ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/ConfigKeyService.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDMarkMeDead.h"
  43 #include "messages/MOSDFull.h"
  44 #include "messages/MOSDMap.h"
  45 #include "messages/MMonGetOSDMap.h"
  46 #include "messages/MOSDBoot.h"
  47 #include "messages/MOSDAlive.h"
  48 #include "messages/MPoolOp.h"
  49 #include "messages/MPoolOpReply.h"
  50 #include "messages/MOSDPGCreate.h"
  51 #include "messages/MOSDPGCreate2.h"
  52 #include "messages/MOSDPGCreated.h"
  53 #include "messages/MOSDPGTemp.h"
  54 #include "messages/MOSDPGReadyToMerge.h"
  55 #include "messages/MMonCommand.h"
  56 #include "messages/MRemoveSnaps.h"
  57 #include "messages/MOSDScrub.h"
  58 #include "messages/MRoute.h"
  59 #include "messages/MMonGetPurgedSnaps.h"
  60 #include "messages/MMonGetPurgedSnapsReply.h"
  61
  62 #include "common/TextTable.h"
  63 #include "common/Timer.h"
  64 #include "common/ceph_argparse.h"
  65 #include "common/perf_counters.h"
  66 #include "common/PriorityCache.h"
  67 #include "common/strtol.h"
  68 #include "common/numa.h"
  69
  70 #include "common/config.h"
  71 #include "common/errno.h"
  72
  73 #include "erasure-code/ErasureCodePlugin.h"
  74 #include "compressor/Compressor.h"
  75 #include "common/Checksummer.h"
  76
  77 #include "include/compat.h"
  78 #include "include/ceph_assert.h"
  79 #include "include/stringify.h"
  80 #include "include/util.h"
  81 #include "common/cmdparse.h"
  82 #include "include/str_list.h"
  83 #include "include/str_map.h"
  84 #include "include/scope_guard.h"
  85 #include "perfglue/heap_profiler.h"
  86
  87 #include "auth/cephx/CephxKeyServer.h"
  88 #include "osd/OSDCap.h"
  89
  90 #include "json_spirit/json_spirit_reader.h"
  91
  92 #include <boost/algorithm/string/predicate.hpp>
  93
  94 #define dout_subsys ceph_subsys_mon
  95 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
  96 static const string OSD_METADATA_PREFIX("osd_metadata");
  97 static const string OSD_SNAP_PREFIX("osd_snap");
  98
  99 /*
 100
 101   OSD snapshot metadata
 102   ---------------------
 103
 104   -- starting with mimic, removed in octopus --
 105
 106   "removed_epoch_%llu_%08lx" % (pool, epoch)
 107    -> interval_set<snapid_t>
 108
 109   "removed_snap_%llu_%016llx" % (pool, last_snap)
 110    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 111
 112
 113   -- starting with mimic --
 114
 115   "purged_snap_%llu_%016llx" % (pool, last_snap)
 116    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 117
 118   - note that the {removed,purged}_snap put the last snap in they key so
 119     that we can use forward iteration only to search for an epoch in an
 120     interval.  e.g., to test if epoch N is removed/purged, we'll find a key
 121     >= N that either does or doesn't contain the given snap.
 122
 123
 124   -- starting with octopus --
 125
 126   "purged_epoch_%08lx" % epoch
 127   -> map<int64_t,interval_set<snapid_t>>
 128
 129   */
 130 using namespace TOPNSPC::common;
 131 namespace {
 132
 133 struct OSDMemCache : public PriorityCache::PriCache {
 134   OSDMonitor *osdmon;
 135   int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
 136   int64_t committed_bytes = 0;
 137   double cache_ratio = 0;
 138
 139   OSDMemCache(OSDMonitor *m) : osdmon(m) {};
 140
 141   virtual uint64_t _get_used_bytes() const = 0;
 142
 143   virtual int64_t request_cache_bytes(
 144       PriorityCache::Priority pri, uint64_t total_cache) const {
 145     int64_t assigned = get_cache_bytes(pri);
 146
 147     switch (pri) {
 148     // All cache items are currently set to have PRI1 priority
 149     case PriorityCache::Priority::PRI1:
 150       {
 151         int64_t request = _get_used_bytes();
 152         return (request > assigned) ? request - assigned : 0;
 153       }
 154     default:
 155       break;
 156     }
 157     return -EOPNOTSUPP;
 158   }
 159
 160   virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
 161       return cache_bytes[pri];
 162   }
 163
 164   virtual int64_t get_cache_bytes() const {
 165     int64_t total = 0;
 166
 167     for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
 168       PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
 169       total += get_cache_bytes(pri);
 170     }
 171     return total;
 172   }
 173
 174   virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 175     cache_bytes[pri] = bytes;
 176   }
 177   virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 178     cache_bytes[pri] += bytes;
 179   }
 180   virtual int64_t commit_cache_size(uint64_t total_cache) {
 181     committed_bytes = PriorityCache::get_chunk(
 182         get_cache_bytes(), total_cache);
 183     return committed_bytes;
 184   }
 185   virtual int64_t get_committed_size() const {
 186     return committed_bytes;
 187   }
 188   virtual double get_cache_ratio() const {
 189     return cache_ratio;
 190   }
 191   virtual void set_cache_ratio(double ratio) {
 192     cache_ratio = ratio;
 193   }
 194   virtual string get_cache_name() const = 0;
 195 };
 196
 197 struct IncCache : public OSDMemCache {
 198   IncCache(OSDMonitor *m) : OSDMemCache(m) {};
 199
 200   virtual uint64_t _get_used_bytes() const {
 201     return osdmon->inc_osd_cache.get_bytes();
 202   }
 203
 204   virtual string get_cache_name() const {
 205     return "OSDMap Inc Cache";
 206   }
 207
 208   uint64_t _get_num_osdmaps() const {
 209     return osdmon->inc_osd_cache.get_size();
 210   }
 211 };
 212
 213 struct FullCache : public OSDMemCache {
 214   FullCache(OSDMonitor *m) : OSDMemCache(m) {};
 215
 216   virtual uint64_t _get_used_bytes() const {
 217     return osdmon->full_osd_cache.get_bytes();
 218   }
 219
 220   virtual string get_cache_name() const {
 221     return "OSDMap Full Cache";
 222   }
 223
 224   uint64_t _get_num_osdmaps() const {
 225     return osdmon->full_osd_cache.get_size();
 226   }
 227 };
 228
 229 std::shared_ptr<IncCache> inc_cache;
 230 std::shared_ptr<FullCache> full_cache;
 231
 232 const uint32_t MAX_POOL_APPLICATIONS = 4;
 233 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 234 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 235
 236 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 237   // Note: this doesn't include support for the application tag match
 238   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 239     auto& match = grant.match;
 240     if (match.is_match_all()) {
 241       return true;
 242     } else if (pool_name != nullptr &&
 243                !match.pool_namespace.pool_name.empty() &&
 244                match.pool_namespace.pool_name == *pool_name) {
 245       return true;
 246     }
 247   }
 248   return false;
 249 }
 250
 251 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 252                                     const KeyServer& key_server,
 253                                     const EntityName& entity_name,
 254                                     const MonCap& mon_caps,
 255                                     const entity_addr_t& peer_socket_addr,
 256                                     const std::string* pool_name)
 257 {
 258   typedef std::map<std::string, std::string> CommandArgs;
 259
 260   if (mon_caps.is_capable(
 261         cct, entity_name, "osd",
 262         "osd pool op unmanaged-snap",
 263         (pool_name == nullptr ?
 264          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 265          CommandArgs{{"poolname", *pool_name}}),
 266         false, true, false,
 267         peer_socket_addr)) {
 268     return true;
 269   }
 270
 271   AuthCapsInfo caps_info;
 272   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 273                                    caps_info)) {
 274     dout(10) << "unable to locate OSD cap data for " << entity_name
 275              << " in auth db" << dendl;
 276     return false;
 277   }
 278
 279   string caps_str;
 280   if (caps_info.caps.length() > 0) {
 281     auto p = caps_info.caps.cbegin();
 282     try {
 283       decode(caps_str, p);
 284     } catch (const buffer::error &err) {
 285       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 286            << dendl;
 287       return false;
 288     }
 289   }
 290
 291   OSDCap osd_cap;
 292   if (!osd_cap.parse(caps_str, nullptr)) {
 293     dout(10) << "unable to parse OSD cap data for " << entity_name
 294              << " in auth db" << dendl;
 295     return false;
 296   }
 297
 298   // if the entity has write permissions in one or all pools, permit
 299   // usage of unmanaged-snapshots
 300   if (osd_cap.allow_all()) {
 301     return true;
 302   }
 303
 304   for (auto& grant : osd_cap.grants) {
 305     if (grant.profile.is_valid()) {
 306       for (auto& profile_grant : grant.profile_grants) {
 307         if (is_osd_writable(profile_grant, pool_name)) {
 308           return true;
 309         }
 310       }
 311     } else if (is_osd_writable(grant, pool_name)) {
 312       return true;
 313     }
 314   }
 315
 316   return false;
 317 }
 318
 319 } // anonymous namespace
 320
 321 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
 322 {
 323   if (epoch_by_pg.size() <= ps) {
 324     epoch_by_pg.resize(ps + 1, 0);
 325   }
 326   const auto old_lec = epoch_by_pg[ps];
 327   if (old_lec >= last_epoch_clean) {
 328     // stale lec
 329     return;
 330   }
 331   epoch_by_pg[ps] = last_epoch_clean;
 332   if (last_epoch_clean < floor) {
 333     floor = last_epoch_clean;
 334   } else if (last_epoch_clean > floor) {
 335     if (old_lec == floor) {
 336       // probably should increase floor?
 337       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 338                                         std::end(epoch_by_pg));
 339       floor = *new_floor;
 340     }
 341   }
 342   if (ps != next_missing) {
 343     return;
 344   }
 345   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 346     if (epoch_by_pg[next_missing] == 0) {
 347       break;
 348     }
 349   }
 350 }
 351
 352 void LastEpochClean::remove_pool(uint64_t pool)
 353 {
 354   report_by_pool.erase(pool);
 355 }
 356
 357 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
 358 {
 359   auto& lec = report_by_pool[pg.pool()];
 360   return lec.report(pg.ps(), last_epoch_clean);
 361 }
 362
 363 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 364 {
 365   auto floor = latest.get_epoch();
 366   for (auto& pool : latest.get_pools()) {
 367     auto reported = report_by_pool.find(pool.first);
 368     if (reported == report_by_pool.end()) {
 369       return 0;
 370     }
 371     if (reported->second.next_missing < pool.second.get_pg_num()) {
 372       return 0;
 373     }
 374     if (reported->second.floor < floor) {
 375       floor = reported->second.floor;
 376     }
 377   }
 378   return floor;
 379 }
 380
 381 void LastEpochClean::dump(Formatter *f) const
 382 {
 383   f->open_array_section("per_pool");
 384
 385   for (auto& it : report_by_pool) {
 386     f->open_object_section("pool");
 387     f->dump_unsigned("poolid", it.first);
 388     f->dump_unsigned("floor", it.second.floor);
 389     f->close_section();
 390   }
 391
 392   f->close_section();
 393 }
 394
 395 class C_UpdateCreatingPGs : public Context {
 396 public:
 397   OSDMonitor *osdmon;
 398   utime_t start;
 399   epoch_t epoch;
 400   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 401     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 402   void finish(int r) override {
 403     if (r >= 0) {
 404       utime_t end = ceph_clock_now();
 405       dout(10) << "osdmap epoch " << epoch << " mapping took "
 406                << (end - start) << " seconds" << dendl;
 407       osdmon->update_creating_pgs();
 408       osdmon->check_pg_creates_subs();
 409     }
 410   }
 411 };
 412
 413 #undef dout_prefix
 414 #define dout_prefix _prefix(_dout, mon, osdmap)
 415 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
 416   return *_dout << "mon." << mon->name << "@" << mon->rank
 417                 << "(" << mon->get_state_name()
 418                 << ").osd e" << osdmap.get_epoch() << " ";
 419 }
 420
 421 OSDMonitor::OSDMonitor(
 422   CephContext *cct,
 423   Monitor *mn,
 424   Paxos *p,
 425   const string& service_name)
 426  : PaxosService(mn, p, service_name),
 427    cct(cct),
 428    inc_osd_cache(g_conf()->mon_osd_cache_size),
 429    full_osd_cache(g_conf()->mon_osd_cache_size),
 430    has_osdmap_manifest(false),
 431    mapper(mn->cct, &mn->cpu_tp)
 432 {
 433   inc_cache = std::make_shared<IncCache>(this);
 434   full_cache = std::make_shared<FullCache>(this);
 435   cct->_conf.add_observer(this);
 436   int r = _set_cache_sizes();
 437   if (r < 0) {
 438     derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
 439          << g_conf()->mon_osd_cache_size
 440          << ") without priority cache management"
 441          << dendl;
 442   }
 443 }
 444
 445 const char **OSDMonitor::get_tracked_conf_keys() const
 446 {
 447   static const char* KEYS[] = {
 448     "mon_memory_target",
 449     "mon_memory_autotune",
 450     "rocksdb_cache_size",
 451     NULL
 452   };
 453   return KEYS;
 454 }
 455
 456 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
 457                                     const std::set<std::string> &changed)
 458 {
 459   dout(10) << __func__ << " " << changed << dendl;
 460
 461   if (changed.count("mon_memory_autotune")) {
 462     _set_cache_autotuning();
 463   }
 464   if (changed.count("mon_memory_target") ||
 465       changed.count("rocksdb_cache_size")) {
 466     int r = _update_mon_cache_settings();
 467     if (r < 0) {
 468       derr << __func__ << " mon_memory_target:"
 469            << g_conf()->mon_memory_target
 470            << " rocksdb_cache_size:"
 471            << g_conf()->rocksdb_cache_size
 472            << ". Unable to update cache size."
 473            << dendl;
 474     }
 475   }
 476 }
 477
 478 void OSDMonitor::_set_cache_autotuning()
 479 {
 480   if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
 481     // Disable cache autotuning
 482     std::lock_guard l(balancer_lock);
 483     pcm = nullptr;
 484   }
 485
 486   if (g_conf()->mon_memory_autotune && pcm == nullptr) {
 487     int r = register_cache_with_pcm();
 488     if (r < 0) {
 489       dout(10) << __func__
 490                << " Error while registering osdmon caches with pcm."
 491                << " Cache auto tuning not enabled."
 492                << dendl;
 493       mon_memory_autotune = false;
 494     } else {
 495       mon_memory_autotune = true;
 496     }
 497   }
 498 }
 499
 500 int OSDMonitor::_update_mon_cache_settings()
 501 {
 502   if (g_conf()->mon_memory_target <= 0 ||
 503       g_conf()->mon_memory_target < mon_memory_min ||
 504       g_conf()->rocksdb_cache_size <= 0) {
 505     return -EINVAL;
 506   }
 507
 508   if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
 509     derr << __func__ << " not using pcm and rocksdb" << dendl;
 510     return -EINVAL;
 511   }
 512
 513   uint64_t old_mon_memory_target = mon_memory_target;
 514   uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
 515
 516   // Set the new pcm memory cache sizes
 517   mon_memory_target = g_conf()->mon_memory_target;
 518   rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 519
 520   uint64_t base = mon_memory_base;
 521   double fragmentation = mon_memory_fragmentation;
 522   uint64_t target = mon_memory_target;
 523   uint64_t min = mon_memory_min;
 524   uint64_t max = min;
 525
 526   uint64_t ltarget = (1.0 - fragmentation) * target;
 527   if (ltarget > base + min) {
 528     max = ltarget - base;
 529   }
 530
 531   int r = _set_cache_ratios();
 532   if (r < 0) {
 533     derr << __func__ << " Cache ratios for pcm could not be set."
 534          << " Review the kv (rocksdb) and mon_memory_target sizes."
 535          << dendl;
 536     mon_memory_target = old_mon_memory_target;
 537     rocksdb_cache_size = old_rocksdb_cache_size;
 538     return -EINVAL;
 539   }
 540
 541   if (mon_memory_autotune && pcm != nullptr) {
 542     std::lock_guard l(balancer_lock);
 543     // set pcm cache levels
 544     pcm->set_target_memory(target);
 545     pcm->set_min_memory(min);
 546     pcm->set_max_memory(max);
 547     // tune memory based on new values
 548     pcm->tune_memory();
 549     pcm->balance();
 550     _set_new_cache_sizes();
 551     dout(1) << __func__ << " Updated mon cache setting."
 552              << " target: " << target
 553              << " min: " << min
 554              << " max: " << max
 555              << dendl;
 556   }
 557   return 0;
 558 }
 559
 560 int OSDMonitor::_set_cache_sizes()
 561 {
 562   if (g_conf()->mon_memory_autotune) {
 563     // set the new osdmon cache targets to be managed by pcm
 564     mon_osd_cache_size = g_conf()->mon_osd_cache_size;
 565     rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 566     mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
 567     mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
 568     mon_memory_target = g_conf()->mon_memory_target;
 569     mon_memory_min = g_conf()->mon_osd_cache_size_min;
 570     if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 571       derr << __func__ << " mon_memory_target:" << mon_memory_target
 572            << " mon_memory_min:" << mon_memory_min
 573            << ". Invalid size option(s) provided."
 574            << dendl;
 575       return -EINVAL;
 576     }
 577     // Set the initial inc and full LRU cache sizes
 578     inc_osd_cache.set_bytes(mon_memory_min);
 579     full_osd_cache.set_bytes(mon_memory_min);
 580     mon_memory_autotune = g_conf()->mon_memory_autotune;
 581   }
 582   return 0;
 583 }
 584
 585 bool OSDMonitor::_have_pending_crush()
 586 {
 587   return pending_inc.crush.length() > 0;
 588 }
 589
 590 CrushWrapper &OSDMonitor::_get_stable_crush()
 591 {
 592   return *osdmap.crush;
 593 }
 594
 595 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
 596 {
 597   bufferlist bl;
 598   if (pending_inc.crush.length())
 599     bl = pending_inc.crush;
 600   else
 601     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 602
 603   auto p = bl.cbegin();
 604   newcrush.decode(p);
 605 }
 606
 607 void OSDMonitor::create_initial()
 608 {
 609   dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
 610
 611   OSDMap newmap;
 612
 613   bufferlist bl;
 614   mon->store->get("mkfs", "osdmap", bl);
 615
 616   if (bl.length()) {
 617     newmap.decode(bl);
 618     newmap.set_fsid(mon->monmap->fsid);
 619   } else {
 620     newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
 621   }
 622   newmap.set_epoch(1);
 623   newmap.created = newmap.modified = ceph_clock_now();
 624
 625   // new clusters should sort bitwise by default.
 626   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 627
 628   newmap.flags |=
 629     CEPH_OSDMAP_RECOVERY_DELETES |
 630     CEPH_OSDMAP_PURGED_SNAPDIRS |
 631     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 632   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 633   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 634   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 635   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 636   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 637   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 638
 639   // new cluster should require latest by default
 640   if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
 641     if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
 642       derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
 643       newmap.require_osd_release = ceph_release_t::mimic;
 644     } else {
 645       derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
 646       newmap.require_osd_release = ceph_release_t::nautilus;
 647     }
 648   } else {
 649     newmap.require_osd_release = ceph_release_t::octopus;
 650     ceph_release_t r = ceph_release_from_name(
 651       g_conf()->mon_osd_initial_require_min_compat_client);
 652     if (!r) {
 653       ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 654     }
 655     newmap.require_min_compat_client = r;
 656   }
 657
 658   // encode into pending incremental
 659   uint64_t features = newmap.get_encoding_features();
 660   newmap.encode(pending_inc.fullmap,
 661                 features | CEPH_FEATURE_RESERVED);
 662   pending_inc.full_crc = newmap.get_crc();
 663   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 664 }
 665
 666 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 667 {
 668   s.insert(service_name);
 669   s.insert(OSD_PG_CREATING_PREFIX);
 670   s.insert(OSD_METADATA_PREFIX);
 671   s.insert(OSD_SNAP_PREFIX);
 672 }
 673
 674 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 675 {
 676   // we really don't care if the version has been updated, because we may
 677   // have trimmed without having increased the last committed; yet, we may
 678   // need to update the in-memory manifest.
 679   load_osdmap_manifest();
 680
 681   version_t version = get_last_committed();
 682   if (version == osdmap.epoch)
 683     return;
 684   ceph_assert(version > osdmap.epoch);
 685
 686   dout(15) << "update_from_paxos paxos e " << version
 687            << ", my e " << osdmap.epoch << dendl;
 688
 689   if (mapping_job) {
 690     if (!mapping_job->is_done()) {
 691       dout(1) << __func__ << " mapping job "
 692               << mapping_job.get() << " did not complete, "
 693               << mapping_job->shards << " left, canceling" << dendl;
 694       mapping_job->abort();
 695     }
 696     mapping_job.reset();
 697   }
 698
 699   load_health();
 700
 701   /*
 702    * We will possibly have a stashed latest that *we* wrote, and we will
 703    * always be sure to have the oldest full map in the first..last range
 704    * due to encode_trim_extra(), which includes the oldest full map in the trim
 705    * transaction.
 706    *
 707    * encode_trim_extra() does not however write the full map's
 708    * version to 'full_latest'.  This is only done when we are building the
 709    * full maps from the incremental versions.  But don't panic!  We make sure
 710    * that the following conditions find whichever full map version is newer.
 711    */
 712   version_t latest_full = get_version_latest_full();
 713   if (latest_full == 0 && get_first_committed() > 1)
 714     latest_full = get_first_committed();
 715
 716   if (get_first_committed() > 1 &&
 717       latest_full < get_first_committed()) {
 718     // the monitor could be just sync'ed with its peer, and the latest_full key
 719     // is not encoded in the paxos commits in encode_pending(), so we need to
 720     // make sure we get it pointing to a proper version.
 721     version_t lc = get_last_committed();
 722     version_t fc = get_first_committed();
 723
 724     dout(10) << __func__ << " looking for valid full map in interval"
 725              << " [" << fc << ", " << lc << "]" << dendl;
 726
 727     latest_full = 0;
 728     for (version_t v = lc; v >= fc; v--) {
 729       string full_key = "full_" + stringify(v);
 730       if (mon->store->exists(get_service_name(), full_key)) {
 731         dout(10) << __func__ << " found latest full map v " << v << dendl;
 732         latest_full = v;
 733         break;
 734       }
 735     }
 736
 737     ceph_assert(latest_full > 0);
 738     auto t(std::make_shared<MonitorDBStore::Transaction>());
 739     put_version_latest_full(t, latest_full);
 740     mon->store->apply_transaction(t);
 741     dout(10) << __func__ << " updated the on-disk full map version to "
 742              << latest_full << dendl;
 743   }
 744
 745   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 746     bufferlist latest_bl;
 747     get_version_full(latest_full, latest_bl);
 748     ceph_assert(latest_bl.length() != 0);
 749     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 750     osdmap = OSDMap();
 751     osdmap.decode(latest_bl);
 752   }
 753
 754   bufferlist bl;
 755   if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 756     auto p = bl.cbegin();
 757     std::lock_guard<std::mutex> l(creating_pgs_lock);
 758     creating_pgs.decode(p);
 759     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 760             << creating_pgs.last_scan_epoch
 761             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 762   } else {
 763     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 764             << dendl;
 765   }
 766
 767   // walk through incrementals
 768   MonitorDBStore::TransactionRef t;
 769   size_t tx_size = 0;
 770   while (version > osdmap.epoch) {
 771     bufferlist inc_bl;
 772     int err = get_version(osdmap.epoch+1, inc_bl);
 773     ceph_assert(err == 0);
 774     ceph_assert(inc_bl.length());
 775     // set priority cache manager levels if the osdmap is
 776     // being populated for the first time.
 777     if (mon_memory_autotune && pcm == nullptr) {
 778       int r = register_cache_with_pcm();
 779       if (r < 0) {
 780         dout(10) << __func__
 781                  << " Error while registering osdmon caches with pcm."
 782                  << " Proceeding without cache auto tuning."
 783                  << dendl;
 784       }
 785     }
 786
 787     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 788             << dendl;
 789     OSDMap::Incremental inc(inc_bl);
 790     err = osdmap.apply_incremental(inc);
 791     ceph_assert(err == 0);
 792
 793     if (!t)
 794       t.reset(new MonitorDBStore::Transaction);
 795
 796     // Write out the full map for all past epochs.  Encode the full
 797     // map with the same features as the incremental.  If we don't
 798     // know, use the quorum features.  If we don't know those either,
 799     // encode with all features.
 800     uint64_t f = inc.encode_features;
 801     if (!f)
 802       f = mon->get_quorum_con_features();
 803     if (!f)
 804       f = -1;
 805     bufferlist full_bl;
 806     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 807     tx_size += full_bl.length();
 808
 809     bufferlist orig_full_bl;
 810     get_version_full(osdmap.epoch, orig_full_bl);
 811     if (orig_full_bl.length()) {
 812       // the primary provided the full map
 813       ceph_assert(inc.have_crc);
 814       if (inc.full_crc != osdmap.crc) {
 815         // This will happen if the mons were running mixed versions in
 816         // the past or some other circumstance made the full encoded
 817         // maps divergent.  Reloading here will bring us back into
 818         // sync with the primary for this and all future maps.  OSDs
 819         // will also be brought back into sync when they discover the
 820         // crc mismatch and request a full map from a mon.
 821         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 822              << dendl;
 823
 824         dout(20) << __func__ << " my (bad) full osdmap:\n";
 825         JSONFormatter jf(true);
 826         jf.dump_object("osdmap", osdmap);
 827         jf.flush(*_dout);
 828         *_dout << "\nhexdump:\n";
 829         full_bl.hexdump(*_dout);
 830         *_dout << dendl;
 831
 832         osdmap = OSDMap();
 833         osdmap.decode(orig_full_bl);
 834
 835         dout(20) << __func__ << " canonical full osdmap:\n";
 836         JSONFormatter jf(true);
 837         jf.dump_object("osdmap", osdmap);
 838         jf.flush(*_dout);
 839         *_dout << "\nhexdump:\n";
 840         orig_full_bl.hexdump(*_dout);
 841         *_dout << dendl;
 842       }
 843     } else {
 844       ceph_assert(!inc.have_crc);
 845       put_version_full(t, osdmap.epoch, full_bl);
 846     }
 847     put_version_latest_full(t, osdmap.epoch);
 848
 849     // share
 850     dout(1) << osdmap << dendl;
 851
 852     if (osdmap.epoch == 1) {
 853       t->erase("mkfs", "osdmap");
 854     }
 855
 856     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 857       mon->store->apply_transaction(t);
 858       t = MonitorDBStore::TransactionRef();
 859       tx_size = 0;
 860     }
 861     for (const auto &osd_state : inc.new_state) {
 862       if (osd_state.second & CEPH_OSD_UP) {
 863         // could be marked up *or* down, but we're too lazy to check which
 864         last_osd_report.erase(osd_state.first);
 865       }
 866       if (osd_state.second & CEPH_OSD_EXISTS) {
 867         // could be created *or* destroyed, but we can safely drop it
 868         osd_epochs.erase(osd_state.first);
 869       }
 870     }
 871   }
 872
 873   if (t) {
 874     mon->store->apply_transaction(t);
 875   }
 876
 877   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 878     if (osdmap.is_out(o))
 879       continue;
 880     auto found = down_pending_out.find(o);
 881     if (osdmap.is_down(o)) {
 882       // populate down -> out map
 883       if (found == down_pending_out.end()) {
 884         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 885         down_pending_out[o] = ceph_clock_now();
 886       }
 887     } else {
 888       if (found != down_pending_out.end()) {
 889         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 890         down_pending_out.erase(found);
 891       }
 892     }
 893   }
 894   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 895
 896   check_osdmap_subs();
 897   check_pg_creates_subs();
 898
 899   share_map_with_random_osd();
 900   update_logger();
 901   process_failures();
 902
 903   // make sure our feature bits reflect the latest map
 904   update_msgr_features();
 905
 906   if (!mon->is_leader()) {
 907     // will be called by on_active() on the leader, avoid doing so twice
 908     start_mapping();
 909   }
 910 }
 911
 912 int OSDMonitor::register_cache_with_pcm()
 913 {
 914   if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 915     derr << __func__ << " Invalid memory size specified for mon caches."
 916          << " Caches will not be auto-tuned."
 917          << dendl;
 918     return -EINVAL;
 919   }
 920   uint64_t base = mon_memory_base;
 921   double fragmentation = mon_memory_fragmentation;
 922   // For calculating total target memory, consider rocksdb cache size.
 923   uint64_t target = mon_memory_target;
 924   uint64_t min = mon_memory_min;
 925   uint64_t max = min;
 926
 927   // Apply the same logic as in bluestore to set the max amount
 928   // of memory to use for cache. Assume base memory for OSDMaps
 929   // and then add in some overhead for fragmentation.
 930   uint64_t ltarget = (1.0 - fragmentation) * target;
 931   if (ltarget > base + min) {
 932     max = ltarget - base;
 933   }
 934
 935   rocksdb_binned_kv_cache = mon->store->get_priority_cache();
 936   if (!rocksdb_binned_kv_cache) {
 937     derr << __func__ << " not using rocksdb" << dendl;
 938     return -EINVAL;
 939   }
 940
 941   int r = _set_cache_ratios();
 942   if (r < 0) {
 943     derr << __func__ << " Cache ratios for pcm could not be set."
 944          << " Review the kv (rocksdb) and mon_memory_target sizes."
 945          << dendl;
 946     return -EINVAL;
 947   }
 948
 949   pcm = std::make_shared<PriorityCache::Manager>(
 950       cct, min, max, target, true);
 951   pcm->insert("kv", rocksdb_binned_kv_cache, true);
 952   pcm->insert("inc", inc_cache, true);
 953   pcm->insert("full", full_cache, true);
 954   dout(1) << __func__ << " pcm target: " << target
 955            << " pcm max: " << max
 956            << " pcm min: " << min
 957            << " inc_osd_cache size: " << inc_osd_cache.get_size()
 958            << dendl;
 959   return 0;
 960 }
 961
 962 int OSDMonitor::_set_cache_ratios()
 963 {
 964   double old_cache_kv_ratio = cache_kv_ratio;
 965
 966   // Set the cache ratios for kv(rocksdb), inc and full caches
 967   cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
 968   if (cache_kv_ratio >= 1.0) {
 969     derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
 970          << ") must be in range [0,<1.0]."
 971          << dendl;
 972     cache_kv_ratio = old_cache_kv_ratio;
 973     return -EINVAL;
 974   }
 975   rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
 976   cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
 977   inc_cache->set_cache_ratio(cache_inc_ratio);
 978   full_cache->set_cache_ratio(cache_full_ratio);
 979
 980   dout(1) << __func__ << " kv ratio " << cache_kv_ratio
 981            << " inc ratio " << cache_inc_ratio
 982            << " full ratio " << cache_full_ratio
 983            << dendl;
 984   return 0;
 985 }
 986
 987 void OSDMonitor::start_mapping()
 988 {
 989   // initiate mapping job
 990   if (mapping_job) {
 991     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
 992              << dendl;
 993     mapping_job->abort();
 994   }
 995   if (!osdmap.get_pools().empty()) {
 996     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
 997     mapping_job = mapping.start_update(osdmap, mapper,
 998                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
 999     dout(10) << __func__ << " started mapping job " << mapping_job.get()
1000              << " at " << fin->start << dendl;
1001     mapping_job->set_finish_event(fin);
1002   } else {
1003     dout(10) << __func__ << " no pools, no mapping job" << dendl;
1004     mapping_job = nullptr;
1005   }
1006 }
1007
1008 void OSDMonitor::update_msgr_features()
1009 {
1010   set<int> types;
1011   types.insert((int)entity_name_t::TYPE_OSD);
1012   types.insert((int)entity_name_t::TYPE_CLIENT);
1013   types.insert((int)entity_name_t::TYPE_MDS);
1014   types.insert((int)entity_name_t::TYPE_MON);
1015   for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1016     uint64_t mask;
1017     uint64_t features = osdmap.get_features(*q, &mask);
1018     if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1019       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1020       ceph::net::Policy p = mon->messenger->get_policy(*q);
1021       p.features_required = (p.features_required & ~mask) | features;
1022       mon->messenger->set_policy(*q, p);
1023     }
1024   }
1025 }
1026
1027 void OSDMonitor::on_active()
1028 {
1029   update_logger();
1030
1031   if (mon->is_leader()) {
1032     mon->clog->debug() << "osdmap " << osdmap;
1033     if (!priority_convert) {
1034       // Only do this once at start-up
1035       convert_pool_priorities();
1036       priority_convert = true;
1037     }
1038   } else {
1039     list<MonOpRequestRef> ls;
1040     take_all_failures(ls);
1041     while (!ls.empty()) {
1042       MonOpRequestRef op = ls.front();
1043       op->mark_osdmon_event(__func__);
1044       dispatch(op);
1045       ls.pop_front();
1046     }
1047   }
1048   start_mapping();
1049 }
1050
1051 void OSDMonitor::on_restart()
1052 {
1053   last_osd_report.clear();
1054 }
1055
1056 void OSDMonitor::on_shutdown()
1057 {
1058   dout(10) << __func__ << dendl;
1059   if (mapping_job) {
1060     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1061              << dendl;
1062     mapping_job->abort();
1063   }
1064
1065   // discard failure info, waiters
1066   list<MonOpRequestRef> ls;
1067   take_all_failures(ls);
1068   ls.clear();
1069 }
1070
1071 void OSDMonitor::update_logger()
1072 {
1073   dout(10) << "update_logger" << dendl;
1074
1075   mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1076   mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1077   mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1078   mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1079 }
1080
1081 void OSDMonitor::create_pending()
1082 {
1083   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1084   pending_inc.fsid = mon->monmap->fsid;
1085   pending_metadata.clear();
1086   pending_metadata_rm.clear();
1087   pending_pseudo_purged_snaps.clear();
1088
1089   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1090
1091   // safety checks (this shouldn't really happen)
1092   {
1093     if (osdmap.backfillfull_ratio <= 0) {
1094       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1095       if (pending_inc.new_backfillfull_ratio > 1.0)
1096         pending_inc.new_backfillfull_ratio /= 100;
1097       dout(1) << __func__ << " setting backfillfull_ratio = "
1098               << pending_inc.new_backfillfull_ratio << dendl;
1099     }
1100     if (osdmap.full_ratio <= 0) {
1101       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1102       if (pending_inc.new_full_ratio > 1.0)
1103         pending_inc.new_full_ratio /= 100;
1104       dout(1) << __func__ << " setting full_ratio = "
1105               << pending_inc.new_full_ratio << dendl;
1106     }
1107     if (osdmap.nearfull_ratio <= 0) {
1108       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1109       if (pending_inc.new_nearfull_ratio > 1.0)
1110         pending_inc.new_nearfull_ratio /= 100;
1111       dout(1) << __func__ << " setting nearfull_ratio = "
1112               << pending_inc.new_nearfull_ratio << dendl;
1113     }
1114   }
1115
1116   // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1117   // structure.
1118   if (osdmap.crush->has_legacy_rule_ids()) {
1119     CrushWrapper newcrush;
1120     _get_pending_crush(newcrush);
1121
1122     // First, for all pools, work out which rule they really used
1123     // by resolving ruleset to rule.
1124     for (const auto &i : osdmap.get_pools()) {
1125       const auto pool_id = i.first;
1126       const auto &pool = i.second;
1127       int new_rule_id = newcrush.find_rule(pool.crush_rule,
1128                                            pool.type, pool.size);
1129
1130       dout(1) << __func__ << " rewriting pool "
1131               << osdmap.get_pool_name(pool_id) << " crush ruleset "
1132               << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1133       if (pending_inc.new_pools.count(pool_id) == 0) {
1134         pending_inc.new_pools[pool_id] = pool;
1135       }
1136       pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1137     }
1138
1139     // Now, go ahead and renumber all the rules so that their
1140     // rule_id field corresponds to their position in the array
1141     auto old_to_new = newcrush.renumber_rules();
1142     dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1143     for (const auto &i : old_to_new) {
1144       dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1145     }
1146     pending_inc.crush.clear();
1147     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1148   }
1149 }
1150
1151 creating_pgs_t
1152 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1153                                const OSDMap& nextmap)
1154 {
1155   dout(10) << __func__ << dendl;
1156   creating_pgs_t pending_creatings;
1157   {
1158     std::lock_guard<std::mutex> l(creating_pgs_lock);
1159     pending_creatings = creating_pgs;
1160   }
1161   // check for new or old pools
1162   if (pending_creatings.last_scan_epoch < inc.epoch) {
1163     unsigned queued = 0;
1164     queued += scan_for_creating_pgs(osdmap.get_pools(),
1165                                     inc.old_pools,
1166                                     inc.modified,
1167                                     &pending_creatings);
1168     queued += scan_for_creating_pgs(inc.new_pools,
1169                                     inc.old_pools,
1170                                     inc.modified,
1171                                     &pending_creatings);
1172     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1173     for (auto deleted_pool : inc.old_pools) {
1174       auto removed = pending_creatings.remove_pool(deleted_pool);
1175       dout(10) << __func__ << " " << removed
1176                << " pg removed because containing pool deleted: "
1177                << deleted_pool << dendl;
1178       last_epoch_clean.remove_pool(deleted_pool);
1179     }
1180     // pgmon updates its creating_pgs in check_osd_map() which is called by
1181     // on_active() and check_osd_map() could be delayed if lease expires, so its
1182     // creating_pgs could be stale in comparison with the one of osdmon. let's
1183     // trim them here. otherwise, they will be added back after being erased.
1184     unsigned removed = 0;
1185     for (auto& pg : pending_created_pgs) {
1186       dout(20) << __func__ << " noting created pg " << pg << dendl;
1187       pending_creatings.created_pools.insert(pg.pool());
1188       removed += pending_creatings.pgs.erase(pg);
1189     }
1190     pending_created_pgs.clear();
1191     dout(10) << __func__ << " " << removed
1192              << " pgs removed because they're created" << dendl;
1193     pending_creatings.last_scan_epoch = osdmap.get_epoch();
1194   }
1195
1196   // filter out any pgs that shouldn't exist.
1197   {
1198     auto i = pending_creatings.pgs.begin();
1199     while (i != pending_creatings.pgs.end()) {
1200       if (!nextmap.pg_exists(i->first)) {
1201         dout(10) << __func__ << " removing pg " << i->first
1202                  << " which should not exist" << dendl;
1203         i = pending_creatings.pgs.erase(i);
1204       } else {
1205         ++i;
1206       }
1207     }
1208   }
1209
1210   // process queue
1211   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1212   const auto total = pending_creatings.pgs.size();
1213   while (pending_creatings.pgs.size() < max &&
1214          !pending_creatings.queue.empty()) {
1215     auto p = pending_creatings.queue.begin();
1216     int64_t poolid = p->first;
1217     dout(10) << __func__ << " pool " << poolid
1218              << " created " << p->second.created
1219              << " modified " << p->second.modified
1220              << " [" << p->second.start << "-" << p->second.end << ")"
1221              << dendl;
1222     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1223                                   p->second.end - p->second.start);
1224     ps_t first = p->second.start;
1225     ps_t end = first + n;
1226     for (ps_t ps = first; ps < end; ++ps) {
1227       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1228       // NOTE: use the *current* epoch as the PG creation epoch so that the
1229       // OSD does not have to generate a long set of PastIntervals.
1230       pending_creatings.pgs.emplace(
1231         pgid,
1232         creating_pgs_t::pg_create_info(inc.epoch,
1233                                        p->second.modified));
1234       dout(10) << __func__ << " adding " << pgid << dendl;
1235     }
1236     p->second.start = end;
1237     if (p->second.done()) {
1238       dout(10) << __func__ << " done with queue for " << poolid << dendl;
1239       pending_creatings.queue.erase(p);
1240     } else {
1241       dout(10) << __func__ << " pool " << poolid
1242                << " now [" << p->second.start << "-" << p->second.end << ")"
1243                << dendl;
1244     }
1245   }
1246   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1247            << " pools" << dendl;
1248
1249   if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1250     // walk creating pgs' history and past_intervals forward
1251     for (auto& i : pending_creatings.pgs) {
1252       // this mirrors PG::start_peering_interval()
1253       pg_t pgid = i.first;
1254
1255       // this is a bit imprecise, but sufficient?
1256       struct min_size_predicate_t : public IsPGRecoverablePredicate {
1257         const pg_pool_t *pi;
1258         bool operator()(const set<pg_shard_t> &have) const {
1259           return have.size() >= pi->min_size;
1260         }
1261         explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1262       } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1263
1264       vector<int> up, acting;
1265       int up_primary, acting_primary;
1266       nextmap.pg_to_up_acting_osds(
1267         pgid, &up, &up_primary, &acting, &acting_primary);
1268       if (i.second.history.epoch_created == 0) {
1269         // new pg entry, set it up
1270         i.second.up = up;
1271         i.second.acting = acting;
1272         i.second.up_primary = up_primary;
1273         i.second.acting_primary = acting_primary;
1274         i.second.history = pg_history_t(i.second.create_epoch,
1275                                         i.second.create_stamp);
1276         dout(10) << __func__ << "  pg " << pgid << " just added, "
1277                  << " up " << i.second.up
1278                  << " p " << i.second.up_primary
1279                  << " acting " << i.second.acting
1280                  << " p " << i.second.acting_primary
1281                  << " history " << i.second.history
1282                  << " past_intervals " << i.second.past_intervals
1283                  << dendl;
1284      } else {
1285         std::stringstream debug;
1286         if (PastIntervals::check_new_interval(
1287               i.second.acting_primary, acting_primary,
1288               i.second.acting, acting,
1289               i.second.up_primary, up_primary,
1290               i.second.up, up,
1291               i.second.history.same_interval_since,
1292               i.second.history.last_epoch_clean,
1293               &nextmap,
1294               &osdmap,
1295               pgid,
1296               min_size_predicate,
1297               &i.second.past_intervals,
1298               &debug)) {
1299           epoch_t e = inc.epoch;
1300           i.second.history.same_interval_since = e;
1301           if (i.second.up != up) {
1302             i.second.history.same_up_since = e;
1303           }
1304           if (i.second.acting_primary != acting_primary) {
1305             i.second.history.same_primary_since = e;
1306           }
1307           if (pgid.is_split(
1308                 osdmap.get_pg_num(pgid.pool()),
1309                 nextmap.get_pg_num(pgid.pool()),
1310                 nullptr)) {
1311             i.second.history.last_epoch_split = e;
1312           }
1313           dout(10) << __func__ << "  pg " << pgid << " new interval,"
1314                    << " up " << i.second.up << " -> " << up
1315                    << " p " << i.second.up_primary << " -> " << up_primary
1316                    << " acting " << i.second.acting << " -> " << acting
1317                    << " p " << i.second.acting_primary << " -> "
1318                    << acting_primary
1319                    << " history " << i.second.history
1320                    << " past_intervals " << i.second.past_intervals
1321                    << dendl;
1322           dout(20) << "  debug: " << debug.str() << dendl;
1323           i.second.up = up;
1324           i.second.acting = acting;
1325           i.second.up_primary = up_primary;
1326           i.second.acting_primary = acting_primary;
1327         }
1328       }
1329     }
1330   }
1331   dout(10) << __func__
1332            << " " << (pending_creatings.pgs.size() - total)
1333            << "/" << pending_creatings.pgs.size()
1334            << " pgs added from queued pools" << dendl;
1335   return pending_creatings;
1336 }
1337
1338 void OSDMonitor::maybe_prime_pg_temp()
1339 {
1340   bool all = false;
1341   if (pending_inc.crush.length()) {
1342     dout(10) << __func__ << " new crush map, all" << dendl;
1343     all = true;
1344   }
1345
1346   if (!pending_inc.new_up_client.empty()) {
1347     dout(10) << __func__ << " new up osds, all" << dendl;
1348     all = true;
1349   }
1350
1351   // check for interesting OSDs
1352   set<int> osds;
1353   for (auto p = pending_inc.new_state.begin();
1354        !all && p != pending_inc.new_state.end();
1355        ++p) {
1356     if ((p->second & CEPH_OSD_UP) &&
1357         osdmap.is_up(p->first)) {
1358       osds.insert(p->first);
1359     }
1360   }
1361   for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1362        !all && p != pending_inc.new_weight.end();
1363        ++p) {
1364     if (p->second < osdmap.get_weight(p->first)) {
1365       // weight reduction
1366       osds.insert(p->first);
1367     } else {
1368       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1369                << dendl;
1370       all = true;
1371     }
1372   }
1373
1374   if (!all && osds.empty())
1375     return;
1376
1377   if (!all) {
1378     unsigned estimate =
1379       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1380     if (estimate > mapping.get_num_pgs() *
1381         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1382       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1383                << osds.size() << " osds >= "
1384                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1385                << mapping.get_num_pgs() << " pgs, all"
1386                << dendl;
1387       all = true;
1388     } else {
1389       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1390                << osds.size() << " osds" << dendl;
1391     }
1392   }
1393
1394   OSDMap next;
1395   next.deepish_copy_from(osdmap);
1396   next.apply_incremental(pending_inc);
1397
1398   if (next.get_pools().empty()) {
1399     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1400   } else if (all) {
1401     PrimeTempJob job(next, this);
1402     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1403     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1404       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1405     } else {
1406       dout(10) << __func__ << " did not finish in "
1407                << g_conf()->mon_osd_prime_pg_temp_max_time
1408                << ", stopping" << dendl;
1409       job.abort();
1410     }
1411   } else {
1412     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1413     utime_t stop = ceph_clock_now();
1414     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1415     const int chunk = 1000;
1416     int n = chunk;
1417     std::unordered_set<pg_t> did_pgs;
1418     for (auto osd : osds) {
1419       auto& pgs = mapping.get_osd_acting_pgs(osd);
1420       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1421       for (auto pgid : pgs) {
1422         if (!did_pgs.insert(pgid).second) {
1423           continue;
1424         }
1425         prime_pg_temp(next, pgid);
1426         if (--n <= 0) {
1427           n = chunk;
1428           if (ceph_clock_now() > stop) {
1429             dout(10) << __func__ << " consumed more than "
1430                      << g_conf()->mon_osd_prime_pg_temp_max_time
1431                      << " seconds, stopping"
1432                      << dendl;
1433             return;
1434           }
1435         }
1436       }
1437     }
1438   }
1439 }
1440
1441 void OSDMonitor::prime_pg_temp(
1442   const OSDMap& next,
1443   pg_t pgid)
1444 {
1445   // TODO: remove this creating_pgs direct access?
1446   if (creating_pgs.pgs.count(pgid)) {
1447     return;
1448   }
1449   if (!osdmap.pg_exists(pgid)) {
1450     return;
1451   }
1452
1453   vector<int> up, acting;
1454   mapping.get(pgid, &up, nullptr, &acting, nullptr);
1455
1456   vector<int> next_up, next_acting;
1457   int next_up_primary, next_acting_primary;
1458   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1459                             &next_acting, &next_acting_primary);
1460   if (acting == next_acting &&
1461       !(up != acting && next_up == next_acting))
1462     return;  // no change since last epoch
1463
1464   if (acting.empty())
1465     return;  // if previously empty now we can be no worse off
1466   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1467   if (pool && acting.size() < pool->min_size)
1468     return;  // can be no worse off than before
1469
1470   if (next_up == next_acting) {
1471     acting.clear();
1472     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1473              << dendl;
1474   }
1475
1476   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1477            << " -> " << next_up << "/" << next_acting
1478            << ", priming " << acting
1479            << dendl;
1480   {
1481     std::lock_guard l(prime_pg_temp_lock);
1482     // do not touch a mapping if a change is pending
1483     pending_inc.new_pg_temp.emplace(
1484       pgid,
1485       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1486   }
1487 }
1488
1489 /**
1490  * @note receiving a transaction in this function gives a fair amount of
1491  * freedom to the service implementation if it does need it. It shouldn't.
1492  */
1493 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1494 {
1495   dout(10) << "encode_pending e " << pending_inc.epoch
1496            << dendl;
1497
1498   if (do_prune(t)) {
1499     dout(1) << __func__ << " osdmap full prune encoded e"
1500             << pending_inc.epoch << dendl;
1501   }
1502
1503   // finalize up pending_inc
1504   pending_inc.modified = ceph_clock_now();
1505
1506   int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1507   ceph_assert(r == 0);
1508
1509   if (mapping_job) {
1510     if (!mapping_job->is_done()) {
1511       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1512               << mapping_job.get() << " did not complete, "
1513               << mapping_job->shards << " left" << dendl;
1514       mapping_job->abort();
1515     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1516       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1517               << mapping_job.get() << " is prior epoch "
1518               << mapping.get_epoch() << dendl;
1519     } else {
1520       if (g_conf()->mon_osd_prime_pg_temp) {
1521         maybe_prime_pg_temp();
1522       }
1523     }
1524   } else if (g_conf()->mon_osd_prime_pg_temp) {
1525     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1526             << dendl;
1527   }
1528   mapping_job.reset();
1529
1530   // ensure we don't have blank new_state updates.  these are interrpeted as
1531   // CEPH_OSD_UP (and almost certainly not what we want!).
1532   auto p = pending_inc.new_state.begin();
1533   while (p != pending_inc.new_state.end()) {
1534     if (p->second == 0) {
1535       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1536       p = pending_inc.new_state.erase(p);
1537     } else {
1538       if (p->second & CEPH_OSD_UP) {
1539         pending_inc.new_last_up_change = pending_inc.modified;
1540       }
1541       ++p;
1542     }
1543   }
1544   if (!pending_inc.new_up_client.empty()) {
1545     pending_inc.new_last_up_change = pending_inc.modified;
1546   }
1547   for (auto& i : pending_inc.new_weight) {
1548     if (i.first >= osdmap.max_osd) {
1549       if (i.second) {
1550         // new osd is already marked in
1551         pending_inc.new_last_in_change = pending_inc.modified;
1552         break;
1553       }
1554     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1555       // existing osd marked in or out
1556       pending_inc.new_last_in_change = pending_inc.modified;
1557       break;
1558     }
1559   }
1560
1561   {
1562     OSDMap tmp;
1563     tmp.deepish_copy_from(osdmap);
1564     tmp.apply_incremental(pending_inc);
1565
1566     // clean pg_temp mappings
1567     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1568
1569     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1570     {
1571       // check every upmapped pg for now
1572       // until we could reliably identify certain cases to ignore,
1573       // which is obviously the hard part TBD..
1574       vector<pg_t> pgs_to_check;
1575       tmp.get_upmap_pgs(&pgs_to_check);
1576       if (pgs_to_check.size() <
1577           static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1578         // not enough pgs, do it inline
1579         tmp.clean_pg_upmaps(cct, &pending_inc);
1580       } else {
1581         CleanUpmapJob job(cct, tmp, pending_inc);
1582         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1583         job.wait();
1584       }
1585     }
1586
1587     // update creating pgs first so that we can remove the created pgid and
1588     // process the pool flag removal below in the same osdmap epoch.
1589     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1590     bufferlist creatings_bl;
1591     uint64_t features = CEPH_FEATURES_ALL;
1592     if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1593       dout(20) << __func__ << " encoding pending pgs without octopus features"
1594                << dendl;
1595       features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1596     }
1597     encode(pending_creatings, creatings_bl, features);
1598     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1599
1600     // remove any old (or incompat) POOL_CREATING flags
1601     for (auto& i : tmp.get_pools()) {
1602       if (tmp.require_osd_release < ceph_release_t::nautilus) {
1603         // pre-nautilus OSDMaps shouldn't get this flag.
1604         if (pending_inc.new_pools.count(i.first)) {
1605           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1606         }
1607       }
1608       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1609           !pending_creatings.still_creating_pool(i.first)) {
1610         dout(10) << __func__ << " done creating pool " << i.first
1611                  << ", clearing CREATING flag" << dendl;
1612         if (pending_inc.new_pools.count(i.first) == 0) {
1613           pending_inc.new_pools[i.first] = i.second;
1614         }
1615         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1616       }
1617     }
1618
1619     // collect which pools are currently affected by
1620     // the near/backfill/full osd(s),
1621     // and set per-pool near/backfill/full flag instead
1622     set<int64_t> full_pool_ids;
1623     set<int64_t> backfillfull_pool_ids;
1624     set<int64_t> nearfull_pool_ids;
1625     tmp.get_full_pools(cct,
1626                        &full_pool_ids,
1627                        &backfillfull_pool_ids,
1628                          &nearfull_pool_ids);
1629     if (full_pool_ids.empty() ||
1630         backfillfull_pool_ids.empty() ||
1631         nearfull_pool_ids.empty()) {
1632       // normal case - no nearfull, backfillfull or full osds
1633         // try cancel any improper nearfull/backfillfull/full pool
1634         // flags first
1635       for (auto &pool: tmp.get_pools()) {
1636         auto p = pool.first;
1637         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1638             nearfull_pool_ids.empty()) {
1639           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1640                    << "'s nearfull flag" << dendl;
1641           if (pending_inc.new_pools.count(p) == 0) {
1642             // load original pool info first!
1643             pending_inc.new_pools[p] = pool.second;
1644           }
1645           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1646         }
1647         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1648             backfillfull_pool_ids.empty()) {
1649           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650                    << "'s backfillfull flag" << dendl;
1651           if (pending_inc.new_pools.count(p) == 0) {
1652             pending_inc.new_pools[p] = pool.second;
1653           }
1654           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1655         }
1656         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1657             full_pool_ids.empty()) {
1658           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1659             // set by EQUOTA, skipping
1660             continue;
1661           }
1662           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1663                    << "'s full flag" << dendl;
1664           if (pending_inc.new_pools.count(p) == 0) {
1665             pending_inc.new_pools[p] = pool.second;
1666           }
1667           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1668         }
1669       }
1670     }
1671     if (!full_pool_ids.empty()) {
1672       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1673                << " as full" << dendl;
1674       for (auto &p: full_pool_ids) {
1675         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1676           continue;
1677         }
1678         if (pending_inc.new_pools.count(p) == 0) {
1679           pending_inc.new_pools[p] = tmp.pools[p];
1680         }
1681         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1682         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1683         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1684       }
1685       // cancel FLAG_FULL for pools which are no longer full too
1686       for (auto &pool: tmp.get_pools()) {
1687         auto p = pool.first;
1688         if (full_pool_ids.count(p)) {
1689           // skip pools we have just marked as full above
1690           continue;
1691         }
1692         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1693             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1694           // don't touch if currently is not full
1695           // or is running out of quota (and hence considered as full)
1696           continue;
1697         }
1698         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1699                  << "'s full flag" << dendl;
1700         if (pending_inc.new_pools.count(p) == 0) {
1701           pending_inc.new_pools[p] = pool.second;
1702         }
1703         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1704       }
1705     }
1706     if (!backfillfull_pool_ids.empty()) {
1707       for (auto &p: backfillfull_pool_ids) {
1708         if (full_pool_ids.count(p)) {
1709           // skip pools we have already considered as full above
1710           continue;
1711         }
1712         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1713           // make sure FLAG_FULL is truly set, so we are safe not
1714           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1716           continue;
1717         }
1718         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1719           // don't bother if pool is already marked as backfillfull
1720           continue;
1721         }
1722         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1723                  << "'s as backfillfull" << dendl;
1724         if (pending_inc.new_pools.count(p) == 0) {
1725           pending_inc.new_pools[p] = tmp.pools[p];
1726         }
1727         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1728         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1729       }
1730       // cancel FLAG_BACKFILLFULL for pools
1731       // which are no longer backfillfull too
1732       for (auto &pool: tmp.get_pools()) {
1733         auto p = pool.first;
1734         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1735           // skip pools we have just marked as backfillfull/full above
1736           continue;
1737         }
1738         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1739           // and don't touch if currently is not backfillfull
1740           continue;
1741         }
1742         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743                  << "'s backfillfull flag" << dendl;
1744         if (pending_inc.new_pools.count(p) == 0) {
1745           pending_inc.new_pools[p] = pool.second;
1746         }
1747         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1748       }
1749     }
1750     if (!nearfull_pool_ids.empty()) {
1751       for (auto &p: nearfull_pool_ids) {
1752         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1753           continue;
1754         }
1755         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1756           // make sure FLAG_FULL is truly set, so we are safe not
1757           // to set a extra (redundant) FLAG_NEARFULL flag
1758           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1759           continue;
1760         }
1761         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1762           // don't bother if pool is already marked as nearfull
1763           continue;
1764         }
1765         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1766                  << "'s as nearfull" << dendl;
1767         if (pending_inc.new_pools.count(p) == 0) {
1768           pending_inc.new_pools[p] = tmp.pools[p];
1769         }
1770         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1771       }
1772       // cancel FLAG_NEARFULL for pools
1773       // which are no longer nearfull too
1774       for (auto &pool: tmp.get_pools()) {
1775         auto p = pool.first;
1776         if (full_pool_ids.count(p) ||
1777             backfillfull_pool_ids.count(p) ||
1778             nearfull_pool_ids.count(p)) {
1779           // skip pools we have just marked as
1780           // nearfull/backfillfull/full above
1781           continue;
1782         }
1783         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1784           // and don't touch if currently is not nearfull
1785           continue;
1786         }
1787         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1788                  << "'s nearfull flag" << dendl;
1789         if (pending_inc.new_pools.count(p) == 0) {
1790           pending_inc.new_pools[p] = pool.second;
1791         }
1792         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1793       }
1794     }
1795
1796     // min_compat_client?
1797     if (!tmp.require_min_compat_client) {
1798       auto mv = tmp.get_min_compat_client();
1799       dout(1) << __func__ << " setting require_min_compat_client to currently "
1800               << "required " << mv << dendl;
1801       mon->clog->info() << "setting require_min_compat_client to currently "
1802                         << "required " << mv;
1803       pending_inc.new_require_min_compat_client = mv;
1804     }
1805
1806     if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1807         tmp.require_osd_release >= ceph_release_t::nautilus) {
1808       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1809       // add creating flags?
1810       for (auto& i : tmp.get_pools()) {
1811         if (pending_creatings.still_creating_pool(i.first)) {
1812           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1813                    << dendl;
1814           if (pending_inc.new_pools.count(i.first) == 0) {
1815             pending_inc.new_pools[i.first] = i.second;
1816           }
1817           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1818         }
1819       }
1820       // adjust blacklist items to all be TYPE_ANY
1821       for (auto& i : tmp.blacklist) {
1822         auto a = i.first;
1823         a.set_type(entity_addr_t::TYPE_ANY);
1824         pending_inc.new_blacklist[a] = i.second;
1825         pending_inc.old_blacklist.push_back(i.first);
1826       }
1827     }
1828
1829     if (osdmap.require_osd_release < ceph_release_t::octopus &&
1830         tmp.require_osd_release >= ceph_release_t::octopus) {
1831       dout(10) << __func__ << " first octopus+ epoch" << dendl;
1832
1833       // adjust obsoleted cache modes
1834       for (auto& [poolid, pi] : tmp.pools) {
1835         if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1836           if (pending_inc.new_pools.count(poolid) == 0) {
1837             pending_inc.new_pools[poolid] = pi;
1838           }
1839           dout(10) << __func__ << " switching pool " << poolid
1840                    << " cachemode from forward -> proxy" << dendl;
1841           pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1842         }
1843         if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1844           if (pending_inc.new_pools.count(poolid) == 0) {
1845             pending_inc.new_pools[poolid] = pi;
1846           }
1847           dout(10) << __func__ << " switching pool " << poolid
1848                    << " cachemode from readforward -> readproxy" << dendl;
1849           pending_inc.new_pools[poolid].cache_mode =
1850             pg_pool_t::CACHEMODE_READPROXY;
1851         }
1852       }
1853
1854       // clear removed_snaps for every pool
1855       for (auto& [poolid, pi] : tmp.pools) {
1856         if (pi.removed_snaps.empty()) {
1857           continue;
1858         }
1859         if (pending_inc.new_pools.count(poolid) == 0) {
1860           pending_inc.new_pools[poolid] = pi;
1861         }
1862         dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1863                  << dendl;
1864         pending_inc.new_pools[poolid].removed_snaps.clear();
1865       }
1866
1867       // create a combined purged snap epoch key for all purged snaps
1868       // prior to this epoch, and store it in the current epoch (i.e.,
1869       // the last pre-octopus epoch, just prior to the one we're
1870       // encoding now).
1871       auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1872       it->lower_bound("purged_snap_");
1873       map<int64_t,snap_interval_set_t> combined;
1874       while (it->valid()) {
1875         if (it->key().find("purged_snap_") != 0) {
1876           break;
1877         }
1878         string k = it->key();
1879         long long unsigned pool;
1880         int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1881         if (n != 1) {
1882           derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1883         } else {
1884           bufferlist v = it->value();
1885           auto p = v.cbegin();
1886           snapid_t begin, end;
1887           ceph::decode(begin, p);
1888           ceph::decode(end, p);
1889           combined[pool].insert(begin, end - begin);
1890         }
1891         it->next();
1892       }
1893       if (!combined.empty()) {
1894         string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1895         bufferlist v;
1896         ceph::encode(combined, v);
1897         t->put(OSD_SNAP_PREFIX, k, v);
1898         dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1899                  << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1900                  << dendl;
1901       } else {
1902         dout(10) << __func__ << " there were no pre-octopus purged snaps"
1903                  << dendl;
1904       }
1905
1906       // clean out the old removed_snap_ and removed_epoch keys
1907       // ('`' is ASCII '_' + 1)
1908       t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1909       t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1910     }
1911   }
1912
1913   // tell me about it
1914   for (auto i = pending_inc.new_state.begin();
1915        i != pending_inc.new_state.end();
1916        ++i) {
1917     int s = i->second ? i->second : CEPH_OSD_UP;
1918     if (s & CEPH_OSD_UP) {
1919       dout(2) << " osd." << i->first << " DOWN" << dendl;
1920       // Reset laggy parameters if failure interval exceeds a threshold.
1921       const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1922       if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1923         int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1924         if (grace_interval_threshold_exceeded(last_failure_interval)) {
1925           set_default_laggy_params(i->first);
1926         }
1927       }
1928     }
1929     if (s & CEPH_OSD_EXISTS)
1930       dout(2) << " osd." << i->first << " DNE" << dendl;
1931   }
1932   for (auto i = pending_inc.new_up_client.begin();
1933        i != pending_inc.new_up_client.end();
1934        ++i) {
1935     //FIXME: insert cluster addresses too
1936     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1937   }
1938   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1939        i != pending_inc.new_weight.end();
1940        ++i) {
1941     if (i->second == CEPH_OSD_OUT) {
1942       dout(2) << " osd." << i->first << " OUT" << dendl;
1943     } else if (i->second == CEPH_OSD_IN) {
1944       dout(2) << " osd." << i->first << " IN" << dendl;
1945     } else {
1946       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1947     }
1948   }
1949
1950   // features for osdmap and its incremental
1951   uint64_t features;
1952
1953   // encode full map and determine its crc
1954   OSDMap tmp;
1955   {
1956     tmp.deepish_copy_from(osdmap);
1957     tmp.apply_incremental(pending_inc);
1958
1959     // determine appropriate features
1960     features = tmp.get_encoding_features();
1961     dout(10) << __func__ << " encoding full map with "
1962              << tmp.require_osd_release
1963              << " features " << features << dendl;
1964
1965     // the features should be a subset of the mon quorum's features!
1966     ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1967
1968     bufferlist fullbl;
1969     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1970     pending_inc.full_crc = tmp.get_crc();
1971
1972     // include full map in the txn.  note that old monitors will
1973     // overwrite this.  new ones will now skip the local full map
1974     // encode and reload from this.
1975     put_version_full(t, pending_inc.epoch, fullbl);
1976   }
1977
1978   // encode
1979   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1980   bufferlist bl;
1981   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1982
1983   dout(20) << " full_crc " << tmp.get_crc()
1984            << " inc_crc " << pending_inc.inc_crc << dendl;
1985
1986   /* put everything in the transaction */
1987   put_version(t, pending_inc.epoch, bl);
1988   put_last_committed(t, pending_inc.epoch);
1989
1990   // metadata, too!
1991   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1992        p != pending_metadata.end();
1993        ++p)
1994     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1995   for (set<int>::iterator p = pending_metadata_rm.begin();
1996        p != pending_metadata_rm.end();
1997        ++p)
1998     t->erase(OSD_METADATA_PREFIX, stringify(*p));
1999   pending_metadata.clear();
2000   pending_metadata_rm.clear();
2001
2002   // purged_snaps
2003   if (tmp.require_osd_release >= ceph_release_t::octopus &&
2004       !pending_inc.new_purged_snaps.empty()) {
2005     // all snaps purged this epoch (across all pools)
2006     string k = make_purged_snap_epoch_key(pending_inc.epoch);
2007     bufferlist v;
2008     encode(pending_inc.new_purged_snaps, v);
2009     t->put(OSD_SNAP_PREFIX, k, v);
2010   }
2011   for (auto& i : pending_inc.new_purged_snaps) {
2012     for (auto q = i.second.begin();
2013          q != i.second.end();
2014          ++q) {
2015       insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2016                                 pending_inc.epoch,
2017                                 t);
2018     }
2019   }
2020   for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2021     for (auto snap : snaps) {
2022       insert_purged_snap_update(pool, snap, snap + 1,
2023                                 pending_inc.epoch,
2024                                 t);
2025     }
2026   }
2027
2028   // health
2029   health_check_map_t next;
2030   tmp.check_health(cct, &next);
2031   encode_health(next, t);
2032 }
2033
2034 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2035 {
2036   bufferlist bl;
2037   int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2038   if (r < 0)
2039     return r;
2040   try {
2041     auto p = bl.cbegin();
2042     decode(m, p);
2043   }
2044   catch (buffer::error& e) {
2045     if (err)
2046       *err << "osd." << osd << " metadata is corrupt";
2047     return -EIO;
2048   }
2049   return 0;
2050 }
2051
2052 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2053 {
2054   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2055     if (osdmap.is_up(osd)) {
2056       map<string,string> meta;
2057       load_metadata(osd, meta, nullptr);
2058       auto p = meta.find(field);
2059       if (p == meta.end()) {
2060         (*out)["unknown"]++;
2061       } else {
2062         (*out)[p->second]++;
2063       }
2064     }
2065   }
2066 }
2067
2068 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2069 {
2070   map<string,int> by_val;
2071   count_metadata(field, &by_val);
2072   f->open_object_section(field.c_str());
2073   for (auto& p : by_val) {
2074     f->dump_int(p.first.c_str(), p.second);
2075   }
2076   f->close_section();
2077 }
2078
2079 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2080 {
2081   map<string, string> metadata;
2082   int r = load_metadata(osd, metadata, nullptr);
2083   if (r < 0)
2084     return r;
2085
2086   auto it = metadata.find("osd_objectstore");
2087   if (it == metadata.end())
2088     return -ENOENT;
2089   *type = it->second;
2090   return 0;
2091 }
2092
2093 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2094                                                  const pg_pool_t &pool,
2095                                                  ostream *err)
2096 {
2097   // just check a few pgs for efficiency - this can't give a guarantee anyway,
2098   // since filestore osds could always join the pool later
2099   set<int> checked_osds;
2100   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2101     vector<int> up, acting;
2102     pg_t pgid(ps, pool_id);
2103     osdmap.pg_to_up_acting_osds(pgid, up, acting);
2104     for (int osd : up) {
2105       if (checked_osds.find(osd) != checked_osds.end())
2106         continue;
2107       string objectstore_type;
2108       int r = get_osd_objectstore_type(osd, &objectstore_type);
2109       // allow with missing metadata, e.g. due to an osd never booting yet
2110       if (r < 0 || objectstore_type == "bluestore") {
2111         checked_osds.insert(osd);
2112         continue;
2113       }
2114       *err << "osd." << osd << " uses " << objectstore_type;
2115       return false;
2116     }
2117   }
2118   return true;
2119 }
2120
2121 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2122 {
2123   map<string,string> m;
2124   if (int r = load_metadata(osd, m, err))
2125     return r;
2126   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2127     f->dump_string(p->first.c_str(), p->second);
2128   return 0;
2129 }
2130
2131 void OSDMonitor::print_nodes(Formatter *f)
2132 {
2133   // group OSDs by their hosts
2134   map<string, list<int> > osds; // hostname => osd
2135   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2136     map<string, string> m;
2137     if (load_metadata(osd, m, NULL)) {
2138       continue;
2139     }
2140     map<string, string>::iterator hostname = m.find("hostname");
2141     if (hostname == m.end()) {
2142       // not likely though
2143       continue;
2144     }
2145     osds[hostname->second].push_back(osd);
2146   }
2147
2148   dump_services(f, osds, "osd");
2149 }
2150
2151 void OSDMonitor::share_map_with_random_osd()
2152 {
2153   if (osdmap.get_num_up_osds() == 0) {
2154     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2155     return;
2156   }
2157
2158   MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2159   if (!s) {
2160     dout(10) << __func__ << " no up osd on our session map" << dendl;
2161     return;
2162   }
2163
2164   dout(10) << "committed, telling random " << s->name
2165            << " all about it" << dendl;
2166
2167   // get feature of the peer
2168   // use quorum_con_features, if it's an anonymous connection.
2169   uint64_t features = s->con_features ? s->con_features :
2170                                         mon->get_quorum_con_features();
2171   // whatev, they'll request more if they need it
2172   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2173   s->con->send_message(m);
2174   // NOTE: do *not* record osd has up to this epoch (as we do
2175   // elsewhere) as they may still need to request older values.
2176 }
2177
2178 version_t OSDMonitor::get_trim_to() const
2179 {
2180   if (mon->get_quorum().empty()) {
2181     dout(10) << __func__ << ": quorum not formed" << dendl;
2182     return 0;
2183   }
2184
2185   {
2186     std::lock_guard<std::mutex> l(creating_pgs_lock);
2187     if (!creating_pgs.pgs.empty()) {
2188       return 0;
2189     }
2190   }
2191
2192   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2193     dout(0) << __func__
2194             << " blocking osdmap trim"
2195                " ('mon_debug_block_osdmap_trim' set to 'true')"
2196             << dendl;
2197     return 0;
2198   }
2199
2200   {
2201     epoch_t floor = get_min_last_epoch_clean();
2202     dout(10) << " min_last_epoch_clean " << floor << dendl;
2203     if (g_conf()->mon_osd_force_trim_to > 0 &&
2204         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2205       floor = g_conf()->mon_osd_force_trim_to;
2206       dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2207     }
2208     unsigned min = g_conf()->mon_min_osdmap_epochs;
2209     if (floor + min > get_last_committed()) {
2210       if (min < get_last_committed())
2211         floor = get_last_committed() - min;
2212       else
2213         floor = 0;
2214     }
2215     if (floor > get_first_committed())
2216       return floor;
2217   }
2218   return 0;
2219 }
2220
2221 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2222 {
2223   auto floor = last_epoch_clean.get_lower_bound(osdmap);
2224   // also scan osd epochs
2225   // don't trim past the oldest reported osd epoch
2226   for (auto& osd_epoch : osd_epochs) {
2227     if (osd_epoch.second < floor &&
2228         osdmap.is_in(osd_epoch.first)) {
2229       floor = osd_epoch.second;
2230     }
2231   }
2232   return floor;
2233 }
2234
2235 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2236                                    version_t first)
2237 {
2238   dout(10) << __func__ << " including full map for e " << first << dendl;
2239   bufferlist bl;
2240   get_version_full(first, bl);
2241   put_version_full(tx, first, bl);
2242
2243   if (has_osdmap_manifest &&
2244       first > osdmap_manifest.get_first_pinned()) {
2245     _prune_update_trimmed(tx, first);
2246   }
2247 }
2248
2249
2250 /* full osdmap prune
2251  *
2252  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2253  */
2254
2255 void OSDMonitor::load_osdmap_manifest()
2256 {
2257   bool store_has_manifest =
2258     mon->store->exists(get_service_name(), "osdmap_manifest");
2259
2260   if (!store_has_manifest) {
2261     if (!has_osdmap_manifest) {
2262       return;
2263     }
2264
2265     dout(20) << __func__
2266              << " dropping osdmap manifest from memory." << dendl;
2267     osdmap_manifest = osdmap_manifest_t();
2268     has_osdmap_manifest = false;
2269     return;
2270   }
2271
2272   dout(20) << __func__
2273            << " osdmap manifest detected in store; reload." << dendl;
2274
2275   bufferlist manifest_bl;
2276   int r = get_value("osdmap_manifest", manifest_bl);
2277   if (r < 0) {
2278     derr << __func__ << " unable to read osdmap version manifest" << dendl;
2279     ceph_abort_msg("error reading manifest");
2280   }
2281   osdmap_manifest.decode(manifest_bl);
2282   has_osdmap_manifest = true;
2283
2284   dout(10) << __func__ << " store osdmap manifest pinned ("
2285            << osdmap_manifest.get_first_pinned()
2286            << " .. "
2287            << osdmap_manifest.get_last_pinned()
2288            << ")"
2289            << dendl;
2290 }
2291
2292 bool OSDMonitor::should_prune() const
2293 {
2294   version_t first = get_first_committed();
2295   version_t last = get_last_committed();
2296   version_t min_osdmap_epochs =
2297     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2298   version_t prune_min =
2299     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2300   version_t prune_interval =
2301     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2302   version_t last_pinned = osdmap_manifest.get_last_pinned();
2303   version_t last_to_pin = last - min_osdmap_epochs;
2304
2305   // Make it or break it constraints.
2306   //
2307   // If any of these conditions fails, we will not prune, regardless of
2308   // whether we have an on-disk manifest with an on-going pruning state.
2309   //
2310   if ((last - first) <= min_osdmap_epochs) {
2311     // between the first and last committed epochs, we don't have
2312     // enough epochs to trim, much less to prune.
2313     dout(10) << __func__
2314              << " currently holding only " << (last - first)
2315              << " epochs (min osdmap epochs: " << min_osdmap_epochs
2316              << "); do not prune."
2317              << dendl;
2318     return false;
2319
2320   } else if ((last_to_pin - first) < prune_min) {
2321     // between the first committed epoch and the last epoch we would prune,
2322     // we simply don't have enough versions over the minimum to prune maps.
2323     dout(10) << __func__
2324              << " could only prune " << (last_to_pin - first)
2325              << " epochs (" << first << ".." << last_to_pin << "), which"
2326                 " is less than the required minimum (" << prune_min << ")"
2327              << dendl;
2328     return false;
2329
2330   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2331     dout(10) << __func__
2332              << " we have pruned as far as we can; do not prune."
2333              << dendl;
2334     return false;
2335
2336   } else if (last_pinned + prune_interval > last_to_pin) {
2337     dout(10) << __func__
2338              << " not enough epochs to form an interval (last pinned: "
2339              << last_pinned << ", last to pin: "
2340              << last_to_pin << ", interval: " << prune_interval << ")"
2341              << dendl;
2342     return false;
2343   }
2344
2345   dout(15) << __func__
2346            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2347            << " lc (" << first << ".." << last << ")"
2348            << dendl;
2349   return true;
2350 }
2351
2352 void OSDMonitor::_prune_update_trimmed(
2353     MonitorDBStore::TransactionRef tx,
2354     version_t first)
2355 {
2356   dout(10) << __func__
2357            << " first " << first
2358            << " last_pinned " << osdmap_manifest.get_last_pinned()
2359            << " last_pinned " << osdmap_manifest.get_last_pinned()
2360            << dendl;
2361
2362   osdmap_manifest_t manifest = osdmap_manifest;
2363
2364   if (!manifest.is_pinned(first)) {
2365     manifest.pin(first);
2366   }
2367
2368   set<version_t>::iterator p_end = manifest.pinned.find(first);
2369   set<version_t>::iterator p = manifest.pinned.begin();
2370   manifest.pinned.erase(p, p_end);
2371   ceph_assert(manifest.get_first_pinned() == first);
2372
2373   if (manifest.get_last_pinned() == first+1 ||
2374       manifest.pinned.size() == 1) {
2375     // we reached the end of the line, as pinned maps go; clean up our
2376     // manifest, and let `should_prune()` decide whether we should prune
2377     // again.
2378     tx->erase(get_service_name(), "osdmap_manifest");
2379     return;
2380   }
2381
2382   bufferlist bl;
2383   manifest.encode(bl);
2384   tx->put(get_service_name(), "osdmap_manifest", bl);
2385 }
2386
2387 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2388 {
2389   dout(1) << __func__ << dendl;
2390
2391   version_t pin_first;
2392
2393   // verify constrainsts on stable in-memory state
2394   if (!has_osdmap_manifest) {
2395     // we must have never pruned, OR if we pruned the state must no longer
2396     // be relevant (i.e., the state must have been removed alongside with
2397     // the trim that *must* have removed past the last pinned map in a
2398     // previous prune).
2399     ceph_assert(osdmap_manifest.pinned.empty());
2400     ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2401     pin_first = get_first_committed();
2402
2403   } else {
2404     // we must have pruned in the past AND its state is still relevant
2405     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2406     // and thus we still hold a manifest in the store).
2407     ceph_assert(!osdmap_manifest.pinned.empty());
2408     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2409     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2410
2411     dout(10) << __func__
2412              << " first_pinned " << osdmap_manifest.get_first_pinned()
2413              << " last_pinned " << osdmap_manifest.get_last_pinned()
2414              << dendl;
2415
2416     pin_first = osdmap_manifest.get_last_pinned();
2417   }
2418
2419   manifest.pin(pin_first);
2420 }
2421
2422 bool OSDMonitor::_prune_sanitize_options() const
2423 {
2424   uint64_t prune_interval =
2425     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2426   uint64_t prune_min =
2427     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2428   uint64_t txsize =
2429     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2430
2431   bool r = true;
2432
2433   if (prune_interval == 0) {
2434     derr << __func__
2435          << " prune is enabled BUT prune interval is zero; abort."
2436          << dendl;
2437     r = false;
2438   } else if (prune_interval == 1) {
2439     derr << __func__
2440          << " prune interval is equal to one, which essentially means"
2441             " no pruning; abort."
2442          << dendl;
2443     r = false;
2444   }
2445   if (prune_min == 0) {
2446     derr << __func__
2447          << " prune is enabled BUT prune min is zero; abort."
2448          << dendl;
2449     r = false;
2450   }
2451   if (prune_interval > prune_min) {
2452     derr << __func__
2453          << " impossible to ascertain proper prune interval because"
2454          << " it is greater than the minimum prune epochs"
2455          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2456          << dendl;
2457     r = false;
2458   }
2459
2460   if (txsize < prune_interval - 1) {
2461     derr << __func__
2462          << "'mon_osdmap_full_prune_txsize' (" << txsize
2463          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2464          << "); abort." << dendl;
2465     r = false;
2466   }
2467   return r;
2468 }
2469
2470 bool OSDMonitor::is_prune_enabled() const {
2471   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2472 }
2473
2474 bool OSDMonitor::is_prune_supported() const {
2475   return mon->get_required_mon_features().contains_any(
2476       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2477 }
2478
2479 /** do_prune
2480  *
2481  * @returns true if has side-effects; false otherwise.
2482  */
2483 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2484 {
2485   bool enabled = is_prune_enabled();
2486
2487   dout(1) << __func__ << " osdmap full prune "
2488           << ( enabled ? "enabled" : "disabled")
2489           << dendl;
2490
2491   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2492     return false;
2493   }
2494
2495   // we are beyond the minimum prune versions, we need to remove maps because
2496   // otherwise the store will grow unbounded and we may end up having issues
2497   // with available disk space or store hangs.
2498
2499   // we will not pin all versions. We will leave a buffer number of versions.
2500   // this allows us the monitor to trim maps without caring too much about
2501   // pinned maps, and then allow us to use another ceph-mon without these
2502   // capabilities, without having to repair the store.
2503
2504   osdmap_manifest_t manifest = osdmap_manifest;
2505
2506   version_t first = get_first_committed();
2507   version_t last = get_last_committed();
2508
2509   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2510   version_t last_pinned = manifest.get_last_pinned();
2511   uint64_t prune_interval =
2512     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2513   uint64_t txsize =
2514     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2515
2516   prune_init(manifest);
2517
2518   // we need to get rid of some osdmaps
2519
2520   dout(5) << __func__
2521           << " lc (" << first << " .. " << last << ")"
2522           << " last_pinned " << last_pinned
2523           << " interval " << prune_interval
2524           << " last_to_pin " << last_to_pin
2525           << dendl;
2526
2527   // We will be erasing maps as we go.
2528   //
2529   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2530   //
2531   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2532   // we stop pruning. We could prune the maps between `next_to_pin` and
2533   // `last_to_pin`, but by not doing it we end up with neater pruned
2534   // intervals, aligned with `prune_interval`. Besides, this should not be a
2535   // problem as long as `prune_interval` is set to a sane value, instead of
2536   // hundreds or thousands of maps.
2537
2538   auto map_exists = [this](version_t v) {
2539     string k = mon->store->combine_strings("full", v);
2540     return mon->store->exists(get_service_name(), k);
2541   };
2542
2543   // 'interval' represents the number of maps from the last pinned
2544   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2545   // version 11 next; all intermediate versions will be removed.
2546   //
2547   // 'txsize' represents the maximum number of versions we'll be removing in
2548   // this iteration. If 'txsize' is large enough to perform multiple passes
2549   // pinning and removing maps, we will do so; if not, we'll do at least one
2550   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2551   // ensure that we never go *over* the maximum.
2552
2553   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2554   uint64_t removal_interval = prune_interval - 1;
2555
2556   if (txsize < removal_interval) {
2557     dout(5) << __func__
2558             << " setting txsize to removal interval size ("
2559             << removal_interval << " versions"
2560             << dendl;
2561     txsize = removal_interval;
2562   }
2563   ceph_assert(removal_interval > 0);
2564
2565   uint64_t num_pruned = 0;
2566   while (num_pruned + removal_interval <= txsize) {
2567     last_pinned = manifest.get_last_pinned();
2568
2569     if (last_pinned + prune_interval > last_to_pin) {
2570       break;
2571     }
2572     ceph_assert(last_pinned < last_to_pin);
2573
2574     version_t next_pinned = last_pinned + prune_interval;
2575     ceph_assert(next_pinned <= last_to_pin);
2576     manifest.pin(next_pinned);
2577
2578     dout(20) << __func__
2579              << " last_pinned " << last_pinned
2580              << " next_pinned " << next_pinned
2581              << " num_pruned " << num_pruned
2582              << " removal interval (" << (last_pinned+1)
2583              << ".." << (next_pinned-1) << ")"
2584              << " txsize " << txsize << dendl;
2585
2586     ceph_assert(map_exists(last_pinned));
2587     ceph_assert(map_exists(next_pinned));
2588
2589     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2590       ceph_assert(!manifest.is_pinned(v));
2591
2592       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2593       string full_key = mon->store->combine_strings("full", v);
2594       tx->erase(get_service_name(), full_key);
2595       ++num_pruned;
2596     }
2597   }
2598
2599   ceph_assert(num_pruned > 0);
2600
2601   bufferlist bl;
2602   manifest.encode(bl);
2603   tx->put(get_service_name(), "osdmap_manifest", bl);
2604
2605   return true;
2606 }
2607
2608
2609 // -------------
2610
2611 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2612 {
2613   op->mark_osdmon_event(__func__);
2614   Message *m = op->get_req();
2615   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2616
2617   switch (m->get_type()) {
2618     // READs
2619   case MSG_MON_COMMAND:
2620     try {
2621       return preprocess_command(op);
2622     } catch (const bad_cmd_get& e) {
2623       bufferlist bl;
2624       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2625       return true;
2626     }
2627   case CEPH_MSG_MON_GET_OSDMAP:
2628     return preprocess_get_osdmap(op);
2629
2630     // damp updates
2631   case MSG_OSD_MARK_ME_DOWN:
2632     return preprocess_mark_me_down(op);
2633   case MSG_OSD_MARK_ME_DEAD:
2634     return preprocess_mark_me_dead(op);
2635   case MSG_OSD_FULL:
2636     return preprocess_full(op);
2637   case MSG_OSD_FAILURE:
2638     return preprocess_failure(op);
2639   case MSG_OSD_BOOT:
2640     return preprocess_boot(op);
2641   case MSG_OSD_ALIVE:
2642     return preprocess_alive(op);
2643   case MSG_OSD_PG_CREATED:
2644     return preprocess_pg_created(op);
2645   case MSG_OSD_PG_READY_TO_MERGE:
2646     return preprocess_pg_ready_to_merge(op);
2647   case MSG_OSD_PGTEMP:
2648     return preprocess_pgtemp(op);
2649   case MSG_OSD_BEACON:
2650     return preprocess_beacon(op);
2651
2652   case CEPH_MSG_POOLOP:
2653     return preprocess_pool_op(op);
2654
2655   case MSG_REMOVE_SNAPS:
2656     return preprocess_remove_snaps(op);
2657
2658   case MSG_MON_GET_PURGED_SNAPS:
2659     return preprocess_get_purged_snaps(op);
2660
2661   default:
2662     ceph_abort();
2663     return true;
2664   }
2665 }
2666
2667 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2668 {
2669   op->mark_osdmon_event(__func__);
2670   Message *m = op->get_req();
2671   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2672
2673   switch (m->get_type()) {
2674     // damp updates
2675   case MSG_OSD_MARK_ME_DOWN:
2676     return prepare_mark_me_down(op);
2677   case MSG_OSD_MARK_ME_DEAD:
2678     return prepare_mark_me_dead(op);
2679   case MSG_OSD_FULL:
2680     return prepare_full(op);
2681   case MSG_OSD_FAILURE:
2682     return prepare_failure(op);
2683   case MSG_OSD_BOOT:
2684     return prepare_boot(op);
2685   case MSG_OSD_ALIVE:
2686     return prepare_alive(op);
2687   case MSG_OSD_PG_CREATED:
2688     return prepare_pg_created(op);
2689   case MSG_OSD_PGTEMP:
2690     return prepare_pgtemp(op);
2691   case MSG_OSD_PG_READY_TO_MERGE:
2692     return prepare_pg_ready_to_merge(op);
2693   case MSG_OSD_BEACON:
2694     return prepare_beacon(op);
2695
2696   case MSG_MON_COMMAND:
2697     try {
2698       return prepare_command(op);
2699     } catch (const bad_cmd_get& e) {
2700       bufferlist bl;
2701       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2702       return true;
2703     }
2704
2705   case CEPH_MSG_POOLOP:
2706     return prepare_pool_op(op);
2707
2708   case MSG_REMOVE_SNAPS:
2709     return prepare_remove_snaps(op);
2710
2711
2712   default:
2713     ceph_abort();
2714   }
2715
2716   return false;
2717 }
2718
2719 bool OSDMonitor::should_propose(double& delay)
2720 {
2721   dout(10) << "should_propose" << dendl;
2722
2723   // if full map, propose immediately!  any subsequent changes will be clobbered.
2724   if (pending_inc.fullmap.length())
2725     return true;
2726
2727   // adjust osd weights?
2728   if (!osd_weight.empty() &&
2729       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2730     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2731     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2732     delay = 0.0;
2733     osd_weight.clear();
2734     return true;
2735   }
2736
2737   return PaxosService::should_propose(delay);
2738 }
2739
2740
2741
2742 // ---------------------------
2743 // READs
2744
2745 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2746 {
2747   op->mark_osdmon_event(__func__);
2748   auto m = op->get_req<MMonGetOSDMap>();
2749
2750   uint64_t features = mon->get_quorum_con_features();
2751   if (op->get_session() && op->get_session()->con_features)
2752     features = op->get_session()->con_features;
2753
2754   dout(10) << __func__ << " " << *m << dendl;
2755   MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2756   epoch_t first = get_first_committed();
2757   epoch_t last = osdmap.get_epoch();
2758   int max = g_conf()->osd_map_message_max;
2759   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2760   for (epoch_t e = std::max(first, m->get_full_first());
2761        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2762        ++e, --max) {
2763     bufferlist& bl = reply->maps[e];
2764     int r = get_version_full(e, features, bl);
2765     ceph_assert(r >= 0);
2766     max_bytes -= bl.length();
2767   }
2768   for (epoch_t e = std::max(first, m->get_inc_first());
2769        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2770        ++e, --max) {
2771     bufferlist& bl = reply->incremental_maps[e];
2772     int r = get_version(e, features, bl);
2773     ceph_assert(r >= 0);
2774     max_bytes -= bl.length();
2775   }
2776   reply->oldest_map = first;
2777   reply->newest_map = last;
2778   mon->send_reply(op, reply);
2779   return true;
2780 }
2781
2782
2783 // ---------------------------
2784 // UPDATEs
2785
2786 // failure --
2787
2788 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2789   // check permissions
2790   MonSession *session = op->get_session();
2791   if (!session)
2792     return true;
2793   if (!session->is_capable("osd", MON_CAP_X)) {
2794     dout(0) << "got MOSDFailure from entity with insufficient caps "
2795             << session->caps << dendl;
2796     return true;
2797   }
2798   if (fsid != mon->monmap->fsid) {
2799     dout(0) << "check_source: on fsid " << fsid
2800             << " != " << mon->monmap->fsid << dendl;
2801     return true;
2802   }
2803   return false;
2804 }
2805
2806
2807 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2808 {
2809   op->mark_osdmon_event(__func__);
2810   auto m = op->get_req<MOSDFailure>();
2811   // who is target_osd
2812   int badboy = m->get_target_osd();
2813
2814   // check permissions
2815   if (check_source(op, m->fsid))
2816     goto didit;
2817
2818   // first, verify the reporting host is valid
2819   if (m->get_orig_source().is_osd()) {
2820     int from = m->get_orig_source().num();
2821     if (!osdmap.exists(from) ||
2822         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2823         (osdmap.is_down(from) && m->if_osd_failed())) {
2824       dout(5) << "preprocess_failure from dead osd." << from
2825               << ", ignoring" << dendl;
2826       send_incremental(op, m->get_epoch()+1);
2827       goto didit;
2828     }
2829   }
2830
2831
2832   // weird?
2833   if (osdmap.is_down(badboy)) {
2834     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2835             << " " << m->get_target_addrs()
2836             << ", from " << m->get_orig_source() << dendl;
2837     if (m->get_epoch() < osdmap.get_epoch())
2838       send_incremental(op, m->get_epoch()+1);
2839     goto didit;
2840   }
2841   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2842     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2843             << " " << m->get_target_addrs()
2844             << " != map's " << osdmap.get_addrs(badboy)
2845             << ", from " << m->get_orig_source() << dendl;
2846     if (m->get_epoch() < osdmap.get_epoch())
2847       send_incremental(op, m->get_epoch()+1);
2848     goto didit;
2849   }
2850
2851   // already reported?
2852   if (osdmap.is_down(badboy) ||
2853       osdmap.get_up_from(badboy) > m->get_epoch()) {
2854     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2855             << " " << m->get_target_addrs()
2856             << ", from " << m->get_orig_source() << dendl;
2857     if (m->get_epoch() < osdmap.get_epoch())
2858       send_incremental(op, m->get_epoch()+1);
2859     goto didit;
2860   }
2861
2862   if (!can_mark_down(badboy)) {
2863     dout(5) << "preprocess_failure ignoring report of osd."
2864             << m->get_target_osd() << " " << m->get_target_addrs()
2865             << " from " << m->get_orig_source() << dendl;
2866     goto didit;
2867   }
2868
2869   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2870            << " " << m->get_target_addrs()
2871            << ", from " << m->get_orig_source() << dendl;
2872   return false;
2873
2874  didit:
2875   mon->no_reply(op);
2876   return true;
2877 }
2878
2879 class C_AckMarkedDown : public C_MonOp {
2880   OSDMonitor *osdmon;
2881 public:
2882   C_AckMarkedDown(
2883     OSDMonitor *osdmon,
2884     MonOpRequestRef op)
2885     : C_MonOp(op), osdmon(osdmon) {}
2886
2887   void _finish(int r) override {
2888     if (r == 0) {
2889       auto m = op->get_req<MOSDMarkMeDown>();
2890       osdmon->mon->send_reply(
2891         op,
2892         new MOSDMarkMeDown(
2893           m->fsid,
2894           m->target_osd,
2895           m->target_addrs,
2896           m->get_epoch(),
2897           false));   // ACK itself does not request an ack
2898     } else if (r == -EAGAIN) {
2899         osdmon->dispatch(op);
2900     } else {
2901         ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2902     }
2903   }
2904   ~C_AckMarkedDown() override {
2905   }
2906 };
2907
2908 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2909 {
2910   op->mark_osdmon_event(__func__);
2911   auto m = op->get_req<MOSDMarkMeDown>();
2912   int from = m->target_osd;
2913
2914   // check permissions
2915   if (check_source(op, m->fsid))
2916     goto reply;
2917
2918   // first, verify the reporting host is valid
2919   if (!m->get_orig_source().is_osd())
2920     goto reply;
2921
2922   if (!osdmap.exists(from) ||
2923       osdmap.is_down(from) ||
2924       osdmap.get_addrs(from) != m->target_addrs) {
2925     dout(5) << "preprocess_mark_me_down from dead osd."
2926             << from << ", ignoring" << dendl;
2927     send_incremental(op, m->get_epoch()+1);
2928     goto reply;
2929   }
2930
2931   // no down might be set
2932   if (!can_mark_down(from))
2933     goto reply;
2934
2935   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2936            << " " << m->target_addrs << dendl;
2937   return false;
2938
2939  reply:
2940   if (m->request_ack) {
2941     Context *c(new C_AckMarkedDown(this, op));
2942     c->complete(0);
2943   }
2944   return true;
2945 }
2946
2947 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2948 {
2949   op->mark_osdmon_event(__func__);
2950   auto m = op->get_req<MOSDMarkMeDown>();
2951   int target_osd = m->target_osd;
2952
2953   ceph_assert(osdmap.is_up(target_osd));
2954   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2955
2956   mon->clog->info() << "osd." << target_osd << " marked itself down";
2957   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2958   if (m->request_ack)
2959     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2960   return true;
2961 }
2962
2963 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2964 {
2965   op->mark_osdmon_event(__func__);
2966   auto m = op->get_req<MOSDMarkMeDead>();
2967   int from = m->target_osd;
2968
2969   // check permissions
2970   if (check_source(op, m->fsid)) {
2971     mon->no_reply(op);
2972     return true;
2973   }
2974
2975   // first, verify the reporting host is valid
2976   if (!m->get_orig_source().is_osd()) {
2977     mon->no_reply(op);
2978     return true;
2979   }
2980
2981   if (!osdmap.exists(from) ||
2982       !osdmap.is_down(from)) {
2983     dout(5) << __func__ << " from nonexistent or up osd." << from
2984             << ", ignoring" << dendl;
2985     send_incremental(op, m->get_epoch()+1);
2986     mon->no_reply(op);
2987     return true;
2988   }
2989
2990   return false;
2991 }
2992
2993 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2994 {
2995   op->mark_osdmon_event(__func__);
2996   auto m = op->get_req<MOSDMarkMeDead>();
2997   int target_osd = m->target_osd;
2998
2999   ceph_assert(osdmap.is_down(target_osd));
3000
3001   mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
3002                     << m->get_epoch();
3003   if (!pending_inc.new_xinfo.count(target_osd)) {
3004     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3005   }
3006   pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3007   wait_for_finished_proposal(
3008     op,
3009     new LambdaContext(
3010       [op, this] (int r) {
3011         if (r >= 0) {
3012           mon->no_reply(op);      // ignore on success
3013         }
3014       }
3015       ));
3016   return true;
3017 }
3018
3019 bool OSDMonitor::can_mark_down(int i)
3020 {
3021   if (osdmap.is_nodown(i)) {
3022     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3023             << "will not mark it down" << dendl;
3024     return false;
3025   }
3026
3027   int num_osds = osdmap.get_num_osds();
3028   if (num_osds == 0) {
3029     dout(5) << __func__ << " no osds" << dendl;
3030     return false;
3031   }
3032   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3033   float up_ratio = (float)up / (float)num_osds;
3034   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3035     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3036             << g_conf()->mon_osd_min_up_ratio
3037             << ", will not mark osd." << i << " down" << dendl;
3038     return false;
3039   }
3040   return true;
3041 }
3042
3043 bool OSDMonitor::can_mark_up(int i)
3044 {
3045   if (osdmap.is_noup(i)) {
3046     dout(5) << __func__ << " osd." << i << " is marked as noup, "
3047             << "will not mark it up" << dendl;
3048     return false;
3049   }
3050
3051   return true;
3052 }
3053
3054 /**
3055  * @note the parameter @p i apparently only exists here so we can output the
3056  *       osd's id on messages.
3057  */
3058 bool OSDMonitor::can_mark_out(int i)
3059 {
3060   if (osdmap.is_noout(i)) {
3061     dout(5) << __func__ << " osd." << i << " is marked as noout, "
3062             << "will not mark it out" << dendl;
3063     return false;
3064   }
3065
3066   int num_osds = osdmap.get_num_osds();
3067   if (num_osds == 0) {
3068     dout(5) << __func__ << " no osds" << dendl;
3069     return false;
3070   }
3071   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3072   float in_ratio = (float)in / (float)num_osds;
3073   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3074     if (i >= 0)
3075       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3076               << g_conf()->mon_osd_min_in_ratio
3077               << ", will not mark osd." << i << " out" << dendl;
3078     else
3079       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3080               << g_conf()->mon_osd_min_in_ratio
3081               << ", will not mark osds out" << dendl;
3082     return false;
3083   }
3084
3085   return true;
3086 }
3087
3088 bool OSDMonitor::can_mark_in(int i)
3089 {
3090   if (osdmap.is_noin(i)) {
3091     dout(5) << __func__ << " osd." << i << " is marked as noin, "
3092             << "will not mark it in" << dendl;
3093     return false;
3094   }
3095
3096   return true;
3097 }
3098
3099 bool OSDMonitor::check_failures(utime_t now)
3100 {
3101   bool found_failure = false;
3102   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3103        p != failure_info.end();
3104        ++p) {
3105     if (can_mark_down(p->first)) {
3106       found_failure |= check_failure(now, p->first, p->second);
3107     }
3108   }
3109   return found_failure;
3110 }
3111
3112 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3113 {
3114   // already pending failure?
3115   if (pending_inc.new_state.count(target_osd) &&
3116       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3117     dout(10) << " already pending failure" << dendl;
3118     return true;
3119   }
3120
3121   set<string> reporters_by_subtree;
3122   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3123   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3124   utime_t max_failed_since = fi.get_failed_since();
3125   utime_t failed_for = now - max_failed_since;
3126
3127   utime_t grace = orig_grace;
3128   double my_grace = 0, peer_grace = 0;
3129   double decay_k = 0;
3130   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3131     double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3132     decay_k = ::log(.5) / halflife;
3133
3134     // scale grace period based on historical probability of 'lagginess'
3135     // (false positive failures due to slowness).
3136     const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3137     double decay = exp((double)failed_for * decay_k);
3138     dout(20) << " halflife " << halflife << " decay_k " << decay_k
3139              << " failed_for " << failed_for << " decay " << decay << dendl;
3140     my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3141     grace += my_grace;
3142   }
3143
3144   // consider the peers reporting a failure a proxy for a potential
3145   // 'subcluster' over the overall cluster that is similarly
3146   // laggy.  this is clearly not true in all cases, but will sometimes
3147   // help us localize the grace correction to a subset of the system
3148   // (say, a rack with a bad switch) that is unhappy.
3149   ceph_assert(fi.reporters.size());
3150   for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3151     // get the parent bucket whose type matches with "reporter_subtree_level".
3152     // fall back to OSD if the level doesn't exist.
3153     if (osdmap.exists(p->first)) {
3154       auto reporter_loc = osdmap.crush->get_full_location(p->first);
3155       if (auto iter = reporter_loc.find(reporter_subtree_level);
3156           iter == reporter_loc.end()) {
3157         reporters_by_subtree.insert("osd." + to_string(p->first));
3158       } else {
3159         reporters_by_subtree.insert(iter->second);
3160       }
3161       if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3162         const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3163         utime_t elapsed = now - xi.down_stamp;
3164         double decay = exp((double)elapsed * decay_k);
3165         peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3166       }
3167       ++p;
3168     } else {
3169       fi.cancel_report(p->first);;
3170       p = fi.reporters.erase(p);
3171     }
3172   }
3173
3174   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3175     peer_grace /= (double)fi.reporters.size();
3176     grace += peer_grace;
3177   }
3178
3179   dout(10) << " osd." << target_osd << " has "
3180            << fi.reporters.size() << " reporters, "
3181            << grace << " grace (" << orig_grace << " + " << my_grace
3182            << " + " << peer_grace << "), max_failed_since " << max_failed_since
3183            << dendl;
3184
3185   if (failed_for >= grace &&
3186       reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3187     dout(1) << " we have enough reporters to mark osd." << target_osd
3188             << " down" << dendl;
3189     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3190
3191     mon->clog->info() << "osd." << target_osd << " failed ("
3192                       << osdmap.crush->get_full_location_ordered_string(
3193                         target_osd)
3194                       << ") ("
3195                       << (int)reporters_by_subtree.size()
3196                       << " reporters from different "
3197                       << reporter_subtree_level << " after "
3198                       << failed_for << " >= grace " << grace << ")";
3199     return true;
3200   }
3201   return false;
3202 }
3203
3204 void OSDMonitor::force_failure(int target_osd, int by)
3205 {
3206   // already pending failure?
3207   if (pending_inc.new_state.count(target_osd) &&
3208       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3209     dout(10) << " already pending failure" << dendl;
3210     return;
3211   }
3212
3213   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3214   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3215   if (!pending_inc.new_xinfo.count(target_osd)) {
3216     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3217   }
3218   pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3219
3220   mon->clog->info() << "osd." << target_osd << " failed ("
3221                     << osdmap.crush->get_full_location_ordered_string(target_osd)
3222                     << ") (connection refused reported by osd." << by << ")";
3223   return;
3224 }
3225
3226 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3227 {
3228   op->mark_osdmon_event(__func__);
3229   auto m = op->get_req<MOSDFailure>();
3230   dout(1) << "prepare_failure osd." << m->get_target_osd()
3231           << " " << m->get_target_addrs()
3232           << " from " << m->get_orig_source()
3233           << " is reporting failure:" << m->if_osd_failed() << dendl;
3234
3235   int target_osd = m->get_target_osd();
3236   int reporter = m->get_orig_source().num();
3237   ceph_assert(osdmap.is_up(target_osd));
3238   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3239
3240   mon->no_reply(op);
3241
3242   if (m->if_osd_failed()) {
3243     // calculate failure time
3244     utime_t now = ceph_clock_now();
3245     utime_t failed_since =
3246       m->get_recv_stamp() - utime_t(m->failed_for, 0);
3247
3248     // add a report
3249     if (m->is_immediate()) {
3250       mon->clog->debug() << "osd." << m->get_target_osd()
3251                          << " reported immediately failed by "
3252                          << m->get_orig_source();
3253       force_failure(target_osd, reporter);
3254       return true;
3255     }
3256     mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3257                       << m->get_orig_source();
3258
3259     failure_info_t& fi = failure_info[target_osd];
3260     MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3261     if (old_op) {
3262       mon->no_reply(old_op);
3263     }
3264
3265     return check_failure(now, target_osd, fi);
3266   } else {
3267     // remove the report
3268     mon->clog->debug() << "osd." << m->get_target_osd()
3269                        << " failure report canceled by "
3270                        << m->get_orig_source();
3271     if (failure_info.count(target_osd)) {
3272       failure_info_t& fi = failure_info[target_osd];
3273       MonOpRequestRef report_op = fi.cancel_report(reporter);
3274       if (report_op) {
3275         mon->no_reply(report_op);
3276       }
3277       if (fi.reporters.empty()) {
3278         dout(10) << " removing last failure_info for osd." << target_osd
3279                  << dendl;
3280         failure_info.erase(target_osd);
3281       } else {
3282         dout(10) << " failure_info for osd." << target_osd << " now "
3283                  << fi.reporters.size() << " reporters" << dendl;
3284       }
3285     } else {
3286       dout(10) << " no failure_info for osd." << target_osd << dendl;
3287     }
3288   }
3289
3290   return false;
3291 }
3292
3293 void OSDMonitor::process_failures()
3294 {
3295   map<int,failure_info_t>::iterator p = failure_info.begin();
3296   while (p != failure_info.end()) {
3297     if (osdmap.is_up(p->first)) {
3298       ++p;
3299     } else {
3300       dout(10) << "process_failures osd." << p->first << dendl;
3301       list<MonOpRequestRef> ls;
3302       p->second.take_report_messages(ls);
3303       failure_info.erase(p++);
3304
3305       while (!ls.empty()) {
3306         MonOpRequestRef o = ls.front();
3307         if (o) {
3308           o->mark_event(__func__);
3309           MOSDFailure *m = o->get_req<MOSDFailure>();
3310           send_latest(o, m->get_epoch());
3311           mon->no_reply(o);
3312         }
3313         ls.pop_front();
3314       }
3315     }
3316   }
3317 }
3318
3319 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3320 {
3321   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3322
3323   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3324        p != failure_info.end();
3325        ++p) {
3326     p->second.take_report_messages(ls);
3327   }
3328   failure_info.clear();
3329 }
3330
3331 int OSDMonitor::get_grace_interval_threshold()
3332 {
3333   int halflife = g_conf()->mon_osd_laggy_halflife;
3334   // Scale the halflife period (default: 1_hr) by
3335   // a factor (48) to calculate the threshold.
3336   int grace_threshold_factor = 48;
3337   return halflife * grace_threshold_factor;
3338 }
3339
3340 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3341 {
3342   int grace_interval_threshold_secs = get_grace_interval_threshold();
3343   if (last_failed_interval > grace_interval_threshold_secs) {
3344     dout(1) << " last_failed_interval " << last_failed_interval
3345             << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3346             << dendl;
3347     return true;
3348   }
3349   return false;
3350 }
3351
3352 void OSDMonitor::set_default_laggy_params(int target_osd)
3353 {
3354   if (pending_inc.new_xinfo.count(target_osd) == 0) {
3355     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3356   }
3357   osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3358   xi.down_stamp = pending_inc.modified;
3359   xi.laggy_probability = 0.0;
3360   xi.laggy_interval = 0;
3361   dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3362 }
3363
3364
3365 // boot --
3366
3367 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3368 {
3369   op->mark_osdmon_event(__func__);
3370   auto m = op->get_req<MOSDBoot>();
3371   int from = m->get_orig_source_inst().name.num();
3372
3373   // check permissions, ignore if failed (no response expected)
3374   MonSession *session = op->get_session();
3375   if (!session)
3376     goto ignore;
3377   if (!session->is_capable("osd", MON_CAP_X)) {
3378     dout(0) << "got preprocess_boot message from entity with insufficient caps"
3379             << session->caps << dendl;
3380     goto ignore;
3381   }
3382
3383   if (m->sb.cluster_fsid != mon->monmap->fsid) {
3384     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3385             << " != " << mon->monmap->fsid << dendl;
3386     goto ignore;
3387   }
3388
3389   if (m->get_orig_source_inst().addr.is_blank_ip()) {
3390     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3391     goto ignore;
3392   }
3393
3394   ceph_assert(m->get_orig_source_inst().name.is_osd());
3395
3396   // force all osds to have gone through luminous prior to upgrade to nautilus
3397   {
3398     vector<string> missing;
3399     if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3400       missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3401     }
3402     if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3403       missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3404     }
3405     if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3406       missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3407     }
3408     if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3409       missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3410     }
3411
3412     if (!missing.empty()) {
3413       using std::experimental::make_ostream_joiner;
3414
3415       stringstream ss;
3416       copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3417
3418       mon->clog->info() << "disallowing boot of OSD "
3419                         << m->get_orig_source_inst()
3420                         << " because the osd lacks " << ss.str();
3421       goto ignore;
3422     }
3423   }
3424
3425   // make sure osd versions do not span more than 3 releases
3426   if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3427       osdmap.require_osd_release < ceph_release_t::mimic) {
3428     mon->clog->info() << "disallowing boot of octopus+ OSD "
3429                       << m->get_orig_source_inst()
3430                       << " because require_osd_release < mimic";
3431     goto ignore;
3432   }
3433
3434   // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3435   // we are reusing a jewel feature bit that was retired in luminous.
3436   if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3437       osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3438       !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3439     mon->clog->info() << "disallowing boot of OSD "
3440                       << m->get_orig_source_inst()
3441                       << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3442     goto ignore;
3443   }
3444
3445   // already booted?
3446   if (osdmap.is_up(from) &&
3447       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3448       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3449     // yup.
3450     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3451             << " " << m->get_orig_source_addrs()
3452             << " =~ " << osdmap.get_addrs(from) << dendl;
3453     _booted(op, false);
3454     return true;
3455   }
3456
3457   if (osdmap.exists(from) &&
3458       !osdmap.get_uuid(from).is_zero() &&
3459       osdmap.get_uuid(from) != m->sb.osd_fsid) {
3460     dout(7) << __func__ << " from " << m->get_orig_source_inst()
3461             << " clashes with existing osd: different fsid"
3462             << " (ours: " << osdmap.get_uuid(from)
3463             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3464     goto ignore;
3465   }
3466
3467   if (osdmap.exists(from) &&
3468       osdmap.get_info(from).up_from > m->version &&
3469       osdmap.get_most_recent_addrs(from).legacy_equals(
3470         m->get_orig_source_addrs())) {
3471     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3472     send_latest(op, m->sb.current_epoch+1);
3473     return true;
3474   }
3475
3476   // noup?
3477   if (!can_mark_up(from)) {
3478     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3479     send_latest(op, m->sb.current_epoch+1);
3480     return true;
3481   }
3482
3483   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3484   return false;
3485
3486  ignore:
3487   return true;
3488 }
3489
3490 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3491 {
3492   op->mark_osdmon_event(__func__);
3493   auto m = op->get_req<MOSDBoot>();
3494   dout(7) << __func__ << " from " << m->get_source()
3495           << " sb " << m->sb
3496           << " client_addrs" << m->get_connection()->get_peer_addrs()
3497           << " cluster_addrs " << m->cluster_addrs
3498           << " hb_back_addrs " << m->hb_back_addrs
3499           << " hb_front_addrs " << m->hb_front_addrs
3500           << dendl;
3501
3502   ceph_assert(m->get_orig_source().is_osd());
3503   int from = m->get_orig_source().num();
3504
3505   // does this osd exist?
3506   if (from >= osdmap.get_max_osd()) {
3507     dout(1) << "boot from osd." << from << " >= max_osd "
3508             << osdmap.get_max_osd() << dendl;
3509     return false;
3510   }
3511
3512   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3513   if (pending_inc.new_state.count(from))
3514     oldstate ^= pending_inc.new_state[from];
3515
3516   // already up?  mark down first?
3517   if (osdmap.is_up(from)) {
3518     dout(7) << __func__ << " was up, first marking down osd." << from << " "
3519             << osdmap.get_addrs(from) << dendl;
3520     // preprocess should have caught these;  if not, assert.
3521     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3522                   m->get_orig_source_addrs()) ||
3523                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3524     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3525
3526     if (pending_inc.new_state.count(from) == 0 ||
3527         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3528       // mark previous guy down
3529       pending_inc.new_state[from] = CEPH_OSD_UP;
3530     }
3531     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3532   } else if (pending_inc.new_up_client.count(from)) {
3533     // already prepared, just wait
3534     dout(7) << __func__ << " already prepared, waiting on "
3535             << m->get_orig_source_addr() << dendl;
3536     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3537   } else {
3538     // mark new guy up.
3539     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3540     pending_inc.new_up_cluster[from] = m->cluster_addrs;
3541     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3542     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3543
3544     down_pending_out.erase(from);  // if any
3545
3546     if (m->sb.weight)
3547       osd_weight[from] = m->sb.weight;
3548
3549     // set uuid?
3550     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3551              << dendl;
3552     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3553       // preprocess should have caught this;  if not, assert.
3554       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3555       pending_inc.new_uuid[from] = m->sb.osd_fsid;
3556     }
3557
3558     // fresh osd?
3559     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3560       const osd_info_t& i = osdmap.get_info(from);
3561       if (i.up_from > i.lost_at) {
3562         dout(10) << " fresh osd; marking lost_at too" << dendl;
3563         pending_inc.new_lost[from] = osdmap.get_epoch();
3564       }
3565     }
3566
3567     // metadata
3568     bufferlist osd_metadata;
3569     encode(m->metadata, osd_metadata);
3570     pending_metadata[from] = osd_metadata;
3571     pending_metadata_rm.erase(from);
3572
3573     // adjust last clean unmount epoch?
3574     const osd_info_t& info = osdmap.get_info(from);
3575     dout(10) << " old osd_info: " << info << dendl;
3576     if (m->sb.mounted > info.last_clean_begin ||
3577         (m->sb.mounted == info.last_clean_begin &&
3578          m->sb.clean_thru > info.last_clean_end)) {
3579       epoch_t begin = m->sb.mounted;
3580       epoch_t end = m->sb.clean_thru;
3581
3582       dout(10) << __func__ << " osd." << from << " last_clean_interval "
3583                << "[" << info.last_clean_begin << "," << info.last_clean_end
3584                << ") -> [" << begin << "-" << end << ")"
3585                << dendl;
3586       pending_inc.new_last_clean_interval[from] =
3587         pair<epoch_t,epoch_t>(begin, end);
3588     }
3589
3590     if (pending_inc.new_xinfo.count(from) == 0)
3591       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3592     osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3593     if (m->boot_epoch == 0) {
3594       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3595       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3596       dout(10) << " not laggy, new xi " << xi << dendl;
3597     } else {
3598       if (xi.down_stamp.sec()) {
3599         int interval = ceph_clock_now().sec() -
3600           xi.down_stamp.sec();
3601         if (g_conf()->mon_osd_laggy_max_interval &&
3602             (interval > g_conf()->mon_osd_laggy_max_interval)) {
3603           interval =  g_conf()->mon_osd_laggy_max_interval;
3604         }
3605         xi.laggy_interval =
3606           interval * g_conf()->mon_osd_laggy_weight +
3607           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3608       }
3609       xi.laggy_probability =
3610         g_conf()->mon_osd_laggy_weight +
3611         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3612       dout(10) << " laggy, now xi " << xi << dendl;
3613     }
3614
3615     // set features shared by the osd
3616     if (m->osd_features)
3617       xi.features = m->osd_features;
3618     else
3619       xi.features = m->get_connection()->get_features();
3620
3621     // mark in?
3622     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3623          (oldstate & CEPH_OSD_AUTOOUT)) ||
3624         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3625         (g_conf()->mon_osd_auto_mark_in)) {
3626       if (can_mark_in(from)) {
3627         if (xi.old_weight > 0) {
3628           pending_inc.new_weight[from] = xi.old_weight;
3629           xi.old_weight = 0;
3630         } else {
3631           pending_inc.new_weight[from] = CEPH_OSD_IN;
3632         }
3633       } else {
3634         dout(7) << __func__ << " NOIN set, will not mark in "
3635                 << m->get_orig_source_addr() << dendl;
3636       }
3637     }
3638
3639     // wait
3640     wait_for_finished_proposal(op, new C_Booted(this, op));
3641   }
3642   return true;
3643 }
3644
3645 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3646 {
3647   op->mark_osdmon_event(__func__);
3648   auto m = op->get_req<MOSDBoot>();
3649   dout(7) << "_booted " << m->get_orig_source_inst()
3650           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3651
3652   if (logit) {
3653     mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3654                       << " boot";
3655   }
3656
3657   send_latest(op, m->sb.current_epoch+1);
3658 }
3659
3660
3661 // -------------
3662 // full
3663
3664 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3665 {
3666   op->mark_osdmon_event(__func__);
3667   auto m = op->get_req<MOSDFull>();
3668   int from = m->get_orig_source().num();
3669   set<string> state;
3670   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3671
3672   // check permissions, ignore if failed
3673   MonSession *session = op->get_session();
3674   if (!session)
3675     goto ignore;
3676   if (!session->is_capable("osd", MON_CAP_X)) {
3677     dout(0) << "MOSDFull from entity with insufficient privileges:"
3678             << session->caps << dendl;
3679     goto ignore;
3680   }
3681
3682   // ignore a full message from the osd instance that already went down
3683   if (!osdmap.exists(from)) {
3684     dout(7) << __func__ << " ignoring full message from nonexistent "
3685             << m->get_orig_source_inst() << dendl;
3686     goto ignore;
3687   }
3688   if ((!osdmap.is_up(from) &&
3689        osdmap.get_most_recent_addrs(from).legacy_equals(
3690          m->get_orig_source_addrs())) ||
3691       (osdmap.is_up(from) &&
3692        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3693     dout(7) << __func__ << " ignoring full message from down "
3694             << m->get_orig_source_inst() << dendl;
3695     goto ignore;
3696   }
3697
3698   OSDMap::calc_state_set(osdmap.get_state(from), state);
3699
3700   if ((osdmap.get_state(from) & mask) == m->state) {
3701     dout(7) << __func__ << " state already " << state << " for osd." << from
3702             << " " << m->get_orig_source_inst() << dendl;
3703     _reply_map(op, m->version);
3704     goto ignore;
3705   }
3706
3707   dout(10) << __func__ << " want state " << state << " for osd." << from
3708            << " " << m->get_orig_source_inst() << dendl;
3709   return false;
3710
3711  ignore:
3712   return true;
3713 }
3714
3715 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3716 {
3717   op->mark_osdmon_event(__func__);
3718   auto m = op->get_req<MOSDFull>();
3719   const int from = m->get_orig_source().num();
3720
3721   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3722   const unsigned want_state = m->state & mask;  // safety first
3723
3724   unsigned cur_state = osdmap.get_state(from);
3725   auto p = pending_inc.new_state.find(from);
3726   if (p != pending_inc.new_state.end()) {
3727     cur_state ^= p->second;
3728   }
3729   cur_state &= mask;
3730
3731   set<string> want_state_set, cur_state_set;
3732   OSDMap::calc_state_set(want_state, want_state_set);
3733   OSDMap::calc_state_set(cur_state, cur_state_set);
3734
3735   if (cur_state != want_state) {
3736     if (p != pending_inc.new_state.end()) {
3737       p->second &= ~mask;
3738     } else {
3739       pending_inc.new_state[from] = 0;
3740     }
3741     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3742     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3743             << " -> " << want_state_set << dendl;
3744   } else {
3745     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3746             << " = wanted " << want_state_set << ", just waiting" << dendl;
3747   }
3748
3749   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3750   return true;
3751 }
3752
3753 // -------------
3754 // alive
3755
3756 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3757 {
3758   op->mark_osdmon_event(__func__);
3759   auto m = op->get_req<MOSDAlive>();
3760   int from = m->get_orig_source().num();
3761
3762   // check permissions, ignore if failed
3763   MonSession *session = op->get_session();
3764   if (!session)
3765     goto ignore;
3766   if (!session->is_capable("osd", MON_CAP_X)) {
3767     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3768             << session->caps << dendl;
3769     goto ignore;
3770   }
3771
3772   if (!osdmap.is_up(from) ||
3773       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3774     dout(7) << "preprocess_alive ignoring alive message from down "
3775             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3776             << dendl;
3777     goto ignore;
3778   }
3779
3780   if (osdmap.get_up_thru(from) >= m->want) {
3781     // yup.
3782     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3783     _reply_map(op, m->version);
3784     return true;
3785   }
3786
3787   dout(10) << "preprocess_alive want up_thru " << m->want
3788            << " from " << m->get_orig_source_inst() << dendl;
3789   return false;
3790
3791  ignore:
3792   return true;
3793 }
3794
3795 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3796 {
3797   op->mark_osdmon_event(__func__);
3798   auto m = op->get_req<MOSDAlive>();
3799   int from = m->get_orig_source().num();
3800
3801   if (0) {  // we probably don't care much about these
3802     mon->clog->debug() << m->get_orig_source_inst() << " alive";
3803   }
3804
3805   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3806           << " from " << m->get_orig_source_inst() << dendl;
3807
3808   update_up_thru(from, m->version); // set to the latest map the OSD has
3809   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3810   return true;
3811 }
3812
3813 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3814 {
3815   op->mark_osdmon_event(__func__);
3816   dout(7) << "_reply_map " << e
3817           << " from " << op->get_req()->get_orig_source_inst()
3818           << dendl;
3819   send_latest(op, e);
3820 }
3821
3822 // pg_created
3823 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3824 {
3825   op->mark_osdmon_event(__func__);
3826   auto m  = op->get_req<MOSDPGCreated>();
3827   dout(10) << __func__ << " " << *m << dendl;
3828   auto session = op->get_session();
3829   mon->no_reply(op);
3830   if (!session) {
3831     dout(10) << __func__ << ": no monitor session!" << dendl;
3832     return true;
3833   }
3834   if (!session->is_capable("osd", MON_CAP_X)) {
3835     derr << __func__ << " received from entity "
3836          << "with insufficient privileges " << session->caps << dendl;
3837     return true;
3838   }
3839   // always forward the "created!" to the leader
3840   return false;
3841 }
3842
3843 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3844 {
3845   op->mark_osdmon_event(__func__);
3846   auto m = op->get_req<MOSDPGCreated>();
3847   dout(10) << __func__ << " " << *m << dendl;
3848   auto src = m->get_orig_source();
3849   auto from = src.num();
3850   if (!src.is_osd() ||
3851       !mon->osdmon()->osdmap.is_up(from) ||
3852       !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3853         m->get_orig_source_addrs())) {
3854     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3855     return false;
3856   }
3857   pending_created_pgs.push_back(m->pgid);
3858   return true;
3859 }
3860
3861 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3862 {
3863   op->mark_osdmon_event(__func__);
3864   auto m = op->get_req<MOSDPGReadyToMerge>();
3865   dout(10) << __func__ << " " << *m << dendl;
3866   const pg_pool_t *pi;
3867   auto session = op->get_session();
3868   if (!session) {
3869     dout(10) << __func__ << ": no monitor session!" << dendl;
3870     goto ignore;
3871   }
3872   if (!session->is_capable("osd", MON_CAP_X)) {
3873     derr << __func__ << " received from entity "
3874          << "with insufficient privileges " << session->caps << dendl;
3875     goto ignore;
3876   }
3877   pi = osdmap.get_pg_pool(m->pgid.pool());
3878   if (!pi) {
3879     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3880     goto ignore;
3881   }
3882   if (pi->get_pg_num() <= m->pgid.ps()) {
3883     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3884     goto ignore;
3885   }
3886   if (pi->get_pg_num() != m->pgid.ps() + 1) {
3887     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3888     goto ignore;
3889   }
3890   if (pi->get_pg_num_pending() > m->pgid.ps()) {
3891     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3892     goto ignore;
3893   }
3894   return false;
3895
3896  ignore:
3897   mon->no_reply(op);
3898   return true;
3899 }
3900
3901 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3902 {
3903   op->mark_osdmon_event(__func__);
3904   auto m  = op->get_req<MOSDPGReadyToMerge>();
3905   dout(10) << __func__ << " " << *m << dendl;
3906   pg_pool_t p;
3907   if (pending_inc.new_pools.count(m->pgid.pool()))
3908     p = pending_inc.new_pools[m->pgid.pool()];
3909   else
3910     p = *osdmap.get_pg_pool(m->pgid.pool());
3911   if (p.get_pg_num() != m->pgid.ps() + 1 ||
3912       p.get_pg_num_pending() > m->pgid.ps()) {
3913     dout(10) << __func__
3914              << " race with concurrent pg_num[_pending] update, will retry"
3915              << dendl;
3916     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3917     return true;
3918   }
3919
3920   if (m->ready) {
3921     p.dec_pg_num(m->pgid,
3922                  pending_inc.epoch,
3923                  m->source_version,
3924                  m->target_version,
3925                  m->last_epoch_started,
3926                  m->last_epoch_clean);
3927     p.last_change = pending_inc.epoch;
3928   } else {
3929     // back off the merge attempt!
3930     p.set_pg_num_pending(p.get_pg_num());
3931   }
3932
3933   // force pre-nautilus clients to resend their ops, since they
3934   // don't understand pg_num_pending changes form a new interval
3935   p.last_force_op_resend_prenautilus = pending_inc.epoch;
3936
3937   pending_inc.new_pools[m->pgid.pool()] = p;
3938
3939   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3940   if (m->ready &&
3941       prob > 0 &&
3942       prob > (double)(rand() % 1000)/1000.0) {
3943     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3944     auto n = new MMonCommand(mon->monmap->get_fsid());
3945     n->set_connection(m->get_connection());
3946     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3947                osdmap.get_pool_name(m->pgid.pool()) +
3948                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3949                stringify(m->pgid.ps() + 1) + "\"}" };
3950     MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3951     nop->set_type_service();
3952     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3953   } else {
3954     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3955   }
3956   return true;
3957 }
3958
3959
3960 // -------------
3961 // pg_temp changes
3962
3963 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3964 {
3965   auto m = op->get_req<MOSDPGTemp>();
3966   dout(10) << "preprocess_pgtemp " << *m << dendl;
3967   mempool::osdmap::vector<int> empty;
3968   int from = m->get_orig_source().num();
3969   size_t ignore_cnt = 0;
3970
3971   // check caps
3972   MonSession *session = op->get_session();
3973   if (!session)
3974     goto ignore;
3975   if (!session->is_capable("osd", MON_CAP_X)) {
3976     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3977             << session->caps << dendl;
3978     goto ignore;
3979   }
3980
3981   if (!osdmap.is_up(from) ||
3982       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3983     dout(7) << "ignoring pgtemp message from down "
3984             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3985             << dendl;
3986     goto ignore;
3987   }
3988
3989   if (m->forced) {
3990     return false;
3991   }
3992
3993   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3994     dout(20) << " " << p->first
3995              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3996              << " -> " << p->second << dendl;
3997
3998     // does the pool exist?
3999     if (!osdmap.have_pg_pool(p->first.pool())) {
4000       /*
4001        * 1. If the osdmap does not have the pool, it means the pool has been
4002        *    removed in-between the osd sending this message and us handling it.
4003        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4004        *    not exist in the pending either, as the osds would not send a
4005        *    message about a pool they know nothing about (yet).
4006        * 3. However, if the pool does exist in the pending, then it must be a
4007        *    new pool, and not relevant to this message (see 1).
4008        */
4009       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4010                << ": pool has been removed" << dendl;
4011       ignore_cnt++;
4012       continue;
4013     }
4014
4015     int acting_primary = -1;
4016     osdmap.pg_to_up_acting_osds(
4017       p->first, nullptr, nullptr, nullptr, &acting_primary);
4018     if (acting_primary != from) {
4019       /* If the source isn't the primary based on the current osdmap, we know
4020        * that the interval changed and that we can discard this message.
4021        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4022        * which of two pg temp mappings on the same pg is more recent.
4023        */
4024       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4025                << ": primary has changed" << dendl;
4026       ignore_cnt++;
4027       continue;
4028     }
4029
4030     // removal?
4031     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4032                               osdmap.primary_temp->count(p->first)))
4033       return false;
4034     // change?
4035     //  NOTE: we assume that this will clear pg_primary, so consider
4036     //        an existing pg_primary field to imply a change
4037     if (p->second.size() &&
4038         (osdmap.pg_temp->count(p->first) == 0 ||
4039          osdmap.pg_temp->get(p->first) != p->second ||
4040          osdmap.primary_temp->count(p->first)))
4041       return false;
4042   }
4043
4044   // should we ignore all the pgs?
4045   if (ignore_cnt == m->pg_temp.size())
4046     goto ignore;
4047
4048   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4049   _reply_map(op, m->map_epoch);
4050   return true;
4051
4052  ignore:
4053   mon->no_reply(op);
4054   return true;
4055 }
4056
4057 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4058 {
4059   epoch_t old_up_thru = osdmap.get_up_thru(from);
4060   auto ut = pending_inc.new_up_thru.find(from);
4061   if (ut != pending_inc.new_up_thru.end()) {
4062     old_up_thru = ut->second;
4063   }
4064   if (up_thru > old_up_thru) {
4065     // set up_thru too, so the osd doesn't have to ask again
4066     pending_inc.new_up_thru[from] = up_thru;
4067   }
4068 }
4069
4070 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4071 {
4072   op->mark_osdmon_event(__func__);
4073   auto m = op->get_req<MOSDPGTemp>();
4074   int from = m->get_orig_source().num();
4075   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4076   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4077     uint64_t pool = p->first.pool();
4078     if (pending_inc.old_pools.count(pool)) {
4079       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4080                << ": pool pending removal" << dendl;
4081       continue;
4082     }
4083     if (!osdmap.have_pg_pool(pool)) {
4084       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4085                << ": pool has been removed" << dendl;
4086       continue;
4087     }
4088     pending_inc.new_pg_temp[p->first] =
4089       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4090
4091     // unconditionally clear pg_primary (until this message can encode
4092     // a change for that, too.. at which point we need to also fix
4093     // preprocess_pg_temp)
4094     if (osdmap.primary_temp->count(p->first) ||
4095         pending_inc.new_primary_temp.count(p->first))
4096       pending_inc.new_primary_temp[p->first] = -1;
4097   }
4098
4099   // set up_thru too, so the osd doesn't have to ask again
4100   update_up_thru(from, m->map_epoch);
4101
4102   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4103   return true;
4104 }
4105
4106
4107 // ---
4108
4109 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4110 {
4111   op->mark_osdmon_event(__func__);
4112   auto m = op->get_req<MRemoveSnaps>();
4113   dout(7) << "preprocess_remove_snaps " << *m << dendl;
4114
4115   // check privilege, ignore if failed
4116   MonSession *session = op->get_session();
4117   mon->no_reply(op);
4118   if (!session)
4119     goto ignore;
4120   if (!session->caps.is_capable(
4121         cct,
4122         session->entity_name,
4123         "osd", "osd pool rmsnap", {}, true, true, false,
4124         session->get_peer_socket_addr())) {
4125     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4126             << session->caps << dendl;
4127     goto ignore;
4128   }
4129
4130   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4131        q != m->snaps.end();
4132        ++q) {
4133     if (!osdmap.have_pg_pool(q->first)) {
4134       dout(10) << " ignoring removed_snaps " << q->second
4135                << " on non-existent pool " << q->first << dendl;
4136       continue;
4137     }
4138     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4139     for (vector<snapid_t>::iterator p = q->second.begin();
4140          p != q->second.end();
4141          ++p) {
4142       if (*p > pi->get_snap_seq() ||
4143           !_is_removed_snap(q->first, *p)) {
4144         return false;
4145       }
4146     }
4147   }
4148
4149   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4150     auto reply = make_message<MRemoveSnaps>();
4151     reply->snaps = m->snaps;
4152     mon->send_reply(op, reply.detach());
4153   }
4154
4155  ignore:
4156   return true;
4157 }
4158
4159 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4160 {
4161   op->mark_osdmon_event(__func__);
4162   auto m = op->get_req<MRemoveSnaps>();
4163   dout(7) << "prepare_remove_snaps " << *m << dendl;
4164
4165   for (auto& [pool, snaps] : m->snaps) {
4166     if (!osdmap.have_pg_pool(pool)) {
4167       dout(10) << " ignoring removed_snaps " << snaps
4168                << " on non-existent pool " << pool << dendl;
4169       continue;
4170     }
4171
4172     pg_pool_t& pi = osdmap.pools[pool];
4173     for (auto s : snaps) {
4174       if (!_is_removed_snap(pool, s) &&
4175           (!pending_inc.new_pools.count(pool) ||
4176            !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4177           (!pending_inc.new_removed_snaps.count(pool) ||
4178            !pending_inc.new_removed_snaps[pool].contains(s))) {
4179         pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4180         if (osdmap.require_osd_release < ceph_release_t::octopus) {
4181           newpi->removed_snaps.insert(s);
4182           dout(10) << " pool " << pool << " removed_snaps added " << s
4183                    << " (now " << newpi->removed_snaps << ")" << dendl;
4184         }
4185         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4186         if (s > newpi->get_snap_seq()) {
4187           dout(10) << " pool " << pool << " snap_seq "
4188                    << newpi->get_snap_seq() << " -> " << s << dendl;
4189           newpi->set_snap_seq(s);
4190         }
4191         newpi->set_snap_epoch(pending_inc.epoch);
4192         dout(10) << " added pool " << pool << " snap " << s
4193                  << " to removed_snaps queue" << dendl;
4194         pending_inc.new_removed_snaps[pool].insert(s);
4195       }
4196     }
4197   }
4198
4199   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4200     auto reply = make_message<MRemoveSnaps>();
4201     reply->snaps = m->snaps;
4202     wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4203   }
4204
4205   return true;
4206 }
4207
4208 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4209 {
4210   op->mark_osdmon_event(__func__);
4211   auto m = op->get_req<MMonGetPurgedSnaps>();
4212   dout(7) << __func__ << " " << *m << dendl;
4213
4214   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4215
4216   string k = make_purged_snap_epoch_key(m->start);
4217   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4218   it->upper_bound(k);
4219   unsigned long epoch = m->last;
4220   while (it->valid()) {
4221     if (it->key().find("purged_epoch_") != 0) {
4222       break;
4223     }
4224     string k = it->key();
4225     int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4226     if (n != 1) {
4227       derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4228     } else if (epoch > m->last) {
4229       break;
4230     } else {
4231       bufferlist bl = it->value();
4232       auto p = bl.cbegin();
4233       auto &v = r[epoch];
4234       try {
4235         ceph::decode(v, p);
4236       } catch (buffer::error& e) {
4237         derr << __func__ << " unable to parse value for key '" << it->key()
4238              << "': \n";
4239         bl.hexdump(*_dout);
4240         *_dout << dendl;
4241       }
4242       n += 4 + v.size() * 16;
4243     }
4244     if (n > 1048576) {
4245       // impose a semi-arbitrary limit to message size
4246       break;
4247     }
4248     it->next();
4249   }
4250
4251   auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4252   reply->purged_snaps.swap(r);
4253   mon->send_reply(op, reply.detach());
4254
4255   return true;
4256 }
4257
4258 // osd beacon
4259 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4260 {
4261   op->mark_osdmon_event(__func__);
4262   // check caps
4263   auto session = op->get_session();
4264   mon->no_reply(op);
4265   if (!session) {
4266     dout(10) << __func__ << " no monitor session!" << dendl;
4267     return true;
4268   }
4269   if (!session->is_capable("osd", MON_CAP_X)) {
4270     derr << __func__ << " received from entity "
4271          << "with insufficient privileges " << session->caps << dendl;
4272     return true;
4273   }
4274   // Always forward the beacon to the leader, even if they are the same as
4275   // the old one. The leader will mark as down osds that haven't sent
4276   // beacon for a few minutes.
4277   return false;
4278 }
4279
4280 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4281 {
4282   op->mark_osdmon_event(__func__);
4283   const auto beacon = op->get_req<MOSDBeacon>();
4284   const auto src = beacon->get_orig_source();
4285   dout(10) << __func__ << " " << *beacon
4286            << " from " << src << dendl;
4287   int from = src.num();
4288
4289   if (!src.is_osd() ||
4290       !osdmap.is_up(from) ||
4291       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4292     if (src.is_osd() && !osdmap.is_up(from)) {
4293       // share some new maps with this guy in case it may not be
4294       // aware of its own deadness...
4295       send_latest(op, beacon->version+1);
4296     }
4297     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4298     return false;
4299   }
4300
4301   last_osd_report[from] = ceph_clock_now();
4302   osd_epochs[from] = beacon->version;
4303
4304   for (const auto& pg : beacon->pgs) {
4305     last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4306   }
4307
4308   if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4309       beacon->last_purged_snaps_scrub) {
4310     if (pending_inc.new_xinfo.count(from) == 0) {
4311       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4312     }
4313     pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4314       beacon->last_purged_snaps_scrub;
4315     return true;
4316   } else {
4317     return false;
4318   }
4319 }
4320
4321 // ---------------
4322 // map helpers
4323
4324 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4325 {
4326   op->mark_osdmon_event(__func__);
4327   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4328           << " start " << start << dendl;
4329   if (start == 0)
4330     send_full(op);
4331   else
4332     send_incremental(op, start);
4333 }
4334
4335
4336 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4337 {
4338   MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4339   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4340   r->oldest_map = get_first_committed();
4341   r->newest_map = osdmap.get_epoch();
4342   return r;
4343 }
4344
4345 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4346 {
4347   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4348            << std::hex << features << std::dec << dendl;
4349   MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
4350   m->oldest_map = get_first_committed();
4351   m->newest_map = osdmap.get_epoch();
4352
4353   for (epoch_t e = to; e >= from && e > 0; e--) {
4354     bufferlist bl;
4355     int err = get_version(e, features, bl);
4356     if (err == 0) {
4357       ceph_assert(bl.length());
4358       // if (get_version(e, bl) > 0) {
4359       dout(20) << "build_incremental    inc " << e << " "
4360                << bl.length() << " bytes" << dendl;
4361       m->incremental_maps[e] = bl;
4362     } else {
4363       ceph_assert(err == -ENOENT);
4364       ceph_assert(!bl.length());
4365       get_version_full(e, features, bl);
4366       if (bl.length() > 0) {
4367       //else if (get_version("full", e, bl) > 0) {
4368       dout(20) << "build_incremental   full " << e << " "
4369                << bl.length() << " bytes" << dendl;
4370       m->maps[e] = bl;
4371       } else {
4372         ceph_abort();  // we should have all maps.
4373       }
4374     }
4375   }
4376   return m;
4377 }
4378
4379 void OSDMonitor::send_full(MonOpRequestRef op)
4380 {
4381   op->mark_osdmon_event(__func__);
4382   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4383   mon->send_reply(op, build_latest_full(op->get_session()->con_features));
4384 }
4385
4386 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4387 {
4388   op->mark_osdmon_event(__func__);
4389
4390   MonSession *s = op->get_session();
4391   ceph_assert(s);
4392
4393   if (s->proxy_con) {
4394     // oh, we can tell the other mon to do it
4395     dout(10) << __func__ << " asking proxying mon to send_incremental from "
4396              << first << dendl;
4397     MRoute *r = new MRoute(s->proxy_tid, NULL);
4398     r->send_osdmap_first = first;
4399     s->proxy_con->send_message(r);
4400     op->mark_event("reply: send routed send_osdmap_first reply");
4401   } else {
4402     // do it ourselves
4403     send_incremental(first, s, false, op);
4404   }
4405 }
4406
4407 void OSDMonitor::send_incremental(epoch_t first,
4408                                   MonSession *session,
4409                                   bool onetime,
4410                                   MonOpRequestRef req)
4411 {
4412   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4413           << " to " << session->name << dendl;
4414
4415   // get feature of the peer
4416   // use quorum_con_features, if it's an anonymous connection.
4417   uint64_t features = session->con_features ? session->con_features :
4418     mon->get_quorum_con_features();
4419
4420   if (first <= session->osd_epoch) {
4421     dout(10) << __func__ << " " << session->name << " should already have epoch "
4422              << session->osd_epoch << dendl;
4423     first = session->osd_epoch + 1;
4424   }
4425
4426   if (first < get_first_committed()) {
4427     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4428     m->oldest_map = get_first_committed();
4429     m->newest_map = osdmap.get_epoch();
4430
4431     first = get_first_committed();
4432     bufferlist bl;
4433     int err = get_version_full(first, features, bl);
4434     ceph_assert(err == 0);
4435     ceph_assert(bl.length());
4436     dout(20) << "send_incremental starting with base full "
4437              << first << " " << bl.length() << " bytes" << dendl;
4438     m->maps[first] = bl;
4439
4440     if (req) {
4441       mon->send_reply(req, m);
4442       session->osd_epoch = first;
4443       return;
4444     } else {
4445       session->con->send_message(m);
4446       session->osd_epoch = first;
4447     }
4448     first++;
4449   }
4450
4451   while (first <= osdmap.get_epoch()) {
4452     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4453                                      osdmap.get_epoch());
4454     MOSDMap *m = build_incremental(first, last, features);
4455
4456     if (req) {
4457       // send some maps.  it may not be all of them, but it will get them
4458       // started.
4459       mon->send_reply(req, m);
4460     } else {
4461       session->con->send_message(m);
4462       first = last + 1;
4463     }
4464     session->osd_epoch = last;
4465     if (onetime || req)
4466       break;
4467   }
4468 }
4469
4470 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4471 {
4472   return get_version(ver, mon->get_quorum_con_features(), bl);
4473 }
4474
4475 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4476 {
4477   OSDMap::Incremental inc;
4478   auto q = bl.cbegin();
4479   inc.decode(q);
4480   // always encode with subset of osdmap's canonical features
4481   uint64_t f = features & inc.encode_features;
4482   dout(20) << __func__ << " " << inc.epoch << " with features " << f
4483            << dendl;
4484   bl.clear();
4485   if (inc.fullmap.length()) {
4486     // embedded full map?
4487     OSDMap m;
4488     m.decode(inc.fullmap);
4489     inc.fullmap.clear();
4490     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4491   }
4492   if (inc.crush.length()) {
4493     // embedded crush map
4494     CrushWrapper c;
4495     auto p = inc.crush.cbegin();
4496     c.decode(p);
4497     inc.crush.clear();
4498     c.encode(inc.crush, f);
4499   }
4500   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4501 }
4502
4503 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4504 {
4505   OSDMap m;
4506   auto q = bl.cbegin();
4507   m.decode(q);
4508   // always encode with subset of osdmap's canonical features
4509   uint64_t f = features & m.get_encoding_features();
4510   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4511            << dendl;
4512   bl.clear();
4513   m.encode(bl, f | CEPH_FEATURE_RESERVED);
4514 }
4515
4516 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4517 {
4518   uint64_t significant_features = OSDMap::get_significant_features(features);
4519   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4520     return 0;
4521   }
4522   int ret = PaxosService::get_version(ver, bl);
4523   if (ret < 0) {
4524     return ret;
4525   }
4526   // NOTE: this check is imprecise; the OSDMap encoding features may
4527   // be a subset of the latest mon quorum features, but worst case we
4528   // reencode once and then cache the (identical) result under both
4529   // feature masks.
4530   if (significant_features !=
4531       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4532     reencode_incremental_map(bl, features);
4533   }
4534   inc_osd_cache.add_bytes({ver, significant_features}, bl);
4535   return 0;
4536 }
4537
4538 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4539 {
4540   bufferlist inc_bl;
4541   int err = get_version(ver, inc_bl);
4542   ceph_assert(err == 0);
4543   ceph_assert(inc_bl.length());
4544
4545   auto p = inc_bl.cbegin();
4546   inc.decode(p);
4547   dout(10) << __func__ << "     "
4548            << " epoch " << inc.epoch
4549            << " inc_crc " << inc.inc_crc
4550            << " full_crc " << inc.full_crc
4551            << " encode_features " << inc.encode_features << dendl;
4552   return 0;
4553 }
4554
4555 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4556 {
4557   dout(10) << __func__ << " ver " << ver << dendl;
4558
4559   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4560   if (closest_pinned == 0) {
4561     return -ENOENT;
4562   }
4563   if (closest_pinned > ver) {
4564     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4565   }
4566   ceph_assert(closest_pinned <= ver);
4567
4568   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4569
4570   // get osdmap incremental maps and apply on top of this one.
4571   bufferlist osdm_bl;
4572   bool has_cached_osdmap = false;
4573   for (version_t v = ver-1; v >= closest_pinned; --v) {
4574     if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4575                                 &osdm_bl)) {
4576       dout(10) << __func__ << " found map in cache ver " << v << dendl;
4577       closest_pinned = v;
4578       has_cached_osdmap = true;
4579       break;
4580     }
4581   }
4582
4583   if (!has_cached_osdmap) {
4584     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4585     if (err != 0) {
4586       derr << __func__ << " closest pinned map ver " << closest_pinned
4587            << " not available! error: " << cpp_strerror(err) << dendl;
4588     }
4589     ceph_assert(err == 0);
4590   }
4591
4592   ceph_assert(osdm_bl.length());
4593
4594   OSDMap osdm;
4595   osdm.decode(osdm_bl);
4596
4597   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4598            << " e" << osdm.epoch
4599            << " crc " << osdm.get_crc()
4600            << " -- applying incremental maps." << dendl;
4601
4602   uint64_t encode_features = 0;
4603   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4604     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
4605
4606     OSDMap::Incremental inc;
4607     int err = get_inc(v, inc);
4608     ceph_assert(err == 0);
4609
4610     encode_features = inc.encode_features;
4611
4612     err = osdm.apply_incremental(inc);
4613     ceph_assert(err == 0);
4614
4615     // this block performs paranoid checks on map retrieval
4616     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4617         inc.full_crc != 0) {
4618
4619       uint64_t f = encode_features;
4620       if (!f) {
4621         f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4622       }
4623
4624       // encode osdmap to force calculating crcs
4625       bufferlist tbl;
4626       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4627       // decode osdmap to compare crcs with what's expected by incremental
4628       OSDMap tosdm;
4629       tosdm.decode(tbl);
4630
4631       if (tosdm.get_crc() != inc.full_crc) {
4632         derr << __func__
4633              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4634              << ", expected " << inc.full_crc << ")" << dendl;
4635         ceph_abort_msg("osdmap crc mismatch");
4636       }
4637     }
4638
4639     // note: we cannot add the recently computed map to the cache, as is,
4640     // because we have not encoded the map into a bl.
4641   }
4642
4643   if (!encode_features) {
4644     dout(10) << __func__
4645              << " last incremental map didn't have features;"
4646              << " defaulting to quorum's or all" << dendl;
4647     encode_features =
4648       (mon->quorum_con_features ? mon->quorum_con_features : -1);
4649   }
4650   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4651
4652   return 0;
4653 }
4654
4655 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4656 {
4657   return get_version_full(ver, mon->get_quorum_con_features(), bl);
4658 }
4659
4660 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4661                                  bufferlist& bl)
4662 {
4663   uint64_t significant_features = OSDMap::get_significant_features(features);
4664   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4665     return 0;
4666   }
4667   int ret = PaxosService::get_version_full(ver, bl);
4668   if (ret == -ENOENT) {
4669     // build map?
4670     ret = get_full_from_pinned_map(ver, bl);
4671   }
4672   if (ret < 0) {
4673     return ret;
4674   }
4675   // NOTE: this check is imprecise; the OSDMap encoding features may
4676   // be a subset of the latest mon quorum features, but worst case we
4677   // reencode once and then cache the (identical) result under both
4678   // feature masks.
4679   if (significant_features !=
4680       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4681     reencode_full_map(bl, features);
4682   }
4683   full_osd_cache.add_bytes({ver, significant_features}, bl);
4684   return 0;
4685 }
4686
4687 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4688 {
4689   dout(10) << "blacklist " << av << " until " << until << dendl;
4690   for (auto a : av.v) {
4691     if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4692       a.set_type(entity_addr_t::TYPE_ANY);
4693     } else {
4694       a.set_type(entity_addr_t::TYPE_LEGACY);
4695     }
4696     pending_inc.new_blacklist[a] = until;
4697   }
4698   return pending_inc.epoch;
4699 }
4700
4701 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4702 {
4703   if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4704     a.set_type(entity_addr_t::TYPE_ANY);
4705   } else {
4706     a.set_type(entity_addr_t::TYPE_LEGACY);
4707   }
4708   dout(10) << "blacklist " << a << " until " << until << dendl;
4709   pending_inc.new_blacklist[a] = until;
4710   return pending_inc.epoch;
4711 }
4712
4713
4714 void OSDMonitor::check_osdmap_subs()
4715 {
4716   dout(10) << __func__ << dendl;
4717   if (!osdmap.get_epoch()) {
4718     return;
4719   }
4720   auto osdmap_subs = mon->session_map.subs.find("osdmap");
4721   if (osdmap_subs == mon->session_map.subs.end()) {
4722     return;
4723   }
4724   auto p = osdmap_subs->second->begin();
4725   while (!p.end()) {
4726     auto sub = *p;
4727     ++p;
4728     check_osdmap_sub(sub);
4729   }
4730 }
4731
4732 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4733 {
4734   dout(10) << __func__ << " " << sub << " next " << sub->next
4735            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4736   if (sub->next <= osdmap.get_epoch()) {
4737     if (sub->next >= 1)
4738       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4739     else
4740       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4741     if (sub->onetime)
4742       mon->session_map.remove_sub(sub);
4743     else
4744       sub->next = osdmap.get_epoch() + 1;
4745   }
4746 }
4747
4748 void OSDMonitor::check_pg_creates_subs()
4749 {
4750   if (!osdmap.get_num_up_osds()) {
4751     return;
4752   }
4753   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4754   mon->with_session_map([this](const MonSessionMap& session_map) {
4755       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4756       if (pg_creates_subs == session_map.subs.end()) {
4757         return;
4758       }
4759       for (auto sub : *pg_creates_subs->second) {
4760         check_pg_creates_sub(sub);
4761       }
4762     });
4763 }
4764
4765 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4766 {
4767   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4768   ceph_assert(sub->type == "osd_pg_creates");
4769   // only send these if the OSD is up.  we will check_subs() when they do
4770   // come up so they will get the creates then.
4771   if (sub->session->name.is_osd() &&
4772       mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4773     sub->next = send_pg_creates(sub->session->name.num(),
4774                                 sub->session->con.get(),
4775                                 sub->next);
4776   }
4777 }
4778
4779 void OSDMonitor::do_application_enable(int64_t pool_id,
4780                                        const std::string &app_name,
4781                                        const std::string &app_key,
4782                                        const std::string &app_value,
4783                                        bool force)
4784 {
4785   ceph_assert(paxos->is_plugged() && is_writeable());
4786
4787   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4788            << dendl;
4789
4790   ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4791
4792   auto pp = osdmap.get_pg_pool(pool_id);
4793   ceph_assert(pp != nullptr);
4794
4795   pg_pool_t p = *pp;
4796   if (pending_inc.new_pools.count(pool_id)) {
4797     p = pending_inc.new_pools[pool_id];
4798   }
4799
4800   if (app_key.empty()) {
4801     p.application_metadata.insert({app_name, {}});
4802   } else {
4803     if (force) {
4804       p.application_metadata[app_name][app_key] = app_value;
4805     } else {
4806       p.application_metadata.insert({app_name, {{app_key, app_value}}});
4807     }
4808   }
4809   p.last_change = pending_inc.epoch;
4810   pending_inc.new_pools[pool_id] = p;
4811 }
4812
4813 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4814                                  pool_opts_t::key_t opt,
4815                                  pool_opts_t::value_t val)
4816 {
4817   auto p = pending_inc.new_pools.try_emplace(
4818     pool_id, *osdmap.get_pg_pool(pool_id));
4819   p.first->second.opts.set(opt, val);
4820 }
4821
4822 unsigned OSDMonitor::scan_for_creating_pgs(
4823   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4824   const mempool::osdmap::set<int64_t>& removed_pools,
4825   utime_t modified,
4826   creating_pgs_t* creating_pgs) const
4827 {
4828   unsigned queued = 0;
4829   for (auto& p : pools) {
4830     int64_t poolid = p.first;
4831     if (creating_pgs->created_pools.count(poolid)) {
4832       dout(10) << __func__ << " already created " << poolid << dendl;
4833       continue;
4834     }
4835     const pg_pool_t& pool = p.second;
4836     int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4837                                          pool.get_type(), pool.get_size());
4838     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4839       continue;
4840
4841     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4842     const auto created = pool.get_last_change();
4843     if (last_scan_epoch && created <= last_scan_epoch) {
4844       dout(10) << __func__ << " no change in pool " << poolid
4845                << " " << pool << dendl;
4846       continue;
4847     }
4848     if (removed_pools.count(poolid)) {
4849       dout(10) << __func__ << " pool is being removed: " << poolid
4850                << " " << pool << dendl;
4851       continue;
4852     }
4853     dout(10) << __func__ << " queueing pool create for " << poolid
4854              << " " << pool << dendl;
4855     creating_pgs->create_pool(poolid, pool.get_pg_num(),
4856                               created, modified);
4857     queued++;
4858   }
4859   return queued;
4860 }
4861
4862 void OSDMonitor::update_creating_pgs()
4863 {
4864   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4865            << creating_pgs.queue.size() << " pools in queue" << dendl;
4866   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4867   std::lock_guard<std::mutex> l(creating_pgs_lock);
4868   for (const auto& pg : creating_pgs.pgs) {
4869     int acting_primary = -1;
4870     auto pgid = pg.first;
4871     if (!osdmap.pg_exists(pgid)) {
4872       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4873                << dendl;
4874       continue;
4875     }
4876     auto mapped = pg.second.create_epoch;
4877     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4878     spg_t spgid(pgid);
4879     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4880     // check the previous creating_pgs, look for the target to whom the pg was
4881     // previously mapped
4882     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4883       const auto last_acting_primary = pgs_by_epoch.first;
4884       for (auto& pgs: pgs_by_epoch.second) {
4885         if (pgs.second.count(spgid)) {
4886           if (last_acting_primary == acting_primary) {
4887             mapped = pgs.first;
4888           } else {
4889             dout(20) << __func__ << " " << pgid << " "
4890                      << " acting_primary:" << last_acting_primary
4891                      << " -> " << acting_primary << dendl;
4892             // note epoch if the target of the create message changed.
4893             mapped = mapping.get_epoch();
4894           }
4895           break;
4896         } else {
4897           // newly creating
4898           mapped = mapping.get_epoch();
4899         }
4900       }
4901     }
4902     dout(10) << __func__ << " will instruct osd." << acting_primary
4903              << " to create " << pgid << "@" << mapped << dendl;
4904     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4905   }
4906   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4907   creating_pgs_epoch = mapping.get_epoch();
4908 }
4909
4910 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4911 {
4912   dout(30) << __func__ << " osd." << osd << " next=" << next
4913            << " " << creating_pgs_by_osd_epoch << dendl;
4914   std::lock_guard<std::mutex> l(creating_pgs_lock);
4915   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4916     dout(20) << __func__
4917              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4918     // the subscribers will be updated when the mapping is completed anyway
4919     return next;
4920   }
4921   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4922   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4923     return next;
4924   ceph_assert(!creating_pgs_by_epoch->second.empty());
4925
4926   MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4927   MOSDPGCreate2 *m = nullptr;
4928
4929   bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
4930
4931   epoch_t last = 0;
4932   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4933        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4934     auto epoch = epoch_pgs->first;
4935     auto& pgs = epoch_pgs->second;
4936     dout(20) << __func__ << " osd." << osd << " from " << next
4937              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4938     last = epoch;
4939     for (auto& pg : pgs) {
4940       // Need the create time from the monitor using its clock to set
4941       // last_scrub_stamp upon pg creation.
4942       auto create = creating_pgs.pgs.find(pg.pgid);
4943       ceph_assert(create != creating_pgs.pgs.end());
4944       if (old) {
4945         if (!oldm) {
4946           oldm = new MOSDPGCreate(creating_pgs_epoch);
4947         }
4948         oldm->mkpg.emplace(pg.pgid,
4949                            pg_create_t{create->second.create_epoch, pg.pgid, 0});
4950         oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
4951       } else {
4952         if (!m) {
4953           m = new MOSDPGCreate2(creating_pgs_epoch);
4954         }
4955         m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4956                                      create->second.create_stamp));
4957         if (create->second.history.epoch_created) {
4958           dout(20) << __func__ << "   " << pg << " " << create->second.history
4959                    << " " << create->second.past_intervals << dendl;
4960           m->pg_extra.emplace(pg, make_pair(create->second.history,
4961                                             create->second.past_intervals));
4962         }
4963       }
4964       dout(20) << __func__ << " will create " << pg
4965                << " at " << create->second.create_epoch << dendl;
4966     }
4967   }
4968   if (m) {
4969     con->send_message(m);
4970   } else if (oldm) {
4971     con->send_message(oldm);
4972   } else {
4973     dout(20) << __func__ << " osd." << osd << " from " << next
4974              << " has nothing to send" << dendl;
4975     return next;
4976   }
4977
4978   // sub is current through last + 1
4979   return last + 1;
4980 }
4981
4982 // TICK
4983
4984
4985 void OSDMonitor::tick()
4986 {
4987   if (!is_active()) return;
4988
4989   dout(10) << osdmap << dendl;
4990
4991   // always update osdmap manifest, regardless of being the leader.
4992   load_osdmap_manifest();
4993
4994   // always tune priority cache manager memory on leader and peons
4995   if (ceph_using_tcmalloc() && mon_memory_autotune) {
4996     std::lock_guard l(balancer_lock);
4997     if (pcm != nullptr) {
4998       pcm->tune_memory();
4999       pcm->balance();
5000       _set_new_cache_sizes();
5001       dout(10) << "tick balancer "
5002                << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5003                << " inc comtd_bytes: " << inc_cache->get_committed_size()
5004                << " inc used_bytes: " << inc_cache->_get_used_bytes()
5005                << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5006                << dendl;
5007       dout(10) << "tick balancer "
5008                << " full cache_bytes: " << full_cache->get_cache_bytes()
5009                << " full comtd_bytes: " << full_cache->get_committed_size()
5010                << " full used_bytes: " << full_cache->_get_used_bytes()
5011                << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5012                << dendl;
5013     }
5014   }
5015
5016   if (!mon->is_leader()) return;
5017
5018   bool do_propose = false;
5019   utime_t now = ceph_clock_now();
5020
5021   if (handle_osd_timeouts(now, last_osd_report)) {
5022     do_propose = true;
5023   }
5024
5025   // mark osds down?
5026   if (check_failures(now)) {
5027     do_propose = true;
5028   }
5029
5030   // Force a proposal if we need to prune; pruning is performed on
5031   // ``encode_pending()``, hence why we need to regularly trigger a proposal
5032   // even if there's nothing going on.
5033   if (is_prune_enabled() && should_prune()) {
5034     do_propose = true;
5035   }
5036
5037   // mark down osds out?
5038
5039   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5040    * influence at all. The decision is made based on the ratio of "in" osds,
5041    * and the function returns false if this ratio is lower that the minimum
5042    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5043    */
5044   if (can_mark_out(-1)) {
5045     string down_out_subtree_limit = g_conf().get_val<string>(
5046       "mon_osd_down_out_subtree_limit");
5047     set<int> down_cache;  // quick cache of down subtrees
5048
5049     map<int,utime_t>::iterator i = down_pending_out.begin();
5050     while (i != down_pending_out.end()) {
5051       int o = i->first;
5052       utime_t down = now;
5053       down -= i->second;
5054       ++i;
5055
5056       if (osdmap.is_down(o) &&
5057           osdmap.is_in(o) &&
5058           can_mark_out(o)) {
5059         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5060         utime_t grace = orig_grace;
5061         double my_grace = 0.0;
5062
5063         if (g_conf()->mon_osd_adjust_down_out_interval) {
5064           // scale grace period the same way we do the heartbeat grace.
5065           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5066           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5067           double decay_k = ::log(.5) / halflife;
5068           double decay = exp((double)down * decay_k);
5069           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5070                    << " down for " << down << " decay " << decay << dendl;
5071           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5072           grace += my_grace;
5073         }
5074
5075         // is this an entire large subtree down?
5076         if (down_out_subtree_limit.length()) {
5077           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5078           if (type > 0) {
5079             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5080               dout(10) << "tick entire containing " << down_out_subtree_limit
5081                        << " subtree for osd." << o
5082                        << " is down; resetting timer" << dendl;
5083               // reset timer, too.
5084               down_pending_out[o] = now;
5085               continue;
5086             }
5087           }
5088         }
5089
5090         bool down_out = !osdmap.is_destroyed(o) &&
5091           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5092         bool destroyed_out = osdmap.is_destroyed(o) &&
5093           g_conf()->mon_osd_destroyed_out_interval > 0 &&
5094         // this is not precise enough as we did not make a note when this osd
5095         // was marked as destroyed, but let's not bother with that
5096         // complexity for now.
5097           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5098         if (down_out || destroyed_out) {
5099           dout(10) << "tick marking osd." << o << " OUT after " << down
5100                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5101           pending_inc.new_weight[o] = CEPH_OSD_OUT;
5102
5103           // set the AUTOOUT bit.
5104           if (pending_inc.new_state.count(o) == 0)
5105             pending_inc.new_state[o] = 0;
5106           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5107
5108           // remember previous weight
5109           if (pending_inc.new_xinfo.count(o) == 0)
5110             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5111           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5112
5113           do_propose = true;
5114
5115           mon->clog->info() << "Marking osd." << o << " out (has been down for "
5116                             << int(down.sec()) << " seconds)";
5117         } else
5118           continue;
5119       }
5120
5121       down_pending_out.erase(o);
5122     }
5123   } else {
5124     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5125   }
5126
5127   // expire blacklisted items?
5128   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5129        p != osdmap.blacklist.end();
5130        ++p) {
5131     if (p->second < now) {
5132       dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5133       pending_inc.old_blacklist.push_back(p->first);
5134       do_propose = true;
5135     }
5136   }
5137
5138   if (try_prune_purged_snaps()) {
5139     do_propose = true;
5140   }
5141
5142   if (update_pools_status())
5143     do_propose = true;
5144
5145   if (do_propose ||
5146       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
5147     propose_pending();
5148 }
5149
5150 void OSDMonitor::_set_new_cache_sizes()
5151 {
5152   uint64_t cache_size = 0;
5153   int64_t inc_alloc = 0;
5154   int64_t full_alloc = 0;
5155   int64_t kv_alloc = 0;
5156
5157   if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5158     cache_size = pcm->get_tuned_mem();
5159     inc_alloc = inc_cache->get_committed_size();
5160     full_alloc = full_cache->get_committed_size();
5161     kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5162   }
5163
5164   inc_osd_cache.set_bytes(inc_alloc);
5165   full_osd_cache.set_bytes(full_alloc);
5166
5167   dout(1) << __func__ << " cache_size:" << cache_size
5168            << " inc_alloc: " << inc_alloc
5169            << " full_alloc: " << full_alloc
5170            << " kv_alloc: " << kv_alloc
5171            << dendl;
5172 }
5173
5174 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5175                                      std::map<int,utime_t> &last_osd_report)
5176 {
5177   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5178   if (now - mon->get_leader_since() < timeo) {
5179     // We haven't been the leader for long enough to consider OSD timeouts
5180     return false;
5181   }
5182
5183   int max_osd = osdmap.get_max_osd();
5184   bool new_down = false;
5185
5186   for (int i=0; i < max_osd; ++i) {
5187     dout(30) << __func__ << ": checking up on osd " << i << dendl;
5188     if (!osdmap.exists(i)) {
5189       last_osd_report.erase(i); // if any
5190       continue;
5191     }
5192     if (!osdmap.is_up(i))
5193       continue;
5194     const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5195     if (t == last_osd_report.end()) {
5196       // it wasn't in the map; start the timer.
5197       last_osd_report[i] = now;
5198     } else if (can_mark_down(i)) {
5199       utime_t diff = now - t->second;
5200       if (diff > timeo) {
5201         mon->clog->info() << "osd." << i << " marked down after no beacon for "
5202                           << diff << " seconds";
5203         derr << "no beacon from osd." << i << " since " << t->second
5204              << ", " << diff << " seconds ago.  marking down" << dendl;
5205         pending_inc.new_state[i] = CEPH_OSD_UP;
5206         new_down = true;
5207       }
5208     }
5209   }
5210   return new_down;
5211 }
5212
5213 static void dump_cpu_list(Formatter *f, const char *name,
5214                           const string& strlist)
5215 {
5216   cpu_set_t cpu_set;
5217   size_t cpu_set_size;
5218   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5219     return;
5220   }
5221   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5222   f->open_array_section(name);
5223   for (auto cpu : cpus) {
5224     f->dump_int("cpu", cpu);
5225   }
5226   f->close_section();
5227 }
5228
5229 void OSDMonitor::dump_info(Formatter *f)
5230 {
5231   f->open_object_section("osdmap");
5232   osdmap.dump(f);
5233   f->close_section();
5234
5235   f->open_array_section("osd_metadata");
5236   for (int i=0; i<osdmap.get_max_osd(); ++i) {
5237     if (osdmap.exists(i)) {
5238       f->open_object_section("osd");
5239       f->dump_unsigned("id", i);
5240       dump_osd_metadata(i, f, NULL);
5241       f->close_section();
5242     }
5243   }
5244   f->close_section();
5245
5246   f->open_object_section("osdmap_clean_epochs");
5247   f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5248
5249   f->open_object_section("last_epoch_clean");
5250   last_epoch_clean.dump(f);
5251   f->close_section();
5252
5253   f->open_array_section("osd_epochs");
5254   for (auto& osd_epoch : osd_epochs) {
5255     f->open_object_section("osd");
5256     f->dump_unsigned("id", osd_epoch.first);
5257     f->dump_unsigned("epoch", osd_epoch.second);
5258     f->close_section();
5259   }
5260   f->close_section(); // osd_epochs
5261
5262   f->close_section(); // osd_clean_epochs
5263
5264   f->dump_unsigned("osdmap_first_committed", get_first_committed());
5265   f->dump_unsigned("osdmap_last_committed", get_last_committed());
5266
5267   f->open_object_section("crushmap");
5268   osdmap.crush->dump(f);
5269   f->close_section();
5270
5271   if (has_osdmap_manifest) {
5272     f->open_object_section("osdmap_manifest");
5273     osdmap_manifest.dump(f);
5274     f->close_section();
5275   }
5276 }
5277
5278 namespace {
5279   enum osd_pool_get_choices {
5280     SIZE, MIN_SIZE,
5281     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5282     NODELETE, NOPGCHANGE, NOSIZECHANGE,
5283     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5284     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5285     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5286     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5287     CACHE_TARGET_FULL_RATIO,
5288     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5289     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5290     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5291     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5292     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5293     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5294     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5295     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5296     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5297     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5298     PG_AUTOSCALE_BIAS };
5299
5300   std::set<osd_pool_get_choices>
5301     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5302                                 const std::set<osd_pool_get_choices>& second)
5303     {
5304       std::set<osd_pool_get_choices> result;
5305       std::set_difference(first.begin(), first.end(),
5306                           second.begin(), second.end(),
5307                           std::inserter(result, result.end()));
5308       return result;
5309     }
5310 }
5311
5312
5313 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5314 {
5315   op->mark_osdmon_event(__func__);
5316   auto m = op->get_req<MMonCommand>();
5317   int r = 0;
5318   bufferlist rdata;
5319   stringstream ss, ds;
5320
5321   cmdmap_t cmdmap;
5322   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5323     string rs = ss.str();
5324     mon->reply_command(op, -EINVAL, rs, get_last_committed());
5325     return true;
5326   }
5327
5328   MonSession *session = op->get_session();
5329   if (!session) {
5330     derr << __func__ << " no session" << dendl;
5331     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5332     return true;
5333   }
5334
5335   string prefix;
5336   cmd_getval(cmdmap, "prefix", prefix);
5337
5338   string format;
5339   cmd_getval(cmdmap, "format", format, string("plain"));
5340   boost::scoped_ptr<Formatter> f(Formatter::create(format));
5341
5342   if (prefix == "osd stat") {
5343     if (f) {
5344       f->open_object_section("osdmap");
5345       osdmap.print_summary(f.get(), ds, "", true);
5346       f->close_section();
5347       f->flush(rdata);
5348     } else {
5349       osdmap.print_summary(nullptr, ds, "", true);
5350       rdata.append(ds);
5351     }
5352   }
5353   else if (prefix == "osd dump" ||
5354            prefix == "osd tree" ||
5355            prefix == "osd tree-from" ||
5356            prefix == "osd ls" ||
5357            prefix == "osd getmap" ||
5358            prefix == "osd getcrushmap" ||
5359            prefix == "osd ls-tree" ||
5360            prefix == "osd info") {
5361     string val;
5362
5363     epoch_t epoch = 0;
5364     int64_t epochnum;
5365     cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5366     epoch = epochnum;
5367
5368     bufferlist osdmap_bl;
5369     int err = get_version_full(epoch, osdmap_bl);
5370     if (err == -ENOENT) {
5371       r = -ENOENT;
5372       ss << "there is no map for epoch " << epoch;
5373       goto reply;
5374     }
5375     ceph_assert(err == 0);
5376     ceph_assert(osdmap_bl.length());
5377
5378     OSDMap *p;
5379     if (epoch == osdmap.get_epoch()) {
5380       p = &osdmap;
5381     } else {
5382       p = new OSDMap;
5383       p->decode(osdmap_bl);
5384     }
5385
5386     auto sg = make_scope_guard([&] {
5387       if (p != &osdmap) {
5388         delete p;
5389       }
5390     });
5391
5392     if (prefix == "osd dump") {
5393       stringstream ds;
5394       if (f) {
5395         f->open_object_section("osdmap");
5396         p->dump(f.get());
5397         f->close_section();
5398         f->flush(ds);
5399       } else {
5400         p->print(ds);
5401       }
5402       rdata.append(ds);
5403       if (!f)
5404         ds << " ";
5405     } else if (prefix == "osd ls") {
5406       if (f) {
5407         f->open_array_section("osds");
5408         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5409           if (osdmap.exists(i)) {
5410             f->dump_int("osd", i);
5411           }
5412         }
5413         f->close_section();
5414         f->flush(ds);
5415       } else {
5416         bool first = true;
5417         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5418           if (osdmap.exists(i)) {
5419             if (!first)
5420               ds << "\n";
5421             first = false;
5422             ds << i;
5423           }
5424         }
5425       }
5426       rdata.append(ds);
5427     } else if (prefix == "osd info") {
5428       int64_t osd_id;
5429       bool do_single_osd = true;
5430       if (!cmd_getval(cmdmap, "id", osd_id)) {
5431         do_single_osd = false;
5432       }
5433
5434       if (do_single_osd && !osdmap.exists(osd_id)) {
5435         ss << "osd." << osd_id << " does not exist";
5436         r = -EINVAL;
5437         goto reply;
5438       }
5439
5440       if (f) {
5441         if (do_single_osd) {
5442           osdmap.dump_osd(osd_id, f.get());
5443         } else {
5444           osdmap.dump_osds(f.get());
5445         }
5446         f->flush(ds);
5447       } else {
5448         if (do_single_osd) {
5449           osdmap.print_osd(osd_id, ds);
5450         } else {
5451           osdmap.print_osds(ds);
5452         }
5453       }
5454       rdata.append(ds);
5455     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5456       string bucket;
5457       if (prefix == "osd tree-from") {
5458         cmd_getval(cmdmap, "bucket", bucket);
5459         if (!osdmap.crush->name_exists(bucket)) {
5460           ss << "bucket '" << bucket << "' does not exist";
5461           r = -ENOENT;
5462           goto reply;
5463         }
5464         int id = osdmap.crush->get_item_id(bucket);
5465         if (id >= 0) {
5466           ss << "\"" << bucket << "\" is not a bucket";
5467           r = -EINVAL;
5468           goto reply;
5469         }
5470       }
5471
5472       vector<string> states;
5473       cmd_getval(cmdmap, "states", states);
5474       unsigned filter = 0;
5475       for (auto& s : states) {
5476         if (s == "up") {
5477           filter |= OSDMap::DUMP_UP;
5478         } else if (s == "down") {
5479           filter |= OSDMap::DUMP_DOWN;
5480         } else if (s == "in") {
5481           filter |= OSDMap::DUMP_IN;
5482         } else if (s == "out") {
5483           filter |= OSDMap::DUMP_OUT;
5484         } else if (s == "destroyed") {
5485           filter |= OSDMap::DUMP_DESTROYED;
5486         } else {
5487           ss << "unrecognized state '" << s << "'";
5488           r = -EINVAL;
5489           goto reply;
5490         }
5491       }
5492       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5493           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5494         ss << "cannot specify both 'in' and 'out'";
5495         r = -EINVAL;
5496         goto reply;
5497       }
5498       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5499            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5500            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5501            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5502            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5503            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5504         ss << "can specify only one of 'up', 'down' and 'destroyed'";
5505         r = -EINVAL;
5506         goto reply;
5507       }
5508       if (f) {
5509         f->open_object_section("tree");
5510         p->print_tree(f.get(), NULL, filter, bucket);
5511         f->close_section();
5512         f->flush(ds);
5513       } else {
5514         p->print_tree(NULL, &ds, filter, bucket);
5515       }
5516       rdata.append(ds);
5517     } else if (prefix == "osd getmap") {
5518       rdata.append(osdmap_bl);
5519       ss << "got osdmap epoch " << p->get_epoch();
5520     } else if (prefix == "osd getcrushmap") {
5521       p->crush->encode(rdata, mon->get_quorum_con_features());
5522       ss << p->get_crush_version();
5523     } else if (prefix == "osd ls-tree") {
5524       string bucket_name;
5525       cmd_getval(cmdmap, "name", bucket_name);
5526       set<int> osds;
5527       r = p->get_osds_by_bucket_name(bucket_name, &osds);
5528       if (r == -ENOENT) {
5529         ss << "\"" << bucket_name << "\" does not exist";
5530         goto reply;
5531       } else if (r < 0) {
5532         ss << "can not parse bucket name:\"" << bucket_name << "\"";
5533         goto reply;
5534       }
5535
5536       if (f) {
5537         f->open_array_section("osds");
5538         for (auto &i : osds) {
5539           if (osdmap.exists(i)) {
5540             f->dump_int("osd", i);
5541           }
5542         }
5543         f->close_section();
5544         f->flush(ds);
5545       } else {
5546         bool first = true;
5547         for (auto &i : osds) {
5548           if (osdmap.exists(i)) {
5549             if (!first)
5550               ds << "\n";
5551             first = false;
5552             ds << i;
5553           }
5554         }
5555       }
5556
5557       rdata.append(ds);
5558     }
5559   } else if (prefix == "osd getmaxosd") {
5560     if (f) {
5561       f->open_object_section("getmaxosd");
5562       f->dump_unsigned("epoch", osdmap.get_epoch());
5563       f->dump_int("max_osd", osdmap.get_max_osd());
5564       f->close_section();
5565       f->flush(rdata);
5566     } else {
5567       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5568       rdata.append(ds);
5569     }
5570   } else if (prefix == "osd utilization") {
5571     string out;
5572     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5573     if (f)
5574       f->flush(rdata);
5575     else
5576       rdata.append(out);
5577     r = 0;
5578     goto reply;
5579   } else if (prefix  == "osd find") {
5580     int64_t osd;
5581     if (!cmd_getval(cmdmap, "id", osd)) {
5582       ss << "unable to parse osd id value '"
5583          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5584       r = -EINVAL;
5585       goto reply;
5586     }
5587     if (!osdmap.exists(osd)) {
5588       ss << "osd." << osd << " does not exist";
5589       r = -ENOENT;
5590       goto reply;
5591     }
5592     string format;
5593     cmd_getval(cmdmap, "format", format);
5594     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5595     f->open_object_section("osd_location");
5596     f->dump_int("osd", osd);
5597     f->dump_object("addrs", osdmap.get_addrs(osd));
5598     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5599
5600     // try to identify host, pod/container name, etc.
5601     map<string,string> m;
5602     load_metadata(osd, m, nullptr);
5603     if (auto p = m.find("hostname"); p != m.end()) {
5604       f->dump_string("host", p->second);
5605     }
5606     for (auto& k : {
5607         "pod_name", "pod_namespace", // set by rook
5608         "container_name"             // set by cephadm, ceph-ansible
5609         }) {
5610       if (auto p = m.find(k); p != m.end()) {
5611         f->dump_string(k, p->second);
5612       }
5613     }
5614
5615     // crush is helpful too
5616     f->open_object_section("crush_location");
5617     map<string,string> loc = osdmap.crush->get_full_location(osd);
5618     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5619       f->dump_string(p->first.c_str(), p->second);
5620     f->close_section();
5621     f->close_section();
5622     f->flush(rdata);
5623   } else if (prefix == "osd metadata") {
5624     int64_t osd = -1;
5625     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5626         !cmd_getval(cmdmap, "id", osd)) {
5627       ss << "unable to parse osd id value '"
5628          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5629       r = -EINVAL;
5630       goto reply;
5631     }
5632     if (osd >= 0 && !osdmap.exists(osd)) {
5633       ss << "osd." << osd << " does not exist";
5634       r = -ENOENT;
5635       goto reply;
5636     }
5637     string format;
5638     cmd_getval(cmdmap, "format", format);
5639     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5640     if (osd >= 0) {
5641       f->open_object_section("osd_metadata");
5642       f->dump_unsigned("id", osd);
5643       r = dump_osd_metadata(osd, f.get(), &ss);
5644       if (r < 0)
5645         goto reply;
5646       f->close_section();
5647     } else {
5648       r = 0;
5649       f->open_array_section("osd_metadata");
5650       for (int i=0; i<osdmap.get_max_osd(); ++i) {
5651         if (osdmap.exists(i)) {
5652           f->open_object_section("osd");
5653           f->dump_unsigned("id", i);
5654           r = dump_osd_metadata(i, f.get(), NULL);
5655           if (r == -EINVAL || r == -ENOENT) {
5656             // Drop error, continue to get other daemons' metadata
5657             dout(4) << "No metadata for osd." << i << dendl;
5658             r = 0;
5659           } else if (r < 0) {
5660             // Unexpected error
5661             goto reply;
5662           }
5663           f->close_section();
5664         }
5665       }
5666       f->close_section();
5667     }
5668     f->flush(rdata);
5669   } else if (prefix == "osd versions") {
5670     if (!f)
5671       f.reset(Formatter::create("json-pretty"));
5672     count_metadata("ceph_version", f.get());
5673     f->flush(rdata);
5674     r = 0;
5675   } else if (prefix == "osd count-metadata") {
5676     if (!f)
5677       f.reset(Formatter::create("json-pretty"));
5678     string field;
5679     cmd_getval(cmdmap, "property", field);
5680     count_metadata(field, f.get());
5681     f->flush(rdata);
5682     r = 0;
5683   } else if (prefix == "osd numa-status") {
5684     TextTable tbl;
5685     if (f) {
5686       f->open_array_section("osds");
5687     } else {
5688       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5689       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5690       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5691       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5692       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5693       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5694     }
5695     for (int i=0; i<osdmap.get_max_osd(); ++i) {
5696       if (osdmap.exists(i)) {
5697         map<string,string> m;
5698         ostringstream err;
5699         if (load_metadata(i, m, &err) < 0) {
5700           continue;
5701         }
5702         string host;
5703         auto p = m.find("hostname");
5704         if (p != m.end()) {
5705           host = p->second;
5706         }
5707         if (f) {
5708           f->open_object_section("osd");
5709           f->dump_int("osd", i);
5710           f->dump_string("host", host);
5711           for (auto n : { "network_numa_node", "objectstore_numa_node",
5712                 "numa_node" }) {
5713             p = m.find(n);
5714             if (p != m.end()) {
5715               f->dump_int(n, atoi(p->second.c_str()));
5716             }
5717           }
5718           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5719             p = m.find(n);
5720             if (p != m.end()) {
5721               list<string> ls = get_str_list(p->second, ",");
5722               f->open_array_section(n);
5723               for (auto node : ls) {
5724                 f->dump_int("node", atoi(node.c_str()));
5725               }
5726               f->close_section();
5727             }
5728           }
5729           for (auto n : { "numa_node_cpus" }) {
5730             p = m.find(n);
5731             if (p != m.end()) {
5732               dump_cpu_list(f.get(), n, p->second);
5733             }
5734           }
5735           f->close_section();
5736         } else {
5737           tbl << i;
5738           tbl << host;
5739           p = m.find("network_numa_nodes");
5740           if (p != m.end()) {
5741             tbl << p->second;
5742           } else {
5743             tbl << "-";
5744           }
5745           p = m.find("objectstore_numa_nodes");
5746           if (p != m.end()) {
5747             tbl << p->second;
5748           } else {
5749             tbl << "-";
5750           }
5751           p = m.find("numa_node");
5752           auto q = m.find("numa_node_cpus");
5753           if (p != m.end() && q != m.end()) {
5754             tbl << p->second;
5755             tbl << q->second;
5756           } else {
5757             tbl << "-";
5758             tbl << "-";
5759           }
5760           tbl << TextTable::endrow;
5761         }
5762       }
5763     }
5764     if (f) {
5765       f->close_section();
5766       f->flush(rdata);
5767     } else {
5768       rdata.append(stringify(tbl));
5769     }
5770   } else if (prefix == "osd map") {
5771     string poolstr, objstr, namespacestr;
5772     cmd_getval(cmdmap, "pool", poolstr);
5773     cmd_getval(cmdmap, "object", objstr);
5774     cmd_getval(cmdmap, "nspace", namespacestr);
5775
5776     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5777     if (pool < 0) {
5778       ss << "pool " << poolstr << " does not exist";
5779       r = -ENOENT;
5780       goto reply;
5781     }
5782     object_locator_t oloc(pool, namespacestr);
5783     object_t oid(objstr);
5784     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5785     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5786     vector<int> up, acting;
5787     int up_p, acting_p;
5788     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5789
5790     string fullobjname;
5791     if (!namespacestr.empty())
5792       fullobjname = namespacestr + string("/") + oid.name;
5793     else
5794       fullobjname = oid.name;
5795     if (f) {
5796       f->open_object_section("osd_map");
5797       f->dump_unsigned("epoch", osdmap.get_epoch());
5798       f->dump_string("pool", poolstr);
5799       f->dump_int("pool_id", pool);
5800       f->dump_stream("objname") << fullobjname;
5801       f->dump_stream("raw_pgid") << pgid;
5802       f->dump_stream("pgid") << mpgid;
5803       f->open_array_section("up");
5804       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5805         f->dump_int("osd", *p);
5806       f->close_section();
5807       f->dump_int("up_primary", up_p);
5808       f->open_array_section("acting");
5809       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5810         f->dump_int("osd", *p);
5811       f->close_section();
5812       f->dump_int("acting_primary", acting_p);
5813       f->close_section(); // osd_map
5814       f->flush(rdata);
5815     } else {
5816       ds << "osdmap e" << osdmap.get_epoch()
5817         << " pool '" << poolstr << "' (" << pool << ")"
5818         << " object '" << fullobjname << "' ->"
5819         << " pg " << pgid << " (" << mpgid << ")"
5820         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5821         << pg_vector_string(acting) << ", p" << acting_p << ")";
5822       rdata.append(ds);
5823     }
5824
5825   } else if (prefix == "pg map") {
5826     pg_t pgid;
5827     string pgidstr;
5828     cmd_getval(cmdmap, "pgid", pgidstr);
5829     if (!pgid.parse(pgidstr.c_str())) {
5830       ss << "invalid pgid '" << pgidstr << "'";
5831       r = -EINVAL;
5832       goto reply;
5833     }
5834     vector<int> up, acting;
5835     if (!osdmap.have_pg_pool(pgid.pool())) {
5836       ss << "pg '" << pgidstr << "' does not exist";
5837       r = -ENOENT;
5838       goto reply;
5839     }
5840     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5841     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5842     if (f) {
5843       f->open_object_section("pg_map");
5844       f->dump_unsigned("epoch", osdmap.get_epoch());
5845       f->dump_stream("raw_pgid") << pgid;
5846       f->dump_stream("pgid") << mpgid;
5847       f->open_array_section("up");
5848       for (auto osd : up) {
5849         f->dump_int("up_osd", osd);
5850       }
5851       f->close_section();
5852       f->open_array_section("acting");
5853       for (auto osd : acting) {
5854         f->dump_int("acting_osd", osd);
5855       }
5856       f->close_section();
5857       f->close_section();
5858       f->flush(rdata);
5859     } else {
5860       ds << "osdmap e" << osdmap.get_epoch()
5861          << " pg " << pgid << " (" << mpgid << ")"
5862          << " -> up " << up << " acting " << acting;
5863       rdata.append(ds);
5864     }
5865     goto reply;
5866
5867   } else if (prefix == "osd lspools") {
5868     if (f)
5869       f->open_array_section("pools");
5870     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5871          p != osdmap.pools.end();
5872          ++p) {
5873       if (f) {
5874         f->open_object_section("pool");
5875         f->dump_int("poolnum", p->first);
5876         f->dump_string("poolname", osdmap.pool_name[p->first]);
5877         f->close_section();
5878       } else {
5879         ds << p->first << ' ' << osdmap.pool_name[p->first];
5880         if (next(p) != osdmap.pools.end()) {
5881           ds << '\n';
5882         }
5883       }
5884     }
5885     if (f) {
5886       f->close_section();
5887       f->flush(ds);
5888     }
5889     rdata.append(ds);
5890   } else if (prefix == "osd blacklist ls") {
5891     if (f)
5892       f->open_array_section("blacklist");
5893
5894     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5895          p != osdmap.blacklist.end();
5896          ++p) {
5897       if (f) {
5898         f->open_object_section("entry");
5899         f->dump_string("addr", p->first.get_legacy_str());
5900         f->dump_stream("until") << p->second;
5901         f->close_section();
5902       } else {
5903         stringstream ss;
5904         string s;
5905         ss << p->first << " " << p->second;
5906         getline(ss, s);
5907         s += "\n";
5908         rdata.append(s);
5909       }
5910     }
5911     if (f) {
5912       f->close_section();
5913       f->flush(rdata);
5914     }
5915     ss << "listed " << osdmap.blacklist.size() << " entries";
5916
5917   } else if (prefix == "osd pool ls") {
5918     string detail;
5919     cmd_getval(cmdmap, "detail", detail);
5920     if (!f && detail == "detail") {
5921       ostringstream ss;
5922       osdmap.print_pools(ss);
5923       rdata.append(ss.str());
5924     } else {
5925       if (f)
5926         f->open_array_section("pools");
5927       for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5928            it != osdmap.get_pools().end();
5929            ++it) {
5930         if (f) {
5931           if (detail == "detail") {
5932             f->open_object_section("pool");
5933             f->dump_int("pool_id", it->first);
5934             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5935             it->second.dump(f.get());
5936             f->close_section();
5937           } else {
5938             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5939           }
5940         } else {
5941           rdata.append(osdmap.get_pool_name(it->first) + "\n");
5942         }
5943       }
5944       if (f) {
5945         f->close_section();
5946         f->flush(rdata);
5947       }
5948     }
5949
5950   } else if (prefix == "osd crush get-tunable") {
5951     string tunable;
5952     cmd_getval(cmdmap, "tunable", tunable);
5953     ostringstream rss;
5954     if (f)
5955       f->open_object_section("tunable");
5956     if (tunable == "straw_calc_version") {
5957       if (f)
5958         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5959       else
5960         rss << osdmap.crush->get_straw_calc_version() << "\n";
5961     } else {
5962       r = -EINVAL;
5963       goto reply;
5964     }
5965     if (f) {
5966       f->close_section();
5967       f->flush(rdata);
5968     } else {
5969       rdata.append(rss.str());
5970     }
5971     r = 0;
5972
5973   } else if (prefix == "osd pool get") {
5974     string poolstr;
5975     cmd_getval(cmdmap, "pool", poolstr);
5976     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5977     if (pool < 0) {
5978       ss << "unrecognized pool '" << poolstr << "'";
5979       r = -ENOENT;
5980       goto reply;
5981     }
5982
5983     const pg_pool_t *p = osdmap.get_pg_pool(pool);
5984     string var;
5985     cmd_getval(cmdmap, "var", var);
5986
5987     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5988     const choices_map_t ALL_CHOICES = {
5989       {"size", SIZE},
5990       {"min_size", MIN_SIZE},
5991       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5992       {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5993       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5994       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5995       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5996       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5997       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5998       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5999       {"use_gmt_hitset", USE_GMT_HITSET},
6000       {"target_max_objects", TARGET_MAX_OBJECTS},
6001       {"target_max_bytes", TARGET_MAX_BYTES},
6002       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6003       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6004       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6005       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6006       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6007       {"erasure_code_profile", ERASURE_CODE_PROFILE},
6008       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6009       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6010       {"fast_read", FAST_READ},
6011       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6012       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6013       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6014       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6015       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6016       {"recovery_priority", RECOVERY_PRIORITY},
6017       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6018       {"scrub_priority", SCRUB_PRIORITY},
6019       {"compression_mode", COMPRESSION_MODE},
6020       {"compression_algorithm", COMPRESSION_ALGORITHM},
6021       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6022       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6023       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6024       {"csum_type", CSUM_TYPE},
6025       {"csum_max_block", CSUM_MAX_BLOCK},
6026       {"csum_min_block", CSUM_MIN_BLOCK},
6027       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6028       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6029       {"pg_num_min", PG_NUM_MIN},
6030       {"target_size_bytes", TARGET_SIZE_BYTES},
6031       {"target_size_ratio", TARGET_SIZE_RATIO},
6032       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6033     };
6034
6035     typedef std::set<osd_pool_get_choices> choices_set_t;
6036
6037     const choices_set_t ONLY_TIER_CHOICES = {
6038       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6039       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6040       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6041       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6042       MIN_READ_RECENCY_FOR_PROMOTE,
6043       MIN_WRITE_RECENCY_FOR_PROMOTE,
6044       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6045     };
6046     const choices_set_t ONLY_ERASURE_CHOICES = {
6047       EC_OVERWRITES, ERASURE_CODE_PROFILE
6048     };
6049
6050     choices_set_t selected_choices;
6051     if (var == "all") {
6052       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6053           it != ALL_CHOICES.end(); ++it) {
6054         selected_choices.insert(it->second);
6055       }
6056
6057       if(!p->is_tier()) {
6058         selected_choices = subtract_second_from_first(selected_choices,
6059                                                       ONLY_TIER_CHOICES);
6060       }
6061
6062       if(!p->is_erasure()) {
6063         selected_choices = subtract_second_from_first(selected_choices,
6064                                                       ONLY_ERASURE_CHOICES);
6065       }
6066     } else /* var != "all" */  {
6067       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6068       osd_pool_get_choices selected = found->second;
6069
6070       if (!p->is_tier() &&
6071           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6072         ss << "pool '" << poolstr
6073            << "' is not a tier pool: variable not applicable";
6074         r = -EACCES;
6075         goto reply;
6076       }
6077
6078       if (!p->is_erasure() &&
6079           ONLY_ERASURE_CHOICES.find(selected)
6080           != ONLY_ERASURE_CHOICES.end()) {
6081         ss << "pool '" << poolstr
6082            << "' is not a erasure pool: variable not applicable";
6083         r = -EACCES;
6084         goto reply;
6085       }
6086
6087       if (pool_opts_t::is_opt_name(var) &&
6088           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6089         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6090         r = -ENOENT;
6091         goto reply;
6092       }
6093
6094       selected_choices.insert(selected);
6095     }
6096
6097     if (f) {
6098       f->open_object_section("pool");
6099       f->dump_string("pool", poolstr);
6100       f->dump_int("pool_id", pool);
6101       for(choices_set_t::const_iterator it = selected_choices.begin();
6102           it != selected_choices.end(); ++it) {
6103         choices_map_t::const_iterator i;
6104         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6105           if (i->second == *it) {
6106             break;
6107           }
6108         }
6109         ceph_assert(i != ALL_CHOICES.end());
6110         switch(*it) {
6111           case PG_NUM:
6112             f->dump_int("pg_num", p->get_pg_num());
6113             break;
6114           case PGP_NUM:
6115             f->dump_int("pgp_num", p->get_pgp_num());
6116             break;
6117           case SIZE:
6118             f->dump_int("size", p->get_size());
6119             break;
6120           case MIN_SIZE:
6121             f->dump_int("min_size", p->get_min_size());
6122             break;
6123           case CRUSH_RULE:
6124             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6125               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6126                                p->get_crush_rule()));
6127             } else {
6128               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6129             }
6130             break;
6131           case EC_OVERWRITES:
6132             f->dump_bool("allow_ec_overwrites",
6133                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6134             break;
6135           case PG_AUTOSCALE_MODE:
6136             f->dump_string("pg_autoscale_mode",
6137                            pg_pool_t::get_pg_autoscale_mode_name(
6138                              p->pg_autoscale_mode));
6139             break;
6140           case HASHPSPOOL:
6141           case NODELETE:
6142           case NOPGCHANGE:
6143           case NOSIZECHANGE:
6144           case WRITE_FADVISE_DONTNEED:
6145           case NOSCRUB:
6146           case NODEEP_SCRUB:
6147             f->dump_bool(i->first.c_str(),
6148                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6149             break;
6150           case HIT_SET_PERIOD:
6151             f->dump_int("hit_set_period", p->hit_set_period);
6152             break;
6153           case HIT_SET_COUNT:
6154             f->dump_int("hit_set_count", p->hit_set_count);
6155             break;
6156           case HIT_SET_TYPE:
6157             f->dump_string("hit_set_type",
6158                            HitSet::get_type_name(p->hit_set_params.get_type()));
6159             break;
6160           case HIT_SET_FPP:
6161             {
6162               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6163                 BloomHitSet::Params *bloomp =
6164                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6165                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6166               } else if(var != "all") {
6167                 f->close_section();
6168                 ss << "hit set is not of type Bloom; " <<
6169                   "invalid to get a false positive rate!";
6170                 r = -EINVAL;
6171                 goto reply;
6172               }
6173             }
6174             break;
6175           case USE_GMT_HITSET:
6176             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6177             break;
6178           case TARGET_MAX_OBJECTS:
6179             f->dump_unsigned("target_max_objects", p->target_max_objects);
6180             break;
6181           case TARGET_MAX_BYTES:
6182             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6183             break;
6184           case CACHE_TARGET_DIRTY_RATIO:
6185             f->dump_unsigned("cache_target_dirty_ratio_micro",
6186                              p->cache_target_dirty_ratio_micro);
6187             f->dump_float("cache_target_dirty_ratio",
6188                           ((float)p->cache_target_dirty_ratio_micro/1000000));
6189             break;
6190           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6191             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6192                              p->cache_target_dirty_high_ratio_micro);
6193             f->dump_float("cache_target_dirty_high_ratio",
6194                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6195             break;
6196           case CACHE_TARGET_FULL_RATIO:
6197             f->dump_unsigned("cache_target_full_ratio_micro",
6198                              p->cache_target_full_ratio_micro);
6199             f->dump_float("cache_target_full_ratio",
6200                           ((float)p->cache_target_full_ratio_micro/1000000));
6201             break;
6202           case CACHE_MIN_FLUSH_AGE:
6203             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6204             break;
6205           case CACHE_MIN_EVICT_AGE:
6206             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6207             break;
6208           case ERASURE_CODE_PROFILE:
6209             f->dump_string("erasure_code_profile", p->erasure_code_profile);
6210             break;
6211           case MIN_READ_RECENCY_FOR_PROMOTE:
6212             f->dump_int("min_read_recency_for_promote",
6213                         p->min_read_recency_for_promote);
6214             break;
6215           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6216             f->dump_int("min_write_recency_for_promote",
6217                         p->min_write_recency_for_promote);
6218             break;
6219           case FAST_READ:
6220             f->dump_int("fast_read", p->fast_read);
6221             break;
6222           case HIT_SET_GRADE_DECAY_RATE:
6223             f->dump_int("hit_set_grade_decay_rate",
6224                         p->hit_set_grade_decay_rate);
6225             break;
6226           case HIT_SET_SEARCH_LAST_N:
6227             f->dump_int("hit_set_search_last_n",
6228                         p->hit_set_search_last_n);
6229             break;
6230           case SCRUB_MIN_INTERVAL:
6231           case SCRUB_MAX_INTERVAL:
6232           case DEEP_SCRUB_INTERVAL:
6233           case RECOVERY_PRIORITY:
6234           case RECOVERY_OP_PRIORITY:
6235           case SCRUB_PRIORITY:
6236           case COMPRESSION_MODE:
6237           case COMPRESSION_ALGORITHM:
6238           case COMPRESSION_REQUIRED_RATIO:
6239           case COMPRESSION_MAX_BLOB_SIZE:
6240           case COMPRESSION_MIN_BLOB_SIZE:
6241           case CSUM_TYPE:
6242           case CSUM_MAX_BLOCK:
6243           case CSUM_MIN_BLOCK:
6244           case FINGERPRINT_ALGORITHM:
6245           case PG_NUM_MIN:
6246           case TARGET_SIZE_BYTES:
6247           case TARGET_SIZE_RATIO:
6248           case PG_AUTOSCALE_BIAS:
6249             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6250             if (p->opts.is_set(key)) {
6251               if(*it == CSUM_TYPE) {
6252                 int64_t val;
6253                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6254                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6255               } else {
6256                 p->opts.dump(i->first, f.get());
6257               }
6258             }
6259             break;
6260         }
6261       }
6262       f->close_section();
6263       f->flush(rdata);
6264     } else /* !f */ {
6265       for(choices_set_t::const_iterator it = selected_choices.begin();
6266           it != selected_choices.end(); ++it) {
6267         choices_map_t::const_iterator i;
6268         switch(*it) {
6269           case PG_NUM:
6270             ss << "pg_num: " << p->get_pg_num() << "\n";
6271             break;
6272           case PGP_NUM:
6273             ss << "pgp_num: " << p->get_pgp_num() << "\n";
6274             break;
6275           case SIZE:
6276             ss << "size: " << p->get_size() << "\n";
6277             break;
6278           case MIN_SIZE:
6279             ss << "min_size: " << p->get_min_size() << "\n";
6280             break;
6281           case CRUSH_RULE:
6282             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6283               ss << "crush_rule: " << osdmap.crush->get_rule_name(
6284                 p->get_crush_rule()) << "\n";
6285             } else {
6286               ss << "crush_rule: " << p->get_crush_rule() << "\n";
6287             }
6288             break;
6289           case PG_AUTOSCALE_MODE:
6290             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6291               p->pg_autoscale_mode) <<"\n";
6292             break;
6293           case HIT_SET_PERIOD:
6294             ss << "hit_set_period: " << p->hit_set_period << "\n";
6295             break;
6296           case HIT_SET_COUNT:
6297             ss << "hit_set_count: " << p->hit_set_count << "\n";
6298             break;
6299           case HIT_SET_TYPE:
6300             ss << "hit_set_type: " <<
6301               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6302             break;
6303           case HIT_SET_FPP:
6304             {
6305               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6306                 BloomHitSet::Params *bloomp =
6307                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6308                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6309               } else if(var != "all") {
6310                 ss << "hit set is not of type Bloom; " <<
6311                   "invalid to get a false positive rate!";
6312                 r = -EINVAL;
6313                 goto reply;
6314               }
6315             }
6316             break;
6317           case USE_GMT_HITSET:
6318             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6319             break;
6320           case TARGET_MAX_OBJECTS:
6321             ss << "target_max_objects: " << p->target_max_objects << "\n";
6322             break;
6323           case TARGET_MAX_BYTES:
6324             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6325             break;
6326           case CACHE_TARGET_DIRTY_RATIO:
6327             ss << "cache_target_dirty_ratio: "
6328                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6329             break;
6330           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6331             ss << "cache_target_dirty_high_ratio: "
6332                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6333             break;
6334           case CACHE_TARGET_FULL_RATIO:
6335             ss << "cache_target_full_ratio: "
6336                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6337             break;
6338           case CACHE_MIN_FLUSH_AGE:
6339             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6340             break;
6341           case CACHE_MIN_EVICT_AGE:
6342             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6343             break;
6344           case ERASURE_CODE_PROFILE:
6345             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6346             break;
6347           case MIN_READ_RECENCY_FOR_PROMOTE:
6348             ss << "min_read_recency_for_promote: " <<
6349               p->min_read_recency_for_promote << "\n";
6350             break;
6351           case HIT_SET_GRADE_DECAY_RATE:
6352             ss << "hit_set_grade_decay_rate: " <<
6353               p->hit_set_grade_decay_rate << "\n";
6354             break;
6355           case HIT_SET_SEARCH_LAST_N:
6356             ss << "hit_set_search_last_n: " <<
6357               p->hit_set_search_last_n << "\n";
6358             break;
6359           case EC_OVERWRITES:
6360             ss << "allow_ec_overwrites: " <<
6361               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6362               "\n";
6363             break;
6364           case HASHPSPOOL:
6365           case NODELETE:
6366           case NOPGCHANGE:
6367           case NOSIZECHANGE:
6368           case WRITE_FADVISE_DONTNEED:
6369           case NOSCRUB:
6370           case NODEEP_SCRUB:
6371             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6372               if (i->second == *it)
6373                 break;
6374             }
6375             ceph_assert(i != ALL_CHOICES.end());
6376             ss << i->first << ": " <<
6377               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6378                "true" : "false") << "\n";
6379             break;
6380           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6381             ss << "min_write_recency_for_promote: " <<
6382               p->min_write_recency_for_promote << "\n";
6383             break;
6384           case FAST_READ:
6385             ss << "fast_read: " << p->fast_read << "\n";
6386             break;
6387           case SCRUB_MIN_INTERVAL:
6388           case SCRUB_MAX_INTERVAL:
6389           case DEEP_SCRUB_INTERVAL:
6390           case RECOVERY_PRIORITY:
6391           case RECOVERY_OP_PRIORITY:
6392           case SCRUB_PRIORITY:
6393           case COMPRESSION_MODE:
6394           case COMPRESSION_ALGORITHM:
6395           case COMPRESSION_REQUIRED_RATIO:
6396           case COMPRESSION_MAX_BLOB_SIZE:
6397           case COMPRESSION_MIN_BLOB_SIZE:
6398           case CSUM_TYPE:
6399           case CSUM_MAX_BLOCK:
6400           case CSUM_MIN_BLOCK:
6401           case FINGERPRINT_ALGORITHM:
6402           case PG_NUM_MIN:
6403           case TARGET_SIZE_BYTES:
6404           case TARGET_SIZE_RATIO:
6405           case PG_AUTOSCALE_BIAS:
6406             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6407               if (i->second == *it)
6408                 break;
6409             }
6410             ceph_assert(i != ALL_CHOICES.end());
6411             {
6412               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6413               if (p->opts.is_set(key)) {
6414                 if(key == pool_opts_t::CSUM_TYPE) {
6415                   int64_t val;
6416                   p->opts.get(key, &val);
6417                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6418                 } else {
6419                   ss << i->first << ": " << p->opts.get(key) << "\n";
6420                 }
6421               }
6422             }
6423             break;
6424         }
6425         rdata.append(ss.str());
6426         ss.str("");
6427       }
6428     }
6429     r = 0;
6430   } else if (prefix == "osd pool get-quota") {
6431     string pool_name;
6432     cmd_getval(cmdmap, "pool", pool_name);
6433
6434     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6435     if (poolid < 0) {
6436       ceph_assert(poolid == -ENOENT);
6437       ss << "unrecognized pool '" << pool_name << "'";
6438       r = -ENOENT;
6439       goto reply;
6440     }
6441     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6442     const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6443     const object_stat_sum_t& sum = pstat->stats.sum;
6444     if (f) {
6445       f->open_object_section("pool_quotas");
6446       f->dump_string("pool_name", pool_name);
6447       f->dump_unsigned("pool_id", poolid);
6448       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6449       f->dump_int("current_num_objects", sum.num_objects);
6450       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6451       f->dump_int("current_num_bytes", sum.num_bytes);
6452       f->close_section();
6453       f->flush(rdata);
6454     } else {
6455       stringstream rs;
6456       rs << "quotas for pool '" << pool_name << "':\n"
6457          << "  max objects: ";
6458       if (p->quota_max_objects == 0)
6459         rs << "N/A";
6460       else {
6461         rs << si_u_t(p->quota_max_objects) << " objects";
6462         rs << "  (current num objects: " << sum.num_objects << " objects)";
6463       }
6464       rs << "\n"
6465          << "  max bytes  : ";
6466       if (p->quota_max_bytes == 0)
6467         rs << "N/A";
6468       else {
6469         rs << byte_u_t(p->quota_max_bytes);
6470         rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
6471       }
6472       rdata.append(rs.str());
6473     }
6474     rdata.append("\n");
6475     r = 0;
6476   } else if (prefix == "osd crush rule list" ||
6477              prefix == "osd crush rule ls") {
6478     if (f) {
6479       f->open_array_section("rules");
6480       osdmap.crush->list_rules(f.get());
6481       f->close_section();
6482       f->flush(rdata);
6483     } else {
6484       ostringstream ss;
6485       osdmap.crush->list_rules(&ss);
6486       rdata.append(ss.str());
6487     }
6488   } else if (prefix == "osd crush rule ls-by-class") {
6489     string class_name;
6490     cmd_getval(cmdmap, "class", class_name);
6491     if (class_name.empty()) {
6492       ss << "no class specified";
6493       r = -EINVAL;
6494       goto reply;
6495     }
6496     set<int> rules;
6497     r = osdmap.crush->get_rules_by_class(class_name, &rules);
6498     if (r < 0) {
6499       ss << "failed to get rules by class '" << class_name << "'";
6500       goto reply;
6501     }
6502     if (f) {
6503       f->open_array_section("rules");
6504       for (auto &rule: rules) {
6505         f->dump_string("name", osdmap.crush->get_rule_name(rule));
6506       }
6507       f->close_section();
6508       f->flush(rdata);
6509     } else {
6510       ostringstream rs;
6511       for (auto &rule: rules) {
6512         rs << osdmap.crush->get_rule_name(rule) << "\n";
6513       }
6514       rdata.append(rs.str());
6515     }
6516   } else if (prefix == "osd crush rule dump") {
6517     string name;
6518     cmd_getval(cmdmap, "name", name);
6519     string format;
6520     cmd_getval(cmdmap, "format", format);
6521     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6522     if (name == "") {
6523       f->open_array_section("rules");
6524       osdmap.crush->dump_rules(f.get());
6525       f->close_section();
6526     } else {
6527       int ruleno = osdmap.crush->get_rule_id(name);
6528       if (ruleno < 0) {
6529         ss << "unknown crush rule '" << name << "'";
6530         r = ruleno;
6531         goto reply;
6532       }
6533       osdmap.crush->dump_rule(ruleno, f.get());
6534     }
6535     ostringstream rs;
6536     f->flush(rs);
6537     rs << "\n";
6538     rdata.append(rs.str());
6539   } else if (prefix == "osd crush dump") {
6540     string format;
6541     cmd_getval(cmdmap, "format", format);
6542     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6543     f->open_object_section("crush_map");
6544     osdmap.crush->dump(f.get());
6545     f->close_section();
6546     ostringstream rs;
6547     f->flush(rs);
6548     rs << "\n";
6549     rdata.append(rs.str());
6550   } else if (prefix == "osd crush show-tunables") {
6551     string format;
6552     cmd_getval(cmdmap, "format", format);
6553     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6554     f->open_object_section("crush_map_tunables");
6555     osdmap.crush->dump_tunables(f.get());
6556     f->close_section();
6557     ostringstream rs;
6558     f->flush(rs);
6559     rs << "\n";
6560     rdata.append(rs.str());
6561   } else if (prefix == "osd crush tree") {
6562     string shadow;
6563     cmd_getval(cmdmap, "shadow", shadow);
6564     bool show_shadow = shadow == "--show-shadow";
6565     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6566     if (f) {
6567       f->open_object_section("crush_tree");
6568       osdmap.crush->dump_tree(nullptr,
6569                               f.get(),
6570                               osdmap.get_pool_names(),
6571                               show_shadow);
6572       f->close_section();
6573       f->flush(rdata);
6574     } else {
6575       ostringstream ss;
6576       osdmap.crush->dump_tree(&ss,
6577                               nullptr,
6578                               osdmap.get_pool_names(),
6579                               show_shadow);
6580       rdata.append(ss.str());
6581     }
6582   } else if (prefix == "osd crush ls") {
6583     string name;
6584     if (!cmd_getval(cmdmap, "node", name)) {
6585       ss << "no node specified";
6586       r = -EINVAL;
6587       goto reply;
6588     }
6589     if (!osdmap.crush->name_exists(name)) {
6590       ss << "node '" << name << "' does not exist";
6591       r = -ENOENT;
6592       goto reply;
6593     }
6594     int id = osdmap.crush->get_item_id(name);
6595     list<int> result;
6596     if (id >= 0) {
6597       result.push_back(id);
6598     } else {
6599       int num = osdmap.crush->get_bucket_size(id);
6600       for (int i = 0; i < num; ++i) {
6601         result.push_back(osdmap.crush->get_bucket_item(id, i));
6602       }
6603     }
6604     if (f) {
6605       f->open_array_section("items");
6606       for (auto i : result) {
6607         f->dump_string("item", osdmap.crush->get_item_name(i));
6608       }
6609       f->close_section();
6610       f->flush(rdata);
6611     } else {
6612       ostringstream ss;
6613       for (auto i : result) {
6614         ss << osdmap.crush->get_item_name(i) << "\n";
6615       }
6616       rdata.append(ss.str());
6617     }
6618     r = 0;
6619   } else if (prefix == "osd crush class ls") {
6620     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6621     f->open_array_section("crush_classes");
6622     for (auto i : osdmap.crush->class_name)
6623       f->dump_string("class", i.second);
6624     f->close_section();
6625     f->flush(rdata);
6626   } else if (prefix == "osd crush class ls-osd") {
6627     string name;
6628     cmd_getval(cmdmap, "class", name);
6629     set<int> osds;
6630     osdmap.crush->get_devices_by_class(name, &osds);
6631     if (f) {
6632       f->open_array_section("osds");
6633       for (auto &osd: osds)
6634         f->dump_int("osd", osd);
6635       f->close_section();
6636       f->flush(rdata);
6637     } else {
6638       bool first = true;
6639       for (auto &osd : osds) {
6640         if (!first)
6641           ds << "\n";
6642         first = false;
6643         ds << osd;
6644       }
6645       rdata.append(ds);
6646     }
6647   } else if (prefix == "osd crush get-device-class") {
6648     vector<string> idvec;
6649     cmd_getval(cmdmap, "ids", idvec);
6650     map<int, string> class_by_osd;
6651     for (auto& id : idvec) {
6652       ostringstream ts;
6653       long osd = parse_osd_id(id.c_str(), &ts);
6654       if (osd < 0) {
6655         ss << "unable to parse osd id:'" << id << "'";
6656         r = -EINVAL;
6657         goto reply;
6658       }
6659       auto device_class = osdmap.crush->get_item_class(osd);
6660       if (device_class)
6661         class_by_osd[osd] = device_class;
6662       else
6663         class_by_osd[osd] = ""; // no class
6664     }
6665     if (f) {
6666       f->open_array_section("osd_device_classes");
6667       for (auto& i : class_by_osd) {
6668         f->open_object_section("osd_device_class");
6669         f->dump_int("osd", i.first);
6670         f->dump_string("device_class", i.second);
6671         f->close_section();
6672       }
6673       f->close_section();
6674       f->flush(rdata);
6675     } else {
6676       if (class_by_osd.size() == 1) {
6677         // for single input, make a clean output
6678         ds << class_by_osd.begin()->second;
6679       } else {
6680         // note that we do not group osds by class here
6681         for (auto it = class_by_osd.begin();
6682              it != class_by_osd.end();
6683              it++) {
6684           ds << "osd." << it->first << ' ' << it->second;
6685           if (next(it) != class_by_osd.end())
6686             ds << '\n';
6687         }
6688       }
6689       rdata.append(ds);
6690     }
6691   } else if (prefix == "osd erasure-code-profile ls") {
6692     const auto &profiles = osdmap.get_erasure_code_profiles();
6693     if (f)
6694       f->open_array_section("erasure-code-profiles");
6695     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6696       if (f)
6697         f->dump_string("profile", i->first.c_str());
6698       else
6699         rdata.append(i->first + "\n");
6700     }
6701     if (f) {
6702       f->close_section();
6703       ostringstream rs;
6704       f->flush(rs);
6705       rs << "\n";
6706       rdata.append(rs.str());
6707     }
6708   } else if (prefix == "osd crush weight-set ls") {
6709     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6710     if (f) {
6711       f->open_array_section("weight_sets");
6712       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6713         f->dump_string("pool", "(compat)");
6714       }
6715       for (auto& i : osdmap.crush->choose_args) {
6716         if (i.first >= 0) {
6717           f->dump_string("pool", osdmap.get_pool_name(i.first));
6718         }
6719       }
6720       f->close_section();
6721       f->flush(rdata);
6722     } else {
6723       ostringstream rs;
6724       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6725         rs << "(compat)\n";
6726       }
6727       for (auto& i : osdmap.crush->choose_args) {
6728         if (i.first >= 0) {
6729           rs << osdmap.get_pool_name(i.first) << "\n";
6730         }
6731       }
6732       rdata.append(rs.str());
6733     }
6734   } else if (prefix == "osd crush weight-set dump") {
6735     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6736                                                      "json-pretty"));
6737     osdmap.crush->dump_choose_args(f.get());
6738     f->flush(rdata);
6739   } else if (prefix == "osd erasure-code-profile get") {
6740     string name;
6741     cmd_getval(cmdmap, "name", name);
6742     if (!osdmap.has_erasure_code_profile(name)) {
6743       ss << "unknown erasure code profile '" << name << "'";
6744       r = -ENOENT;
6745       goto reply;
6746     }
6747     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6748     if (f)
6749       f->open_object_section("profile");
6750     for (map<string,string>::const_iterator i = profile.begin();
6751          i != profile.end();
6752          ++i) {
6753       if (f)
6754         f->dump_string(i->first.c_str(), i->second.c_str());
6755       else
6756         rdata.append(i->first + "=" + i->second + "\n");
6757     }
6758     if (f) {
6759       f->close_section();
6760       ostringstream rs;
6761       f->flush(rs);
6762       rs << "\n";
6763       rdata.append(rs.str());
6764     }
6765   } else if (prefix == "osd pool application get") {
6766     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6767                                                      "json-pretty"));
6768     string pool_name;
6769     cmd_getval(cmdmap, "pool", pool_name);
6770     string app;
6771     cmd_getval(cmdmap, "app", app);
6772     string key;
6773     cmd_getval(cmdmap, "key", key);
6774
6775     if (pool_name.empty()) {
6776       // all
6777       f->open_object_section("pools");
6778       for (const auto &pool : osdmap.pools) {
6779         std::string name("<unknown>");
6780         const auto &pni = osdmap.pool_name.find(pool.first);
6781         if (pni != osdmap.pool_name.end())
6782           name = pni->second;
6783         f->open_object_section(name.c_str());
6784         for (auto &app_pair : pool.second.application_metadata) {
6785           f->open_object_section(app_pair.first.c_str());
6786           for (auto &kv_pair : app_pair.second) {
6787             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6788           }
6789           f->close_section();
6790         }
6791         f->close_section(); // name
6792       }
6793       f->close_section(); // pools
6794       f->flush(rdata);
6795     } else {
6796       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6797       if (pool < 0) {
6798         ss << "unrecognized pool '" << pool_name << "'";
6799         r = -ENOENT;
6800         goto reply;
6801       }
6802       auto p = osdmap.get_pg_pool(pool);
6803       // filter by pool
6804       if (app.empty()) {
6805         f->open_object_section(pool_name.c_str());
6806         for (auto &app_pair : p->application_metadata) {
6807           f->open_object_section(app_pair.first.c_str());
6808           for (auto &kv_pair : app_pair.second) {
6809             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6810           }
6811           f->close_section(); // application
6812         }
6813         f->close_section(); // pool_name
6814         f->flush(rdata);
6815         goto reply;
6816       }
6817
6818       auto app_it = p->application_metadata.find(app);
6819       if (app_it == p->application_metadata.end()) {
6820         ss << "pool '" << pool_name << "' has no application '" << app << "'";
6821         r = -ENOENT;
6822         goto reply;
6823       }
6824       // filter by pool + app
6825       if (key.empty()) {
6826         f->open_object_section(app_it->first.c_str());
6827         for (auto &kv_pair : app_it->second) {
6828           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6829         }
6830         f->close_section(); // application
6831         f->flush(rdata);
6832         goto reply;
6833       }
6834       // filter by pool + app + key
6835       auto key_it = app_it->second.find(key);
6836       if (key_it == app_it->second.end()) {
6837         ss << "application '" << app << "' on pool '" << pool_name
6838            << "' does not have key '" << key << "'";
6839         r = -ENOENT;
6840         goto reply;
6841       }
6842       ss << key_it->second << "\n";
6843       rdata.append(ss.str());
6844       ss.str("");
6845     }
6846   } else if (prefix == "osd get-require-min-compat-client") {
6847     ss << osdmap.require_min_compat_client << std::endl;
6848     rdata.append(ss.str());
6849     ss.str("");
6850     goto reply;
6851   } else if (prefix == "osd pool application enable" ||
6852              prefix == "osd pool application disable" ||
6853              prefix == "osd pool application set" ||
6854              prefix == "osd pool application rm") {
6855     bool changed = false;
6856     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6857     if (r != 0) {
6858       // Error, reply.
6859       goto reply;
6860     } else if (changed) {
6861       // Valid mutation, proceed to prepare phase
6862       return false;
6863     } else {
6864       // Idempotent case, reply
6865       goto reply;
6866     }
6867   } else {
6868     // try prepare update
6869     return false;
6870   }
6871
6872  reply:
6873   string rs;
6874   getline(ss, rs);
6875   mon->reply_command(op, r, rs, rdata, get_last_committed());
6876   return true;
6877 }
6878
6879 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6880 {
6881   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6882     osdmap.get_pg_pool(pool_id));
6883   ceph_assert(pool);
6884   pool->set_flag(flags);
6885 }
6886
6887 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6888 {
6889   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6890     osdmap.get_pg_pool(pool_id));
6891   ceph_assert(pool);
6892   pool->unset_flag(flags);
6893 }
6894
6895 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
6896 {
6897   char k[80];
6898   snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
6899   return k;
6900 }
6901
6902 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
6903 {
6904   char k[80];
6905   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6906            (unsigned long long)pool, (unsigned long long)snap);
6907   return k;
6908 }
6909
6910 string OSDMonitor::make_purged_snap_key_value(
6911   int64_t pool, snapid_t snap, snapid_t num,
6912   epoch_t epoch, bufferlist *v)
6913 {
6914   // encode the *last* epoch in the key so that we can use forward
6915   // iteration only to search for an epoch in an interval.
6916   encode(snap, *v);
6917   encode(snap + num, *v);
6918   encode(epoch, *v);
6919   return make_purged_snap_key(pool, snap + num - 1);
6920 }
6921
6922
6923 int OSDMonitor::lookup_purged_snap(
6924   int64_t pool, snapid_t snap,
6925   snapid_t *begin, snapid_t *end)
6926 {
6927   string k = make_purged_snap_key(pool, snap);
6928   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6929   it->lower_bound(k);
6930   if (!it->valid()) {
6931     dout(20) << __func__
6932              << " pool " << pool << " snap " << snap
6933              << " - key '" << k << "' not found" << dendl;
6934     return -ENOENT;
6935   }
6936   if (it->key().find("purged_snap_") != 0) {
6937     dout(20) << __func__
6938              << " pool " << pool << " snap " << snap
6939              << " - key '" << k << "' got '" << it->key()
6940              << "', wrong prefix" << dendl;
6941     return -ENOENT;
6942   }
6943   string gotk = it->key();
6944   const char *format = "purged_snap_%llu_";
6945   long long int keypool;
6946   int n = sscanf(gotk.c_str(), format, &keypool);
6947   if (n != 1) {
6948     derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6949     return -ENOENT;
6950   }
6951   if (pool != keypool) {
6952     dout(20) << __func__
6953              << " pool " << pool << " snap " << snap
6954              << " - key '" << k << "' got '" << gotk
6955              << "', wrong pool " << keypool
6956              << dendl;
6957     return -ENOENT;
6958   }
6959   bufferlist v = it->value();
6960   auto p = v.cbegin();
6961   decode(*begin, p);
6962   decode(*end, p);
6963   if (snap < *begin || snap >= *end) {
6964     dout(20) << __func__
6965              << " pool " << pool << " snap " << snap
6966              << " - found [" << *begin << "," << *end << "), no overlap"
6967              << dendl;
6968     return -ENOENT;
6969   }
6970   return 0;
6971 }
6972
6973 void OSDMonitor::insert_purged_snap_update(
6974   int64_t pool,
6975   snapid_t start, snapid_t end,
6976   epoch_t epoch,
6977   MonitorDBStore::TransactionRef t)
6978 {
6979   snapid_t before_begin, before_end;
6980   snapid_t after_begin, after_end;
6981   int b = lookup_purged_snap(pool, start - 1,
6982                              &before_begin, &before_end);
6983   int a = lookup_purged_snap(pool, end,
6984                              &after_begin, &after_end);
6985   if (!b && !a) {
6986     dout(10) << __func__
6987              << " [" << start << "," << end << ") - joins ["
6988              << before_begin << "," << before_end << ") and ["
6989              << after_begin << "," << after_end << ")" << dendl;
6990     // erase only the begin record; we'll overwrite the end one.
6991     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6992     bufferlist v;
6993     string k = make_purged_snap_key_value(pool,
6994                                           before_begin, after_end - before_begin,
6995                                           pending_inc.epoch, &v);
6996     t->put(OSD_SNAP_PREFIX, k, v);
6997   } else if (!b) {
6998     dout(10) << __func__
6999              << " [" << start << "," << end << ") - join with earlier ["
7000              << before_begin << "," << before_end << ")" << dendl;
7001     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7002     bufferlist v;
7003     string k = make_purged_snap_key_value(pool,
7004                                           before_begin, end - before_begin,
7005                                           pending_inc.epoch, &v);
7006     t->put(OSD_SNAP_PREFIX, k, v);
7007   } else if (!a) {
7008     dout(10) << __func__
7009              << " [" << start << "," << end << ") - join with later ["
7010              << after_begin << "," << after_end << ")" << dendl;
7011     // overwrite after record
7012     bufferlist v;
7013     string k = make_purged_snap_key_value(pool,
7014                                           start, after_end - start,
7015                                           pending_inc.epoch, &v);
7016     t->put(OSD_SNAP_PREFIX, k, v);
7017   } else {
7018     dout(10) << __func__
7019              << " [" << start << "," << end << ") - new"
7020              << dendl;
7021     bufferlist v;
7022     string k = make_purged_snap_key_value(pool,
7023                                           start, end - start,
7024                                           pending_inc.epoch, &v);
7025     t->put(OSD_SNAP_PREFIX, k, v);
7026   }
7027 }
7028
7029 bool OSDMonitor::try_prune_purged_snaps()
7030 {
7031   if (!mon->mgrstatmon()->is_readable()) {
7032     return false;
7033   }
7034   if (!pending_inc.new_purged_snaps.empty()) {
7035     return false;  // we already pruned for this epoch
7036   }
7037
7038   unsigned max_prune = cct->_conf.get_val<uint64_t>(
7039     "mon_max_snap_prune_per_epoch");
7040   if (!max_prune) {
7041     max_prune = 100000;
7042   }
7043   dout(10) << __func__ << " max_prune " << max_prune << dendl;
7044
7045   unsigned actually_pruned = 0;
7046   auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
7047   for (auto& p : osdmap.get_pools()) {
7048     auto q = purged_snaps.find(p.first);
7049     if (q == purged_snaps.end()) {
7050       continue;
7051     }
7052     auto& purged = q->second;
7053     if (purged.empty()) {
7054       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7055       continue;
7056     }
7057     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7058     snap_interval_set_t to_prune;
7059     unsigned maybe_pruned = actually_pruned;
7060     for (auto i = purged.begin(); i != purged.end(); ++i) {
7061       snapid_t begin = i.get_start();
7062       auto end = i.get_start() + i.get_len();
7063       snapid_t pbegin = 0, pend = 0;
7064       int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7065       if (r == 0) {
7066         // already purged.
7067         // be a bit aggressive about backing off here, because the mon may
7068         // do a lot of work going through this set, and if we know the
7069         // purged set from the OSDs is at least *partly* stale we may as
7070         // well wait for it to be fresh.
7071         dout(20) << __func__ << "  we've already purged " << pbegin
7072                  << "~" << (pend - pbegin) << dendl;
7073         break;  // next pool
7074       }
7075       if (pbegin && pbegin > begin && pbegin < end) {
7076         // the tail of [begin,end) is purged; shorten the range
7077         end = pbegin;
7078       }
7079       to_prune.insert(begin, end - begin);
7080       maybe_pruned += end - begin;
7081       if (maybe_pruned >= max_prune) {
7082         break;
7083       }
7084     }
7085     if (!to_prune.empty()) {
7086       // PGs may still be reporting things as purged that we have already
7087       // pruned from removed_snaps_queue.
7088       snap_interval_set_t actual;
7089       auto r = osdmap.removed_snaps_queue.find(p.first);
7090       if (r != osdmap.removed_snaps_queue.end()) {
7091         actual.intersection_of(to_prune, r->second);
7092       }
7093       actually_pruned += actual.size();
7094       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7095                << ", actual pruned " << actual << dendl;
7096       if (!actual.empty()) {
7097         pending_inc.new_purged_snaps[p.first].swap(actual);
7098       }
7099     }
7100     if (actually_pruned >= max_prune) {
7101       break;
7102     }
7103   }
7104   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7105   return !!actually_pruned;
7106 }
7107
7108 bool OSDMonitor::update_pools_status()
7109 {
7110   if (!mon->mgrstatmon()->is_readable())
7111     return false;
7112
7113   bool ret = false;
7114
7115   auto& pools = osdmap.get_pools();
7116   for (auto it = pools.begin(); it != pools.end(); ++it) {
7117     const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
7118     if (!pstat)
7119       continue;
7120     const object_stat_sum_t& sum = pstat->stats.sum;
7121     const pg_pool_t &pool = it->second;
7122     const string& pool_name = osdmap.get_pool_name(it->first);
7123
7124     bool pool_is_full =
7125       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7126       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7127
7128     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7129       if (pool_is_full)
7130         continue;
7131
7132       mon->clog->info() << "pool '" << pool_name
7133                        << "' no longer out of quota; removing NO_QUOTA flag";
7134       // below we cancel FLAG_FULL too, we'll set it again in
7135       // OSDMonitor::encode_pending if it still fails the osd-full checking.
7136       clear_pool_flags(it->first,
7137                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7138       ret = true;
7139     } else {
7140       if (!pool_is_full)
7141         continue;
7142
7143       if (pool.quota_max_bytes > 0 &&
7144           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7145         mon->clog->warn() << "pool '" << pool_name << "' is full"
7146                          << " (reached quota's max_bytes: "
7147                          << byte_u_t(pool.quota_max_bytes) << ")";
7148       }
7149       if (pool.quota_max_objects > 0 &&
7150                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7151         mon->clog->warn() << "pool '" << pool_name << "' is full"
7152                          << " (reached quota's max_objects: "
7153                          << pool.quota_max_objects << ")";
7154       }
7155       // set both FLAG_FULL_QUOTA and FLAG_FULL
7156       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7157       // since FLAG_FULL should always take precedence
7158       set_pool_flags(it->first,
7159                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7160       clear_pool_flags(it->first,
7161                        pg_pool_t::FLAG_NEARFULL |
7162                        pg_pool_t::FLAG_BACKFILLFULL);
7163       ret = true;
7164     }
7165   }
7166   return ret;
7167 }
7168
7169 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7170 {
7171   op->mark_osdmon_event(__func__);
7172   auto m = op->get_req<MPoolOp>();
7173   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7174   MonSession *session = op->get_session();
7175   if (!session)
7176     return -EPERM;
7177   string erasure_code_profile;
7178   stringstream ss;
7179   string rule_name;
7180   int ret = 0;
7181   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7182                          0, 0, 0, 0, 0, 0.0,
7183                          erasure_code_profile,
7184                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7185                          &ss);
7186
7187   if (ret < 0) {
7188     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7189   }
7190   return ret;
7191 }
7192
7193 int OSDMonitor::crush_rename_bucket(const string& srcname,
7194                                     const string& dstname,
7195                                     ostream *ss)
7196 {
7197   int ret;
7198   //
7199   // Avoid creating a pending crush if it does not already exists and
7200   // the rename would fail.
7201   //
7202   if (!_have_pending_crush()) {
7203     ret = _get_stable_crush().can_rename_bucket(srcname,
7204                                                 dstname,
7205                                                 ss);
7206     if (ret)
7207       return ret;
7208   }
7209
7210   CrushWrapper newcrush;
7211   _get_pending_crush(newcrush);
7212
7213   ret = newcrush.rename_bucket(srcname,
7214                                dstname,
7215                                ss);
7216   if (ret)
7217     return ret;
7218
7219   pending_inc.crush.clear();
7220   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7221   *ss << "renamed bucket " << srcname << " into " << dstname;
7222   return 0;
7223 }
7224
7225 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7226 {
7227   string replacement = "";
7228
7229   if (plugin == "jerasure_generic" ||
7230       plugin == "jerasure_sse3" ||
7231       plugin == "jerasure_sse4" ||
7232       plugin == "jerasure_neon") {
7233     replacement = "jerasure";
7234   } else if (plugin == "shec_generic" ||
7235              plugin == "shec_sse3" ||
7236              plugin == "shec_sse4" ||
7237              plugin == "shec_neon") {
7238     replacement = "shec";
7239   }
7240
7241   if (replacement != "") {
7242     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7243             << plugin << " that has been deprecated. Please use "
7244             << replacement << " instead." << dendl;
7245   }
7246 }
7247
7248 int OSDMonitor::normalize_profile(const string& profilename,
7249                                   ErasureCodeProfile &profile,
7250                                   bool force,
7251                                   ostream *ss)
7252 {
7253   ErasureCodeInterfaceRef erasure_code;
7254   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7255   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7256   check_legacy_ec_plugin(plugin->second, profilename);
7257   int err = instance.factory(plugin->second,
7258                              g_conf().get_val<std::string>("erasure_code_dir"),
7259                              profile, &erasure_code, ss);
7260   if (err) {
7261     return err;
7262   }
7263
7264   err = erasure_code->init(profile, ss);
7265   if (err) {
7266     return err;
7267   }
7268
7269   auto it = profile.find("stripe_unit");
7270   if (it != profile.end()) {
7271     string err_str;
7272     uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7273     if (!err_str.empty()) {
7274       *ss << "could not parse stripe_unit '" << it->second
7275           << "': " << err_str << std::endl;
7276       return -EINVAL;
7277     }
7278     uint32_t data_chunks = erasure_code->get_data_chunk_count();
7279     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7280     if (chunk_size != stripe_unit) {
7281       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7282           << "alignment. Would be padded to " << chunk_size
7283           << std::endl;
7284       return -EINVAL;
7285     }
7286     if ((stripe_unit % 4096) != 0 && !force) {
7287       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7288           << "use --force to override this check" << std::endl;
7289       return -EINVAL;
7290     }
7291   }
7292   return 0;
7293 }
7294
7295 int OSDMonitor::crush_rule_create_erasure(const string &name,
7296                                              const string &profile,
7297                                              int *rule,
7298                                              ostream *ss)
7299 {
7300   int ruleid = osdmap.crush->get_rule_id(name);
7301   if (ruleid != -ENOENT) {
7302     *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7303     return -EEXIST;
7304   }
7305
7306   CrushWrapper newcrush;
7307   _get_pending_crush(newcrush);
7308
7309   ruleid = newcrush.get_rule_id(name);
7310   if (ruleid != -ENOENT) {
7311     *rule = newcrush.get_rule_mask_ruleset(ruleid);
7312     return -EALREADY;
7313   } else {
7314     ErasureCodeInterfaceRef erasure_code;
7315     int err = get_erasure_code(profile, &erasure_code, ss);
7316     if (err) {
7317       *ss << "failed to load plugin using profile " << profile << std::endl;
7318       return err;
7319     }
7320
7321     err = erasure_code->create_rule(name, newcrush, ss);
7322     erasure_code.reset();
7323     if (err < 0)
7324       return err;
7325     *rule = err;
7326     pending_inc.crush.clear();
7327     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7328     return 0;
7329   }
7330 }
7331
7332 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7333                                  ErasureCodeInterfaceRef *erasure_code,
7334                                  ostream *ss) const
7335 {
7336   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7337     return -EAGAIN;
7338   ErasureCodeProfile profile =
7339     osdmap.get_erasure_code_profile(erasure_code_profile);
7340   ErasureCodeProfile::const_iterator plugin =
7341     profile.find("plugin");
7342   if (plugin == profile.end()) {
7343     *ss << "cannot determine the erasure code plugin"
7344         << " because there is no 'plugin' entry in the erasure_code_profile "
7345         << profile << std::endl;
7346     return -EINVAL;
7347   }
7348   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7349   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7350   return instance.factory(plugin->second,
7351                           g_conf().get_val<std::string>("erasure_code_dir"),
7352                           profile, erasure_code, ss);
7353 }
7354
7355 int OSDMonitor::check_cluster_features(uint64_t features,
7356                                        stringstream &ss)
7357 {
7358   stringstream unsupported_ss;
7359   int unsupported_count = 0;
7360   if ((mon->get_quorum_con_features() & features) != features) {
7361     unsupported_ss << "the monitor cluster";
7362     ++unsupported_count;
7363   }
7364
7365   set<int32_t> up_osds;
7366   osdmap.get_up_osds(up_osds);
7367   for (set<int32_t>::iterator it = up_osds.begin();
7368        it != up_osds.end(); ++it) {
7369     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7370     if ((xi.features & features) != features) {
7371       if (unsupported_count > 0)
7372         unsupported_ss << ", ";
7373       unsupported_ss << "osd." << *it;
7374       unsupported_count ++;
7375     }
7376   }
7377
7378   if (unsupported_count > 0) {
7379     ss << "features " << features << " unsupported by: "
7380        << unsupported_ss.str();
7381     return -ENOTSUP;
7382   }
7383
7384   // check pending osd state, too!
7385   for (map<int32_t,osd_xinfo_t>::const_iterator p =
7386          pending_inc.new_xinfo.begin();
7387        p != pending_inc.new_xinfo.end(); ++p) {
7388     const osd_xinfo_t &xi = p->second;
7389     if ((xi.features & features) != features) {
7390       dout(10) << __func__ << " pending osd." << p->first
7391                << " features are insufficient; retry" << dendl;
7392       return -EAGAIN;
7393     }
7394   }
7395
7396   return 0;
7397 }
7398
7399 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7400                                                  stringstream& ss)
7401 {
7402   OSDMap::Incremental new_pending = pending_inc;
7403   encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7404   OSDMap newmap;
7405   newmap.deepish_copy_from(osdmap);
7406   newmap.apply_incremental(new_pending);
7407
7408   // client compat
7409   if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7410     auto mv = newmap.get_min_compat_client();
7411     if (mv > newmap.require_min_compat_client) {
7412       ss << "new crush map requires client version " << mv
7413          << " but require_min_compat_client is "
7414          << newmap.require_min_compat_client;
7415       return false;
7416     }
7417   }
7418
7419   // osd compat
7420   uint64_t features =
7421     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7422     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7423   stringstream features_ss;
7424   int r = check_cluster_features(features, features_ss);
7425   if (r) {
7426     ss << "Could not change CRUSH: " << features_ss.str();
7427     return false;
7428   }
7429
7430   return true;
7431 }
7432
7433 bool OSDMonitor::erasure_code_profile_in_use(
7434   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7435   const string &profile,
7436   ostream *ss)
7437 {
7438   bool found = false;
7439   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7440        p != pools.end();
7441        ++p) {
7442     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7443       *ss << osdmap.pool_name[p->first] << " ";
7444       found = true;
7445     }
7446   }
7447   if (found) {
7448     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7449   }
7450   return found;
7451 }
7452
7453 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7454                                            map<string,string> *erasure_code_profile_map,
7455                                            ostream *ss)
7456 {
7457   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7458                                    get_json_str_map,
7459                                    *ss,
7460                                    erasure_code_profile_map,
7461                                    true);
7462   if (r)
7463     return r;
7464   ceph_assert((*erasure_code_profile_map).count("plugin"));
7465   string default_plugin = (*erasure_code_profile_map)["plugin"];
7466   map<string,string> user_map;
7467   for (vector<string>::const_iterator i = erasure_code_profile.begin();
7468        i != erasure_code_profile.end();
7469        ++i) {
7470     size_t equal = i->find('=');
7471     if (equal == string::npos) {
7472       user_map[*i] = string();
7473       (*erasure_code_profile_map)[*i] = string();
7474     } else {
7475       const string key = i->substr(0, equal);
7476       equal++;
7477       const string value = i->substr(equal);
7478       if (key.find("ruleset-") == 0) {
7479         *ss << "property '" << key << "' is no longer supported; try "
7480             << "'crush-" << key.substr(8) << "' instead";
7481         return -EINVAL;
7482       }
7483       user_map[key] = value;
7484       (*erasure_code_profile_map)[key] = value;
7485     }
7486   }
7487
7488   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7489     (*erasure_code_profile_map) = user_map;
7490
7491   return 0;
7492 }
7493
7494 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7495                                   const string &erasure_code_profile,
7496                                   uint8_t repl_size,
7497                                   unsigned *size, unsigned *min_size,
7498                                   ostream *ss)
7499 {
7500   int err = 0;
7501   switch (pool_type) {
7502   case pg_pool_t::TYPE_REPLICATED:
7503     if (repl_size == 0) {
7504       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7505     }
7506     *size = repl_size;
7507     *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7508     break;
7509   case pg_pool_t::TYPE_ERASURE:
7510     {
7511       ErasureCodeInterfaceRef erasure_code;
7512       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7513       if (err == 0) {
7514         *size = erasure_code->get_chunk_count();
7515         *min_size =
7516           erasure_code->get_data_chunk_count() +
7517           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7518         assert(*min_size <= *size);
7519         assert(*min_size >= erasure_code->get_data_chunk_count());
7520       }
7521     }
7522     break;
7523   default:
7524     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7525     err = -EINVAL;
7526     break;
7527   }
7528   return err;
7529 }
7530
7531 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7532                                           const string &erasure_code_profile,
7533                                           uint32_t *stripe_width,
7534                                           ostream *ss)
7535 {
7536   int err = 0;
7537   switch (pool_type) {
7538   case pg_pool_t::TYPE_REPLICATED:
7539     // ignored
7540     break;
7541   case pg_pool_t::TYPE_ERASURE:
7542     {
7543       ErasureCodeProfile profile =
7544         osdmap.get_erasure_code_profile(erasure_code_profile);
7545       ErasureCodeInterfaceRef erasure_code;
7546       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7547       if (err)
7548         break;
7549       uint32_t data_chunks = erasure_code->get_data_chunk_count();
7550       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7551       auto it = profile.find("stripe_unit");
7552       if (it != profile.end()) {
7553         string err_str;
7554         stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7555         ceph_assert(err_str.empty());
7556       }
7557       *stripe_width = data_chunks *
7558         erasure_code->get_chunk_size(stripe_unit * data_chunks);
7559     }
7560     break;
7561   default:
7562     *ss << "prepare_pool_stripe_width: "
7563        << pool_type << " is not a known pool type";
7564     err = -EINVAL;
7565     break;
7566   }
7567   return err;
7568 }
7569
7570 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7571                                         const string &erasure_code_profile,
7572                                         const string &rule_name,
7573                                         int *crush_rule,
7574                                         ostream *ss)
7575 {
7576
7577   if (*crush_rule < 0) {
7578     switch (pool_type) {
7579     case pg_pool_t::TYPE_REPLICATED:
7580       {
7581         if (rule_name == "") {
7582           // Use default rule
7583           *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7584           if (*crush_rule < 0) {
7585             // Errors may happen e.g. if no valid rule is available
7586             *ss << "No suitable CRUSH rule exists, check "
7587                 << "'osd pool default crush *' config options";
7588             return -ENOENT;
7589           }
7590         } else {
7591           return get_crush_rule(rule_name, crush_rule, ss);
7592         }
7593       }
7594       break;
7595     case pg_pool_t::TYPE_ERASURE:
7596       {
7597         int err = crush_rule_create_erasure(rule_name,
7598                                                erasure_code_profile,
7599                                                crush_rule, ss);
7600         switch (err) {
7601         case -EALREADY:
7602           dout(20) << "prepare_pool_crush_rule: rule "
7603                    << rule_name << " try again" << dendl;
7604           // fall through
7605         case 0:
7606           // need to wait for the crush rule to be proposed before proceeding
7607           err = -EAGAIN;
7608           break;
7609         case -EEXIST:
7610           err = 0;
7611           break;
7612         }
7613         return err;
7614       }
7615       break;
7616     default:
7617       *ss << "prepare_pool_crush_rule: " << pool_type
7618          << " is not a known pool type";
7619       return -EINVAL;
7620       break;
7621     }
7622   } else {
7623     if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7624       *ss << "CRUSH rule " << *crush_rule << " not found";
7625       return -ENOENT;
7626     }
7627   }
7628
7629   return 0;
7630 }
7631
7632 int OSDMonitor::get_crush_rule(const string &rule_name,
7633                                int *crush_rule,
7634                                ostream *ss)
7635 {
7636   int ret;
7637   ret = osdmap.crush->get_rule_id(rule_name);
7638   if (ret != -ENOENT) {
7639     // found it, use it
7640     *crush_rule = ret;
7641   } else {
7642     CrushWrapper newcrush;
7643     _get_pending_crush(newcrush);
7644
7645     ret = newcrush.get_rule_id(rule_name);
7646     if (ret != -ENOENT) {
7647       // found it, wait for it to be proposed
7648       dout(20) << __func__ << ": rule " << rule_name
7649                << " try again" << dendl;
7650       return -EAGAIN;
7651     } else {
7652       // Cannot find it , return error
7653       *ss << "specified rule " << rule_name << " doesn't exist";
7654       return ret;
7655     }
7656   }
7657   return 0;
7658 }
7659
7660 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7661 {
7662   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7663   auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
7664   auto max_pgs = max_pgs_per_osd * num_osds;
7665   uint64_t projected = 0;
7666   if (pool < 0) {
7667     projected += pg_num * size;
7668   }
7669   for (const auto& i : osdmap.get_pools()) {
7670     if (i.first == pool) {
7671       projected += pg_num * size;
7672     } else {
7673       projected += i.second.get_pg_num_target() * i.second.get_size();
7674     }
7675   }
7676   if (projected > max_pgs) {
7677     if (pool >= 0) {
7678       *ss << "pool id " << pool;
7679     }
7680     *ss << " pg_num " << pg_num << " size " << size
7681         << " would mean " << projected
7682         << " total pgs, which exceeds max " << max_pgs
7683         << " (mon_max_pg_per_osd " << max_pgs_per_osd
7684         << " * num_in_osds " << num_osds << ")";
7685     return -ERANGE;
7686   }
7687   return 0;
7688 }
7689
7690 /**
7691  * @param name The name of the new pool
7692  * @param crush_rule The crush rule to use. If <0, will use the system default
7693  * @param crush_rule_name The crush rule to use, if crush_rulset <0
7694  * @param pg_num The pg_num to use. If set to 0, will use the system default
7695  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7696  * @param repl_size Replication factor, or 0 for default
7697  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7698  * @param pool_type TYPE_ERASURE, or TYPE_REP
7699  * @param expected_num_objects expected number of objects on the pool
7700  * @param fast_read fast read type.
7701  * @param ss human readable error message, if any.
7702  *
7703  * @return 0 on success, negative errno on failure.
7704  */
7705 int OSDMonitor::prepare_new_pool(string& name,
7706                                  int crush_rule,
7707                                  const string &crush_rule_name,
7708                                  unsigned pg_num, unsigned pgp_num,
7709                                  unsigned pg_num_min,
7710                                  const uint64_t repl_size,
7711                                  const uint64_t target_size_bytes,
7712                                  const float target_size_ratio,
7713                                  const string &erasure_code_profile,
7714                                  const unsigned pool_type,
7715                                  const uint64_t expected_num_objects,
7716                                  FastReadType fast_read,
7717                                  const string& pg_autoscale_mode,
7718                                  ostream *ss)
7719 {
7720   if (name.length() == 0)
7721     return -EINVAL;
7722   if (pg_num == 0)
7723     pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7724   if (pgp_num == 0)
7725     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7726   if (!pgp_num)
7727     pgp_num = pg_num;
7728   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7729     *ss << "'pg_num' must be greater than 0 and less than or equal to "
7730         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7731         << " (you may adjust 'mon max pool pg num' for higher values)";
7732     return -ERANGE;
7733   }
7734   if (pgp_num > pg_num) {
7735     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7736         << ", which in this case is " << pg_num;
7737     return -ERANGE;
7738   }
7739   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7740     *ss << "'fast_read' can only apply to erasure coding pool";
7741     return -EINVAL;
7742   }
7743   int r;
7744   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7745                                  crush_rule_name, &crush_rule, ss);
7746   if (r) {
7747     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7748     return r;
7749   }
7750   if (g_conf()->mon_osd_crush_smoke_test) {
7751     CrushWrapper newcrush;
7752     _get_pending_crush(newcrush);
7753     ostringstream err;
7754     CrushTester tester(newcrush, err);
7755     tester.set_min_x(0);
7756     tester.set_max_x(50);
7757     tester.set_rule(crush_rule);
7758     auto start = ceph::coarse_mono_clock::now();
7759     r = tester.test_with_fork(g_conf()->mon_lease);
7760     auto duration = ceph::coarse_mono_clock::now() - start;
7761     if (r < 0) {
7762       dout(10) << "tester.test_with_fork returns " << r
7763                << ": " << err.str() << dendl;
7764       *ss << "crush test failed with " << r << ": " << err.str();
7765       return r;
7766     }
7767     dout(10) << __func__ << " crush smoke test duration: "
7768              << duration << dendl;
7769   }
7770   unsigned size, min_size;
7771   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7772                         &size, &min_size, ss);
7773   if (r) {
7774     dout(10) << "prepare_pool_size returns " << r << dendl;
7775     return r;
7776   }
7777   r = check_pg_num(-1, pg_num, size, ss);
7778   if (r) {
7779     dout(10) << "check_pg_num returns " << r << dendl;
7780     return r;
7781   }
7782
7783   if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7784     return -EINVAL;
7785   }
7786
7787   uint32_t stripe_width = 0;
7788   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7789   if (r) {
7790     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7791     return r;
7792   }
7793
7794   bool fread = false;
7795   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7796     switch (fast_read) {
7797       case FAST_READ_OFF:
7798         fread = false;
7799         break;
7800       case FAST_READ_ON:
7801         fread = true;
7802         break;
7803       case FAST_READ_DEFAULT:
7804         fread = g_conf()->osd_pool_default_ec_fast_read;
7805         break;
7806       default:
7807         *ss << "invalid fast_read setting: " << fast_read;
7808         return -EINVAL;
7809     }
7810   }
7811
7812   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7813        p != pending_inc.new_pool_names.end();
7814        ++p) {
7815     if (p->second == name)
7816       return 0;
7817   }
7818
7819   if (-1 == pending_inc.new_pool_max)
7820     pending_inc.new_pool_max = osdmap.pool_max;
7821   int64_t pool = ++pending_inc.new_pool_max;
7822   pg_pool_t empty;
7823   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7824   pi->create_time = ceph_clock_now();
7825   pi->type = pool_type;
7826   pi->fast_read = fread;
7827   pi->flags = g_conf()->osd_pool_default_flags;
7828   if (g_conf()->osd_pool_default_flag_hashpspool)
7829     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7830   if (g_conf()->osd_pool_default_flag_nodelete)
7831     pi->set_flag(pg_pool_t::FLAG_NODELETE);
7832   if (g_conf()->osd_pool_default_flag_nopgchange)
7833     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7834   if (g_conf()->osd_pool_default_flag_nosizechange)
7835     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7836   pi->set_flag(pg_pool_t::FLAG_CREATING);
7837   if (g_conf()->osd_pool_use_gmt_hitset)
7838     pi->use_gmt_hitset = true;
7839   else
7840     pi->use_gmt_hitset = false;
7841
7842   pi->size = size;
7843   pi->min_size = min_size;
7844   pi->crush_rule = crush_rule;
7845   pi->expected_num_objects = expected_num_objects;
7846   pi->object_hash = CEPH_STR_HASH_RJENKINS;
7847
7848   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7849         g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7850       m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7851     pi->pg_autoscale_mode = m;
7852   } else {
7853     pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7854   }
7855   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7856   pi->set_pg_num(
7857     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7858     : pg_num);
7859   pi->set_pg_num_pending(pi->get_pg_num());
7860   pi->set_pg_num_target(pg_num);
7861   pi->set_pgp_num(pi->get_pg_num());
7862   pi->set_pgp_num_target(pgp_num);
7863   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7864       pg_num_min) {
7865     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7866   }
7867   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7868         pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7869     pi->pg_autoscale_mode = m;
7870   }
7871
7872   pi->last_change = pending_inc.epoch;
7873   pi->auid = 0;
7874
7875   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7876       pi->erasure_code_profile = erasure_code_profile;
7877   } else {
7878       pi->erasure_code_profile = "";
7879   }
7880   pi->stripe_width = stripe_width;
7881
7882   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7883       target_size_bytes) {
7884     // only store for nautilus+ because TARGET_SIZE_BYTES may be
7885     // larger than int32_t max.
7886     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7887   }
7888   if (target_size_ratio > 0.0 &&
7889       osdmap.require_osd_release >= ceph_release_t::nautilus) {
7890     // only store for nautilus+, just to be consistent and tidy.
7891     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7892   }
7893
7894   pi->cache_target_dirty_ratio_micro =
7895     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7896   pi->cache_target_dirty_high_ratio_micro =
7897     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7898   pi->cache_target_full_ratio_micro =
7899     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7900   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7901   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7902
7903   pending_inc.new_pool_names[pool] = name;
7904   return 0;
7905 }
7906
7907 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7908 {
7909   op->mark_osdmon_event(__func__);
7910   ostringstream ss;
7911   if (pending_inc.new_flags < 0)
7912     pending_inc.new_flags = osdmap.get_flags();
7913   pending_inc.new_flags |= flag;
7914   ss << OSDMap::get_flag_string(flag) << " is set";
7915   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7916                                                     get_last_committed() + 1));
7917   return true;
7918 }
7919
7920 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7921 {
7922   op->mark_osdmon_event(__func__);
7923   ostringstream ss;
7924   if (pending_inc.new_flags < 0)
7925     pending_inc.new_flags = osdmap.get_flags();
7926   pending_inc.new_flags &= ~flag;
7927   ss << OSDMap::get_flag_string(flag) << " is unset";
7928   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7929                                                     get_last_committed() + 1));
7930   return true;
7931 }
7932
7933 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7934                                          stringstream& ss)
7935 {
7936   string poolstr;
7937   cmd_getval(cmdmap, "pool", poolstr);
7938   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7939   if (pool < 0) {
7940     ss << "unrecognized pool '" << poolstr << "'";
7941     return -ENOENT;
7942   }
7943   string var;
7944   cmd_getval(cmdmap, "var", var);
7945
7946   pg_pool_t p = *osdmap.get_pg_pool(pool);
7947   if (pending_inc.new_pools.count(pool))
7948     p = pending_inc.new_pools[pool];
7949
7950   // accept val as a json string in the normal case (current
7951   // generation monitor).  parse out int or float values from the
7952   // string as needed.  however, if it is not a string, try to pull
7953   // out an int, in case an older monitor with an older json schema is
7954   // forwarding a request.
7955   string val;
7956   string interr, floaterr;
7957   int64_t n = 0;
7958   double f = 0;
7959   int64_t uf = 0;  // micro-f
7960   cmd_getval(cmdmap, "val", val);
7961
7962   auto si_options = {
7963     "target_max_objects"
7964   };
7965   auto iec_options = {
7966     "target_max_bytes",
7967     "target_size_bytes",
7968     "compression_max_blob_size",
7969     "compression_min_blob_size",
7970     "csum_max_block",
7971     "csum_min_block",
7972   };
7973   if (count(begin(si_options), end(si_options), var)) {
7974     n = strict_si_cast<int64_t>(val.c_str(), &interr);
7975   } else if (count(begin(iec_options), end(iec_options), var)) {
7976     n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7977   } else {
7978     // parse string as both int and float; different fields use different types.
7979     n = strict_strtoll(val.c_str(), 10, &interr);
7980     f = strict_strtod(val.c_str(), &floaterr);
7981     uf = llrintl(f * (double)1000000.0);
7982   }
7983
7984   if (!p.is_tier() &&
7985       (var == "hit_set_type" || var == "hit_set_period" ||
7986        var == "hit_set_count" || var == "hit_set_fpp" ||
7987        var == "target_max_objects" || var == "target_max_bytes" ||
7988        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7989        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7990        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7991        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7992        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7993     return -EACCES;
7994   }
7995
7996   if (var == "size") {
7997     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7998       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7999       return -EPERM;
8000     }
8001     if (p.type == pg_pool_t::TYPE_ERASURE) {
8002       ss << "can not change the size of an erasure-coded pool";
8003       return -ENOTSUP;
8004     }
8005     if (interr.length()) {
8006       ss << "error parsing integer value '" << val << "': " << interr;
8007       return -EINVAL;
8008     }
8009     if (n <= 0 || n > 10) {
8010       ss << "pool size must be between 1 and 10";
8011       return -EINVAL;
8012     }
8013     if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
8014       return -EINVAL;
8015     }
8016     int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
8017     if (r < 0) {
8018       return r;
8019     }
8020     p.size = n;
8021     p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8022   } else if (var == "min_size") {
8023     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8024       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8025       return -EPERM;
8026     }
8027     if (interr.length()) {
8028       ss << "error parsing integer value '" << val << "': " << interr;
8029       return -EINVAL;
8030     }
8031
8032     if (p.type != pg_pool_t::TYPE_ERASURE) {
8033       if (n < 1 || n > p.size) {
8034         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8035         return -EINVAL;
8036       }
8037     } else {
8038        ErasureCodeInterfaceRef erasure_code;
8039        int k;
8040        stringstream tmp;
8041        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8042        if (err == 0) {
8043          k = erasure_code->get_data_chunk_count();
8044        } else {
8045          ss << __func__ << " get_erasure_code failed: " << tmp.str();
8046          return err;
8047        }
8048
8049        if (n < k || n > p.size) {
8050          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8051          return -EINVAL;
8052        }
8053     }
8054     p.min_size = n;
8055   } else if (var == "pg_num_actual") {
8056     if (interr.length()) {
8057       ss << "error parsing integer value '" << val << "': " << interr;
8058       return -EINVAL;
8059     }
8060     if (n == (int)p.get_pg_num()) {
8061       return 0;
8062     }
8063     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8064       ss << "'pg_num' must be greater than 0 and less than or equal to "
8065          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8066          << " (you may adjust 'mon max pool pg num' for higher values)";
8067       return -ERANGE;
8068     }
8069     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8070       ss << "cannot adjust pg_num while initial PGs are being created";
8071       return -EBUSY;
8072     }
8073     if (n > (int)p.get_pg_num()) {
8074       if (p.get_pg_num() != p.get_pg_num_pending()) {
8075         // force pre-nautilus clients to resend their ops, since they
8076         // don't understand pg_num_pending changes form a new interval
8077         p.last_force_op_resend_prenautilus = pending_inc.epoch;
8078       }
8079       p.set_pg_num(n);
8080     } else {
8081       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8082         ss << "nautilus OSDs are required to adjust pg_num_pending";
8083         return -EPERM;
8084       }
8085       if (n < (int)p.get_pgp_num()) {
8086         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8087         return -EINVAL;
8088       }
8089       if (n < (int)p.get_pg_num() - 1) {
8090         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8091            << ") - 1; only single pg decrease is currently supported";
8092         return -EINVAL;
8093       }
8094       p.set_pg_num_pending(n);
8095       // force pre-nautilus clients to resend their ops, since they
8096       // don't understand pg_num_pending changes form a new interval
8097       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8098     }
8099     // force pre-luminous clients to resend their ops, since they
8100     // don't understand that split PGs now form a new interval.
8101     p.last_force_op_resend_preluminous = pending_inc.epoch;
8102   } else if (var == "pg_num") {
8103     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8104       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8105       return -EPERM;
8106     }
8107     if (interr.length()) {
8108       ss << "error parsing integer value '" << val << "': " << interr;
8109       return -EINVAL;
8110     }
8111     if (n == (int)p.get_pg_num_target()) {
8112       return 0;
8113     }
8114     if (n <= 0 || static_cast<uint64_t>(n) >
8115                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8116       ss << "'pg_num' must be greater than 0 and less than or equal to "
8117          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8118          << " (you may adjust 'mon max pool pg num' for higher values)";
8119       return -ERANGE;
8120     }
8121     if (n > (int)p.get_pg_num_target()) {
8122       int r = check_pg_num(pool, n, p.get_size(), &ss);
8123       if (r) {
8124         return r;
8125       }
8126       bool force = false;
8127       cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8128       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8129         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
8130         return -EPERM;
8131       }
8132     } else {
8133       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8134         ss << "nautilus OSDs are required to decrease pg_num";
8135         return -EPERM;
8136       }
8137     }
8138     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8139       // pre-nautilus osdmap format; increase pg_num directly
8140       assert(n > (int)p.get_pg_num());
8141       // force pre-nautilus clients to resend their ops, since they
8142       // don't understand pg_num_target changes form a new interval
8143       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8144       // force pre-luminous clients to resend their ops, since they
8145       // don't understand that split PGs now form a new interval.
8146       p.last_force_op_resend_preluminous = pending_inc.epoch;
8147       p.set_pg_num(n);
8148     } else {
8149       // set targets; mgr will adjust pg_num_actual and pgp_num later.
8150       // make pgp_num track pg_num if it already matches.  if it is set
8151       // differently, leave it different and let the user control it
8152       // manually.
8153       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8154         p.set_pgp_num_target(n);
8155       }
8156       p.set_pg_num_target(n);
8157     }
8158   } else if (var == "pgp_num_actual") {
8159     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8160       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8161       return -EPERM;
8162     }
8163     if (interr.length()) {
8164       ss << "error parsing integer value '" << val << "': " << interr;
8165       return -EINVAL;
8166     }
8167     if (n <= 0) {
8168       ss << "specified pgp_num must > 0, but you set to " << n;
8169       return -EINVAL;
8170     }
8171     if (n > (int)p.get_pg_num()) {
8172       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8173       return -EINVAL;
8174     }
8175     if (n > (int)p.get_pg_num_pending()) {
8176       ss << "specified pgp_num " << n
8177          << " > pg_num_pending " << p.get_pg_num_pending();
8178       return -EINVAL;
8179     }
8180     p.set_pgp_num(n);
8181   } else if (var == "pgp_num") {
8182     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8183       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8184       return -EPERM;
8185     }
8186     if (interr.length()) {
8187       ss << "error parsing integer value '" << val << "': " << interr;
8188       return -EINVAL;
8189     }
8190     if (n <= 0) {
8191       ss << "specified pgp_num must > 0, but you set to " << n;
8192       return -EINVAL;
8193     }
8194     if (n > (int)p.get_pg_num_target()) {
8195       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8196       return -EINVAL;
8197     }
8198     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8199       // pre-nautilus osdmap format; increase pgp_num directly
8200       p.set_pgp_num(n);
8201     } else {
8202       p.set_pgp_num_target(n);
8203     }
8204   } else if (var == "pg_autoscale_mode") {
8205     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8206     if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8207       ss << "specified invalid mode " << val;
8208       return -EINVAL;
8209     }
8210     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8211       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8212       return -EINVAL;
8213     }
8214     p.pg_autoscale_mode = m;
8215   } else if (var == "crush_rule") {
8216     int id = osdmap.crush->get_rule_id(val);
8217     if (id == -ENOENT) {
8218       ss << "crush rule " << val << " does not exist";
8219       return -ENOENT;
8220     }
8221     if (id < 0) {
8222       ss << cpp_strerror(id);
8223       return -ENOENT;
8224     }
8225     if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8226       return -EINVAL;
8227     }
8228     p.crush_rule = id;
8229   } else if (var == "nodelete" || var == "nopgchange" ||
8230              var == "nosizechange" || var == "write_fadvise_dontneed" ||
8231              var == "noscrub" || var == "nodeep-scrub") {
8232     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8233     // make sure we only compare against 'n' if we didn't receive a string
8234     if (val == "true" || (interr.empty() && n == 1)) {
8235       p.set_flag(flag);
8236     } else if (val == "false" || (interr.empty() && n == 0)) {
8237       p.unset_flag(flag);
8238     } else {
8239       ss << "expecting value 'true', 'false', '0', or '1'";
8240       return -EINVAL;
8241     }
8242   } else if (var == "hashpspool") {
8243     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8244     bool force = false;
8245     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8246
8247     if (!force) {
8248       ss << "are you SURE?  this will remap all placement groups in this pool,"
8249             " this triggers large data movement,"
8250             " pass --yes-i-really-mean-it if you really do.";
8251       return -EPERM;
8252     }
8253     // make sure we only compare against 'n' if we didn't receive a string
8254     if (val == "true" || (interr.empty() && n == 1)) {
8255       p.set_flag(flag);
8256     } else if (val == "false" || (interr.empty() && n == 0)) {
8257       p.unset_flag(flag);
8258     } else {
8259       ss << "expecting value 'true', 'false', '0', or '1'";
8260       return -EINVAL;
8261     }
8262   } else if (var == "hit_set_type") {
8263     if (val == "none")
8264       p.hit_set_params = HitSet::Params();
8265     else {
8266       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8267       if (err)
8268         return err;
8269       if (val == "bloom") {
8270         BloomHitSet::Params *bsp = new BloomHitSet::Params;
8271         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8272         p.hit_set_params = HitSet::Params(bsp);
8273       } else if (val == "explicit_hash")
8274         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8275       else if (val == "explicit_object")
8276         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8277       else {
8278         ss << "unrecognized hit_set type '" << val << "'";
8279         return -EINVAL;
8280       }
8281     }
8282   } else if (var == "hit_set_period") {
8283     if (interr.length()) {
8284       ss << "error parsing integer value '" << val << "': " << interr;
8285       return -EINVAL;
8286     } else if (n < 0) {
8287       ss << "hit_set_period should be non-negative";
8288       return -EINVAL;
8289     }
8290     p.hit_set_period = n;
8291   } else if (var == "hit_set_count") {
8292     if (interr.length()) {
8293       ss << "error parsing integer value '" << val << "': " << interr;
8294       return -EINVAL;
8295     } else if (n < 0) {
8296       ss << "hit_set_count should be non-negative";
8297       return -EINVAL;
8298     }
8299     p.hit_set_count = n;
8300   } else if (var == "hit_set_fpp") {
8301     if (floaterr.length()) {
8302       ss << "error parsing floating point value '" << val << "': " << floaterr;
8303       return -EINVAL;
8304     } else if (f < 0 || f > 1.0) {
8305       ss << "hit_set_fpp should be in the range 0..1";
8306       return -EINVAL;
8307     }
8308     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8309       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8310       return -EINVAL;
8311     }
8312     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8313     bloomp->set_fpp(f);
8314   } else if (var == "use_gmt_hitset") {
8315     if (val == "true" || (interr.empty() && n == 1)) {
8316       p.use_gmt_hitset = true;
8317     } else {
8318       ss << "expecting value 'true' or '1'";
8319       return -EINVAL;
8320     }
8321   } else if (var == "allow_ec_overwrites") {
8322     if (!p.is_erasure()) {
8323       ss << "ec overwrites can only be enabled for an erasure coded pool";
8324       return -EINVAL;
8325     }
8326     stringstream err;
8327     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8328         !is_pool_currently_all_bluestore(pool, p, &err)) {
8329       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8330       return -EINVAL;
8331     }
8332     if (val == "true" || (interr.empty() && n == 1)) {
8333         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8334     } else if (val == "false" || (interr.empty() && n == 0)) {
8335       ss << "ec overwrites cannot be disabled once enabled";
8336       return -EINVAL;
8337     } else {
8338       ss << "expecting value 'true', 'false', '0', or '1'";
8339       return -EINVAL;
8340     }
8341   } else if (var == "target_max_objects") {
8342     if (interr.length()) {
8343       ss << "error parsing int '" << val << "': " << interr;
8344       return -EINVAL;
8345     }
8346     p.target_max_objects = n;
8347   } else if (var == "target_max_bytes") {
8348     if (interr.length()) {
8349       ss << "error parsing int '" << val << "': " << interr;
8350       return -EINVAL;
8351     }
8352     p.target_max_bytes = n;
8353   } else if (var == "cache_target_dirty_ratio") {
8354     if (floaterr.length()) {
8355       ss << "error parsing float '" << val << "': " << floaterr;
8356       return -EINVAL;
8357     }
8358     if (f < 0 || f > 1.0) {
8359       ss << "value must be in the range 0..1";
8360       return -ERANGE;
8361     }
8362     p.cache_target_dirty_ratio_micro = uf;
8363   } else if (var == "cache_target_dirty_high_ratio") {
8364     if (floaterr.length()) {
8365       ss << "error parsing float '" << val << "': " << floaterr;
8366       return -EINVAL;
8367     }
8368     if (f < 0 || f > 1.0) {
8369       ss << "value must be in the range 0..1";
8370       return -ERANGE;
8371     }
8372     p.cache_target_dirty_high_ratio_micro = uf;
8373   } else if (var == "cache_target_full_ratio") {
8374     if (floaterr.length()) {
8375       ss << "error parsing float '" << val << "': " << floaterr;
8376       return -EINVAL;
8377     }
8378     if (f < 0 || f > 1.0) {
8379       ss << "value must be in the range 0..1";
8380       return -ERANGE;
8381     }
8382     p.cache_target_full_ratio_micro = uf;
8383   } else if (var == "cache_min_flush_age") {
8384     if (interr.length()) {
8385       ss << "error parsing int '" << val << "': " << interr;
8386       return -EINVAL;
8387     }
8388     p.cache_min_flush_age = n;
8389   } else if (var == "cache_min_evict_age") {
8390     if (interr.length()) {
8391       ss << "error parsing int '" << val << "': " << interr;
8392       return -EINVAL;
8393     }
8394     p.cache_min_evict_age = n;
8395   } else if (var == "min_read_recency_for_promote") {
8396     if (interr.length()) {
8397       ss << "error parsing integer value '" << val << "': " << interr;
8398       return -EINVAL;
8399     }
8400     p.min_read_recency_for_promote = n;
8401   } else if (var == "hit_set_grade_decay_rate") {
8402     if (interr.length()) {
8403       ss << "error parsing integer value '" << val << "': " << interr;
8404       return -EINVAL;
8405     }
8406     if (n > 100 || n < 0) {
8407       ss << "value out of range,valid range is 0 - 100";
8408       return -EINVAL;
8409     }
8410     p.hit_set_grade_decay_rate = n;
8411   } else if (var == "hit_set_search_last_n") {
8412     if (interr.length()) {
8413       ss << "error parsing integer value '" << val << "': " << interr;
8414       return -EINVAL;
8415     }
8416     if (n > p.hit_set_count || n < 0) {
8417       ss << "value out of range,valid range is 0 - hit_set_count";
8418       return -EINVAL;
8419     }
8420     p.hit_set_search_last_n = n;
8421   } else if (var == "min_write_recency_for_promote") {
8422     if (interr.length()) {
8423       ss << "error parsing integer value '" << val << "': " << interr;
8424       return -EINVAL;
8425     }
8426     p.min_write_recency_for_promote = n;
8427   } else if (var == "fast_read") {
8428     if (p.is_replicated()) {
8429         ss << "fast read is not supported in replication pool";
8430         return -EINVAL;
8431     }
8432     if (val == "true" || (interr.empty() && n == 1)) {
8433       p.fast_read = true;
8434     } else if (val == "false" || (interr.empty() && n == 0)) {
8435       p.fast_read = false;
8436     } else {
8437       ss << "expecting value 'true', 'false', '0', or '1'";
8438       return -EINVAL;
8439     }
8440   } else if (pool_opts_t::is_opt_name(var)) {
8441     bool unset = val == "unset";
8442     if (var == "compression_mode") {
8443       if (!unset) {
8444         auto cmode = Compressor::get_comp_mode_type(val);
8445         if (!cmode) {
8446           ss << "unrecognized compression mode '" << val << "'";
8447           return -EINVAL;
8448         }
8449       }
8450     } else if (var == "compression_algorithm") {
8451       if (!unset) {
8452         auto alg = Compressor::get_comp_alg_type(val);
8453         if (!alg) {
8454           ss << "unrecognized compression_algorithm '" << val << "'";
8455           return -EINVAL;
8456         }
8457       }
8458     } else if (var == "compression_required_ratio") {
8459       if (floaterr.length()) {
8460         ss << "error parsing float value '" << val << "': " << floaterr;
8461         return -EINVAL;
8462       }
8463       if (f < 0 || f > 1) {
8464         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8465         return -EINVAL;
8466       }
8467     } else if (var == "csum_type") {
8468       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8469       if (t < 0 ) {
8470         ss << "unrecognized csum_type '" << val << "'";
8471         return -EINVAL;
8472       }
8473       //preserve csum_type numeric value
8474       n = t;
8475       interr.clear();
8476     } else if (var == "compression_max_blob_size" ||
8477                var == "compression_min_blob_size" ||
8478                var == "csum_max_block" ||
8479                var == "csum_min_block") {
8480       if (interr.length()) {
8481         ss << "error parsing int value '" << val << "': " << interr;
8482         return -EINVAL;
8483       }
8484     } else if (var == "fingerprint_algorithm") {
8485       if (!unset) {
8486         auto alg = pg_pool_t::get_fingerprint_from_str(val);
8487         if (!alg) {
8488           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8489           return -EINVAL;
8490         }
8491       }
8492     } else if (var == "target_size_bytes") {
8493       if (interr.length()) {
8494         ss << "error parsing unit value '" << val << "': " << interr;
8495         return -EINVAL;
8496       }
8497       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8498         ss << "must set require_osd_release to nautilus or "
8499            << "later before setting target_size_bytes";
8500         return -EINVAL;
8501       }
8502     } else if (var == "pg_num_min") {
8503       if (interr.length()) {
8504         ss << "error parsing int value '" << val << "': " << interr;
8505         return -EINVAL;
8506       }
8507       if (n > (int)p.get_pg_num_target()) {
8508         ss << "specified pg_num_min " << n
8509            << " > pg_num " << p.get_pg_num_target();
8510         return -EINVAL;
8511       }
8512     } else if (var == "recovery_priority") {
8513       if (interr.length()) {
8514         ss << "error parsing int value '" << val << "': " << interr;
8515         return -EINVAL;
8516       }
8517       if (!g_conf()->debug_allow_any_pool_priority) {
8518         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8519           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8520              << " and " << OSD_POOL_PRIORITY_MAX;
8521           return -EINVAL;
8522         }
8523       }
8524     } else if (var == "pg_autoscale_bias") {
8525       if (f < 0.0 || f > 1000.0) {
8526         ss << "pg_autoscale_bias must be between 0 and 1000";
8527         return -EINVAL;
8528       }
8529     }
8530
8531     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8532     switch (desc.type) {
8533     case pool_opts_t::STR:
8534       if (unset) {
8535         p.opts.unset(desc.key);
8536       } else {
8537         p.opts.set(desc.key, static_cast<std::string>(val));
8538       }
8539       break;
8540     case pool_opts_t::INT:
8541       if (interr.length()) {
8542         ss << "error parsing integer value '" << val << "': " << interr;
8543         return -EINVAL;
8544       }
8545       if (n == 0) {
8546         p.opts.unset(desc.key);
8547       } else {
8548         p.opts.set(desc.key, static_cast<int64_t>(n));
8549       }
8550       break;
8551     case pool_opts_t::DOUBLE:
8552       if (floaterr.length()) {
8553         ss << "error parsing floating point value '" << val << "': " << floaterr;
8554         return -EINVAL;
8555       }
8556       if (f == 0) {
8557         p.opts.unset(desc.key);
8558       } else {
8559         p.opts.set(desc.key, static_cast<double>(f));
8560       }
8561       break;
8562     default:
8563       ceph_assert(!"unknown type");
8564     }
8565   } else {
8566     ss << "unrecognized variable '" << var << "'";
8567     return -EINVAL;
8568   }
8569   if (val != "unset") {
8570     ss << "set pool " << pool << " " << var << " to " << val;
8571   } else {
8572     ss << "unset pool " << pool << " " << var;
8573   }
8574   p.last_change = pending_inc.epoch;
8575   pending_inc.new_pools[pool] = p;
8576   return 0;
8577 }
8578
8579 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8580                                                  const cmdmap_t& cmdmap,
8581                                                  stringstream& ss)
8582 {
8583   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8584 }
8585
8586 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8587                                                     const cmdmap_t& cmdmap,
8588                                                     stringstream& ss,
8589                                                     bool *modified)
8590 {
8591   return _command_pool_application(prefix, cmdmap, ss, modified, false);
8592 }
8593
8594
8595 /**
8596  * Common logic for preprocess and prepare phases of pool application
8597  * tag commands.  In preprocess mode we're only detecting invalid
8598  * commands, and determining whether it was a modification or a no-op.
8599  * In prepare mode we're actually updating the pending state.
8600  */
8601 int OSDMonitor::_command_pool_application(const string &prefix,
8602                                           const cmdmap_t& cmdmap,
8603                                           stringstream& ss,
8604                                           bool *modified,
8605                                           bool preparing)
8606 {
8607   string pool_name;
8608   cmd_getval(cmdmap, "pool", pool_name);
8609   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8610   if (pool < 0) {
8611     ss << "unrecognized pool '" << pool_name << "'";
8612     return -ENOENT;
8613   }
8614
8615   pg_pool_t p = *osdmap.get_pg_pool(pool);
8616   if (preparing) {
8617     if (pending_inc.new_pools.count(pool)) {
8618       p = pending_inc.new_pools[pool];
8619     }
8620   }
8621
8622   string app;
8623   cmd_getval(cmdmap, "app", app);
8624   bool app_exists = (p.application_metadata.count(app) > 0);
8625
8626   string key;
8627   cmd_getval(cmdmap, "key", key);
8628   if (key == "all") {
8629     ss << "key cannot be 'all'";
8630     return -EINVAL;
8631   }
8632
8633   string value;
8634   cmd_getval(cmdmap, "value", value);
8635   if (value == "all") {
8636     ss << "value cannot be 'all'";
8637     return -EINVAL;
8638   }
8639
8640   if (boost::algorithm::ends_with(prefix, "enable")) {
8641     if (app.empty()) {
8642       ss << "application name must be provided";
8643       return -EINVAL;
8644     }
8645
8646     if (p.is_tier()) {
8647       ss << "application must be enabled on base tier";
8648       return -EINVAL;
8649     }
8650
8651     bool force = false;
8652     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8653
8654     if (!app_exists && !p.application_metadata.empty() && !force) {
8655       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8656          << "application; pass --yes-i-really-mean-it to proceed anyway";
8657       return -EPERM;
8658     }
8659
8660     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8661       ss << "too many enabled applications on pool '" << pool_name << "'; "
8662          << "max " << MAX_POOL_APPLICATIONS;
8663       return -EINVAL;
8664     }
8665
8666     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8667       ss << "application name '" << app << "' too long; max length "
8668          << MAX_POOL_APPLICATION_LENGTH;
8669       return -EINVAL;
8670     }
8671
8672     if (!app_exists) {
8673       p.application_metadata[app] = {};
8674     }
8675     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8676
8677   } else if (boost::algorithm::ends_with(prefix, "disable")) {
8678     bool force = false;
8679     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8680
8681     if (!force) {
8682       ss << "Are you SURE? Disabling an application within a pool might result "
8683          << "in loss of application functionality; pass "
8684          << "--yes-i-really-mean-it to proceed anyway";
8685       return -EPERM;
8686     }
8687
8688     if (!app_exists) {
8689       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8690          << "'";
8691       return 0; // idempotent
8692     }
8693
8694     p.application_metadata.erase(app);
8695     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8696
8697   } else if (boost::algorithm::ends_with(prefix, "set")) {
8698     if (p.is_tier()) {
8699       ss << "application metadata must be set on base tier";
8700       return -EINVAL;
8701     }
8702
8703     if (!app_exists) {
8704       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8705          << "'";
8706       return -ENOENT;
8707     }
8708
8709     string key;
8710     cmd_getval(cmdmap, "key", key);
8711
8712     if (key.empty()) {
8713       ss << "key must be provided";
8714       return -EINVAL;
8715     }
8716
8717     auto &app_keys = p.application_metadata[app];
8718     if (app_keys.count(key) == 0 &&
8719         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8720       ss << "too many keys set for application '" << app << "' on pool '"
8721          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8722       return -EINVAL;
8723     }
8724
8725     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8726       ss << "key '" << app << "' too long; max length "
8727          << MAX_POOL_APPLICATION_LENGTH;
8728       return -EINVAL;
8729     }
8730
8731     string value;
8732     cmd_getval(cmdmap, "value", value);
8733     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8734       ss << "value '" << value << "' too long; max length "
8735          << MAX_POOL_APPLICATION_LENGTH;
8736       return -EINVAL;
8737     }
8738
8739     p.application_metadata[app][key] = value;
8740     ss << "set application '" << app << "' key '" << key << "' to '"
8741        << value << "' on pool '" << pool_name << "'";
8742   } else if (boost::algorithm::ends_with(prefix, "rm")) {
8743     if (!app_exists) {
8744       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8745          << "'";
8746       return -ENOENT;
8747     }
8748
8749     string key;
8750     cmd_getval(cmdmap, "key", key);
8751     auto it = p.application_metadata[app].find(key);
8752     if (it == p.application_metadata[app].end()) {
8753       ss << "application '" << app << "' on pool '" << pool_name
8754          << "' does not have key '" << key << "'";
8755       return 0; // idempotent
8756     }
8757
8758     p.application_metadata[app].erase(it);
8759     ss << "removed application '" << app << "' key '" << key << "' on pool '"
8760        << pool_name << "'";
8761   } else {
8762     ceph_abort();
8763   }
8764
8765   if (preparing) {
8766     p.last_change = pending_inc.epoch;
8767     pending_inc.new_pools[pool] = p;
8768   }
8769
8770   // Because we fell through this far, we didn't hit no-op cases,
8771   // so pool was definitely modified
8772   if (modified != nullptr) {
8773     *modified = true;
8774   }
8775
8776   return 0;
8777 }
8778
8779 int OSDMonitor::_prepare_command_osd_crush_remove(
8780     CrushWrapper &newcrush,
8781     int32_t id,
8782     int32_t ancestor,
8783     bool has_ancestor,
8784     bool unlink_only)
8785 {
8786   int err = 0;
8787
8788   if (has_ancestor) {
8789     err = newcrush.remove_item_under(cct, id, ancestor,
8790         unlink_only);
8791   } else {
8792     err = newcrush.remove_item(cct, id, unlink_only);
8793   }
8794   return err;
8795 }
8796
8797 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8798 {
8799   pending_inc.crush.clear();
8800   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8801 }
8802
8803 int OSDMonitor::prepare_command_osd_crush_remove(
8804     CrushWrapper &newcrush,
8805     int32_t id,
8806     int32_t ancestor,
8807     bool has_ancestor,
8808     bool unlink_only)
8809 {
8810   int err = _prepare_command_osd_crush_remove(
8811       newcrush, id, ancestor,
8812       has_ancestor, unlink_only);
8813
8814   if (err < 0)
8815     return err;
8816
8817   ceph_assert(err == 0);
8818   do_osd_crush_remove(newcrush);
8819
8820   return 0;
8821 }
8822
8823 int OSDMonitor::prepare_command_osd_remove(int32_t id)
8824 {
8825   if (osdmap.is_up(id)) {
8826     return -EBUSY;
8827   }
8828
8829   pending_inc.new_state[id] = osdmap.get_state(id);
8830   pending_inc.new_uuid[id] = uuid_d();
8831   pending_metadata_rm.insert(id);
8832   pending_metadata.erase(id);
8833
8834   return 0;
8835 }
8836
8837 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8838 {
8839   ceph_assert(existing_id);
8840   *existing_id = -1;
8841
8842   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8843     if (!osdmap.exists(i) &&
8844         pending_inc.new_up_client.count(i) == 0 &&
8845         (pending_inc.new_state.count(i) == 0 ||
8846          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8847       *existing_id = i;
8848       return -1;
8849     }
8850   }
8851
8852   if (pending_inc.new_max_osd < 0) {
8853     return osdmap.get_max_osd();
8854   }
8855   return pending_inc.new_max_osd;
8856 }
8857
8858 void OSDMonitor::do_osd_create(
8859     const int32_t id,
8860     const uuid_d& uuid,
8861     const string& device_class,
8862     int32_t* new_id)
8863 {
8864   dout(10) << __func__ << " uuid " << uuid << dendl;
8865   ceph_assert(new_id);
8866
8867   // We presume validation has been performed prior to calling this
8868   // function. We assert with prejudice.
8869
8870   int32_t allocated_id = -1; // declare here so we can jump
8871   int32_t existing_id = -1;
8872   if (!uuid.is_zero()) {
8873     existing_id = osdmap.identify_osd(uuid);
8874     if (existing_id >= 0) {
8875       ceph_assert(id < 0 || id == existing_id);
8876       *new_id = existing_id;
8877       goto out;
8878     } else if (id >= 0) {
8879       // uuid does not exist, and id has been provided, so just create
8880       // the new osd.id
8881       *new_id = id;
8882       goto out;
8883     }
8884   }
8885
8886   // allocate a new id
8887   allocated_id = _allocate_osd_id(&existing_id);
8888   dout(10) << __func__ << " allocated id " << allocated_id
8889            << " existing id " << existing_id << dendl;
8890   if (existing_id >= 0) {
8891     ceph_assert(existing_id < osdmap.get_max_osd());
8892     ceph_assert(allocated_id < 0);
8893     pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8894     *new_id = existing_id;
8895   } else if (allocated_id >= 0) {
8896     ceph_assert(existing_id < 0);
8897     // raise max_osd
8898     if (pending_inc.new_max_osd < 0) {
8899       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8900     } else {
8901       ++pending_inc.new_max_osd;
8902     }
8903     *new_id = pending_inc.new_max_osd - 1;
8904     ceph_assert(*new_id == allocated_id);
8905   } else {
8906     ceph_abort_msg("unexpected condition");
8907   }
8908
8909 out:
8910   if (device_class.size()) {
8911     CrushWrapper newcrush;
8912     _get_pending_crush(newcrush);
8913     if (newcrush.get_max_devices() < *new_id + 1) {
8914       newcrush.set_max_devices(*new_id + 1);
8915     }
8916     string name = string("osd.") + stringify(*new_id);
8917     if (!newcrush.item_exists(*new_id)) {
8918       newcrush.set_item_name(*new_id, name);
8919     }
8920     ostringstream ss;
8921     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8922     if (r < 0) {
8923       derr << __func__ << " failed to set " << name << " device_class "
8924            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8925            << dendl;
8926       // non-fatal... this might be a replay and we want to be idempotent.
8927     } else {
8928       dout(20) << __func__ << " set " << name << " device_class " << device_class
8929                << dendl;
8930       pending_inc.crush.clear();
8931       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8932     }
8933   } else {
8934     dout(20) << __func__ << " no device_class" << dendl;
8935   }
8936
8937   dout(10) << __func__ << " using id " << *new_id << dendl;
8938   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8939     pending_inc.new_max_osd = *new_id + 1;
8940   }
8941
8942   pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8943   if (!uuid.is_zero())
8944     pending_inc.new_uuid[*new_id] = uuid;
8945 }
8946
8947 int OSDMonitor::validate_osd_create(
8948     const int32_t id,
8949     const uuid_d& uuid,
8950     const bool check_osd_exists,
8951     int32_t* existing_id,
8952     stringstream& ss)
8953 {
8954
8955   dout(10) << __func__ << " id " << id << " uuid " << uuid
8956            << " check_osd_exists " << check_osd_exists << dendl;
8957
8958   ceph_assert(existing_id);
8959
8960   if (id < 0 && uuid.is_zero()) {
8961     // we have nothing to validate
8962     *existing_id = -1;
8963     return 0;
8964   } else if (uuid.is_zero()) {
8965     // we have an id but we will ignore it - because that's what
8966     // `osd create` does.
8967     return 0;
8968   }
8969
8970   /*
8971    * This function will be used to validate whether we are able to
8972    * create a new osd when the `uuid` is specified.
8973    *
8974    * It will be used by both `osd create` and `osd new`, as the checks
8975    * are basically the same when it pertains to osd id and uuid validation.
8976    * However, `osd create` presumes an `uuid` is optional, for legacy
8977    * reasons, while `osd new` requires the `uuid` to be provided. This
8978    * means that `osd create` will not be idempotent if an `uuid` is not
8979    * provided, but we will always guarantee the idempotency of `osd new`.
8980    */
8981
8982   ceph_assert(!uuid.is_zero());
8983   if (pending_inc.identify_osd(uuid) >= 0) {
8984     // osd is about to exist
8985     return -EAGAIN;
8986   }
8987
8988   int32_t i = osdmap.identify_osd(uuid);
8989   if (i >= 0) {
8990     // osd already exists
8991     if (id >= 0 && i != id) {
8992       ss << "uuid " << uuid << " already in use for different id " << i;
8993       return -EEXIST;
8994     }
8995     // return a positive errno to distinguish between a blocking error
8996     // and an error we consider to not be a problem (i.e., this would be
8997     // an idempotent operation).
8998     *existing_id = i;
8999     return EEXIST;
9000   }
9001   // i < 0
9002   if (id >= 0) {
9003     if (pending_inc.new_state.count(id)) {
9004       // osd is about to exist
9005       return -EAGAIN;
9006     }
9007     // we may not care if an osd exists if we are recreating a previously
9008     // destroyed osd.
9009     if (check_osd_exists && osdmap.exists(id)) {
9010       ss << "id " << id << " already in use and does not match uuid "
9011          << uuid;
9012       return -EINVAL;
9013     }
9014   }
9015   return 0;
9016 }
9017
9018 int OSDMonitor::prepare_command_osd_create(
9019     const int32_t id,
9020     const uuid_d& uuid,
9021     int32_t* existing_id,
9022     stringstream& ss)
9023 {
9024   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9025   ceph_assert(existing_id);
9026   if (osdmap.is_destroyed(id)) {
9027     ss << "ceph osd create has been deprecated. Please use ceph osd new "
9028           "instead.";
9029     return -EINVAL;
9030   }
9031
9032   if (uuid.is_zero()) {
9033     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9034   }
9035
9036   return validate_osd_create(id, uuid, true, existing_id, ss);
9037 }
9038
9039 int OSDMonitor::prepare_command_osd_new(
9040     MonOpRequestRef op,
9041     const cmdmap_t& cmdmap,
9042     const map<string,string>& params,
9043     stringstream &ss,
9044     Formatter *f)
9045 {
9046   uuid_d uuid;
9047   string uuidstr;
9048   int64_t id = -1;
9049
9050   ceph_assert(paxos->is_plugged());
9051
9052   dout(10) << __func__ << " " << op << dendl;
9053
9054   /* validate command. abort now if something's wrong. */
9055
9056   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9057    *
9058    * If `id` is not specified, we will identify any existing osd based
9059    * on `uuid`. Operation will be idempotent iff secrets match.
9060    *
9061    * If `id` is specified, we will identify any existing osd based on
9062    * `uuid` and match against `id`. If they match, operation will be
9063    * idempotent iff secrets match.
9064    *
9065    * `-i secrets.json` will be optional. If supplied, will be used
9066    * to check for idempotency when `id` and `uuid` match.
9067    *
9068    * If `id` is not specified, and `uuid` does not exist, an id will
9069    * be found or allocated for the osd.
9070    *
9071    * If `id` is specified, and the osd has been previously marked
9072    * as destroyed, then the `id` will be reused.
9073    */
9074   if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9075     ss << "requires the OSD's UUID to be specified.";
9076     return -EINVAL;
9077   } else if (!uuid.parse(uuidstr.c_str())) {
9078     ss << "invalid UUID value '" << uuidstr << "'.";
9079     return -EINVAL;
9080   }
9081
9082   if (cmd_getval(cmdmap, "id", id) &&
9083       (id < 0)) {
9084     ss << "invalid OSD id; must be greater or equal than zero.";
9085     return -EINVAL;
9086   }
9087
9088   // are we running an `osd create`-like command, or recreating
9089   // a previously destroyed osd?
9090
9091   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9092
9093   // we will care about `id` to assess whether osd is `destroyed`, or
9094   // to create a new osd.
9095   // we will need an `id` by the time we reach auth.
9096
9097   int32_t existing_id = -1;
9098   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9099                                 &existing_id, ss);
9100
9101   bool may_be_idempotent = false;
9102   if (err == EEXIST) {
9103     // this is idempotent from the osdmon's point-of-view
9104     may_be_idempotent = true;
9105     ceph_assert(existing_id >= 0);
9106     id = existing_id;
9107   } else if (err < 0) {
9108     return err;
9109   }
9110
9111   if (!may_be_idempotent) {
9112     // idempotency is out of the window. We are either creating a new
9113     // osd or recreating a destroyed osd.
9114     //
9115     // We now need to figure out if we have an `id` (and if it's valid),
9116     // of find an `id` if we don't have one.
9117
9118     // NOTE: we need to consider the case where the `id` is specified for
9119     // `osd create`, and we must honor it. So this means checking if
9120     // the `id` is destroyed, and if so assume the destroy; otherwise,
9121     // check if it `exists` - in which case we complain about not being
9122     // `destroyed`. In the end, if nothing fails, we must allow the
9123     // creation, so that we are compatible with `create`.
9124     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9125       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9126       ss << "OSD " << id << " has not yet been destroyed";
9127       return -EINVAL;
9128     } else if (id < 0) {
9129       // find an `id`
9130       id = _allocate_osd_id(&existing_id);
9131       if (id < 0) {
9132         ceph_assert(existing_id >= 0);
9133         id = existing_id;
9134       }
9135       dout(10) << __func__ << " found id " << id << " to use" << dendl;
9136     } else if (id >= 0 && osdmap.is_destroyed(id)) {
9137       dout(10) << __func__ << " recreating osd." << id << dendl;
9138     } else {
9139       dout(10) << __func__ << " creating new osd." << id << dendl;
9140     }
9141   } else {
9142     ceph_assert(id >= 0);
9143     ceph_assert(osdmap.exists(id));
9144   }
9145
9146   // we are now able to either create a brand new osd or reuse an existing
9147   // osd that has been previously destroyed.
9148
9149   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9150
9151   if (may_be_idempotent && params.empty()) {
9152     // nothing to do, really.
9153     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9154     ceph_assert(id >= 0);
9155     if (f) {
9156       f->open_object_section("created_osd");
9157       f->dump_int("osdid", id);
9158       f->close_section();
9159     } else {
9160       ss << id;
9161     }
9162     return EEXIST;
9163   }
9164
9165   string device_class;
9166   auto p = params.find("crush_device_class");
9167   if (p != params.end()) {
9168     device_class = p->second;
9169     dout(20) << __func__ << " device_class will be " << device_class << dendl;
9170   }
9171   string cephx_secret, lockbox_secret, dmcrypt_key;
9172   bool has_lockbox = false;
9173   bool has_secrets = params.count("cephx_secret")
9174     || params.count("cephx_lockbox_secret")
9175     || params.count("dmcrypt_key");
9176
9177   ConfigKeyService *svc = nullptr;
9178   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9179
9180   if (has_secrets) {
9181     if (params.count("cephx_secret") == 0) {
9182       ss << "requires a cephx secret.";
9183       return -EINVAL;
9184     }
9185     cephx_secret = params.at("cephx_secret");
9186
9187     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9188     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9189
9190     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9191              << " dmcrypt " << has_dmcrypt_key << dendl;
9192
9193     if (has_lockbox_secret && has_dmcrypt_key) {
9194       has_lockbox = true;
9195       lockbox_secret = params.at("cephx_lockbox_secret");
9196       dmcrypt_key = params.at("dmcrypt_key");
9197     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9198       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9199       return -EINVAL;
9200     }
9201
9202     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9203
9204     err = mon->authmon()->validate_osd_new(id, uuid,
9205         cephx_secret,
9206         lockbox_secret,
9207         cephx_entity,
9208         lockbox_entity,
9209         ss);
9210     if (err < 0) {
9211       return err;
9212     } else if (may_be_idempotent && err != EEXIST) {
9213       // for this to be idempotent, `id` should already be >= 0; no need
9214       // to use validate_id.
9215       ceph_assert(id >= 0);
9216       ss << "osd." << id << " exists but secrets do not match";
9217       return -EEXIST;
9218     }
9219
9220     if (has_lockbox) {
9221       svc = (ConfigKeyService*)mon->config_key_service;
9222       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9223       if (err < 0) {
9224         return err;
9225       } else if (may_be_idempotent && err != EEXIST) {
9226         ceph_assert(id >= 0);
9227         ss << "osd." << id << " exists but dm-crypt key does not match.";
9228         return -EEXIST;
9229       }
9230     }
9231   }
9232   ceph_assert(!has_secrets || !cephx_secret.empty());
9233   ceph_assert(!has_lockbox || !lockbox_secret.empty());
9234
9235   if (may_be_idempotent) {
9236     // we have nothing to do for either the osdmon or the authmon,
9237     // and we have no lockbox - so the config key service will not be
9238     // touched. This is therefore an idempotent operation, and we can
9239     // just return right away.
9240     dout(10) << __func__ << " idempotent -- no op." << dendl;
9241     ceph_assert(id >= 0);
9242     if (f) {
9243       f->open_object_section("created_osd");
9244       f->dump_int("osdid", id);
9245       f->close_section();
9246     } else {
9247       ss << id;
9248     }
9249     return EEXIST;
9250   }
9251   ceph_assert(!may_be_idempotent);
9252
9253   // perform updates.
9254   if (has_secrets) {
9255     ceph_assert(!cephx_secret.empty());
9256     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9257            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9258
9259     err = mon->authmon()->do_osd_new(cephx_entity,
9260         lockbox_entity,
9261         has_lockbox);
9262     ceph_assert(0 == err);
9263
9264     if (has_lockbox) {
9265       ceph_assert(nullptr != svc);
9266       svc->do_osd_new(uuid, dmcrypt_key);
9267     }
9268   }
9269
9270   if (is_recreate_destroyed) {
9271     ceph_assert(id >= 0);
9272     ceph_assert(osdmap.is_destroyed(id));
9273     pending_inc.new_weight[id] = CEPH_OSD_OUT;
9274     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9275     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9276       pending_inc.new_state[id] |= CEPH_OSD_NEW;
9277     }
9278     if (osdmap.get_state(id) & CEPH_OSD_UP) {
9279       // due to http://tracker.ceph.com/issues/20751 some clusters may
9280       // have UP set for non-existent OSDs; make sure it is cleared
9281       // for a newly created osd.
9282       pending_inc.new_state[id] |= CEPH_OSD_UP;
9283     }
9284     pending_inc.new_uuid[id] = uuid;
9285   } else {
9286     ceph_assert(id >= 0);
9287     int32_t new_id = -1;
9288     do_osd_create(id, uuid, device_class, &new_id);
9289     ceph_assert(new_id >= 0);
9290     ceph_assert(id == new_id);
9291   }
9292
9293   if (f) {
9294     f->open_object_section("created_osd");
9295     f->dump_int("osdid", id);
9296     f->close_section();
9297   } else {
9298     ss << id;
9299   }
9300
9301   return 0;
9302 }
9303
9304 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9305 {
9306   op->mark_osdmon_event(__func__);
9307   auto m = op->get_req<MMonCommand>();
9308   stringstream ss;
9309   cmdmap_t cmdmap;
9310   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9311     string rs = ss.str();
9312     mon->reply_command(op, -EINVAL, rs, get_last_committed());
9313     return true;
9314   }
9315
9316   MonSession *session = op->get_session();
9317   if (!session) {
9318     derr << __func__ << " no session" << dendl;
9319     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9320     return true;
9321   }
9322
9323   return prepare_command_impl(op, cmdmap);
9324 }
9325
9326 static int parse_reweights(CephContext *cct,
9327                            const cmdmap_t& cmdmap,
9328                            const OSDMap& osdmap,
9329                            map<int32_t, uint32_t>* weights)
9330 {
9331   string weights_str;
9332   if (!cmd_getval(cmdmap, "weights", weights_str)) {
9333     return -EINVAL;
9334   }
9335   std::replace(begin(weights_str), end(weights_str), '\'', '"');
9336   json_spirit::mValue json_value;
9337   if (!json_spirit::read(weights_str, json_value)) {
9338     return -EINVAL;
9339   }
9340   if (json_value.type() != json_spirit::obj_type) {
9341     return -EINVAL;
9342   }
9343   const auto obj = json_value.get_obj();
9344   try {
9345     for (auto& osd_weight : obj) {
9346       auto osd_id = std::stoi(osd_weight.first);
9347       if (!osdmap.exists(osd_id)) {
9348         return -ENOENT;
9349       }
9350       if (osd_weight.second.type() != json_spirit::str_type) {
9351         return -EINVAL;
9352       }
9353       auto weight = std::stoul(osd_weight.second.get_str());
9354       weights->insert({osd_id, weight});
9355     }
9356   } catch (const std::logic_error& e) {
9357     return -EINVAL;
9358   }
9359   return 0;
9360 }
9361
9362 int OSDMonitor::prepare_command_osd_destroy(
9363     int32_t id,
9364     stringstream& ss)
9365 {
9366   ceph_assert(paxos->is_plugged());
9367
9368   // we check if the osd exists for the benefit of `osd purge`, which may
9369   // have previously removed the osd. If the osd does not exist, return
9370   // -ENOENT to convey this, and let the caller deal with it.
9371   //
9372   // we presume that all auth secrets and config keys were removed prior
9373   // to this command being called. if they exist by now, we also assume
9374   // they must have been created by some other command and do not pertain
9375   // to this non-existent osd.
9376   if (!osdmap.exists(id)) {
9377     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9378     return -ENOENT;
9379   }
9380
9381   uuid_d uuid = osdmap.get_uuid(id);
9382   dout(10) << __func__ << " destroying osd." << id
9383            << " uuid " << uuid << dendl;
9384
9385   // if it has been destroyed, we assume our work here is done.
9386   if (osdmap.is_destroyed(id)) {
9387     ss << "destroyed osd." << id;
9388     return 0;
9389   }
9390
9391   EntityName cephx_entity, lockbox_entity;
9392   bool idempotent_auth = false, idempotent_cks = false;
9393
9394   int err = mon->authmon()->validate_osd_destroy(id, uuid,
9395                                                  cephx_entity,
9396                                                  lockbox_entity,
9397                                                  ss);
9398   if (err < 0) {
9399     if (err == -ENOENT) {
9400       idempotent_auth = true;
9401     } else {
9402       return err;
9403     }
9404   }
9405
9406   ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9407   err = svc->validate_osd_destroy(id, uuid);
9408   if (err < 0) {
9409     ceph_assert(err == -ENOENT);
9410     err = 0;
9411     idempotent_cks = true;
9412   }
9413
9414   if (!idempotent_auth) {
9415     err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9416     ceph_assert(0 == err);
9417   }
9418
9419   if (!idempotent_cks) {
9420     svc->do_osd_destroy(id, uuid);
9421   }
9422
9423   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9424   pending_inc.new_uuid[id] = uuid_d();
9425
9426   // we can only propose_pending() once per service, otherwise we'll be
9427   // defying PaxosService and all laws of nature. Therefore, as we may
9428   // be used during 'osd purge', let's keep the caller responsible for
9429   // proposing.
9430   ceph_assert(err == 0);
9431   return 0;
9432 }
9433
9434 int OSDMonitor::prepare_command_osd_purge(
9435     int32_t id,
9436     stringstream& ss)
9437 {
9438   ceph_assert(paxos->is_plugged());
9439   dout(10) << __func__ << " purging osd." << id << dendl;
9440
9441   ceph_assert(!osdmap.is_up(id));
9442
9443   /*
9444    * This may look a bit weird, but this is what's going to happen:
9445    *
9446    *  1. we make sure that removing from crush works
9447    *  2. we call `prepare_command_osd_destroy()`. If it returns an
9448    *     error, then we abort the whole operation, as no updates
9449    *     have been made. However, we this function will have
9450    *     side-effects, thus we need to make sure that all operations
9451    *     performed henceforth will *always* succeed.
9452    *  3. we call `prepare_command_osd_remove()`. Although this
9453    *     function can return an error, it currently only checks if the
9454    *     osd is up - and we have made sure that it is not so, so there
9455    *     is no conflict, and it is effectively an update.
9456    *  4. finally, we call `do_osd_crush_remove()`, which will perform
9457    *     the crush update we delayed from before.
9458    */
9459
9460   CrushWrapper newcrush;
9461   _get_pending_crush(newcrush);
9462
9463   bool may_be_idempotent = false;
9464
9465   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9466   if (err == -ENOENT) {
9467     err = 0;
9468     may_be_idempotent = true;
9469   } else if (err < 0) {
9470     ss << "error removing osd." << id << " from crush";
9471     return err;
9472   }
9473
9474   // no point destroying the osd again if it has already been marked destroyed
9475   if (!osdmap.is_destroyed(id)) {
9476     err = prepare_command_osd_destroy(id, ss);
9477     if (err < 0) {
9478       if (err == -ENOENT) {
9479         err = 0;
9480       } else {
9481         return err;
9482       }
9483     } else {
9484       may_be_idempotent = false;
9485     }
9486   }
9487   ceph_assert(0 == err);
9488
9489   if (may_be_idempotent && !osdmap.exists(id)) {
9490     dout(10) << __func__ << " osd." << id << " does not exist and "
9491              << "we are idempotent." << dendl;
9492     return -ENOENT;
9493   }
9494
9495   err = prepare_command_osd_remove(id);
9496   // we should not be busy, as we should have made sure this id is not up.
9497   ceph_assert(0 == err);
9498
9499   do_osd_crush_remove(newcrush);
9500   return 0;
9501 }
9502
9503 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9504                                       const cmdmap_t& cmdmap)
9505 {
9506   op->mark_osdmon_event(__func__);
9507   auto m = op->get_req<MMonCommand>();
9508   bool ret = false;
9509   stringstream ss;
9510   string rs;
9511   bufferlist rdata;
9512   int err = 0;
9513
9514   string format;
9515   cmd_getval(cmdmap, "format", format, string("plain"));
9516   boost::scoped_ptr<Formatter> f(Formatter::create(format));
9517
9518   string prefix;
9519   cmd_getval(cmdmap, "prefix", prefix);
9520
9521   int64_t osdid;
9522   string osd_name;
9523   bool osdid_present = false;
9524   if (prefix != "osd pg-temp" &&
9525       prefix != "osd pg-upmap" &&
9526       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
9527     osdid_present = cmd_getval(cmdmap, "id", osdid);
9528   }
9529   if (osdid_present) {
9530     ostringstream oss;
9531     oss << "osd." << osdid;
9532     osd_name = oss.str();
9533   }
9534
9535   // Even if there's a pending state with changes that could affect
9536   // a command, considering that said state isn't yet committed, we
9537   // just don't care about those changes if the command currently being
9538   // handled acts as a no-op against the current committed state.
9539   // In a nutshell, we assume this command  happens *before*.
9540   //
9541   // Let me make this clearer:
9542   //
9543   //   - If we have only one client, and that client issues some
9544   //     operation that would conflict with this operation  but is
9545   //     still on the pending state, then we would be sure that said
9546   //     operation wouldn't have returned yet, so the client wouldn't
9547   //     issue this operation (unless the client didn't wait for the
9548   //     operation to finish, and that would be the client's own fault).
9549   //
9550   //   - If we have more than one client, each client will observe
9551   //     whatever is the state at the moment of the commit.  So, if we
9552   //     have two clients, one issuing an unlink and another issuing a
9553   //     link, and if the link happens while the unlink is still on the
9554   //     pending state, from the link's point-of-view this is a no-op.
9555   //     If different clients are issuing conflicting operations and
9556   //     they care about that, then the clients should make sure they
9557   //     enforce some kind of concurrency mechanism -- from our
9558   //     perspective that's what Douglas Adams would call an SEP.
9559   //
9560   // This should be used as a general guideline for most commands handled
9561   // in this function.  Adapt as you see fit, but please bear in mind that
9562   // this is the expected behavior.
9563
9564
9565   if (prefix == "osd setcrushmap" ||
9566       (prefix == "osd crush set" && !osdid_present)) {
9567     if (pending_inc.crush.length()) {
9568       dout(10) << __func__ << " waiting for pending crush update " << dendl;
9569       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9570       return true;
9571     }
9572     dout(10) << "prepare_command setting new crush map" << dendl;
9573     bufferlist data(m->get_data());
9574     CrushWrapper crush;
9575     try {
9576       auto bl = data.cbegin();
9577       crush.decode(bl);
9578     }
9579     catch (const std::exception &e) {
9580       err = -EINVAL;
9581       ss << "Failed to parse crushmap: " << e.what();
9582       goto reply;
9583     }
9584
9585     int64_t prior_version = 0;
9586     if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9587       if (prior_version == osdmap.get_crush_version() - 1) {
9588         // see if we are a resend of the last update.  this is imperfect
9589         // (multiple racing updaters may not both get reliable success)
9590         // but we expect crush updaters (via this interface) to be rare-ish.
9591         bufferlist current, proposed;
9592         osdmap.crush->encode(current, mon->get_quorum_con_features());
9593         crush.encode(proposed, mon->get_quorum_con_features());
9594         if (current.contents_equal(proposed)) {
9595           dout(10) << __func__
9596                    << " proposed matches current and version equals previous"
9597                    << dendl;
9598           err = 0;
9599           ss << osdmap.get_crush_version();
9600           goto reply;
9601         }
9602       }
9603       if (prior_version != osdmap.get_crush_version()) {
9604         err = -EPERM;
9605         ss << "prior_version " << prior_version << " != crush version "
9606            << osdmap.get_crush_version();
9607         goto reply;
9608       }
9609     }
9610
9611     if (crush.has_legacy_rule_ids()) {
9612       err = -EINVAL;
9613       ss << "crush maps with ruleset != ruleid are no longer allowed";
9614       goto reply;
9615     }
9616     if (!validate_crush_against_features(&crush, ss)) {
9617       err = -EINVAL;
9618       goto reply;
9619     }
9620
9621     err = osdmap.validate_crush_rules(&crush, &ss);
9622     if (err < 0) {
9623       goto reply;
9624     }
9625
9626     if (g_conf()->mon_osd_crush_smoke_test) {
9627       // sanity check: test some inputs to make sure this map isn't
9628       // totally broken
9629       dout(10) << " testing map" << dendl;
9630       stringstream ess;
9631       CrushTester tester(crush, ess);
9632       tester.set_min_x(0);
9633       tester.set_max_x(50);
9634       auto start = ceph::coarse_mono_clock::now();
9635       int r = tester.test_with_fork(g_conf()->mon_lease);
9636       auto duration = ceph::coarse_mono_clock::now() - start;
9637       if (r < 0) {
9638         dout(10) << " tester.test_with_fork returns " << r
9639                  << ": " << ess.str() << dendl;
9640         ss << "crush smoke test failed with " << r << ": " << ess.str();
9641         err = r;
9642         goto reply;
9643       }
9644       dout(10) << __func__ << " crush somke test duration: "
9645                << duration << ", result: " << ess.str() << dendl;
9646     }
9647
9648     pending_inc.crush = data;
9649     ss << osdmap.get_crush_version() + 1;
9650     goto update;
9651
9652   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9653     CrushWrapper newcrush;
9654     _get_pending_crush(newcrush);
9655     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9656       int bid = -1 - b;
9657       if (newcrush.bucket_exists(bid) &&
9658           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9659         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9660         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9661       }
9662     }
9663     if (!validate_crush_against_features(&newcrush, ss)) {
9664       err = -EINVAL;
9665       goto reply;
9666     }
9667     pending_inc.crush.clear();
9668     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9669     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9670                                               get_last_committed() + 1));
9671     return true;
9672   } else if (prefix == "osd crush set-device-class") {
9673     string device_class;
9674     if (!cmd_getval(cmdmap, "class", device_class)) {
9675       err = -EINVAL; // no value!
9676       goto reply;
9677     }
9678
9679     bool stop = false;
9680     vector<string> idvec;
9681     cmd_getval(cmdmap, "ids", idvec);
9682     CrushWrapper newcrush;
9683     _get_pending_crush(newcrush);
9684     set<int> updated;
9685     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9686       set<int> osds;
9687       // wildcard?
9688       if (j == 0 &&
9689           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9690         osdmap.get_all_osds(osds);
9691         stop = true;
9692       } else {
9693         // try traditional single osd way
9694         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9695         if (osd < 0) {
9696           // ss has reason for failure
9697           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9698           err = -EINVAL;
9699           continue;
9700         }
9701         osds.insert(osd);
9702       }
9703
9704       for (auto &osd : osds) {
9705         if (!osdmap.exists(osd)) {
9706           ss << "osd." << osd << " does not exist. ";
9707           continue;
9708         }
9709
9710         ostringstream oss;
9711         oss << "osd." << osd;
9712         string name = oss.str();
9713
9714         if (newcrush.get_max_devices() < osd + 1) {
9715           newcrush.set_max_devices(osd + 1);
9716         }
9717         string action;
9718         if (newcrush.item_exists(osd)) {
9719           action = "updating";
9720         } else {
9721           action = "creating";
9722           newcrush.set_item_name(osd, name);
9723         }
9724
9725         dout(5) << action << " crush item id " << osd << " name '" << name
9726                 << "' device_class '" << device_class << "'"
9727                 << dendl;
9728         err = newcrush.update_device_class(osd, device_class, name, &ss);
9729         if (err < 0) {
9730           goto reply;
9731         }
9732         if (err == 0 && !_have_pending_crush()) {
9733           if (!stop) {
9734             // for single osd only, wildcard makes too much noise
9735             ss << "set-device-class item id " << osd << " name '" << name
9736                << "' device_class '" << device_class << "': no change. ";
9737           }
9738         } else {
9739           updated.insert(osd);
9740         }
9741       }
9742     }
9743
9744     if (!updated.empty()) {
9745       pending_inc.crush.clear();
9746       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9747       ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9748       getline(ss, rs);
9749       wait_for_finished_proposal(op,
9750         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9751       return true;
9752     }
9753
9754  } else if (prefix == "osd crush rm-device-class") {
9755     bool stop = false;
9756     vector<string> idvec;
9757     cmd_getval(cmdmap, "ids", idvec);
9758     CrushWrapper newcrush;
9759     _get_pending_crush(newcrush);
9760     set<int> updated;
9761
9762     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9763       set<int> osds;
9764
9765       // wildcard?
9766       if (j == 0 &&
9767           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9768         osdmap.get_all_osds(osds);
9769         stop = true;
9770       } else {
9771         // try traditional single osd way
9772         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9773         if (osd < 0) {
9774           // ss has reason for failure
9775           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9776           err = -EINVAL;
9777           goto reply;
9778         }
9779         osds.insert(osd);
9780       }
9781
9782       for (auto &osd : osds) {
9783         if (!osdmap.exists(osd)) {
9784           ss << "osd." << osd << " does not exist. ";
9785           continue;
9786         }
9787
9788         auto class_name = newcrush.get_item_class(osd);
9789         if (!class_name) {
9790           ss << "osd." << osd << " belongs to no class, ";
9791           continue;
9792         }
9793         // note that we do not verify if class_is_in_use here
9794         // in case the device is misclassified and user wants
9795         // to overridely reset...
9796
9797         err = newcrush.remove_device_class(cct, osd, &ss);
9798         if (err < 0) {
9799           // ss has reason for failure
9800           goto reply;
9801         }
9802         updated.insert(osd);
9803       }
9804     }
9805
9806     if (!updated.empty()) {
9807       pending_inc.crush.clear();
9808       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9809       ss << "done removing class of osd(s): " << updated;
9810       getline(ss, rs);
9811       wait_for_finished_proposal(op,
9812         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9813       return true;
9814     }
9815   } else if (prefix == "osd crush class create") {
9816     string device_class;
9817     if (!cmd_getval(cmdmap, "class", device_class)) {
9818       err = -EINVAL; // no value!
9819       goto reply;
9820     }
9821     if (osdmap.require_osd_release < ceph_release_t::luminous) {
9822       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9823          << "luminous' before using crush device classes";
9824       err = -EPERM;
9825       goto reply;
9826     }
9827     if (!_have_pending_crush() &&
9828         _get_stable_crush().class_exists(device_class)) {
9829       ss << "class '" << device_class << "' already exists";
9830       goto reply;
9831     }
9832      CrushWrapper newcrush;
9833     _get_pending_crush(newcrush);
9834      if (newcrush.class_exists(device_class)) {
9835       ss << "class '" << device_class << "' already exists";
9836       goto update;
9837     }
9838     int class_id = newcrush.get_or_create_class_id(device_class);
9839     pending_inc.crush.clear();
9840     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9841     ss << "created class " << device_class << " with id " << class_id
9842        << " to crush map";
9843     goto update;
9844   } else if (prefix == "osd crush class rm") {
9845     string device_class;
9846     if (!cmd_getval(cmdmap, "class", device_class)) {
9847        err = -EINVAL; // no value!
9848        goto reply;
9849      }
9850     if (osdmap.require_osd_release < ceph_release_t::luminous) {
9851        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9852          << "luminous' before using crush device classes";
9853        err = -EPERM;
9854        goto reply;
9855      }
9856
9857      if (!osdmap.crush->class_exists(device_class)) {
9858        err = 0;
9859        goto reply;
9860      }
9861
9862      CrushWrapper newcrush;
9863      _get_pending_crush(newcrush);
9864      if (!newcrush.class_exists(device_class)) {
9865        err = 0; // make command idempotent
9866        goto wait;
9867      }
9868      int class_id = newcrush.get_class_id(device_class);
9869      stringstream ts;
9870      if (newcrush.class_is_in_use(class_id, &ts)) {
9871        err = -EBUSY;
9872        ss << "class '" << device_class << "' " << ts.str();
9873        goto reply;
9874      }
9875
9876      // check if class is used by any erasure-code-profiles
9877      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9878        osdmap.get_erasure_code_profiles();
9879      auto ec_profiles = pending_inc.get_erasure_code_profiles();
9880 #ifdef HAVE_STDLIB_MAP_SPLICING
9881      ec_profiles.merge(old_ec_profiles);
9882 #else
9883      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9884                         make_move_iterator(end(old_ec_profiles)));
9885 #endif
9886      list<string> referenced_by;
9887      for (auto &i: ec_profiles) {
9888        for (auto &j: i.second) {
9889          if ("crush-device-class" == j.first && device_class == j.second) {
9890            referenced_by.push_back(i.first);
9891          }
9892        }
9893      }
9894      if (!referenced_by.empty()) {
9895        err = -EBUSY;
9896        ss << "class '" << device_class
9897           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9898        goto reply;
9899      }
9900
9901      set<int> osds;
9902      newcrush.get_devices_by_class(device_class, &osds);
9903      for (auto& p: osds) {
9904        err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9905        if (err < 0) {
9906          // ss has reason for failure
9907          goto reply;
9908        }
9909      }
9910
9911      if (osds.empty()) {
9912        // empty class, remove directly
9913        err = newcrush.remove_class_name(device_class);
9914        if (err < 0) {
9915          ss << "class '" << device_class << "' cannot be removed '"
9916             << cpp_strerror(err) << "'";
9917          goto reply;
9918        }
9919      }
9920
9921      pending_inc.crush.clear();
9922      newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9923      ss << "removed class " << device_class << " with id " << class_id
9924         << " from crush map";
9925      goto update;
9926   } else if (prefix == "osd crush class rename") {
9927     string srcname, dstname;
9928     if (!cmd_getval(cmdmap, "srcname", srcname)) {
9929       err = -EINVAL;
9930       goto reply;
9931     }
9932     if (!cmd_getval(cmdmap, "dstname", dstname)) {
9933       err = -EINVAL;
9934       goto reply;
9935     }
9936
9937     CrushWrapper newcrush;
9938     _get_pending_crush(newcrush);
9939     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9940       // suppose this is a replay and return success
9941       // so command is idempotent
9942       ss << "already renamed to '" << dstname << "'";
9943       err = 0;
9944       goto reply;
9945     }
9946
9947     err = newcrush.rename_class(srcname, dstname);
9948     if (err < 0) {
9949       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9950          << cpp_strerror(err);
9951       goto reply;
9952     }
9953
9954     pending_inc.crush.clear();
9955     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9956     ss << "rename class '" << srcname << "' to '" << dstname << "'";
9957     goto update;
9958   } else if (prefix == "osd crush add-bucket") {
9959     // os crush add-bucket <name> <type>
9960     string name, typestr;
9961     vector<string> argvec;
9962     cmd_getval(cmdmap, "name", name);
9963     cmd_getval(cmdmap, "type", typestr);
9964     cmd_getval(cmdmap, "args", argvec);
9965     map<string,string> loc;
9966     if (!argvec.empty()) {
9967       CrushWrapper::parse_loc_map(argvec, &loc);
9968       dout(0) << "will create and move bucket '" << name
9969               << "' to location " << loc << dendl;
9970     }
9971
9972     if (!_have_pending_crush() &&
9973         _get_stable_crush().name_exists(name)) {
9974       ss << "bucket '" << name << "' already exists";
9975       goto reply;
9976     }
9977
9978     CrushWrapper newcrush;
9979     _get_pending_crush(newcrush);
9980
9981     if (newcrush.name_exists(name)) {
9982       ss << "bucket '" << name << "' already exists";
9983       goto update;
9984     }
9985     int type = newcrush.get_type_id(typestr);
9986     if (type < 0) {
9987       ss << "type '" << typestr << "' does not exist";
9988       err = -EINVAL;
9989       goto reply;
9990     }
9991     if (type == 0) {
9992       ss << "type '" << typestr << "' is for devices, not buckets";
9993       err = -EINVAL;
9994       goto reply;
9995     }
9996     int bucketno;
9997     err = newcrush.add_bucket(0, 0,
9998                               CRUSH_HASH_DEFAULT, type, 0, NULL,
9999                               NULL, &bucketno);
10000     if (err < 0) {
10001       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10002       goto reply;
10003     }
10004     err = newcrush.set_item_name(bucketno, name);
10005     if (err < 0) {
10006       ss << "error setting bucket name to '" << name << "'";
10007       goto reply;
10008     }
10009
10010     if (!loc.empty()) {
10011       if (!newcrush.check_item_loc(cct, bucketno, loc,
10012           (int *)NULL)) {
10013         err = newcrush.move_bucket(cct, bucketno, loc);
10014         if (err < 0) {
10015           ss << "error moving bucket '" << name << "' to location " << loc;
10016           goto reply;
10017         }
10018       } else {
10019         ss << "no need to move item id " << bucketno << " name '" << name
10020            << "' to location " << loc << " in crush map";
10021       }
10022     }
10023
10024     pending_inc.crush.clear();
10025     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10026     if (loc.empty()) {
10027       ss << "added bucket " << name << " type " << typestr
10028          << " to crush map";
10029     } else {
10030       ss << "added bucket " << name << " type " << typestr
10031          << " to location " << loc;
10032     }
10033     goto update;
10034   } else if (prefix == "osd crush rename-bucket") {
10035     string srcname, dstname;
10036     cmd_getval(cmdmap, "srcname", srcname);
10037     cmd_getval(cmdmap, "dstname", dstname);
10038
10039     err = crush_rename_bucket(srcname, dstname, &ss);
10040     if (err == -EALREADY) // equivalent to success for idempotency
10041       err = 0;
10042     if (err)
10043       goto reply;
10044     else
10045       goto update;
10046   } else if (prefix == "osd crush weight-set create" ||
10047              prefix == "osd crush weight-set create-compat") {
10048     CrushWrapper newcrush;
10049     _get_pending_crush(newcrush);
10050     int64_t pool;
10051     int positions;
10052     if (newcrush.has_non_straw2_buckets()) {
10053       ss << "crush map contains one or more bucket(s) that are not straw2";
10054       err = -EPERM;
10055       goto reply;
10056     }
10057     if (prefix == "osd crush weight-set create") {
10058       if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10059           osdmap.require_min_compat_client < ceph_release_t::luminous) {
10060         ss << "require_min_compat_client "
10061            << osdmap.require_min_compat_client
10062            << " < luminous, which is required for per-pool weight-sets. "
10063            << "Try 'ceph osd set-require-min-compat-client luminous' "
10064            << "before using the new interface";
10065         err = -EPERM;
10066         goto reply;
10067       }
10068       string poolname, mode;
10069       cmd_getval(cmdmap, "pool", poolname);
10070       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10071       if (pool < 0) {
10072         ss << "pool '" << poolname << "' not found";
10073         err = -ENOENT;
10074         goto reply;
10075       }
10076       cmd_getval(cmdmap, "mode", mode);
10077       if (mode != "flat" && mode != "positional") {
10078         ss << "unrecognized weight-set mode '" << mode << "'";
10079         err = -EINVAL;
10080         goto reply;
10081       }
10082       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10083     } else {
10084       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10085       positions = 1;
10086     }
10087     if (!newcrush.create_choose_args(pool, positions)) {
10088       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10089         ss << "compat weight-set already created";
10090       } else {
10091         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10092            << "' already created";
10093       }
10094       goto reply;
10095     }
10096     pending_inc.crush.clear();
10097     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10098     goto update;
10099
10100   } else if (prefix == "osd crush weight-set rm" ||
10101              prefix == "osd crush weight-set rm-compat") {
10102     CrushWrapper newcrush;
10103     _get_pending_crush(newcrush);
10104     int64_t pool;
10105     if (prefix == "osd crush weight-set rm") {
10106       string poolname;
10107       cmd_getval(cmdmap, "pool", poolname);
10108       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10109       if (pool < 0) {
10110         ss << "pool '" << poolname << "' not found";
10111         err = -ENOENT;
10112         goto reply;
10113       }
10114     } else {
10115       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10116     }
10117     newcrush.rm_choose_args(pool);
10118     pending_inc.crush.clear();
10119     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10120     goto update;
10121
10122   } else if (prefix == "osd crush weight-set reweight" ||
10123              prefix == "osd crush weight-set reweight-compat") {
10124     string poolname, item;
10125     vector<double> weight;
10126     cmd_getval(cmdmap, "pool", poolname);
10127     cmd_getval(cmdmap, "item", item);
10128     cmd_getval(cmdmap, "weight", weight);
10129     CrushWrapper newcrush;
10130     _get_pending_crush(newcrush);
10131     int64_t pool;
10132     if (prefix == "osd crush weight-set reweight") {
10133       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10134       if (pool < 0) {
10135         ss << "pool '" << poolname << "' not found";
10136         err = -ENOENT;
10137         goto reply;
10138       }
10139       if (!newcrush.have_choose_args(pool)) {
10140         ss << "no weight-set for pool '" << poolname << "'";
10141         err = -ENOENT;
10142         goto reply;
10143       }
10144       auto arg_map = newcrush.choose_args_get(pool);
10145       int positions = newcrush.get_choose_args_positions(arg_map);
10146       if (weight.size() != (size_t)positions) {
10147          ss << "must specify exact " << positions << " weight values";
10148          err = -EINVAL;
10149          goto reply;
10150       }
10151     } else {
10152       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10153       if (!newcrush.have_choose_args(pool)) {
10154         ss << "no backward-compatible weight-set";
10155         err = -ENOENT;
10156         goto reply;
10157       }
10158     }
10159     if (!newcrush.name_exists(item)) {
10160       ss << "item '" << item << "' does not exist";
10161       err = -ENOENT;
10162       goto reply;
10163     }
10164     err = newcrush.choose_args_adjust_item_weightf(
10165       cct,
10166       newcrush.choose_args_get(pool),
10167       newcrush.get_item_id(item),
10168       weight,
10169       &ss);
10170     if (err < 0) {
10171       goto reply;
10172     }
10173     err = 0;
10174     pending_inc.crush.clear();
10175     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10176     goto update;
10177   } else if (osdid_present &&
10178              (prefix == "osd crush set" || prefix == "osd crush add")) {
10179     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10180     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10181     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10182
10183     if (!osdmap.exists(osdid)) {
10184       err = -ENOENT;
10185       ss << osd_name
10186          << " does not exist. Create it before updating the crush map";
10187       goto reply;
10188     }
10189
10190     double weight;
10191     if (!cmd_getval(cmdmap, "weight", weight)) {
10192       ss << "unable to parse weight value '"
10193          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10194       err = -EINVAL;
10195       goto reply;
10196     }
10197
10198     string args;
10199     vector<string> argvec;
10200     cmd_getval(cmdmap, "args", argvec);
10201     map<string,string> loc;
10202     CrushWrapper::parse_loc_map(argvec, &loc);
10203
10204     if (prefix == "osd crush set"
10205         && !_get_stable_crush().item_exists(osdid)) {
10206       err = -ENOENT;
10207       ss << "unable to set item id " << osdid << " name '" << osd_name
10208          << "' weight " << weight << " at location " << loc
10209          << ": does not exist";
10210       goto reply;
10211     }
10212
10213     dout(5) << "adding/updating crush item id " << osdid << " name '"
10214       << osd_name << "' weight " << weight << " at location "
10215       << loc << dendl;
10216     CrushWrapper newcrush;
10217     _get_pending_crush(newcrush);
10218
10219     string action;
10220     if (prefix == "osd crush set" ||
10221         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10222       action = "set";
10223       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10224     } else {
10225       action = "add";
10226       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10227       if (err == 0)
10228         err = 1;
10229     }
10230
10231     if (err < 0)
10232       goto reply;
10233
10234     if (err == 0 && !_have_pending_crush()) {
10235       ss << action << " item id " << osdid << " name '" << osd_name
10236          << "' weight " << weight << " at location " << loc << ": no change";
10237       goto reply;
10238     }
10239
10240     pending_inc.crush.clear();
10241     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10242     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10243        << weight << " at location " << loc << " to crush map";
10244     getline(ss, rs);
10245     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10246                                                       get_last_committed() + 1));
10247     return true;
10248
10249   } else if (prefix == "osd crush create-or-move") {
10250     do {
10251       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10252       if (!osdmap.exists(osdid)) {
10253         err = -ENOENT;
10254         ss << osd_name
10255            << " does not exist.  create it before updating the crush map";
10256         goto reply;
10257       }
10258
10259       double weight;
10260       if (!cmd_getval(cmdmap, "weight", weight)) {
10261         ss << "unable to parse weight value '"
10262            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10263         err = -EINVAL;
10264         goto reply;
10265       }
10266
10267       string args;
10268       vector<string> argvec;
10269       cmd_getval(cmdmap, "args", argvec);
10270       map<string,string> loc;
10271       CrushWrapper::parse_loc_map(argvec, &loc);
10272
10273       dout(0) << "create-or-move crush item name '" << osd_name
10274               << "' initial_weight " << weight << " at location " << loc
10275               << dendl;
10276
10277       CrushWrapper newcrush;
10278       _get_pending_crush(newcrush);
10279
10280       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10281                                          g_conf()->osd_crush_update_weight_set);
10282       if (err == 0) {
10283         ss << "create-or-move updated item name '" << osd_name
10284            << "' weight " << weight
10285            << " at location " << loc << " to crush map";
10286         break;
10287       }
10288       if (err > 0) {
10289         pending_inc.crush.clear();
10290         newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10291         ss << "create-or-move updating item name '" << osd_name
10292            << "' weight " << weight
10293            << " at location " << loc << " to crush map";
10294         getline(ss, rs);
10295         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10296                                                   get_last_committed() + 1));
10297         return true;
10298       }
10299     } while (false);
10300
10301   } else if (prefix == "osd crush move") {
10302     do {
10303       // osd crush move <name> <loc1> [<loc2> ...]
10304       string name;
10305       vector<string> argvec;
10306       cmd_getval(cmdmap, "name", name);
10307       cmd_getval(cmdmap, "args", argvec);
10308       map<string,string> loc;
10309       CrushWrapper::parse_loc_map(argvec, &loc);
10310
10311       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10312       CrushWrapper newcrush;
10313       _get_pending_crush(newcrush);
10314
10315       if (!newcrush.name_exists(name)) {
10316         err = -ENOENT;
10317         ss << "item " << name << " does not exist";
10318         break;
10319       }
10320       int id = newcrush.get_item_id(name);
10321
10322       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10323         if (id >= 0) {
10324           err = newcrush.create_or_move_item(
10325             cct, id, 0, name, loc,
10326             g_conf()->osd_crush_update_weight_set);
10327         } else {
10328           err = newcrush.move_bucket(cct, id, loc);
10329         }
10330         if (err >= 0) {
10331           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10332           pending_inc.crush.clear();
10333           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10334           getline(ss, rs);
10335           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10336                                                    get_last_committed() + 1));
10337           return true;
10338         }
10339       } else {
10340         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10341         err = 0;
10342       }
10343     } while (false);
10344   } else if (prefix == "osd crush swap-bucket") {
10345     string source, dest;
10346     cmd_getval(cmdmap, "source", source);
10347     cmd_getval(cmdmap, "dest", dest);
10348
10349     bool force = false;
10350     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10351
10352     CrushWrapper newcrush;
10353     _get_pending_crush(newcrush);
10354     if (!newcrush.name_exists(source)) {
10355       ss << "source item " << source << " does not exist";
10356       err = -ENOENT;
10357       goto reply;
10358     }
10359     if (!newcrush.name_exists(dest)) {
10360       ss << "dest item " << dest << " does not exist";
10361       err = -ENOENT;
10362       goto reply;
10363     }
10364     int sid = newcrush.get_item_id(source);
10365     int did = newcrush.get_item_id(dest);
10366     int sparent;
10367     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10368       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10369       err = -EPERM;
10370       goto reply;
10371     }
10372     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10373         !force) {
10374       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10375          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10376          << "; pass --yes-i-really-mean-it to proceed anyway";
10377       err = -EPERM;
10378       goto reply;
10379     }
10380     int r = newcrush.swap_bucket(cct, sid, did);
10381     if (r < 0) {
10382       ss << "failed to swap bucket contents: " << cpp_strerror(r);
10383       err = r;
10384       goto reply;
10385     }
10386     ss << "swapped bucket of " << source << " to " << dest;
10387     pending_inc.crush.clear();
10388     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10389     wait_for_finished_proposal(op,
10390                                new Monitor::C_Command(mon, op, err, ss.str(),
10391                                                       get_last_committed() + 1));
10392     return true;
10393   } else if (prefix == "osd crush link") {
10394     // osd crush link <name> <loc1> [<loc2> ...]
10395     string name;
10396     cmd_getval(cmdmap, "name", name);
10397     vector<string> argvec;
10398     cmd_getval(cmdmap, "args", argvec);
10399     map<string,string> loc;
10400     CrushWrapper::parse_loc_map(argvec, &loc);
10401
10402     // Need an explicit check for name_exists because get_item_id returns
10403     // 0 on unfound.
10404     int id = osdmap.crush->get_item_id(name);
10405     if (!osdmap.crush->name_exists(name)) {
10406       err = -ENOENT;
10407       ss << "item " << name << " does not exist";
10408       goto reply;
10409     } else {
10410       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10411     }
10412     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10413       ss << "no need to move item id " << id << " name '" << name
10414          << "' to location " << loc << " in crush map";
10415       err = 0;
10416       goto reply;
10417     }
10418
10419     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10420     CrushWrapper newcrush;
10421     _get_pending_crush(newcrush);
10422
10423     if (!newcrush.name_exists(name)) {
10424       err = -ENOENT;
10425       ss << "item " << name << " does not exist";
10426       goto reply;
10427     } else {
10428       int id = newcrush.get_item_id(name);
10429       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10430         err = newcrush.link_bucket(cct, id, loc);
10431         if (err >= 0) {
10432           ss << "linked item id " << id << " name '" << name
10433              << "' to location " << loc << " in crush map";
10434           pending_inc.crush.clear();
10435           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10436         } else {
10437           ss << "cannot link item id " << id << " name '" << name
10438              << "' to location " << loc;
10439           goto reply;
10440         }
10441       } else {
10442         ss << "no need to move item id " << id << " name '" << name
10443            << "' to location " << loc << " in crush map";
10444         err = 0;
10445       }
10446     }
10447     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10448                                               get_last_committed() + 1));
10449     return true;
10450   } else if (prefix == "osd crush rm" ||
10451              prefix == "osd crush remove" ||
10452              prefix == "osd crush unlink") {
10453     do {
10454       // osd crush rm <id> [ancestor]
10455       CrushWrapper newcrush;
10456       _get_pending_crush(newcrush);
10457
10458       string name;
10459       cmd_getval(cmdmap, "name", name);
10460
10461       if (!osdmap.crush->name_exists(name)) {
10462         err = 0;
10463         ss << "device '" << name << "' does not appear in the crush map";
10464         break;
10465       }
10466       if (!newcrush.name_exists(name)) {
10467         err = 0;
10468         ss << "device '" << name << "' does not appear in the crush map";
10469         getline(ss, rs);
10470         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10471                                                   get_last_committed() + 1));
10472         return true;
10473       }
10474       int id = newcrush.get_item_id(name);
10475       int ancestor = 0;
10476
10477       bool unlink_only = prefix == "osd crush unlink";
10478       string ancestor_str;
10479       if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10480         if (!newcrush.name_exists(ancestor_str)) {
10481           err = -ENOENT;
10482           ss << "ancestor item '" << ancestor_str
10483              << "' does not appear in the crush map";
10484           break;
10485         }
10486         ancestor = newcrush.get_item_id(ancestor_str);
10487       }
10488
10489       err = prepare_command_osd_crush_remove(
10490           newcrush,
10491           id, ancestor,
10492           (ancestor < 0), unlink_only);
10493
10494       if (err == -ENOENT) {
10495         ss << "item " << id << " does not appear in that position";
10496         err = 0;
10497         break;
10498       }
10499       if (err == 0) {
10500         if (!unlink_only)
10501           pending_inc.new_crush_node_flags[id] = 0;
10502         ss << "removed item id " << id << " name '" << name << "' from crush map";
10503         getline(ss, rs);
10504         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10505                                                   get_last_committed() + 1));
10506         return true;
10507       }
10508     } while (false);
10509
10510   } else if (prefix == "osd crush reweight-all") {
10511     CrushWrapper newcrush;
10512     _get_pending_crush(newcrush);
10513
10514     newcrush.reweight(cct);
10515     pending_inc.crush.clear();
10516     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10517     ss << "reweighted crush hierarchy";
10518     getline(ss, rs);
10519     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10520                                                   get_last_committed() + 1));
10521     return true;
10522   } else if (prefix == "osd crush reweight") {
10523     // osd crush reweight <name> <weight>
10524     CrushWrapper newcrush;
10525     _get_pending_crush(newcrush);
10526
10527     string name;
10528     cmd_getval(cmdmap, "name", name);
10529     if (!newcrush.name_exists(name)) {
10530       err = -ENOENT;
10531       ss << "device '" << name << "' does not appear in the crush map";
10532       goto reply;
10533     }
10534
10535     int id = newcrush.get_item_id(name);
10536     if (id < 0) {
10537       ss << "device '" << name << "' is not a leaf in the crush map";
10538       err = -EINVAL;
10539       goto reply;
10540     }
10541     double w;
10542     if (!cmd_getval(cmdmap, "weight", w)) {
10543       ss << "unable to parse weight value '"
10544          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10545       err = -EINVAL;
10546       goto reply;
10547     }
10548
10549     err = newcrush.adjust_item_weightf(cct, id, w,
10550                                        g_conf()->osd_crush_update_weight_set);
10551     if (err < 0)
10552       goto reply;
10553     pending_inc.crush.clear();
10554     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10555     ss << "reweighted item id " << id << " name '" << name << "' to " << w
10556        << " in crush map";
10557     getline(ss, rs);
10558     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10559                                                   get_last_committed() + 1));
10560     return true;
10561   } else if (prefix == "osd crush reweight-subtree") {
10562     // osd crush reweight <name> <weight>
10563     CrushWrapper newcrush;
10564     _get_pending_crush(newcrush);
10565
10566     string name;
10567     cmd_getval(cmdmap, "name", name);
10568     if (!newcrush.name_exists(name)) {
10569       err = -ENOENT;
10570       ss << "device '" << name << "' does not appear in the crush map";
10571       goto reply;
10572     }
10573
10574     int id = newcrush.get_item_id(name);
10575     if (id >= 0) {
10576       ss << "device '" << name << "' is not a subtree in the crush map";
10577       err = -EINVAL;
10578       goto reply;
10579     }
10580     double w;
10581     if (!cmd_getval(cmdmap, "weight", w)) {
10582       ss << "unable to parse weight value '"
10583          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10584       err = -EINVAL;
10585       goto reply;
10586     }
10587
10588     err = newcrush.adjust_subtree_weightf(cct, id, w,
10589                                           g_conf()->osd_crush_update_weight_set);
10590     if (err < 0)
10591       goto reply;
10592     pending_inc.crush.clear();
10593     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10594     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10595        << " in crush map";
10596     getline(ss, rs);
10597     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10598                                               get_last_committed() + 1));
10599     return true;
10600   } else if (prefix == "osd crush tunables") {
10601     CrushWrapper newcrush;
10602     _get_pending_crush(newcrush);
10603
10604     err = 0;
10605     string profile;
10606     cmd_getval(cmdmap, "profile", profile);
10607     if (profile == "legacy" || profile == "argonaut") {
10608       newcrush.set_tunables_legacy();
10609     } else if (profile == "bobtail") {
10610       newcrush.set_tunables_bobtail();
10611     } else if (profile == "firefly") {
10612       newcrush.set_tunables_firefly();
10613     } else if (profile == "hammer") {
10614       newcrush.set_tunables_hammer();
10615     } else if (profile == "jewel") {
10616       newcrush.set_tunables_jewel();
10617     } else if (profile == "optimal") {
10618       newcrush.set_tunables_optimal();
10619     } else if (profile == "default") {
10620       newcrush.set_tunables_default();
10621     } else {
10622       ss << "unrecognized profile '" << profile << "'";
10623       err = -EINVAL;
10624       goto reply;
10625     }
10626
10627     if (!validate_crush_against_features(&newcrush, ss)) {
10628       err = -EINVAL;
10629       goto reply;
10630     }
10631
10632     pending_inc.crush.clear();
10633     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10634     ss << "adjusted tunables profile to " << profile;
10635     getline(ss, rs);
10636     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10637                                               get_last_committed() + 1));
10638     return true;
10639   } else if (prefix == "osd crush set-tunable") {
10640     CrushWrapper newcrush;
10641     _get_pending_crush(newcrush);
10642
10643     err = 0;
10644     string tunable;
10645     cmd_getval(cmdmap, "tunable", tunable);
10646
10647     int64_t value = -1;
10648     if (!cmd_getval(cmdmap, "value", value)) {
10649       err = -EINVAL;
10650       ss << "failed to parse integer value "
10651          << cmd_vartype_stringify(cmdmap.at("value"));
10652       goto reply;
10653     }
10654
10655     if (tunable == "straw_calc_version") {
10656       if (value != 0 && value != 1) {
10657         ss << "value must be 0 or 1; got " << value;
10658         err = -EINVAL;
10659         goto reply;
10660       }
10661       newcrush.set_straw_calc_version(value);
10662     } else {
10663       ss << "unrecognized tunable '" << tunable << "'";
10664       err = -EINVAL;
10665       goto reply;
10666     }
10667
10668     if (!validate_crush_against_features(&newcrush, ss)) {
10669       err = -EINVAL;
10670       goto reply;
10671     }
10672
10673     pending_inc.crush.clear();
10674     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10675     ss << "adjusted tunable " << tunable << " to " << value;
10676     getline(ss, rs);
10677     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10678                                               get_last_committed() + 1));
10679     return true;
10680
10681   } else if (prefix == "osd crush rule create-simple") {
10682     string name, root, type, mode;
10683     cmd_getval(cmdmap, "name", name);
10684     cmd_getval(cmdmap, "root", root);
10685     cmd_getval(cmdmap, "type", type);
10686     cmd_getval(cmdmap, "mode", mode);
10687     if (mode == "")
10688       mode = "firstn";
10689
10690     if (osdmap.crush->rule_exists(name)) {
10691       // The name is uniquely associated to a ruleid and the rule it contains
10692       // From the user point of view, the rule is more meaningfull.
10693       ss << "rule " << name << " already exists";
10694       err = 0;
10695       goto reply;
10696     }
10697
10698     CrushWrapper newcrush;
10699     _get_pending_crush(newcrush);
10700
10701     if (newcrush.rule_exists(name)) {
10702       // The name is uniquely associated to a ruleid and the rule it contains
10703       // From the user point of view, the rule is more meaningfull.
10704       ss << "rule " << name << " already exists";
10705       err = 0;
10706     } else {
10707       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10708                                                pg_pool_t::TYPE_REPLICATED, &ss);
10709       if (ruleno < 0) {
10710         err = ruleno;
10711         goto reply;
10712       }
10713
10714       pending_inc.crush.clear();
10715       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10716     }
10717     getline(ss, rs);
10718     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10719                                               get_last_committed() + 1));
10720     return true;
10721
10722   } else if (prefix == "osd crush rule create-replicated") {
10723     string name, root, type, device_class;
10724     cmd_getval(cmdmap, "name", name);
10725     cmd_getval(cmdmap, "root", root);
10726     cmd_getval(cmdmap, "type", type);
10727     cmd_getval(cmdmap, "class", device_class);
10728
10729     if (osdmap.crush->rule_exists(name)) {
10730       // The name is uniquely associated to a ruleid and the rule it contains
10731       // From the user point of view, the rule is more meaningfull.
10732       ss << "rule " << name << " already exists";
10733       err = 0;
10734       goto reply;
10735     }
10736
10737     CrushWrapper newcrush;
10738     _get_pending_crush(newcrush);
10739
10740     if (newcrush.rule_exists(name)) {
10741       // The name is uniquely associated to a ruleid and the rule it contains
10742       // From the user point of view, the rule is more meaningfull.
10743       ss << "rule " << name << " already exists";
10744       err = 0;
10745     } else {
10746       int ruleno = newcrush.add_simple_rule(
10747         name, root, type, device_class,
10748         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10749       if (ruleno < 0) {
10750         err = ruleno;
10751         goto reply;
10752       }
10753
10754       pending_inc.crush.clear();
10755       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10756     }
10757     getline(ss, rs);
10758     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10759                                               get_last_committed() + 1));
10760     return true;
10761
10762   } else if (prefix == "osd erasure-code-profile rm") {
10763     string name;
10764     cmd_getval(cmdmap, "name", name);
10765
10766     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10767       goto wait;
10768
10769     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10770       err = -EBUSY;
10771       goto reply;
10772     }
10773
10774     if (osdmap.has_erasure_code_profile(name) ||
10775         pending_inc.new_erasure_code_profiles.count(name)) {
10776       if (osdmap.has_erasure_code_profile(name)) {
10777         pending_inc.old_erasure_code_profiles.push_back(name);
10778       } else {
10779         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10780         pending_inc.new_erasure_code_profiles.erase(name);
10781       }
10782
10783       getline(ss, rs);
10784       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10785                                                         get_last_committed() + 1));
10786       return true;
10787     } else {
10788       ss << "erasure-code-profile " << name << " does not exist";
10789       err = 0;
10790       goto reply;
10791     }
10792
10793   } else if (prefix == "osd erasure-code-profile set") {
10794     string name;
10795     cmd_getval(cmdmap, "name", name);
10796     vector<string> profile;
10797     cmd_getval(cmdmap, "profile", profile);
10798
10799     bool force = false;
10800     cmd_getval(cmdmap, "force", force);
10801
10802     map<string,string> profile_map;
10803     err = parse_erasure_code_profile(profile, &profile_map, &ss);
10804     if (err)
10805       goto reply;
10806     if (profile_map.find("plugin") == profile_map.end()) {
10807       ss << "erasure-code-profile " << profile_map
10808          << " must contain a plugin entry" << std::endl;
10809       err = -EINVAL;
10810       goto reply;
10811     }
10812     string plugin = profile_map["plugin"];
10813
10814     if (pending_inc.has_erasure_code_profile(name)) {
10815       dout(20) << "erasure code profile " << name << " try again" << dendl;
10816       goto wait;
10817     } else {
10818       err = normalize_profile(name, profile_map, force, &ss);
10819       if (err)
10820         goto reply;
10821
10822       if (osdmap.has_erasure_code_profile(name)) {
10823         ErasureCodeProfile existing_profile_map =
10824           osdmap.get_erasure_code_profile(name);
10825         err = normalize_profile(name, existing_profile_map, force, &ss);
10826         if (err)
10827           goto reply;
10828
10829         if (existing_profile_map == profile_map) {
10830           err = 0;
10831           goto reply;
10832         }
10833         if (!force) {
10834           err = -EPERM;
10835           ss << "will not override erasure code profile " << name
10836              << " because the existing profile "
10837              << existing_profile_map
10838              << " is different from the proposed profile "
10839              << profile_map;
10840           goto reply;
10841         }
10842       }
10843
10844       dout(20) << "erasure code profile set " << name << "="
10845                << profile_map << dendl;
10846       pending_inc.set_erasure_code_profile(name, profile_map);
10847     }
10848
10849     getline(ss, rs);
10850     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10851                                                       get_last_committed() + 1));
10852     return true;
10853
10854   } else if (prefix == "osd crush rule create-erasure") {
10855     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10856     if (err == -EAGAIN)
10857       goto wait;
10858     if (err)
10859       goto reply;
10860     string name, poolstr;
10861     cmd_getval(cmdmap, "name", name);
10862     string profile;
10863     cmd_getval(cmdmap, "profile", profile);
10864     if (profile == "")
10865       profile = "default";
10866     if (profile == "default") {
10867       if (!osdmap.has_erasure_code_profile(profile)) {
10868         if (pending_inc.has_erasure_code_profile(profile)) {
10869           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10870           goto wait;
10871         }
10872
10873         map<string,string> profile_map;
10874         err = osdmap.get_erasure_code_profile_default(cct,
10875                                                       profile_map,
10876                                                       &ss);
10877         if (err)
10878           goto reply;
10879         err = normalize_profile(name, profile_map, true, &ss);
10880         if (err)
10881           goto reply;
10882         dout(20) << "erasure code profile set " << profile << "="
10883                  << profile_map << dendl;
10884         pending_inc.set_erasure_code_profile(profile, profile_map);
10885         goto wait;
10886       }
10887     }
10888
10889     int rule;
10890     err = crush_rule_create_erasure(name, profile, &rule, &ss);
10891     if (err < 0) {
10892       switch(err) {
10893       case -EEXIST: // return immediately
10894         ss << "rule " << name << " already exists";
10895         err = 0;
10896         goto reply;
10897         break;
10898       case -EALREADY: // wait for pending to be proposed
10899         ss << "rule " << name << " already exists";
10900         err = 0;
10901         break;
10902       default: // non recoverable error
10903         goto reply;
10904         break;
10905       }
10906     } else {
10907       ss << "created rule " << name << " at " << rule;
10908     }
10909
10910     getline(ss, rs);
10911     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10912                                                       get_last_committed() + 1));
10913     return true;
10914
10915   } else if (prefix == "osd crush rule rm") {
10916     string name;
10917     cmd_getval(cmdmap, "name", name);
10918
10919     if (!osdmap.crush->rule_exists(name)) {
10920       ss << "rule " << name << " does not exist";
10921       err = 0;
10922       goto reply;
10923     }
10924
10925     CrushWrapper newcrush;
10926     _get_pending_crush(newcrush);
10927
10928     if (!newcrush.rule_exists(name)) {
10929       ss << "rule " << name << " does not exist";
10930       err = 0;
10931     } else {
10932       int ruleno = newcrush.get_rule_id(name);
10933       ceph_assert(ruleno >= 0);
10934
10935       // make sure it is not in use.
10936       // FIXME: this is ok in some situations, but let's not bother with that
10937       // complexity now.
10938       int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10939       if (osdmap.crush_rule_in_use(ruleset)) {
10940         ss << "crush ruleset " << name << " " << ruleset << " is in use";
10941         err = -EBUSY;
10942         goto reply;
10943       }
10944
10945       err = newcrush.remove_rule(ruleno);
10946       if (err < 0) {
10947         goto reply;
10948       }
10949
10950       pending_inc.crush.clear();
10951       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10952     }
10953     getline(ss, rs);
10954     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10955                                               get_last_committed() + 1));
10956     return true;
10957
10958   } else if (prefix == "osd crush rule rename") {
10959     string srcname;
10960     string dstname;
10961     cmd_getval(cmdmap, "srcname", srcname);
10962     cmd_getval(cmdmap, "dstname", dstname);
10963     if (srcname.empty() || dstname.empty()) {
10964       ss << "must specify both source rule name and destination rule name";
10965       err = -EINVAL;
10966       goto reply;
10967     }
10968     if (srcname == dstname) {
10969       ss << "destination rule name is equal to source rule name";
10970       err = 0;
10971       goto reply;
10972     }
10973
10974     CrushWrapper newcrush;
10975     _get_pending_crush(newcrush);
10976     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10977       // srcname does not exist and dstname already exists
10978       // suppose this is a replay and return success
10979       // (so this command is idempotent)
10980       ss << "already renamed to '" << dstname << "'";
10981       err = 0;
10982       goto reply;
10983     }
10984
10985     err = newcrush.rename_rule(srcname, dstname, &ss);
10986     if (err < 0) {
10987       // ss has reason for failure
10988       goto reply;
10989     }
10990     pending_inc.crush.clear();
10991     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10992     getline(ss, rs);
10993     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10994                                get_last_committed() + 1));
10995     return true;
10996
10997   } else if (prefix == "osd setmaxosd") {
10998     int64_t newmax;
10999     if (!cmd_getval(cmdmap, "newmax", newmax)) {
11000       ss << "unable to parse 'newmax' value '"
11001          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11002       err = -EINVAL;
11003       goto reply;
11004     }
11005
11006     if (newmax > g_conf()->mon_max_osd) {
11007       err = -ERANGE;
11008       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11009          << g_conf()->mon_max_osd << ")";
11010       goto reply;
11011     }
11012
11013     // Don't allow shrinking OSD number as this will cause data loss
11014     // and may cause kernel crashes.
11015     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11016     if (newmax < osdmap.get_max_osd()) {
11017       // Check if the OSDs exist between current max and new value.
11018       // If there are any OSDs exist, then don't allow shrinking number
11019       // of OSDs.
11020       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11021         if (osdmap.exists(i)) {
11022           err = -EBUSY;
11023           ss << "cannot shrink max_osd to " << newmax
11024              << " because osd." << i << " (and possibly others) still in use";
11025           goto reply;
11026         }
11027       }
11028     }
11029
11030     pending_inc.new_max_osd = newmax;
11031     ss << "set new max_osd = " << pending_inc.new_max_osd;
11032     getline(ss, rs);
11033     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11034                                               get_last_committed() + 1));
11035     return true;
11036
11037   } else if (prefix == "osd set-full-ratio" ||
11038              prefix == "osd set-backfillfull-ratio" ||
11039              prefix == "osd set-nearfull-ratio") {
11040     double n;
11041     if (!cmd_getval(cmdmap, "ratio", n)) {
11042       ss << "unable to parse 'ratio' value '"
11043          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11044       err = -EINVAL;
11045       goto reply;
11046     }
11047     if (prefix == "osd set-full-ratio")
11048       pending_inc.new_full_ratio = n;
11049     else if (prefix == "osd set-backfillfull-ratio")
11050       pending_inc.new_backfillfull_ratio = n;
11051     else if (prefix == "osd set-nearfull-ratio")
11052       pending_inc.new_nearfull_ratio = n;
11053     ss << prefix << " " << n;
11054     getline(ss, rs);
11055     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11056                                               get_last_committed() + 1));
11057     return true;
11058   } else if (prefix == "osd set-require-min-compat-client") {
11059     string v;
11060     cmd_getval(cmdmap, "version", v);
11061     ceph_release_t vno = ceph_release_from_name(v);
11062     if (!vno) {
11063       ss << "version " << v << " is not recognized";
11064       err = -EINVAL;
11065       goto reply;
11066     }
11067     OSDMap newmap;
11068     newmap.deepish_copy_from(osdmap);
11069     newmap.apply_incremental(pending_inc);
11070     newmap.require_min_compat_client = vno;
11071     auto mvno = newmap.get_min_compat_client();
11072     if (vno < mvno) {
11073       ss << "osdmap current utilizes features that require " << mvno
11074          << "; cannot set require_min_compat_client below that to " << vno;
11075       err = -EPERM;
11076       goto reply;
11077     }
11078     bool sure = false;
11079     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11080     if (!sure) {
11081       FeatureMap m;
11082       mon->get_combined_feature_map(&m);
11083       uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
11084       bool first = true;
11085       bool ok = true;
11086       for (int type : {
11087             CEPH_ENTITY_TYPE_CLIENT,
11088             CEPH_ENTITY_TYPE_MDS,
11089             CEPH_ENTITY_TYPE_MGR }) {
11090         auto p = m.m.find(type);
11091         if (p == m.m.end()) {
11092           continue;
11093         }
11094         for (auto& q : p->second) {
11095           uint64_t missing = ~q.first & features;
11096           if (missing) {
11097             if (first) {
11098               ss << "cannot set require_min_compat_client to " << v << ": ";
11099             } else {
11100               ss << "; ";
11101             }
11102             first = false;
11103             ss << q.second << " connected " << ceph_entity_type_name(type)
11104                << "(s) look like " << ceph_release_name(
11105                  ceph_release_from_features(q.first))
11106                << " (missing 0x" << std::hex << missing << std::dec << ")";
11107             ok = false;
11108           }
11109         }
11110       }
11111       if (!ok) {
11112         ss << "; add --yes-i-really-mean-it to do it anyway";
11113         err = -EPERM;
11114         goto reply;
11115       }
11116     }
11117     ss << "set require_min_compat_client to " << vno;
11118     pending_inc.new_require_min_compat_client = vno;
11119     getline(ss, rs);
11120     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11121                                                           get_last_committed() + 1));
11122     return true;
11123   } else if (prefix == "osd pause") {
11124     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11125
11126   } else if (prefix == "osd unpause") {
11127     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11128
11129   } else if (prefix == "osd set") {
11130     bool sure = false;
11131     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11132
11133     string key;
11134     cmd_getval(cmdmap, "key", key);
11135     if (key == "pause")
11136       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11137     else if (key == "noup")
11138       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11139     else if (key == "nodown")
11140       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11141     else if (key == "noout")
11142       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11143     else if (key == "noin")
11144       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11145     else if (key == "nobackfill")
11146       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11147     else if (key == "norebalance")
11148       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11149     else if (key == "norecover")
11150       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11151     else if (key == "noscrub")
11152       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11153     else if (key == "nodeep-scrub")
11154       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11155     else if (key == "notieragent")
11156       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11157     else if (key == "nosnaptrim")
11158       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11159     else if (key == "pglog_hardlimit") {
11160       if (!osdmap.get_num_up_osds() && !sure) {
11161         ss << "Not advisable to continue since no OSDs are up. Pass "
11162            << "--yes-i-really-mean-it if you really wish to continue.";
11163         err = -EPERM;
11164         goto reply;
11165       }
11166       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11167       // we are reusing a jewel feature bit that was retired in luminous.
11168       if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11169          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11170           || sure)) {
11171         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11172       } else {
11173         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11174         err = -EPERM;
11175         goto reply;
11176       }
11177     } else {
11178       ss << "unrecognized flag '" << key << "'";
11179       err = -EINVAL;
11180     }
11181
11182   } else if (prefix == "osd unset") {
11183     string key;
11184     cmd_getval(cmdmap, "key", key);
11185     if (key == "pause")
11186       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11187     else if (key == "noup")
11188       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11189     else if (key == "nodown")
11190       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11191     else if (key == "noout")
11192       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11193     else if (key == "noin")
11194       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11195     else if (key == "nobackfill")
11196       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11197     else if (key == "norebalance")
11198       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11199     else if (key == "norecover")
11200       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11201     else if (key == "noscrub")
11202       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11203     else if (key == "nodeep-scrub")
11204       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11205     else if (key == "notieragent")
11206       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11207     else if (key == "nosnaptrim")
11208       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11209     else {
11210       ss << "unrecognized flag '" << key << "'";
11211       err = -EINVAL;
11212     }
11213
11214   } else if (prefix == "osd require-osd-release") {
11215     string release;
11216     cmd_getval(cmdmap, "release", release);
11217     bool sure = false;
11218     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11219     ceph_release_t rel = ceph_release_from_name(release.c_str());
11220     if (!rel) {
11221       ss << "unrecognized release " << release;
11222       err = -EINVAL;
11223       goto reply;
11224     }
11225     if (rel == osdmap.require_osd_release) {
11226       // idempotent
11227       err = 0;
11228       goto reply;
11229     }
11230     ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11231     if (!osdmap.get_num_up_osds() && !sure) {
11232       ss << "Not advisable to continue since no OSDs are up. Pass "
11233          << "--yes-i-really-mean-it if you really wish to continue.";
11234       err = -EPERM;
11235       goto reply;
11236     }
11237     if (rel == ceph_release_t::mimic) {
11238       if (!mon->monmap->get_required_features().contains_all(
11239             ceph::features::mon::FEATURE_MIMIC)) {
11240         ss << "not all mons are mimic";
11241         err = -EPERM;
11242         goto reply;
11243       }
11244       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11245            && !sure) {
11246         ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11247         err = -EPERM;
11248         goto reply;
11249       }
11250     } else if (rel == ceph_release_t::nautilus) {
11251       if (!mon->monmap->get_required_features().contains_all(
11252             ceph::features::mon::FEATURE_NAUTILUS)) {
11253         ss << "not all mons are nautilus";
11254         err = -EPERM;
11255         goto reply;
11256       }
11257       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11258            && !sure) {
11259         ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11260         err = -EPERM;
11261         goto reply;
11262       }
11263     } else if (rel == ceph_release_t::octopus) {
11264       if (!mon->monmap->get_required_features().contains_all(
11265             ceph::features::mon::FEATURE_OCTOPUS)) {
11266         ss << "not all mons are octopus";
11267         err = -EPERM;
11268         goto reply;
11269       }
11270       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11271            && !sure) {
11272         ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11273         err = -EPERM;
11274         goto reply;
11275       }
11276     } else {
11277       ss << "not supported for this release yet";
11278       err = -EPERM;
11279       goto reply;
11280     }
11281     if (rel < osdmap.require_osd_release) {
11282       ss << "require_osd_release cannot be lowered once it has been set";
11283       err = -EPERM;
11284       goto reply;
11285     }
11286     pending_inc.new_require_osd_release = rel;
11287     goto update;
11288   } else if (prefix == "osd down" ||
11289              prefix == "osd out" ||
11290              prefix == "osd in" ||
11291              prefix == "osd rm" ||
11292              prefix == "osd stop") {
11293
11294     bool any = false;
11295     bool stop = false;
11296     bool verbose = true;
11297     bool definitely_dead = false;
11298
11299     vector<string> idvec;
11300     cmd_getval(cmdmap, "ids", idvec);
11301     cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11302     derr << "definitely_dead " << (int)definitely_dead << dendl;
11303     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11304       set<int> osds;
11305
11306       // wildcard?
11307       if (j == 0 &&
11308           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11309         if (prefix == "osd in") {
11310           // touch out osds only
11311           osdmap.get_out_existing_osds(osds);
11312         } else {
11313           osdmap.get_all_osds(osds);
11314         }
11315         stop = true;
11316         verbose = false; // so the output is less noisy.
11317       } else {
11318         long osd = parse_osd_id(idvec[j].c_str(), &ss);
11319         if (osd < 0) {
11320           ss << "invalid osd id" << osd;
11321           err = -EINVAL;
11322           continue;
11323         } else if (!osdmap.exists(osd)) {
11324           ss << "osd." << osd << " does not exist. ";
11325           continue;
11326         }
11327
11328         osds.insert(osd);
11329       }
11330
11331       for (auto &osd : osds) {
11332         if (prefix == "osd down") {
11333           if (osdmap.is_down(osd)) {
11334             if (verbose)
11335               ss << "osd." << osd << " is already down. ";
11336           } else {
11337             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11338             ss << "marked down osd." << osd << ". ";
11339             any = true;
11340           }
11341           if (definitely_dead) {
11342             if (!pending_inc.new_xinfo.count(osd)) {
11343               pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11344             }
11345             if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11346               any = true;
11347             }
11348             pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11349           }
11350         } else if (prefix == "osd out") {
11351           if (osdmap.is_out(osd)) {
11352             if (verbose)
11353               ss << "osd." << osd << " is already out. ";
11354           } else {
11355             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11356             if (osdmap.osd_weight[osd]) {
11357               if (pending_inc.new_xinfo.count(osd) == 0) {
11358                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11359               }
11360               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11361             }
11362             ss << "marked out osd." << osd << ". ";
11363             std::ostringstream msg;
11364             msg << "Client " << op->get_session()->entity_name
11365                 << " marked osd." << osd << " out";
11366             if (osdmap.is_up(osd)) {
11367               msg << ", while it was still marked up";
11368             } else {
11369               auto period = ceph_clock_now() - down_pending_out[osd];
11370               msg << ", after it was down for " << int(period.sec())
11371                   << " seconds";
11372             }
11373
11374             mon->clog->info() << msg.str();
11375             any = true;
11376           }
11377         } else if (prefix == "osd in") {
11378           if (osdmap.is_in(osd)) {
11379             if (verbose)
11380               ss << "osd." << osd << " is already in. ";
11381           } else {
11382             if (osdmap.osd_xinfo[osd].old_weight > 0) {
11383               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11384               if (pending_inc.new_xinfo.count(osd) == 0) {
11385                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11386               }
11387               pending_inc.new_xinfo[osd].old_weight = 0;
11388             } else {
11389               pending_inc.new_weight[osd] = CEPH_OSD_IN;
11390             }
11391             ss << "marked in osd." << osd << ". ";
11392             any = true;
11393           }
11394         } else if (prefix == "osd rm") {
11395           err = prepare_command_osd_remove(osd);
11396
11397           if (err == -EBUSY) {
11398             if (any)
11399               ss << ", ";
11400             ss << "osd." << osd << " is still up; must be down before removal. ";
11401           } else {
11402             ceph_assert(err == 0);
11403             if (any) {
11404               ss << ", osd." << osd;
11405             } else {
11406               ss << "removed osd." << osd;
11407             }
11408             any = true;
11409           }
11410         } else if (prefix == "osd stop") {
11411           if (osdmap.is_stop(osd)) {
11412             if (verbose)
11413               ss << "osd." << osd << " is already stopped. ";
11414           } else if (osdmap.is_down(osd)) {
11415             pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11416             ss << "stop down osd." << osd << ". ";
11417             any = true;
11418           } else {
11419             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11420             ss << "stop osd." << osd << ". ";
11421             any = true;
11422           }
11423         }
11424       }
11425     }
11426     if (any) {
11427       getline(ss, rs);
11428       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11429                                                 get_last_committed() + 1));
11430       return true;
11431     }
11432   } else if (prefix == "osd set-group" ||
11433              prefix == "osd unset-group" ||
11434              prefix == "osd add-noup" ||
11435              prefix == "osd add-nodown" ||
11436              prefix == "osd add-noin" ||
11437              prefix == "osd add-noout" ||
11438              prefix == "osd rm-noup" ||
11439              prefix == "osd rm-nodown" ||
11440              prefix == "osd rm-noin" ||
11441              prefix == "osd rm-noout") {
11442     bool do_set = prefix == "osd set-group" ||
11443                   prefix.find("add") != string::npos;
11444     string flag_str;
11445     unsigned flags = 0;
11446     vector<string> who;
11447     if (prefix == "osd set-group" || prefix == "osd unset-group") {
11448       cmd_getval(cmdmap, "flags", flag_str);
11449       cmd_getval(cmdmap, "who", who);
11450       vector<string> raw_flags;
11451       boost::split(raw_flags, flag_str, boost::is_any_of(","));
11452       for (auto& f : raw_flags) {
11453         if (f == "noup")
11454           flags |= CEPH_OSD_NOUP;
11455         else if (f == "nodown")
11456           flags |= CEPH_OSD_NODOWN;
11457         else if (f == "noin")
11458           flags |= CEPH_OSD_NOIN;
11459         else if (f == "noout")
11460           flags |= CEPH_OSD_NOOUT;
11461         else {
11462           ss << "unrecognized flag '" << f << "', must be one of "
11463              << "{noup,nodown,noin,noout}";
11464           err = -EINVAL;
11465           goto reply;
11466         }
11467       }
11468     } else {
11469       cmd_getval(cmdmap, "ids", who);
11470       if (prefix.find("noup") != string::npos)
11471         flags = CEPH_OSD_NOUP;
11472       else if (prefix.find("nodown") != string::npos)
11473         flags = CEPH_OSD_NODOWN;
11474       else if (prefix.find("noin") != string::npos)
11475         flags = CEPH_OSD_NOIN;
11476       else if (prefix.find("noout") != string::npos)
11477         flags = CEPH_OSD_NOOUT;
11478       else
11479         ceph_assert(0 == "Unreachable!");
11480     }
11481     if (flags == 0) {
11482       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11483       err = -EINVAL;
11484       goto reply;
11485     }
11486     if (who.empty()) {
11487       ss << "must specify at least one or more targets to set/unset";
11488       err = -EINVAL;
11489       goto reply;
11490     }
11491     set<int> osds;
11492     set<int> crush_nodes;
11493     set<int> device_classes;
11494     for (auto& w : who) {
11495       if (w == "any" || w == "all" || w == "*") {
11496         osdmap.get_all_osds(osds);
11497         break;
11498       }
11499       std::stringstream ts;
11500       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11501         osds.insert(osd);
11502       } else if (osdmap.crush->name_exists(w)) {
11503         crush_nodes.insert(osdmap.crush->get_item_id(w));
11504       } else if (osdmap.crush->class_exists(w)) {
11505         device_classes.insert(osdmap.crush->get_class_id(w));
11506       } else {
11507         ss << "unable to parse osd id or crush node or device class: "
11508            << "\"" << w << "\". ";
11509       }
11510     }
11511     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11512       // ss has reason for failure
11513       err = -EINVAL;
11514       goto reply;
11515     }
11516     bool any = false;
11517     for (auto osd : osds) {
11518       if (!osdmap.exists(osd)) {
11519         ss << "osd." << osd << " does not exist. ";
11520         continue;
11521       }
11522       if (do_set) {
11523         if (flags & CEPH_OSD_NOUP) {
11524           any |= osdmap.is_noup_by_osd(osd) ?
11525             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11526             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11527         }
11528         if (flags & CEPH_OSD_NODOWN) {
11529           any |= osdmap.is_nodown_by_osd(osd) ?
11530             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11531             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11532         }
11533         if (flags & CEPH_OSD_NOIN) {
11534           any |= osdmap.is_noin_by_osd(osd) ?
11535             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11536             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11537         }
11538         if (flags & CEPH_OSD_NOOUT) {
11539           any |= osdmap.is_noout_by_osd(osd) ?
11540             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11541             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11542         }
11543       } else {
11544         if (flags & CEPH_OSD_NOUP) {
11545           any |= osdmap.is_noup_by_osd(osd) ?
11546             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11547             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11548         }
11549         if (flags & CEPH_OSD_NODOWN) {
11550           any |= osdmap.is_nodown_by_osd(osd) ?
11551             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11552             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11553         }
11554         if (flags & CEPH_OSD_NOIN) {
11555           any |= osdmap.is_noin_by_osd(osd) ?
11556             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11557             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11558         }
11559         if (flags & CEPH_OSD_NOOUT) {
11560           any |= osdmap.is_noout_by_osd(osd) ?
11561             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11562             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11563         }
11564       }
11565     }
11566     for (auto& id : crush_nodes) {
11567       auto old_flags = osdmap.get_crush_node_flags(id);
11568       auto& pending_flags = pending_inc.new_crush_node_flags[id];
11569       pending_flags |= old_flags; // adopt existing flags first!
11570       if (do_set) {
11571         pending_flags |= flags;
11572       } else {
11573         pending_flags &= ~flags;
11574       }
11575       any = true;
11576     }
11577     for (auto& id : device_classes) {
11578       auto old_flags = osdmap.get_device_class_flags(id);
11579       auto& pending_flags = pending_inc.new_device_class_flags[id];
11580       pending_flags |= old_flags;
11581       if (do_set) {
11582         pending_flags |= flags;
11583       } else {
11584         pending_flags &= ~flags;
11585       }
11586       any = true;
11587     }
11588     if (any) {
11589       getline(ss, rs);
11590       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11591                                  get_last_committed() + 1));
11592       return true;
11593     }
11594   } else if (prefix == "osd pg-temp") {
11595     string pgidstr;
11596     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11597       ss << "unable to parse 'pgid' value '"
11598          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11599       err = -EINVAL;
11600       goto reply;
11601     }
11602     pg_t pgid;
11603     if (!pgid.parse(pgidstr.c_str())) {
11604       ss << "invalid pgid '" << pgidstr << "'";
11605       err = -EINVAL;
11606       goto reply;
11607     }
11608     if (!osdmap.pg_exists(pgid)) {
11609       ss << "pg " << pgid << " does not exist";
11610       err = -ENOENT;
11611       goto reply;
11612     }
11613     if (pending_inc.new_pg_temp.count(pgid)) {
11614       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11615       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11616       return true;
11617     }
11618
11619     vector<int64_t> id_vec;
11620     vector<int32_t> new_pg_temp;
11621     cmd_getval(cmdmap, "id", id_vec);
11622     if (id_vec.empty())  {
11623       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11624       ss << "done cleaning up pg_temp of " << pgid;
11625       goto update;
11626     }
11627     for (auto osd : id_vec) {
11628       if (!osdmap.exists(osd)) {
11629         ss << "osd." << osd << " does not exist";
11630         err = -ENOENT;
11631         goto reply;
11632       }
11633       new_pg_temp.push_back(osd);
11634     }
11635
11636     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11637     if ((int)new_pg_temp.size() < pool_min_size) {
11638       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11639          << pool_min_size << ")";
11640       err = -EINVAL;
11641       goto reply;
11642     }
11643
11644     int pool_size = osdmap.get_pg_pool_size(pgid);
11645     if ((int)new_pg_temp.size() > pool_size) {
11646       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11647          << pool_size << ")";
11648       err = -EINVAL;
11649       goto reply;
11650     }
11651
11652     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11653       new_pg_temp.begin(), new_pg_temp.end());
11654     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11655     goto update;
11656   } else if (prefix == "osd primary-temp") {
11657     string pgidstr;
11658     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11659       ss << "unable to parse 'pgid' value '"
11660          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11661       err = -EINVAL;
11662       goto reply;
11663     }
11664     pg_t pgid;
11665     if (!pgid.parse(pgidstr.c_str())) {
11666       ss << "invalid pgid '" << pgidstr << "'";
11667       err = -EINVAL;
11668       goto reply;
11669     }
11670     if (!osdmap.pg_exists(pgid)) {
11671       ss << "pg " << pgid << " does not exist";
11672       err = -ENOENT;
11673       goto reply;
11674     }
11675
11676     int64_t osd;
11677     if (!cmd_getval(cmdmap, "id", osd)) {
11678       ss << "unable to parse 'id' value '"
11679          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11680       err = -EINVAL;
11681       goto reply;
11682     }
11683     if (osd != -1 && !osdmap.exists(osd)) {
11684       ss << "osd." << osd << " does not exist";
11685       err = -ENOENT;
11686       goto reply;
11687     }
11688
11689     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11690         osdmap.require_min_compat_client < ceph_release_t::firefly) {
11691       ss << "require_min_compat_client "
11692          << osdmap.require_min_compat_client
11693          << " < firefly, which is required for primary-temp";
11694       err = -EPERM;
11695       goto reply;
11696     }
11697
11698     pending_inc.new_primary_temp[pgid] = osd;
11699     ss << "set " << pgid << " primary_temp mapping to " << osd;
11700     goto update;
11701   } else if (prefix == "pg repeer") {
11702     pg_t pgid;
11703     string pgidstr;
11704     cmd_getval(cmdmap, "pgid", pgidstr);
11705     if (!pgid.parse(pgidstr.c_str())) {
11706       ss << "invalid pgid '" << pgidstr << "'";
11707       err = -EINVAL;
11708       goto reply;
11709     }
11710     if (!osdmap.pg_exists(pgid)) {
11711       ss << "pg '" << pgidstr << "' does not exist";
11712       err = -ENOENT;
11713       goto reply;
11714     }
11715     vector<int> acting;
11716     int primary;
11717     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11718     if (primary < 0) {
11719       err = -EAGAIN;
11720       ss << "pg currently has no primary";
11721       goto reply;
11722     }
11723     if (acting.size() > 1) {
11724       // map to just primary; it will map back to what it wants
11725       pending_inc.new_pg_temp[pgid] = { primary };
11726     } else {
11727       // hmm, pick another arbitrary osd to induce a change.  Note
11728       // that this won't work if there is only one suitable OSD in the cluster.
11729       int i;
11730       bool done = false;
11731       for (i = 0; i < osdmap.get_max_osd(); ++i) {
11732         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11733           continue;
11734         }
11735         pending_inc.new_pg_temp[pgid] = { primary, i };
11736         done = true;
11737         break;
11738       }
11739       if (!done) {
11740         err = -EAGAIN;
11741         ss << "not enough up OSDs in the cluster to force repeer";
11742         goto reply;
11743       }
11744     }
11745     goto update;
11746   } else if (prefix == "osd pg-upmap" ||
11747              prefix == "osd rm-pg-upmap" ||
11748              prefix == "osd pg-upmap-items" ||
11749              prefix == "osd rm-pg-upmap-items") {
11750     if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11751       ss << "min_compat_client "
11752          << osdmap.require_min_compat_client
11753          << " < luminous, which is required for pg-upmap. "
11754          << "Try 'ceph osd set-require-min-compat-client luminous' "
11755          << "before using the new interface";
11756       err = -EPERM;
11757       goto reply;
11758     }
11759     err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11760     if (err == -EAGAIN)
11761       goto wait;
11762     if (err < 0)
11763       goto reply;
11764     string pgidstr;
11765     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11766       ss << "unable to parse 'pgid' value '"
11767          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11768       err = -EINVAL;
11769       goto reply;
11770     }
11771     pg_t pgid;
11772     if (!pgid.parse(pgidstr.c_str())) {
11773       ss << "invalid pgid '" << pgidstr << "'";
11774       err = -EINVAL;
11775       goto reply;
11776     }
11777     if (!osdmap.pg_exists(pgid)) {
11778       ss << "pg " << pgid << " does not exist";
11779       err = -ENOENT;
11780       goto reply;
11781     }
11782     if (pending_inc.old_pools.count(pgid.pool())) {
11783       ss << "pool of " << pgid << " is pending removal";
11784       err = -ENOENT;
11785       getline(ss, rs);
11786       wait_for_finished_proposal(op,
11787         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11788       return true;
11789     }
11790
11791     enum {
11792       OP_PG_UPMAP,
11793       OP_RM_PG_UPMAP,
11794       OP_PG_UPMAP_ITEMS,
11795       OP_RM_PG_UPMAP_ITEMS,
11796     } option;
11797
11798     if (prefix == "osd pg-upmap") {
11799       option = OP_PG_UPMAP;
11800     } else if (prefix == "osd rm-pg-upmap") {
11801       option = OP_RM_PG_UPMAP;
11802     } else if (prefix == "osd pg-upmap-items") {
11803       option = OP_PG_UPMAP_ITEMS;
11804     } else {
11805       option = OP_RM_PG_UPMAP_ITEMS;
11806     }
11807
11808     // check pending upmap changes
11809     switch (option) {
11810     case OP_PG_UPMAP: // fall through
11811     case OP_RM_PG_UPMAP:
11812       if (pending_inc.new_pg_upmap.count(pgid) ||
11813           pending_inc.old_pg_upmap.count(pgid)) {
11814         dout(10) << __func__ << " waiting for pending update on "
11815                  << pgid << dendl;
11816         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11817         return true;
11818       }
11819       break;
11820
11821     case OP_PG_UPMAP_ITEMS: // fall through
11822     case OP_RM_PG_UPMAP_ITEMS:
11823       if (pending_inc.new_pg_upmap_items.count(pgid) ||
11824           pending_inc.old_pg_upmap_items.count(pgid)) {
11825         dout(10) << __func__ << " waiting for pending update on "
11826                  << pgid << dendl;
11827         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11828         return true;
11829       }
11830       break;
11831
11832     default:
11833       ceph_abort_msg("invalid option");
11834     }
11835
11836     switch (option) {
11837     case OP_PG_UPMAP:
11838       {
11839         vector<int64_t> id_vec;
11840         if (!cmd_getval(cmdmap, "id", id_vec)) {
11841           ss << "unable to parse 'id' value(s) '"
11842              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11843           err = -EINVAL;
11844           goto reply;
11845         }
11846
11847         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11848         if ((int)id_vec.size() < pool_min_size) {
11849           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11850              << pool_min_size << ")";
11851           err = -EINVAL;
11852           goto reply;
11853         }
11854
11855         int pool_size = osdmap.get_pg_pool_size(pgid);
11856         if ((int)id_vec.size() > pool_size) {
11857           ss << "num of osds (" << id_vec.size() <<") > pool size ("
11858              << pool_size << ")";
11859           err = -EINVAL;
11860           goto reply;
11861         }
11862
11863         vector<int32_t> new_pg_upmap;
11864         for (auto osd : id_vec) {
11865           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11866             ss << "osd." << osd << " does not exist";
11867             err = -ENOENT;
11868             goto reply;
11869           }
11870           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11871           if (it != new_pg_upmap.end()) {
11872             ss << "osd." << osd << " already exists, ";
11873             continue;
11874           }
11875           new_pg_upmap.push_back(osd);
11876         }
11877
11878         if (new_pg_upmap.empty()) {
11879           ss << "no valid upmap items(pairs) is specified";
11880           err = -EINVAL;
11881           goto reply;
11882         }
11883
11884         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11885           new_pg_upmap.begin(), new_pg_upmap.end());
11886         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
11887       }
11888       break;
11889
11890     case OP_RM_PG_UPMAP:
11891       {
11892         pending_inc.old_pg_upmap.insert(pgid);
11893         ss << "clear " << pgid << " pg_upmap mapping";
11894       }
11895       break;
11896
11897     case OP_PG_UPMAP_ITEMS:
11898       {
11899         vector<int64_t> id_vec;
11900         if (!cmd_getval(cmdmap, "id", id_vec)) {
11901           ss << "unable to parse 'id' value(s) '"
11902              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11903           err = -EINVAL;
11904           goto reply;
11905         }
11906
11907         if (id_vec.size() % 2) {
11908           ss << "you must specify pairs of osd ids to be remapped";
11909           err = -EINVAL;
11910           goto reply;
11911         }
11912
11913         int pool_size = osdmap.get_pg_pool_size(pgid);
11914         if ((int)(id_vec.size() / 2) > pool_size) {
11915           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11916              << pool_size << ")";
11917           err = -EINVAL;
11918           goto reply;
11919         }
11920
11921         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11922         ostringstream items;
11923         items << "[";
11924         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11925           int from = *p++;
11926           int to = *p;
11927           if (from == to) {
11928             ss << "from osd." << from << " == to osd." << to << ", ";
11929             continue;
11930           }
11931           if (!osdmap.exists(from)) {
11932             ss << "osd." << from << " does not exist";
11933             err = -ENOENT;
11934             goto reply;
11935           }
11936           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11937             ss << "osd." << to << " does not exist";
11938             err = -ENOENT;
11939             goto reply;
11940           }
11941           pair<int32_t,int32_t> entry = make_pair(from, to);
11942           auto it = std::find(new_pg_upmap_items.begin(),
11943             new_pg_upmap_items.end(), entry);
11944           if (it != new_pg_upmap_items.end()) {
11945             ss << "osd." << from << " -> osd." << to << " already exists, ";
11946             continue;
11947           }
11948           new_pg_upmap_items.push_back(entry);
11949           items << from << "->" << to << ",";
11950         }
11951         string out(items.str());
11952         out.resize(out.size() - 1); // drop last ','
11953         out += "]";
11954
11955         if (new_pg_upmap_items.empty()) {
11956           ss << "no valid upmap items(pairs) is specified";
11957           err = -EINVAL;
11958           goto reply;
11959         }
11960
11961         pending_inc.new_pg_upmap_items[pgid] =
11962           mempool::osdmap::vector<pair<int32_t,int32_t>>(
11963           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11964         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11965       }
11966       break;
11967
11968     case OP_RM_PG_UPMAP_ITEMS:
11969       {
11970         pending_inc.old_pg_upmap_items.insert(pgid);
11971         ss << "clear " << pgid << " pg_upmap_items mapping";
11972       }
11973       break;
11974
11975     default:
11976       ceph_abort_msg("invalid option");
11977     }
11978
11979     goto update;
11980   } else if (prefix == "osd primary-affinity") {
11981     int64_t id;
11982     if (!cmd_getval(cmdmap, "id", id)) {
11983       ss << "invalid osd id value '"
11984          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11985       err = -EINVAL;
11986       goto reply;
11987     }
11988     double w;
11989     if (!cmd_getval(cmdmap, "weight", w)) {
11990       ss << "unable to parse 'weight' value '"
11991          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11992       err = -EINVAL;
11993       goto reply;
11994     }
11995     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11996     if (ww < 0L) {
11997       ss << "weight must be >= 0";
11998       err = -EINVAL;
11999       goto reply;
12000     }
12001     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12002         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12003       ss << "require_min_compat_client "
12004          << osdmap.require_min_compat_client
12005          << " < firefly, which is required for primary-affinity";
12006       err = -EPERM;
12007       goto reply;
12008     }
12009     if (osdmap.exists(id)) {
12010       pending_inc.new_primary_affinity[id] = ww;
12011       ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
12012       getline(ss, rs);
12013       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12014                                                 get_last_committed() + 1));
12015       return true;
12016     } else {
12017       ss << "osd." << id << " does not exist";
12018       err = -ENOENT;
12019       goto reply;
12020     }
12021   } else if (prefix == "osd reweight") {
12022     int64_t id;
12023     if (!cmd_getval(cmdmap, "id", id)) {
12024       ss << "unable to parse osd id value '"
12025          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12026       err = -EINVAL;
12027       goto reply;
12028     }
12029     double w;
12030     if (!cmd_getval(cmdmap, "weight", w)) {
12031       ss << "unable to parse weight value '"
12032          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12033       err = -EINVAL;
12034       goto reply;
12035     }
12036     long ww = (int)((double)CEPH_OSD_IN*w);
12037     if (ww < 0L) {
12038       ss << "weight must be >= 0";
12039       err = -EINVAL;
12040       goto reply;
12041     }
12042     if (osdmap.exists(id)) {
12043       pending_inc.new_weight[id] = ww;
12044       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12045       getline(ss, rs);
12046       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12047                                                 get_last_committed() + 1));
12048       return true;
12049     } else {
12050       ss << "osd." << id << " does not exist";
12051       err = -ENOENT;
12052       goto reply;
12053     }
12054   } else if (prefix == "osd reweightn") {
12055     map<int32_t, uint32_t> weights;
12056     err = parse_reweights(cct, cmdmap, osdmap, &weights);
12057     if (err) {
12058       ss << "unable to parse 'weights' value '"
12059          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12060       goto reply;
12061     }
12062     pending_inc.new_weight.insert(weights.begin(), weights.end());
12063     wait_for_finished_proposal(
12064         op,
12065         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12066     return true;
12067   } else if (prefix == "osd lost") {
12068     int64_t id;
12069     if (!cmd_getval(cmdmap, "id", id)) {
12070       ss << "unable to parse osd id value '"
12071          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12072       err = -EINVAL;
12073       goto reply;
12074     }
12075     bool sure = false;
12076     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12077     if (!sure) {
12078       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
12079             "--yes-i-really-mean-it if you really do.";
12080       err = -EPERM;
12081       goto reply;
12082     } else if (!osdmap.exists(id)) {
12083       ss << "osd." << id << " does not exist";
12084       err = -ENOENT;
12085       goto reply;
12086     } else if (!osdmap.is_down(id)) {
12087       ss << "osd." << id << " is not down";
12088       err = -EBUSY;
12089       goto reply;
12090     } else {
12091       epoch_t e = osdmap.get_info(id).down_at;
12092       pending_inc.new_lost[id] = e;
12093       ss << "marked osd lost in epoch " << e;
12094       getline(ss, rs);
12095       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12096                                                 get_last_committed() + 1));
12097       return true;
12098     }
12099
12100   } else if (prefix == "osd destroy-actual" ||
12101              prefix == "osd purge-actual" ||
12102              prefix == "osd purge-new") {
12103     /* Destroying an OSD means that we don't expect to further make use of
12104      * the OSDs data (which may even become unreadable after this operation),
12105      * and that we are okay with scrubbing all its cephx keys and config-key
12106      * data (which may include lockbox keys, thus rendering the osd's data
12107      * unreadable).
12108      *
12109      * The OSD will not be removed. Instead, we will mark it as destroyed,
12110      * such that a subsequent call to `create` will not reuse the osd id.
12111      * This will play into being able to recreate the OSD, at the same
12112      * crush location, with minimal data movement.
12113      */
12114
12115     // make sure authmon is writeable.
12116     if (!mon->authmon()->is_writeable()) {
12117       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12118                << "osd destroy" << dendl;
12119       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12120       return false;
12121     }
12122
12123     int64_t id;
12124     if (!cmd_getval(cmdmap, "id", id)) {
12125       auto p = cmdmap.find("id");
12126       if (p == cmdmap.end()) {
12127         ss << "no osd id specified";
12128       } else {
12129         ss << "unable to parse osd id value '"
12130            << cmd_vartype_stringify(cmdmap.at("id")) << "";
12131       }
12132       err = -EINVAL;
12133       goto reply;
12134     }
12135
12136     bool is_destroy = (prefix == "osd destroy-actual");
12137     if (!is_destroy) {
12138       ceph_assert("osd purge-actual" == prefix ||
12139              "osd purge-new" == prefix);
12140     }
12141
12142     bool sure = false;
12143     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12144     if (!sure) {
12145       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
12146          << "This will mean real, permanent data loss, as well "
12147          << "as deletion of cephx and lockbox keys. "
12148          << "Pass --yes-i-really-mean-it if you really do.";
12149       err = -EPERM;
12150       goto reply;
12151     } else if (!osdmap.exists(id)) {
12152       ss << "osd." << id << " does not exist";
12153       err = 0; // idempotent
12154       goto reply;
12155     } else if (osdmap.is_up(id)) {
12156       ss << "osd." << id << " is not `down`.";
12157       err = -EBUSY;
12158       goto reply;
12159     } else if (is_destroy && osdmap.is_destroyed(id)) {
12160       ss << "destroyed osd." << id;
12161       err = 0;
12162       goto reply;
12163     }
12164
12165     if (prefix == "osd purge-new" &&
12166         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12167       ss << "osd." << id << " is not new";
12168       err = -EPERM;
12169       goto reply;
12170     }
12171
12172     bool goto_reply = false;
12173
12174     paxos->plug();
12175     if (is_destroy) {
12176       err = prepare_command_osd_destroy(id, ss);
12177       // we checked above that it should exist.
12178       ceph_assert(err != -ENOENT);
12179     } else {
12180       err = prepare_command_osd_purge(id, ss);
12181       if (err == -ENOENT) {
12182         err = 0;
12183         ss << "osd." << id << " does not exist.";
12184         goto_reply = true;
12185       }
12186     }
12187     paxos->unplug();
12188
12189     if (err < 0 || goto_reply) {
12190       goto reply;
12191     }
12192
12193     if (is_destroy) {
12194       ss << "destroyed osd." << id;
12195     } else {
12196       ss << "purged osd." << id;
12197     }
12198
12199     getline(ss, rs);
12200     wait_for_finished_proposal(op,
12201         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12202     force_immediate_propose();
12203     return true;
12204
12205   } else if (prefix == "osd new") {
12206
12207     // make sure authmon is writeable.
12208     if (!mon->authmon()->is_writeable()) {
12209       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12210                << "osd new" << dendl;
12211       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12212       return false;
12213     }
12214
12215     map<string,string> param_map;
12216
12217     bufferlist bl = m->get_data();
12218     string param_json = bl.to_str();
12219     dout(20) << __func__ << " osd new json = " << param_json << dendl;
12220
12221     err = get_json_str_map(param_json, ss, &param_map);
12222     if (err < 0)
12223       goto reply;
12224
12225     dout(20) << __func__ << " osd new params " << param_map << dendl;
12226
12227     paxos->plug();
12228     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12229     paxos->unplug();
12230
12231     if (err < 0) {
12232       goto reply;
12233     }
12234
12235     if (f) {
12236       f->flush(rdata);
12237     } else {
12238       rdata.append(ss);
12239     }
12240
12241     if (err == EEXIST) {
12242       // idempotent operation
12243       err = 0;
12244       goto reply;
12245     }
12246
12247     wait_for_finished_proposal(op,
12248         new Monitor::C_Command(mon, op, 0, rs, rdata,
12249                                get_last_committed() + 1));
12250     force_immediate_propose();
12251     return true;
12252
12253   } else if (prefix == "osd create") {
12254
12255     // optional id provided?
12256     int64_t id = -1, cmd_id = -1;
12257     if (cmd_getval(cmdmap, "id", cmd_id)) {
12258       if (cmd_id < 0) {
12259         ss << "invalid osd id value '" << cmd_id << "'";
12260         err = -EINVAL;
12261         goto reply;
12262       }
12263       dout(10) << " osd create got id " << cmd_id << dendl;
12264     }
12265
12266     uuid_d uuid;
12267     string uuidstr;
12268     if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12269       if (!uuid.parse(uuidstr.c_str())) {
12270         ss << "invalid uuid value '" << uuidstr << "'";
12271         err = -EINVAL;
12272         goto reply;
12273       }
12274       // we only care about the id if we also have the uuid, to
12275       // ensure the operation's idempotency.
12276       id = cmd_id;
12277     }
12278
12279     int32_t new_id = -1;
12280     err = prepare_command_osd_create(id, uuid, &new_id, ss);
12281     if (err < 0) {
12282       if (err == -EAGAIN) {
12283         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12284         return true;
12285       }
12286       // a check has failed; reply to the user.
12287       goto reply;
12288
12289     } else if (err == EEXIST) {
12290       // this is an idempotent operation; we can go ahead and reply.
12291       if (f) {
12292         f->open_object_section("created_osd");
12293         f->dump_int("osdid", new_id);
12294         f->close_section();
12295         f->flush(rdata);
12296       } else {
12297         ss << new_id;
12298         rdata.append(ss);
12299       }
12300       err = 0;
12301       goto reply;
12302     }
12303
12304     string empty_device_class;
12305     do_osd_create(id, uuid, empty_device_class, &new_id);
12306
12307     if (f) {
12308       f->open_object_section("created_osd");
12309       f->dump_int("osdid", new_id);
12310       f->close_section();
12311       f->flush(rdata);
12312     } else {
12313       ss << new_id;
12314       rdata.append(ss);
12315     }
12316     wait_for_finished_proposal(op,
12317         new Monitor::C_Command(mon, op, 0, rs, rdata,
12318                                get_last_committed() + 1));
12319     return true;
12320
12321   } else if (prefix == "osd blacklist clear") {
12322     pending_inc.new_blacklist.clear();
12323     std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12324     osdmap.get_blacklist(&blacklist);
12325     for (const auto &entry : blacklist) {
12326       pending_inc.old_blacklist.push_back(entry.first);
12327     }
12328     ss << " removed all blacklist entries";
12329     getline(ss, rs);
12330     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12331                                               get_last_committed() + 1));
12332     return true;
12333   } else if (prefix == "osd blacklist") {
12334     string addrstr;
12335     cmd_getval(cmdmap, "addr", addrstr);
12336     entity_addr_t addr;
12337     if (!addr.parse(addrstr.c_str(), 0)) {
12338       ss << "unable to parse address " << addrstr;
12339       err = -EINVAL;
12340       goto reply;
12341     }
12342     else {
12343       if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12344         // always blacklist type ANY
12345         addr.set_type(entity_addr_t::TYPE_ANY);
12346       } else {
12347         addr.set_type(entity_addr_t::TYPE_LEGACY);
12348       }
12349
12350       string blacklistop;
12351       cmd_getval(cmdmap, "blacklistop", blacklistop);
12352       if (blacklistop == "add") {
12353         utime_t expires = ceph_clock_now();
12354         double d;
12355         // default one hour
12356         cmd_getval(cmdmap, "expire", d,
12357           g_conf()->mon_osd_blacklist_default_expire);
12358         expires += d;
12359
12360         pending_inc.new_blacklist[addr] = expires;
12361
12362         {
12363           // cancel any pending un-blacklisting request too
12364           auto it = std::find(pending_inc.old_blacklist.begin(),
12365             pending_inc.old_blacklist.end(), addr);
12366           if (it != pending_inc.old_blacklist.end()) {
12367             pending_inc.old_blacklist.erase(it);
12368           }
12369         }
12370
12371         ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12372         getline(ss, rs);
12373         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12374                                                   get_last_committed() + 1));
12375         return true;
12376       } else if (blacklistop == "rm") {
12377         if (osdmap.is_blacklisted(addr) ||
12378             pending_inc.new_blacklist.count(addr)) {
12379           if (osdmap.is_blacklisted(addr))
12380             pending_inc.old_blacklist.push_back(addr);
12381           else
12382             pending_inc.new_blacklist.erase(addr);
12383           ss << "un-blacklisting " << addr;
12384           getline(ss, rs);
12385           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12386                                                     get_last_committed() + 1));
12387           return true;
12388         }
12389         ss << addr << " isn't blacklisted";
12390         err = 0;
12391         goto reply;
12392       }
12393     }
12394   } else if (prefix == "osd pool mksnap") {
12395     string poolstr;
12396     cmd_getval(cmdmap, "pool", poolstr);
12397     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12398     if (pool < 0) {
12399       ss << "unrecognized pool '" << poolstr << "'";
12400       err = -ENOENT;
12401       goto reply;
12402     }
12403     string snapname;
12404     cmd_getval(cmdmap, "snap", snapname);
12405     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12406     if (p->is_unmanaged_snaps_mode()) {
12407       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12408       err = -EINVAL;
12409       goto reply;
12410     } else if (p->snap_exists(snapname.c_str())) {
12411       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12412       err = 0;
12413       goto reply;
12414     } else if (p->is_tier()) {
12415       ss << "pool " << poolstr << " is a cache tier";
12416       err = -EINVAL;
12417       goto reply;
12418     }
12419     pg_pool_t *pp = 0;
12420     if (pending_inc.new_pools.count(pool))
12421       pp = &pending_inc.new_pools[pool];
12422     if (!pp) {
12423       pp = &pending_inc.new_pools[pool];
12424       *pp = *p;
12425     }
12426     if (pp->snap_exists(snapname.c_str())) {
12427       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12428     } else {
12429       pp->add_snap(snapname.c_str(), ceph_clock_now());
12430       pp->set_snap_epoch(pending_inc.epoch);
12431       ss << "created pool " << poolstr << " snap " << snapname;
12432     }
12433     getline(ss, rs);
12434     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12435                                               get_last_committed() + 1));
12436     return true;
12437   } else if (prefix == "osd pool rmsnap") {
12438     string poolstr;
12439     cmd_getval(cmdmap, "pool", poolstr);
12440     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12441     if (pool < 0) {
12442       ss << "unrecognized pool '" << poolstr << "'";
12443       err = -ENOENT;
12444       goto reply;
12445     }
12446     string snapname;
12447     cmd_getval(cmdmap, "snap", snapname);
12448     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12449     if (p->is_unmanaged_snaps_mode()) {
12450       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12451       err = -EINVAL;
12452       goto reply;
12453     } else if (!p->snap_exists(snapname.c_str())) {
12454       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12455       err = 0;
12456       goto reply;
12457     }
12458     pg_pool_t *pp = 0;
12459     if (pending_inc.new_pools.count(pool))
12460       pp = &pending_inc.new_pools[pool];
12461     if (!pp) {
12462       pp = &pending_inc.new_pools[pool];
12463       *pp = *p;
12464     }
12465     snapid_t sn = pp->snap_exists(snapname.c_str());
12466     if (sn) {
12467       pp->remove_snap(sn);
12468       pp->set_snap_epoch(pending_inc.epoch);
12469       ss << "removed pool " << poolstr << " snap " << snapname;
12470     } else {
12471       ss << "already removed pool " << poolstr << " snap " << snapname;
12472     }
12473     getline(ss, rs);
12474     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12475                                               get_last_committed() + 1));
12476     return true;
12477   } else if (prefix == "osd pool create") {
12478     int64_t pg_num, pg_num_min;
12479     int64_t pgp_num;
12480     cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12481     cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12482     cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12483
12484     string pool_type_str;
12485     cmd_getval(cmdmap, "pool_type", pool_type_str);
12486     if (pool_type_str.empty())
12487       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12488
12489     string poolstr;
12490     cmd_getval(cmdmap, "pool", poolstr);
12491     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12492     if (pool_id >= 0) {
12493       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12494       if (pool_type_str != p->get_type_name()) {
12495         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12496         err = -EINVAL;
12497       } else {
12498         ss << "pool '" << poolstr << "' already exists";
12499         err = 0;
12500       }
12501       goto reply;
12502     }
12503
12504     int pool_type;
12505     if (pool_type_str == "replicated") {
12506       pool_type = pg_pool_t::TYPE_REPLICATED;
12507     } else if (pool_type_str == "erasure") {
12508       pool_type = pg_pool_t::TYPE_ERASURE;
12509     } else {
12510       ss << "unknown pool type '" << pool_type_str << "'";
12511       err = -EINVAL;
12512       goto reply;
12513     }
12514
12515     bool implicit_rule_creation = false;
12516     int64_t expected_num_objects = 0;
12517     string rule_name;
12518     cmd_getval(cmdmap, "rule", rule_name);
12519     string erasure_code_profile;
12520     cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12521
12522     if (pool_type == pg_pool_t::TYPE_ERASURE) {
12523       if (erasure_code_profile == "")
12524         erasure_code_profile = "default";
12525       //handle the erasure code profile
12526       if (erasure_code_profile == "default") {
12527         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12528           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12529             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12530             goto wait;
12531           }
12532
12533           map<string,string> profile_map;
12534           err = osdmap.get_erasure_code_profile_default(cct,
12535                                                       profile_map,
12536                                                       &ss);
12537           if (err)
12538             goto reply;
12539           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12540           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12541           goto wait;
12542         }
12543       }
12544       if (rule_name == "") {
12545         implicit_rule_creation = true;
12546         if (erasure_code_profile == "default") {
12547           rule_name = "erasure-code";
12548         } else {
12549           dout(1) << "implicitly use rule named after the pool: "
12550                 << poolstr << dendl;
12551           rule_name = poolstr;
12552         }
12553       }
12554       cmd_getval(cmdmap, "expected_num_objects",
12555                  expected_num_objects, int64_t(0));
12556     } else {
12557       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12558       //     and put expected_num_objects to rule field
12559       if (erasure_code_profile != "") { // cmd is from CLI
12560         if (rule_name != "") {
12561           string interr;
12562           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12563           if (interr.length()) {
12564             ss << "error parsing integer value '" << rule_name << "': " << interr;
12565             err = -EINVAL;
12566             goto reply;
12567           }
12568         }
12569         rule_name = erasure_code_profile;
12570       } else { // cmd is well-formed
12571         cmd_getval(cmdmap, "expected_num_objects",
12572                    expected_num_objects, int64_t(0));
12573       }
12574     }
12575
12576     if (!implicit_rule_creation && rule_name != "") {
12577       int rule;
12578       err = get_crush_rule(rule_name, &rule, &ss);
12579       if (err == -EAGAIN) {
12580         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12581         return true;
12582       }
12583       if (err)
12584         goto reply;
12585     }
12586
12587     if (expected_num_objects < 0) {
12588       ss << "'expected_num_objects' must be non-negative";
12589       err = -EINVAL;
12590       goto reply;
12591     }
12592
12593     set<int32_t> osds;
12594     osdmap.get_all_osds(osds);
12595     bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12596       string type;
12597       if (!get_osd_objectstore_type(osd, &type)) {
12598         return type == "filestore";
12599       } else {
12600         return false;
12601       }
12602     });
12603
12604     if (has_filestore_osd &&
12605         expected_num_objects > 0 &&
12606         cct->_conf->filestore_merge_threshold > 0) {
12607       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12608       err = -EINVAL;
12609       goto reply;
12610     }
12611
12612     if (has_filestore_osd &&
12613         expected_num_objects == 0 &&
12614         cct->_conf->filestore_merge_threshold < 0) {
12615       int osds = osdmap.get_num_osds();
12616       bool sure = false;
12617       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12618       if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12619         ss << "For better initial performance on pools expected to store a "
12620            << "large number of objects, consider supplying the "
12621            << "expected_num_objects parameter when creating the pool."
12622            << " Pass --yes-i-really-mean-it to ignore it";
12623         err = -EPERM;
12624         goto reply;
12625       }
12626     }
12627
12628     int64_t fast_read_param;
12629     cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12630     FastReadType fast_read = FAST_READ_DEFAULT;
12631     if (fast_read_param == 0)
12632       fast_read = FAST_READ_OFF;
12633     else if (fast_read_param > 0)
12634       fast_read = FAST_READ_ON;
12635
12636     int64_t repl_size = 0;
12637     cmd_getval(cmdmap, "size", repl_size);
12638     int64_t target_size_bytes = 0;
12639     double target_size_ratio = 0.0;
12640     cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12641     cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12642
12643     string pg_autoscale_mode;
12644     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12645
12646     err = prepare_new_pool(poolstr,
12647                            -1, // default crush rule
12648                            rule_name,
12649                            pg_num, pgp_num, pg_num_min,
12650                            repl_size, target_size_bytes, target_size_ratio,
12651                            erasure_code_profile, pool_type,
12652                            (uint64_t)expected_num_objects,
12653                            fast_read,
12654                            pg_autoscale_mode,
12655                            &ss);
12656     if (err < 0) {
12657       switch(err) {
12658       case -EEXIST:
12659         ss << "pool '" << poolstr << "' already exists";
12660         break;
12661       case -EAGAIN:
12662         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12663         return true;
12664       case -ERANGE:
12665         goto reply;
12666       default:
12667         goto reply;
12668         break;
12669       }
12670     } else {
12671       ss << "pool '" << poolstr << "' created";
12672     }
12673     getline(ss, rs);
12674     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12675                                               get_last_committed() + 1));
12676     return true;
12677
12678   } else if (prefix == "osd pool delete" ||
12679              prefix == "osd pool rm") {
12680     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12681     string poolstr, poolstr2, sure;
12682     cmd_getval(cmdmap, "pool", poolstr);
12683     cmd_getval(cmdmap, "pool2", poolstr2);
12684     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12685     if (pool < 0) {
12686       ss << "pool '" << poolstr << "' does not exist";
12687       err = 0;
12688       goto reply;
12689     }
12690
12691     bool force_no_fake = false;
12692     cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12693     bool force = false;
12694     cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12695     if (poolstr2 != poolstr ||
12696         (!force && !force_no_fake)) {
12697       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12698          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12699          << "followed by --yes-i-really-really-mean-it.";
12700       err = -EPERM;
12701       goto reply;
12702     }
12703     err = _prepare_remove_pool(pool, &ss, force_no_fake);
12704     if (err == -EAGAIN) {
12705       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12706       return true;
12707     }
12708     if (err < 0)
12709       goto reply;
12710     goto update;
12711   } else if (prefix == "osd pool rename") {
12712     string srcpoolstr, destpoolstr;
12713     cmd_getval(cmdmap, "srcpool", srcpoolstr);
12714     cmd_getval(cmdmap, "destpool", destpoolstr);
12715     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12716     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12717
12718     if (pool_src < 0) {
12719       if (pool_dst >= 0) {
12720         // src pool doesn't exist, dst pool does exist: to ensure idempotency
12721         // of operations, assume this rename succeeded, as it is not changing
12722         // the current state.  Make sure we output something understandable
12723         // for whoever is issuing the command, if they are paying attention,
12724         // in case it was not intentional; or to avoid a "wtf?" and a bug
12725         // report in case it was intentional, while expecting a failure.
12726         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12727           << destpoolstr << "' does -- assuming successful rename";
12728         err = 0;
12729       } else {
12730         ss << "unrecognized pool '" << srcpoolstr << "'";
12731         err = -ENOENT;
12732       }
12733       goto reply;
12734     } else if (pool_dst >= 0) {
12735       // source pool exists and so does the destination pool
12736       ss << "pool '" << destpoolstr << "' already exists";
12737       err = -EEXIST;
12738       goto reply;
12739     }
12740
12741     int ret = _prepare_rename_pool(pool_src, destpoolstr);
12742     if (ret == 0) {
12743       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12744     } else {
12745       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12746         << cpp_strerror(ret);
12747     }
12748     getline(ss, rs);
12749     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12750                                               get_last_committed() + 1));
12751     return true;
12752
12753   } else if (prefix == "osd pool set") {
12754     err = prepare_command_pool_set(cmdmap, ss);
12755     if (err == -EAGAIN)
12756       goto wait;
12757     if (err < 0)
12758       goto reply;
12759
12760     getline(ss, rs);
12761     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12762                                                    get_last_committed() + 1));
12763     return true;
12764   } else if (prefix == "osd tier add") {
12765     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12766     if (err == -EAGAIN)
12767       goto wait;
12768     if (err)
12769       goto reply;
12770     string poolstr;
12771     cmd_getval(cmdmap, "pool", poolstr);
12772     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12773     if (pool_id < 0) {
12774       ss << "unrecognized pool '" << poolstr << "'";
12775       err = -ENOENT;
12776       goto reply;
12777     }
12778     string tierpoolstr;
12779     cmd_getval(cmdmap, "tierpool", tierpoolstr);
12780     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12781     if (tierpool_id < 0) {
12782       ss << "unrecognized pool '" << tierpoolstr << "'";
12783       err = -ENOENT;
12784       goto reply;
12785     }
12786     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12787     ceph_assert(p);
12788     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12789     ceph_assert(tp);
12790
12791     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12792       goto reply;
12793     }
12794
12795     // make sure new tier is empty
12796     string force_nonempty;
12797     cmd_getval(cmdmap, "force_nonempty", force_nonempty);
12798     const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
12799     if (pstats && pstats->stats.sum.num_objects != 0 &&
12800         force_nonempty != "--force-nonempty") {
12801       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12802       err = -ENOTEMPTY;
12803       goto reply;
12804     }
12805     if (tp->is_erasure()) {
12806       ss << "tier pool '" << tierpoolstr
12807          << "' is an ec pool, which cannot be a tier";
12808       err = -ENOTSUP;
12809       goto reply;
12810     }
12811     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12812         ((force_nonempty != "--force-nonempty") ||
12813          (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
12814       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12815       err = -ENOTEMPTY;
12816       goto reply;
12817     }
12818     // go
12819     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12820     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12821     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12822       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12823       return true;
12824     }
12825     np->tiers.insert(tierpool_id);
12826     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12827     ntp->tier_of = pool_id;
12828     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12829     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12830                                               get_last_committed() + 1));
12831     return true;
12832   } else if (prefix == "osd tier remove" ||
12833              prefix == "osd tier rm") {
12834     string poolstr;
12835     cmd_getval(cmdmap, "pool", poolstr);
12836     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12837     if (pool_id < 0) {
12838       ss << "unrecognized pool '" << poolstr << "'";
12839       err = -ENOENT;
12840       goto reply;
12841     }
12842     string tierpoolstr;
12843     cmd_getval(cmdmap, "tierpool", tierpoolstr);
12844     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12845     if (tierpool_id < 0) {
12846       ss << "unrecognized pool '" << tierpoolstr << "'";
12847       err = -ENOENT;
12848       goto reply;
12849     }
12850     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12851     ceph_assert(p);
12852     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12853     ceph_assert(tp);
12854
12855     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12856       goto reply;
12857     }
12858
12859     if (p->tiers.count(tierpool_id) == 0) {
12860       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12861       err = 0;
12862       goto reply;
12863     }
12864     if (tp->tier_of != pool_id) {
12865       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12866          << osdmap.get_pool_name(tp->tier_of) << "': "
12867          // be scary about it; this is an inconsistency and bells must go off
12868          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12869       err = -EINVAL;
12870       goto reply;
12871     }
12872     if (p->read_tier == tierpool_id) {
12873       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12874       err = -EBUSY;
12875       goto reply;
12876     }
12877     // go
12878     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12879     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12880     if (np->tiers.count(tierpool_id) == 0 ||
12881         ntp->tier_of != pool_id ||
12882         np->read_tier == tierpool_id) {
12883       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12884       return true;
12885     }
12886     np->tiers.erase(tierpool_id);
12887     ntp->clear_tier();
12888     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12889     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12890                                               get_last_committed() + 1));
12891     return true;
12892   } else if (prefix == "osd tier set-overlay") {
12893     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12894     if (err == -EAGAIN)
12895       goto wait;
12896     if (err)
12897       goto reply;
12898     string poolstr;
12899     cmd_getval(cmdmap, "pool", poolstr);
12900     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12901     if (pool_id < 0) {
12902       ss << "unrecognized pool '" << poolstr << "'";
12903       err = -ENOENT;
12904       goto reply;
12905     }
12906     string overlaypoolstr;
12907     cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
12908     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12909     if (overlaypool_id < 0) {
12910       ss << "unrecognized pool '" << overlaypoolstr << "'";
12911       err = -ENOENT;
12912       goto reply;
12913     }
12914     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12915     ceph_assert(p);
12916     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
12917     ceph_assert(overlay_p);
12918     if (p->tiers.count(overlaypool_id) == 0) {
12919       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12920       err = -EINVAL;
12921       goto reply;
12922     }
12923     if (p->read_tier == overlaypool_id) {
12924       err = 0;
12925       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12926       goto reply;
12927     }
12928     if (p->has_read_tier()) {
12929       ss << "pool '" << poolstr << "' has overlay '"
12930          << osdmap.get_pool_name(p->read_tier)
12931          << "'; please remove-overlay first";
12932       err = -EINVAL;
12933       goto reply;
12934     }
12935
12936     // go
12937     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12938     np->read_tier = overlaypool_id;
12939     np->write_tier = overlaypool_id;
12940     np->set_last_force_op_resend(pending_inc.epoch);
12941     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12942     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12943     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12944     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12945       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12946     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12947                                               get_last_committed() + 1));
12948     return true;
12949   } else if (prefix == "osd tier remove-overlay" ||
12950              prefix == "osd tier rm-overlay") {
12951     string poolstr;
12952     cmd_getval(cmdmap, "pool", poolstr);
12953     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12954     if (pool_id < 0) {
12955       ss << "unrecognized pool '" << poolstr << "'";
12956       err = -ENOENT;
12957       goto reply;
12958     }
12959     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12960     ceph_assert(p);
12961     if (!p->has_read_tier()) {
12962       err = 0;
12963       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12964       goto reply;
12965     }
12966
12967     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12968       goto reply;
12969     }
12970
12971     // go
12972     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12973     if (np->has_read_tier()) {
12974       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12975       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12976       nop->set_last_force_op_resend(pending_inc.epoch);
12977     }
12978     if (np->has_write_tier()) {
12979       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12980       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12981       nop->set_last_force_op_resend(pending_inc.epoch);
12982     }
12983     np->clear_read_tier();
12984     np->clear_write_tier();
12985     np->set_last_force_op_resend(pending_inc.epoch);
12986     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12987     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12988                                               get_last_committed() + 1));
12989     return true;
12990   } else if (prefix == "osd tier cache-mode") {
12991     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12992     if (err == -EAGAIN)
12993       goto wait;
12994     if (err)
12995       goto reply;
12996     string poolstr;
12997     cmd_getval(cmdmap, "pool", poolstr);
12998     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12999     if (pool_id < 0) {
13000       ss << "unrecognized pool '" << poolstr << "'";
13001       err = -ENOENT;
13002       goto reply;
13003     }
13004     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13005     ceph_assert(p);
13006     if (!p->is_tier()) {
13007       ss << "pool '" << poolstr << "' is not a tier";
13008       err = -EINVAL;
13009       goto reply;
13010     }
13011     string modestr;
13012     cmd_getval(cmdmap, "mode", modestr);
13013     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13014     if (int(mode) < 0) {
13015       ss << "'" << modestr << "' is not a valid cache mode";
13016       err = -EINVAL;
13017       goto reply;
13018     }
13019
13020     bool sure = false;
13021     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13022
13023     if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13024         mode == pg_pool_t::CACHEMODE_READFORWARD) {
13025       ss << "'" << modestr << "' is no longer a supported cache mode";
13026       err = -EPERM;
13027       goto reply;
13028     }
13029     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13030          mode != pg_pool_t::CACHEMODE_NONE &&
13031          mode != pg_pool_t::CACHEMODE_PROXY &&
13032          mode != pg_pool_t::CACHEMODE_READPROXY) &&
13033          !sure) {
13034       ss << "'" << modestr << "' is not a well-supported cache mode and may "
13035          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
13036       err = -EPERM;
13037       goto reply;
13038     }
13039
13040     // pool already has this cache-mode set and there are no pending changes
13041     if (p->cache_mode == mode &&
13042         (pending_inc.new_pools.count(pool_id) == 0 ||
13043          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13044       ss << "set cache-mode for pool '" << poolstr << "'"
13045          << " to " << pg_pool_t::get_cache_mode_name(mode);
13046       err = 0;
13047       goto reply;
13048     }
13049
13050     /* Mode description:
13051      *
13052      *  none:       No cache-mode defined
13053      *  forward:    Forward all reads and writes to base pool [removed]
13054      *  writeback:  Cache writes, promote reads from base pool
13055      *  readonly:   Forward writes to base pool
13056      *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13057      *  proxy:       Proxy all reads and writes to base pool
13058      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
13059      *
13060      * Hence, these are the allowed transitions:
13061      *
13062      *  none -> any
13063      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13064      *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13065      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13066      *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13067      *  writeback -> readproxy || proxy
13068      *  readonly -> any
13069      */
13070
13071     // We check if the transition is valid against the current pool mode, as
13072     // it is the only committed state thus far.  We will blantly squash
13073     // whatever mode is on the pending state.
13074
13075     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13076         (mode != pg_pool_t::CACHEMODE_PROXY &&
13077           mode != pg_pool_t::CACHEMODE_READPROXY)) {
13078       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13079          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13080          << "' pool; only '"
13081          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13082         << "' allowed.";
13083       err = -EINVAL;
13084       goto reply;
13085     }
13086     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13087         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13088           mode != pg_pool_t::CACHEMODE_PROXY &&
13089           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13090
13091         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13092         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13093           mode != pg_pool_t::CACHEMODE_PROXY)) ||
13094
13095         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13096         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13097           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13098
13099         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13100         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13101           mode != pg_pool_t::CACHEMODE_PROXY &&
13102           mode != pg_pool_t::CACHEMODE_READPROXY))) {
13103
13104       const pool_stat_t* pstats =
13105         mon->mgrstatmon()->get_pool_stat(pool_id);
13106
13107       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13108         ss << "unable to set cache-mode '"
13109            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13110            << "': dirty objects found";
13111         err = -EBUSY;
13112         goto reply;
13113       }
13114     }
13115     // go
13116     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13117     np->cache_mode = mode;
13118     // set this both when moving to and from cache_mode NONE.  this is to
13119     // capture legacy pools that were set up before this flag existed.
13120     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13121     ss << "set cache-mode for pool '" << poolstr
13122         << "' to " << pg_pool_t::get_cache_mode_name(mode);
13123     if (mode == pg_pool_t::CACHEMODE_NONE) {
13124       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13125       ceph_assert(base_pool);
13126       if (base_pool->read_tier == pool_id ||
13127           base_pool->write_tier == pool_id)
13128         ss <<" (WARNING: pool is still configured as read or write tier)";
13129     }
13130     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13131                                               get_last_committed() + 1));
13132     return true;
13133   } else if (prefix == "osd tier add-cache") {
13134     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13135     if (err == -EAGAIN)
13136       goto wait;
13137     if (err)
13138       goto reply;
13139     string poolstr;
13140     cmd_getval(cmdmap, "pool", poolstr);
13141     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13142     if (pool_id < 0) {
13143       ss << "unrecognized pool '" << poolstr << "'";
13144       err = -ENOENT;
13145       goto reply;
13146     }
13147     string tierpoolstr;
13148     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13149     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13150     if (tierpool_id < 0) {
13151       ss << "unrecognized pool '" << tierpoolstr << "'";
13152       err = -ENOENT;
13153       goto reply;
13154     }
13155     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13156     ceph_assert(p);
13157     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13158     ceph_assert(tp);
13159
13160     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13161       goto reply;
13162     }
13163
13164     int64_t size = 0;
13165     if (!cmd_getval(cmdmap, "size", size)) {
13166       ss << "unable to parse 'size' value '"
13167          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13168       err = -EINVAL;
13169       goto reply;
13170     }
13171     // make sure new tier is empty
13172     const pool_stat_t *pstats =
13173       mon->mgrstatmon()->get_pool_stat(tierpool_id);
13174     if (pstats && pstats->stats.sum.num_objects != 0) {
13175       ss << "tier pool '" << tierpoolstr << "' is not empty";
13176       err = -ENOTEMPTY;
13177       goto reply;
13178     }
13179     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13180     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13181     if (int(mode) < 0) {
13182       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13183       err = -EINVAL;
13184       goto reply;
13185     }
13186     HitSet::Params hsp;
13187     auto& cache_hit_set_type =
13188       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13189     if (cache_hit_set_type == "bloom") {
13190       BloomHitSet::Params *bsp = new BloomHitSet::Params;
13191       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13192       hsp = HitSet::Params(bsp);
13193     } else if (cache_hit_set_type == "explicit_hash") {
13194       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13195     } else if (cache_hit_set_type == "explicit_object") {
13196       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13197     } else {
13198       ss << "osd tier cache default hit set type '"
13199          << cache_hit_set_type << "' is not a known type";
13200       err = -EINVAL;
13201       goto reply;
13202     }
13203     // go
13204     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13205     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13206     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13207       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13208       return true;
13209     }
13210     np->tiers.insert(tierpool_id);
13211     np->read_tier = np->write_tier = tierpool_id;
13212     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13213     np->set_last_force_op_resend(pending_inc.epoch);
13214     ntp->set_last_force_op_resend(pending_inc.epoch);
13215     ntp->tier_of = pool_id;
13216     ntp->cache_mode = mode;
13217     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13218     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13219     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13220     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13221     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13222     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13223     ntp->hit_set_params = hsp;
13224     ntp->target_max_bytes = size;
13225     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13226     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13227                                               get_last_committed() + 1));
13228     return true;
13229   } else if (prefix == "osd pool set-quota") {
13230     string poolstr;
13231     cmd_getval(cmdmap, "pool", poolstr);
13232     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13233     if (pool_id < 0) {
13234       ss << "unrecognized pool '" << poolstr << "'";
13235       err = -ENOENT;
13236       goto reply;
13237     }
13238
13239     string field;
13240     cmd_getval(cmdmap, "field", field);
13241     if (field != "max_objects" && field != "max_bytes") {
13242       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13243       err = -EINVAL;
13244       goto reply;
13245     }
13246
13247     // val could contain unit designations, so we treat as a string
13248     string val;
13249     cmd_getval(cmdmap, "val", val);
13250     string tss;
13251     int64_t value;
13252     if (field == "max_objects") {
13253       value = strict_sistrtoll(val.c_str(), &tss);
13254     } else if (field == "max_bytes") {
13255       value = strict_iecstrtoll(val.c_str(), &tss);
13256     } else {
13257       ceph_abort_msg("unrecognized option");
13258     }
13259     if (!tss.empty()) {
13260       ss << "error parsing value '" << val << "': " << tss;
13261       err = -EINVAL;
13262       goto reply;
13263     }
13264
13265     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13266     if (field == "max_objects") {
13267       pi->quota_max_objects = value;
13268     } else if (field == "max_bytes") {
13269       pi->quota_max_bytes = value;
13270     } else {
13271       ceph_abort_msg("unrecognized option");
13272     }
13273     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13274     rs = ss.str();
13275     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13276                                               get_last_committed() + 1));
13277     return true;
13278   } else if (prefix == "osd pool application enable" ||
13279              prefix == "osd pool application disable" ||
13280              prefix == "osd pool application set" ||
13281              prefix == "osd pool application rm") {
13282     err = prepare_command_pool_application(prefix, cmdmap, ss);
13283     if (err == -EAGAIN) {
13284       goto wait;
13285     } else if (err < 0) {
13286       goto reply;
13287     } else {
13288       goto update;
13289     }
13290   } else if (prefix == "osd force-create-pg") {
13291     pg_t pgid;
13292     string pgidstr;
13293     cmd_getval(cmdmap, "pgid", pgidstr);
13294     if (!pgid.parse(pgidstr.c_str())) {
13295       ss << "invalid pgid '" << pgidstr << "'";
13296       err = -EINVAL;
13297       goto reply;
13298     }
13299     if (!osdmap.pg_exists(pgid)) {
13300       ss << "pg " << pgid << " should not exist";
13301       err = -ENOENT;
13302       goto reply;
13303     }
13304     bool sure = false;
13305     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13306     if (!sure) {
13307       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13308          << "that the cluster will give up ever trying to recover the lost data.  Do this "
13309          << "only if you are certain that all copies of the PG are in fact lost and you are "
13310          << "willing to accept that the data is permanently destroyed.  Pass "
13311          << "--yes-i-really-mean-it to proceed.";
13312       err = -EPERM;
13313       goto reply;
13314     }
13315     bool creating_now;
13316     {
13317       std::lock_guard<std::mutex> l(creating_pgs_lock);
13318       auto emplaced = creating_pgs.pgs.emplace(
13319         pgid,
13320         creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13321                                        ceph_clock_now()));
13322       creating_now = emplaced.second;
13323     }
13324     if (creating_now) {
13325       ss << "pg " << pgidstr << " now creating, ok";
13326       // set the pool's CREATING flag so that (1) the osd won't ignore our
13327       // create message and (2) we won't propose any future pg_num changes
13328       // until after the PG has been instantiated.
13329       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13330         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13331       }
13332       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13333       err = 0;
13334       goto update;
13335     } else {
13336       ss << "pg " << pgid << " already creating";
13337       err = 0;
13338       goto reply;
13339     }
13340   } else {
13341     err = -EINVAL;
13342   }
13343
13344  reply:
13345   getline(ss, rs);
13346   if (err < 0 && rs.length() == 0)
13347     rs = cpp_strerror(err);
13348   mon->reply_command(op, err, rs, rdata, get_last_committed());
13349   return ret;
13350
13351  update:
13352   getline(ss, rs);
13353   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13354                                             get_last_committed() + 1));
13355   return true;
13356
13357  wait:
13358   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13359   return true;
13360 }
13361
13362 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13363 {
13364   op->mark_osdmon_event(__func__);
13365
13366   auto m = op->get_req<MPoolOp>();
13367   MonSession *session = op->get_session();
13368   if (!session) {
13369     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13370     return true;
13371   }
13372
13373   switch (m->op) {
13374   case POOL_OP_CREATE_UNMANAGED_SNAP:
13375   case POOL_OP_DELETE_UNMANAGED_SNAP:
13376     {
13377       const std::string* pool_name = nullptr;
13378       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13379       if (pg_pool != nullptr) {
13380         pool_name = &osdmap.get_pool_name(m->pool);
13381       }
13382
13383       if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13384                                           session->entity_name, session->caps,
13385                                           session->get_peer_socket_addr(),
13386                                           pool_name)) {
13387         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13388                 << "privileges. message: " << *m  << std::endl
13389                 << "caps: " << session->caps << dendl;
13390         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13391         return true;
13392       }
13393     }
13394     break;
13395   default:
13396     if (!session->is_capable("osd", MON_CAP_W)) {
13397       dout(0) << "got pool op from entity with insufficient privileges. "
13398               << "message: " << *m  << std::endl
13399               << "caps: " << session->caps << dendl;
13400       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13401       return true;
13402     }
13403     break;
13404   }
13405
13406   return false;
13407 }
13408
13409 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13410 {
13411   op->mark_osdmon_event(__func__);
13412   auto m = op->get_req<MPoolOp>();
13413
13414   if (enforce_pool_op_caps(op)) {
13415     return true;
13416   }
13417
13418   if (m->fsid != mon->monmap->fsid) {
13419     dout(0) << __func__ << " drop message on fsid " << m->fsid
13420             << " != " << mon->monmap->fsid << " for " << *m << dendl;
13421     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13422     return true;
13423   }
13424
13425   if (m->op == POOL_OP_CREATE)
13426     return preprocess_pool_op_create(op);
13427
13428   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13429   if (p == nullptr) {
13430     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13431     if (m->op == POOL_OP_DELETE) {
13432       _pool_op_reply(op, 0, osdmap.get_epoch());
13433     } else {
13434       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13435     }
13436     return true;
13437   }
13438
13439   // check if the snap and snapname exist
13440   bool snap_exists = false;
13441   if (p->snap_exists(m->name.c_str()))
13442     snap_exists = true;
13443
13444   switch (m->op) {
13445   case POOL_OP_CREATE_SNAP:
13446     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13447       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13448       return true;
13449     }
13450     if (snap_exists) {
13451       _pool_op_reply(op, 0, osdmap.get_epoch());
13452       return true;
13453     }
13454     return false;
13455   case POOL_OP_CREATE_UNMANAGED_SNAP:
13456     if (p->is_pool_snaps_mode()) {
13457       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13458       return true;
13459     }
13460     return false;
13461   case POOL_OP_DELETE_SNAP:
13462     if (p->is_unmanaged_snaps_mode()) {
13463       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13464       return true;
13465     }
13466     if (!snap_exists) {
13467       _pool_op_reply(op, 0, osdmap.get_epoch());
13468       return true;
13469     }
13470     return false;
13471   case POOL_OP_DELETE_UNMANAGED_SNAP:
13472     if (p->is_pool_snaps_mode()) {
13473       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13474       return true;
13475     }
13476     if (_is_removed_snap(m->pool, m->snapid)) {
13477       _pool_op_reply(op, 0, osdmap.get_epoch());
13478       return true;
13479     }
13480     return false;
13481   case POOL_OP_DELETE:
13482     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13483       _pool_op_reply(op, 0, osdmap.get_epoch());
13484       return true;
13485     }
13486     return false;
13487   case POOL_OP_AUID_CHANGE:
13488     return false;
13489   default:
13490     ceph_abort();
13491     break;
13492   }
13493
13494   return false;
13495 }
13496
13497 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13498 {
13499   if (!osdmap.have_pg_pool(pool)) {
13500     dout(10) << __func__ << " pool " << pool << " snap " << snap
13501              << " - pool dne" << dendl;
13502     return true;
13503   }
13504   if (osdmap.in_removed_snaps_queue(pool, snap)) {
13505     dout(10) << __func__ << " pool " << pool << " snap " << snap
13506              << " - in osdmap removed_snaps_queue" << dendl;
13507     return true;
13508   }
13509   snapid_t begin, end;
13510   int r = lookup_purged_snap(pool, snap, &begin, &end);
13511   if (r == 0) {
13512     dout(10) << __func__ << " pool " << pool << " snap " << snap
13513              << " - purged, [" << begin << "," << end << ")" << dendl;
13514     return true;
13515   }
13516   return false;
13517 }
13518
13519 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13520 {
13521   if (pending_inc.old_pools.count(pool)) {
13522     dout(10) << __func__ << " pool " << pool << " snap " << snap
13523              << " - pool pending deletion" << dendl;
13524     return true;
13525   }
13526   if (pending_inc.in_new_removed_snaps(pool, snap)) {
13527     dout(10) << __func__ << " pool " << pool << " snap " << snap
13528              << " - in pending new_removed_snaps" << dendl;
13529     return true;
13530   }
13531   return false;
13532 }
13533
13534 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13535 {
13536   op->mark_osdmon_event(__func__);
13537   auto m = op->get_req<MPoolOp>();
13538   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13539   if (pool >= 0) {
13540     _pool_op_reply(op, 0, osdmap.get_epoch());
13541     return true;
13542   }
13543
13544   return false;
13545 }
13546
13547 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13548 {
13549   op->mark_osdmon_event(__func__);
13550   auto m = op->get_req<MPoolOp>();
13551   dout(10) << "prepare_pool_op " << *m << dendl;
13552   if (m->op == POOL_OP_CREATE) {
13553     return prepare_pool_op_create(op);
13554   } else if (m->op == POOL_OP_DELETE) {
13555     return prepare_pool_op_delete(op);
13556   }
13557
13558   int ret = 0;
13559   bool changed = false;
13560
13561   if (!osdmap.have_pg_pool(m->pool)) {
13562     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13563     return false;
13564   }
13565
13566   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13567
13568   switch (m->op) {
13569     case POOL_OP_CREATE_SNAP:
13570       if (pool->is_tier()) {
13571         ret = -EINVAL;
13572         _pool_op_reply(op, ret, osdmap.get_epoch());
13573         return false;
13574       }  // else, fall through
13575     case POOL_OP_DELETE_SNAP:
13576       if (!pool->is_unmanaged_snaps_mode()) {
13577         bool snap_exists = pool->snap_exists(m->name.c_str());
13578         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13579           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13580           ret = 0;
13581         } else {
13582           break;
13583         }
13584       } else {
13585         ret = -EINVAL;
13586       }
13587       _pool_op_reply(op, ret, osdmap.get_epoch());
13588       return false;
13589
13590     case POOL_OP_DELETE_UNMANAGED_SNAP:
13591       // we won't allow removal of an unmanaged snapshot from a pool
13592       // not in unmanaged snaps mode.
13593       if (!pool->is_unmanaged_snaps_mode()) {
13594         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13595         return false;
13596       }
13597       /* fall-thru */
13598     case POOL_OP_CREATE_UNMANAGED_SNAP:
13599       // but we will allow creating an unmanaged snapshot on any pool
13600       // as long as it is not in 'pool' snaps mode.
13601       if (pool->is_pool_snaps_mode()) {
13602         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13603         return false;
13604       }
13605   }
13606
13607   // projected pool info
13608   pg_pool_t pp;
13609   if (pending_inc.new_pools.count(m->pool))
13610     pp = pending_inc.new_pools[m->pool];
13611   else
13612     pp = *osdmap.get_pg_pool(m->pool);
13613
13614   bufferlist reply_data;
13615
13616   // pool snaps vs unmanaged snaps are mutually exclusive
13617   switch (m->op) {
13618   case POOL_OP_CREATE_SNAP:
13619   case POOL_OP_DELETE_SNAP:
13620     if (pp.is_unmanaged_snaps_mode()) {
13621       ret = -EINVAL;
13622       goto out;
13623     }
13624     break;
13625
13626   case POOL_OP_CREATE_UNMANAGED_SNAP:
13627   case POOL_OP_DELETE_UNMANAGED_SNAP:
13628     if (pp.is_pool_snaps_mode()) {
13629       ret = -EINVAL;
13630       goto out;
13631     }
13632   }
13633
13634   switch (m->op) {
13635   case POOL_OP_CREATE_SNAP:
13636     if (!pp.snap_exists(m->name.c_str())) {
13637       pp.add_snap(m->name.c_str(), ceph_clock_now());
13638       dout(10) << "create snap in pool " << m->pool << " " << m->name
13639                << " seq " << pp.get_snap_epoch() << dendl;
13640       changed = true;
13641     }
13642     break;
13643
13644   case POOL_OP_DELETE_SNAP:
13645     {
13646       snapid_t s = pp.snap_exists(m->name.c_str());
13647       if (s) {
13648         pp.remove_snap(s);
13649         pending_inc.new_removed_snaps[m->pool].insert(s);
13650         changed = true;
13651       }
13652     }
13653     break;
13654
13655   case POOL_OP_CREATE_UNMANAGED_SNAP:
13656     {
13657       uint64_t snapid = pp.add_unmanaged_snap(
13658         osdmap.require_osd_release < ceph_release_t::octopus);
13659       encode(snapid, reply_data);
13660       changed = true;
13661     }
13662     break;
13663
13664   case POOL_OP_DELETE_UNMANAGED_SNAP:
13665     if (!_is_removed_snap(m->pool, m->snapid) &&
13666         !_is_pending_removed_snap(m->pool, m->snapid)) {
13667       if (m->snapid > pp.get_snap_seq()) {
13668         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13669         return false;
13670       }
13671       pp.remove_unmanaged_snap(
13672         m->snapid,
13673         osdmap.require_osd_release < ceph_release_t::octopus);
13674       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13675       // also record the new seq as purged: this avoids a discontinuity
13676       // after all of the snaps have been purged, since the seq assigned
13677       // during removal lives in the same namespace as the actual snaps.
13678       pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13679       changed = true;
13680     }
13681     break;
13682
13683   case POOL_OP_AUID_CHANGE:
13684     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13685     return false;
13686
13687   default:
13688     ceph_abort();
13689     break;
13690   }
13691
13692   if (changed) {
13693     pp.set_snap_epoch(pending_inc.epoch);
13694     pending_inc.new_pools[m->pool] = pp;
13695   }
13696
13697  out:
13698   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13699   return true;
13700 }
13701
13702 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13703 {
13704   op->mark_osdmon_event(__func__);
13705   int err = prepare_new_pool(op);
13706   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13707   return true;
13708 }
13709
13710 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13711                                    ostream *ss)
13712 {
13713   const string& poolstr = osdmap.get_pool_name(pool_id);
13714
13715   // If the Pool is in use by CephFS, refuse to delete it
13716   FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13717   if (pending_fsmap.pool_in_use(pool_id)) {
13718     *ss << "pool '" << poolstr << "' is in use by CephFS";
13719     return -EBUSY;
13720   }
13721
13722   if (pool.tier_of >= 0) {
13723     *ss << "pool '" << poolstr << "' is a tier of '"
13724         << osdmap.get_pool_name(pool.tier_of) << "'";
13725     return -EBUSY;
13726   }
13727   if (!pool.tiers.empty()) {
13728     *ss << "pool '" << poolstr << "' has tiers";
13729     for(auto tier : pool.tiers) {
13730       *ss << " " << osdmap.get_pool_name(tier);
13731     }
13732     return -EBUSY;
13733   }
13734
13735   if (!g_conf()->mon_allow_pool_delete) {
13736     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13737     return -EPERM;
13738   }
13739
13740   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13741     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13742     return -EPERM;
13743   }
13744
13745   *ss << "pool '" << poolstr << "' removed";
13746   return 0;
13747 }
13748
13749 /**
13750  * Check if it is safe to add a tier to a base pool
13751  *
13752  * @return
13753  * True if the operation should proceed, false if we should abort here
13754  * (abort doesn't necessarily mean error, could be idempotency)
13755  */
13756 bool OSDMonitor::_check_become_tier(
13757     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13758     const int64_t base_pool_id, const pg_pool_t *base_pool,
13759     int *err,
13760     ostream *ss) const
13761 {
13762   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13763   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13764
13765   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13766   if (pending_fsmap.pool_in_use(tier_pool_id)) {
13767     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13768     *err = -EBUSY;
13769     return false;
13770   }
13771
13772   if (base_pool->tiers.count(tier_pool_id)) {
13773     ceph_assert(tier_pool->tier_of == base_pool_id);
13774     *err = 0;
13775     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13776       << base_pool_name << "'";
13777     return false;
13778   }
13779
13780   if (base_pool->is_tier()) {
13781     *ss << "pool '" << base_pool_name << "' is already a tier of '"
13782       << osdmap.get_pool_name(base_pool->tier_of) << "', "
13783       << "multiple tiers are not yet supported.";
13784     *err = -EINVAL;
13785     return false;
13786   }
13787
13788   if (tier_pool->has_tiers()) {
13789     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13790     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13791          it != tier_pool->tiers.end(); ++it)
13792       *ss << "'" << osdmap.get_pool_name(*it) << "',";
13793     *ss << " multiple tiers are not yet supported.";
13794     *err = -EINVAL;
13795     return false;
13796   }
13797
13798   if (tier_pool->is_tier()) {
13799     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13800        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13801     *err = -EINVAL;
13802     return false;
13803   }
13804
13805   *err = 0;
13806   return true;
13807 }
13808
13809
13810 /**
13811  * Check if it is safe to remove a tier from this base pool
13812  *
13813  * @return
13814  * True if the operation should proceed, false if we should abort here
13815  * (abort doesn't necessarily mean error, could be idempotency)
13816  */
13817 bool OSDMonitor::_check_remove_tier(
13818     const int64_t base_pool_id, const pg_pool_t *base_pool,
13819     const pg_pool_t *tier_pool,
13820     int *err, ostream *ss) const
13821 {
13822   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13823
13824   // Apply CephFS-specific checks
13825   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13826   if (pending_fsmap.pool_in_use(base_pool_id)) {
13827     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13828       // If the underlying pool is erasure coded and does not allow EC
13829       // overwrites, we can't permit the removal of the replicated tier that
13830       // CephFS relies on to access it
13831       *ss << "pool '" << base_pool_name <<
13832           "' does not allow EC overwrites and is in use by CephFS"
13833           " via its tier";
13834       *err = -EBUSY;
13835       return false;
13836     }
13837
13838     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13839       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13840              "tier is still in use as a writeback cache.  Change the cache "
13841              "mode and flush the cache before removing it";
13842       *err = -EBUSY;
13843       return false;
13844     }
13845   }
13846
13847   *err = 0;
13848   return true;
13849 }
13850
13851 int OSDMonitor::_prepare_remove_pool(
13852   int64_t pool, ostream *ss, bool no_fake)
13853 {
13854   dout(10) << __func__ << " " << pool << dendl;
13855   const pg_pool_t *p = osdmap.get_pg_pool(pool);
13856   int r = _check_remove_pool(pool, *p, ss);
13857   if (r < 0)
13858     return r;
13859
13860   auto new_pool = pending_inc.new_pools.find(pool);
13861   if (new_pool != pending_inc.new_pools.end()) {
13862     // if there is a problem with the pending info, wait and retry
13863     // this op.
13864     const auto& p = new_pool->second;
13865     int r = _check_remove_pool(pool, p, ss);
13866     if (r < 0)
13867       return -EAGAIN;
13868   }
13869
13870   if (pending_inc.old_pools.count(pool)) {
13871     dout(10) << __func__ << " " << pool << " already pending removal"
13872              << dendl;
13873     return 0;
13874   }
13875
13876   if (g_conf()->mon_fake_pool_delete && !no_fake) {
13877     string old_name = osdmap.get_pool_name(pool);
13878     string new_name = old_name + "." + stringify(pool) + ".DELETED";
13879     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13880             << old_name << " -> " << new_name << dendl;
13881     pending_inc.new_pool_names[pool] = new_name;
13882     return 0;
13883   }
13884
13885   // remove
13886   pending_inc.old_pools.insert(pool);
13887
13888   // remove any pg_temp mappings for this pool
13889   for (auto p = osdmap.pg_temp->begin();
13890        p != osdmap.pg_temp->end();
13891        ++p) {
13892     if (p->first.pool() == pool) {
13893       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
13894                << p->first << dendl;
13895       pending_inc.new_pg_temp[p->first].clear();
13896     }
13897   }
13898   // remove any primary_temp mappings for this pool
13899   for (auto p = osdmap.primary_temp->begin();
13900       p != osdmap.primary_temp->end();
13901       ++p) {
13902     if (p->first.pool() == pool) {
13903       dout(10) << __func__ << " " << pool
13904                << " removing obsolete primary_temp" << p->first << dendl;
13905       pending_inc.new_primary_temp[p->first] = -1;
13906     }
13907   }
13908   // remove any pg_upmap mappings for this pool
13909   for (auto& p : osdmap.pg_upmap) {
13910     if (p.first.pool() == pool) {
13911       dout(10) << __func__ << " " << pool
13912                << " removing obsolete pg_upmap "
13913                << p.first << dendl;
13914       pending_inc.old_pg_upmap.insert(p.first);
13915     }
13916   }
13917   // remove any pending pg_upmap mappings for this pool
13918   {
13919     auto it = pending_inc.new_pg_upmap.begin();
13920     while (it != pending_inc.new_pg_upmap.end()) {
13921       if (it->first.pool() == pool) {
13922         dout(10) << __func__ << " " << pool
13923                  << " removing pending pg_upmap "
13924                  << it->first << dendl;
13925         it = pending_inc.new_pg_upmap.erase(it);
13926       } else {
13927         it++;
13928       }
13929     }
13930   }
13931   // remove any pg_upmap_items mappings for this pool
13932   for (auto& p : osdmap.pg_upmap_items) {
13933     if (p.first.pool() == pool) {
13934       dout(10) << __func__ << " " << pool
13935                << " removing obsolete pg_upmap_items " << p.first
13936                << dendl;
13937       pending_inc.old_pg_upmap_items.insert(p.first);
13938     }
13939   }
13940   // remove any pending pg_upmap mappings for this pool
13941   {
13942     auto it = pending_inc.new_pg_upmap_items.begin();
13943     while (it != pending_inc.new_pg_upmap_items.end()) {
13944       if (it->first.pool() == pool) {
13945         dout(10) << __func__ << " " << pool
13946                  << " removing pending pg_upmap_items "
13947                  << it->first << dendl;
13948         it = pending_inc.new_pg_upmap_items.erase(it);
13949       } else {
13950         it++;
13951       }
13952     }
13953   }
13954
13955   // remove any choose_args for this pool
13956   CrushWrapper newcrush;
13957   _get_pending_crush(newcrush);
13958   if (newcrush.have_choose_args(pool)) {
13959     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13960     newcrush.rm_choose_args(pool);
13961     pending_inc.crush.clear();
13962     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13963   }
13964   return 0;
13965 }
13966
13967 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13968 {
13969   dout(10) << "_prepare_rename_pool " << pool << dendl;
13970   if (pending_inc.old_pools.count(pool)) {
13971     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13972     return -ENOENT;
13973   }
13974   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13975        p != pending_inc.new_pool_names.end();
13976        ++p) {
13977     if (p->second == newname && p->first != pool) {
13978       return -EEXIST;
13979     }
13980   }
13981
13982   pending_inc.new_pool_names[pool] = newname;
13983   return 0;
13984 }
13985
13986 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13987 {
13988   op->mark_osdmon_event(__func__);
13989   auto m = op->get_req<MPoolOp>();
13990   ostringstream ss;
13991   int ret = _prepare_remove_pool(m->pool, &ss, false);
13992   if (ret == -EAGAIN) {
13993     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13994     return true;
13995   }
13996   if (ret < 0)
13997     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13998   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13999                                                       pending_inc.epoch));
14000   return true;
14001 }
14002
14003 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14004                                 int ret, epoch_t epoch, bufferlist *blp)
14005 {
14006   op->mark_osdmon_event(__func__);
14007   auto m = op->get_req<MPoolOp>();
14008   dout(20) << "_pool_op_reply " << ret << dendl;
14009   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14010                                          ret, epoch, get_last_committed(), blp);
14011   mon->send_reply(op, reply);
14012 }
14013
14014 void OSDMonitor::convert_pool_priorities(void)
14015 {
14016   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14017   int64_t max_prio = 0;
14018   int64_t min_prio = 0;
14019   for (const auto &i : osdmap.get_pools()) {
14020     const auto &pool = i.second;
14021
14022     if (pool.opts.is_set(key)) {
14023       int64_t prio = 0;
14024       pool.opts.get(key, &prio);
14025       if (prio > max_prio)
14026         max_prio = prio;
14027       if (prio < min_prio)
14028         min_prio = prio;
14029     }
14030   }
14031   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14032     dout(20) << __func__ << " nothing to fix" << dendl;
14033     return;
14034   }
14035   // Current pool priorities exceeds new maximum
14036   for (const auto &i : osdmap.get_pools()) {
14037     const auto pool_id = i.first;
14038     pg_pool_t pool = i.second;
14039
14040     int64_t prio = 0;
14041     pool.opts.get(key, &prio);
14042     int64_t n;
14043
14044     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14045       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14046       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14047     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14048       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
14049       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14050     } else {
14051       continue;
14052     }
14053     if (n == 0) {
14054       pool.opts.unset(key);
14055     } else {
14056       pool.opts.set(key, static_cast<int64_t>(n));
14057     }
14058     dout(10) << __func__ << " pool " << pool_id
14059              << " recovery_priority adjusted "
14060              << prio << " to " << n << dendl;
14061     pool.last_change = pending_inc.epoch;
14062     pending_inc.new_pools[pool_id] = pool;
14063   }
14064 }