ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/ConfigKeyService.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDMarkMeDead.h"
  43 #include "messages/MOSDFull.h"
  44 #include "messages/MOSDMap.h"
  45 #include "messages/MMonGetOSDMap.h"
  46 #include "messages/MOSDBoot.h"
  47 #include "messages/MOSDAlive.h"
  48 #include "messages/MPoolOp.h"
  49 #include "messages/MPoolOpReply.h"
  50 #include "messages/MOSDPGCreate.h"
  51 #include "messages/MOSDPGCreate2.h"
  52 #include "messages/MOSDPGCreated.h"
  53 #include "messages/MOSDPGTemp.h"
  54 #include "messages/MOSDPGReadyToMerge.h"
  55 #include "messages/MMonCommand.h"
  56 #include "messages/MRemoveSnaps.h"
  57 #include "messages/MOSDScrub.h"
  58 #include "messages/MRoute.h"
  59 #include "messages/MMonGetPurgedSnaps.h"
  60 #include "messages/MMonGetPurgedSnapsReply.h"
  61
  62 #include "common/TextTable.h"
  63 #include "common/Timer.h"
  64 #include "common/ceph_argparse.h"
  65 #include "common/perf_counters.h"
  66 #include "common/PriorityCache.h"
  67 #include "common/strtol.h"
  68 #include "common/numa.h"
  69
  70 #include "common/config.h"
  71 #include "common/errno.h"
  72
  73 #include "erasure-code/ErasureCodePlugin.h"
  74 #include "compressor/Compressor.h"
  75 #include "common/Checksummer.h"
  76
  77 #include "include/compat.h"
  78 #include "include/ceph_assert.h"
  79 #include "include/stringify.h"
  80 #include "include/util.h"
  81 #include "common/cmdparse.h"
  82 #include "include/str_list.h"
  83 #include "include/str_map.h"
  84 #include "include/scope_guard.h"
  85 #include "perfglue/heap_profiler.h"
  86
  87 #include "auth/cephx/CephxKeyServer.h"
  88 #include "osd/OSDCap.h"
  89
  90 #include "json_spirit/json_spirit_reader.h"
  91
  92 #include <boost/algorithm/string/predicate.hpp>
  93
  94 #define dout_subsys ceph_subsys_mon
  95 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
  96 static const string OSD_METADATA_PREFIX("osd_metadata");
  97 static const string OSD_SNAP_PREFIX("osd_snap");
  98
  99 /*
 100
 101   OSD snapshot metadata
 102   ---------------------
 103
 104   -- starting with mimic, removed in octopus --
 105
 106   "removed_epoch_%llu_%08lx" % (pool, epoch)
 107    -> interval_set<snapid_t>
 108
 109   "removed_snap_%llu_%016llx" % (pool, last_snap)
 110    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 111
 112
 113   -- starting with mimic --
 114
 115   "purged_snap_%llu_%016llx" % (pool, last_snap)
 116    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 117
 118   - note that the {removed,purged}_snap put the last snap in they key so
 119     that we can use forward iteration only to search for an epoch in an
 120     interval.  e.g., to test if epoch N is removed/purged, we'll find a key
 121     >= N that either does or doesn't contain the given snap.
 122
 123
 124   -- starting with octopus --
 125
 126   "purged_epoch_%08lx" % epoch
 127   -> map<int64_t,interval_set<snapid_t>>
 128
 129   */
 130 using namespace TOPNSPC::common;
 131 namespace {
 132
 133 struct OSDMemCache : public PriorityCache::PriCache {
 134   OSDMonitor *osdmon;
 135   int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
 136   int64_t committed_bytes = 0;
 137   double cache_ratio = 0;
 138
 139   OSDMemCache(OSDMonitor *m) : osdmon(m) {};
 140
 141   virtual uint64_t _get_used_bytes() const = 0;
 142
 143   virtual int64_t request_cache_bytes(
 144       PriorityCache::Priority pri, uint64_t total_cache) const {
 145     int64_t assigned = get_cache_bytes(pri);
 146
 147     switch (pri) {
 148     // All cache items are currently set to have PRI1 priority
 149     case PriorityCache::Priority::PRI1:
 150       {
 151         int64_t request = _get_used_bytes();
 152         return (request > assigned) ? request - assigned : 0;
 153       }
 154     default:
 155       break;
 156     }
 157     return -EOPNOTSUPP;
 158   }
 159
 160   virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
 161       return cache_bytes[pri];
 162   }
 163
 164   virtual int64_t get_cache_bytes() const {
 165     int64_t total = 0;
 166
 167     for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
 168       PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
 169       total += get_cache_bytes(pri);
 170     }
 171     return total;
 172   }
 173
 174   virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 175     cache_bytes[pri] = bytes;
 176   }
 177   virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 178     cache_bytes[pri] += bytes;
 179   }
 180   virtual int64_t commit_cache_size(uint64_t total_cache) {
 181     committed_bytes = PriorityCache::get_chunk(
 182         get_cache_bytes(), total_cache);
 183     return committed_bytes;
 184   }
 185   virtual int64_t get_committed_size() const {
 186     return committed_bytes;
 187   }
 188   virtual double get_cache_ratio() const {
 189     return cache_ratio;
 190   }
 191   virtual void set_cache_ratio(double ratio) {
 192     cache_ratio = ratio;
 193   }
 194   virtual string get_cache_name() const = 0;
 195 };
 196
 197 struct IncCache : public OSDMemCache {
 198   IncCache(OSDMonitor *m) : OSDMemCache(m) {};
 199
 200   virtual uint64_t _get_used_bytes() const {
 201     return osdmon->inc_osd_cache.get_bytes();
 202   }
 203
 204   virtual string get_cache_name() const {
 205     return "OSDMap Inc Cache";
 206   }
 207
 208   uint64_t _get_num_osdmaps() const {
 209     return osdmon->inc_osd_cache.get_size();
 210   }
 211 };
 212
 213 struct FullCache : public OSDMemCache {
 214   FullCache(OSDMonitor *m) : OSDMemCache(m) {};
 215
 216   virtual uint64_t _get_used_bytes() const {
 217     return osdmon->full_osd_cache.get_bytes();
 218   }
 219
 220   virtual string get_cache_name() const {
 221     return "OSDMap Full Cache";
 222   }
 223
 224   uint64_t _get_num_osdmaps() const {
 225     return osdmon->full_osd_cache.get_size();
 226   }
 227 };
 228
 229 std::shared_ptr<IncCache> inc_cache;
 230 std::shared_ptr<FullCache> full_cache;
 231
 232 const uint32_t MAX_POOL_APPLICATIONS = 4;
 233 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 234 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 235
 236 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 237   // Note: this doesn't include support for the application tag match
 238   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 239     auto& match = grant.match;
 240     if (match.is_match_all()) {
 241       return true;
 242     } else if (pool_name != nullptr &&
 243                !match.pool_namespace.pool_name.empty() &&
 244                match.pool_namespace.pool_name == *pool_name) {
 245       return true;
 246     }
 247   }
 248   return false;
 249 }
 250
 251 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 252                                     const KeyServer& key_server,
 253                                     const EntityName& entity_name,
 254                                     const MonCap& mon_caps,
 255                                     const entity_addr_t& peer_socket_addr,
 256                                     const std::string* pool_name)
 257 {
 258   typedef std::map<std::string, std::string> CommandArgs;
 259
 260   if (mon_caps.is_capable(
 261         cct, entity_name, "osd",
 262         "osd pool op unmanaged-snap",
 263         (pool_name == nullptr ?
 264          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 265          CommandArgs{{"poolname", *pool_name}}),
 266         false, true, false,
 267         peer_socket_addr)) {
 268     return true;
 269   }
 270
 271   AuthCapsInfo caps_info;
 272   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 273                                    caps_info)) {
 274     dout(10) << "unable to locate OSD cap data for " << entity_name
 275              << " in auth db" << dendl;
 276     return false;
 277   }
 278
 279   string caps_str;
 280   if (caps_info.caps.length() > 0) {
 281     auto p = caps_info.caps.cbegin();
 282     try {
 283       decode(caps_str, p);
 284     } catch (const buffer::error &err) {
 285       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 286            << dendl;
 287       return false;
 288     }
 289   }
 290
 291   OSDCap osd_cap;
 292   if (!osd_cap.parse(caps_str, nullptr)) {
 293     dout(10) << "unable to parse OSD cap data for " << entity_name
 294              << " in auth db" << dendl;
 295     return false;
 296   }
 297
 298   // if the entity has write permissions in one or all pools, permit
 299   // usage of unmanaged-snapshots
 300   if (osd_cap.allow_all()) {
 301     return true;
 302   }
 303
 304   for (auto& grant : osd_cap.grants) {
 305     if (grant.profile.is_valid()) {
 306       for (auto& profile_grant : grant.profile_grants) {
 307         if (is_osd_writable(profile_grant, pool_name)) {
 308           return true;
 309         }
 310       }
 311     } else if (is_osd_writable(grant, pool_name)) {
 312       return true;
 313     }
 314   }
 315
 316   return false;
 317 }
 318
 319 } // anonymous namespace
 320
 321 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
 322 {
 323   if (epoch_by_pg.size() <= ps) {
 324     epoch_by_pg.resize(ps + 1, 0);
 325   }
 326   const auto old_lec = epoch_by_pg[ps];
 327   if (old_lec >= last_epoch_clean) {
 328     // stale lec
 329     return;
 330   }
 331   epoch_by_pg[ps] = last_epoch_clean;
 332   if (last_epoch_clean < floor) {
 333     floor = last_epoch_clean;
 334   } else if (last_epoch_clean > floor) {
 335     if (old_lec == floor) {
 336       // probably should increase floor?
 337       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 338                                         std::end(epoch_by_pg));
 339       floor = *new_floor;
 340     }
 341   }
 342   if (ps != next_missing) {
 343     return;
 344   }
 345   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 346     if (epoch_by_pg[next_missing] == 0) {
 347       break;
 348     }
 349   }
 350 }
 351
 352 void LastEpochClean::remove_pool(uint64_t pool)
 353 {
 354   report_by_pool.erase(pool);
 355 }
 356
 357 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
 358 {
 359   auto& lec = report_by_pool[pg.pool()];
 360   return lec.report(pg.ps(), last_epoch_clean);
 361 }
 362
 363 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 364 {
 365   auto floor = latest.get_epoch();
 366   for (auto& pool : latest.get_pools()) {
 367     auto reported = report_by_pool.find(pool.first);
 368     if (reported == report_by_pool.end()) {
 369       return 0;
 370     }
 371     if (reported->second.next_missing < pool.second.get_pg_num()) {
 372       return 0;
 373     }
 374     if (reported->second.floor < floor) {
 375       floor = reported->second.floor;
 376     }
 377   }
 378   return floor;
 379 }
 380
 381 void LastEpochClean::dump(Formatter *f) const
 382 {
 383   f->open_array_section("per_pool");
 384
 385   for (auto& it : report_by_pool) {
 386     f->open_object_section("pool");
 387     f->dump_unsigned("poolid", it.first);
 388     f->dump_unsigned("floor", it.second.floor);
 389     f->close_section();
 390   }
 391
 392   f->close_section();
 393 }
 394
 395 class C_UpdateCreatingPGs : public Context {
 396 public:
 397   OSDMonitor *osdmon;
 398   utime_t start;
 399   epoch_t epoch;
 400   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 401     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 402   void finish(int r) override {
 403     if (r >= 0) {
 404       utime_t end = ceph_clock_now();
 405       dout(10) << "osdmap epoch " << epoch << " mapping took "
 406                << (end - start) << " seconds" << dendl;
 407       osdmon->update_creating_pgs();
 408       osdmon->check_pg_creates_subs();
 409     }
 410   }
 411 };
 412
 413 #undef dout_prefix
 414 #define dout_prefix _prefix(_dout, mon, osdmap)
 415 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
 416   return *_dout << "mon." << mon->name << "@" << mon->rank
 417                 << "(" << mon->get_state_name()
 418                 << ").osd e" << osdmap.get_epoch() << " ";
 419 }
 420
 421 OSDMonitor::OSDMonitor(
 422   CephContext *cct,
 423   Monitor *mn,
 424   Paxos *p,
 425   const string& service_name)
 426  : PaxosService(mn, p, service_name),
 427    cct(cct),
 428    inc_osd_cache(g_conf()->mon_osd_cache_size),
 429    full_osd_cache(g_conf()->mon_osd_cache_size),
 430    has_osdmap_manifest(false),
 431    mapper(mn->cct, &mn->cpu_tp)
 432 {
 433   inc_cache = std::make_shared<IncCache>(this);
 434   full_cache = std::make_shared<FullCache>(this);
 435   cct->_conf.add_observer(this);
 436   int r = _set_cache_sizes();
 437   if (r < 0) {
 438     derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
 439          << g_conf()->mon_osd_cache_size
 440          << ") without priority cache management"
 441          << dendl;
 442   }
 443 }
 444
 445 const char **OSDMonitor::get_tracked_conf_keys() const
 446 {
 447   static const char* KEYS[] = {
 448     "mon_memory_target",
 449     "mon_memory_autotune",
 450     "rocksdb_cache_size",
 451     NULL
 452   };
 453   return KEYS;
 454 }
 455
 456 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
 457                                     const std::set<std::string> &changed)
 458 {
 459   dout(10) << __func__ << " " << changed << dendl;
 460
 461   if (changed.count("mon_memory_autotune")) {
 462     _set_cache_autotuning();
 463   }
 464   if (changed.count("mon_memory_target") ||
 465       changed.count("rocksdb_cache_size")) {
 466     int r = _update_mon_cache_settings();
 467     if (r < 0) {
 468       derr << __func__ << " mon_memory_target:"
 469            << g_conf()->mon_memory_target
 470            << " rocksdb_cache_size:"
 471            << g_conf()->rocksdb_cache_size
 472            << ". Unable to update cache size."
 473            << dendl;
 474     }
 475   }
 476 }
 477
 478 void OSDMonitor::_set_cache_autotuning()
 479 {
 480   if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
 481     // Disable cache autotuning
 482     std::lock_guard l(balancer_lock);
 483     pcm = nullptr;
 484   }
 485
 486   if (g_conf()->mon_memory_autotune && pcm == nullptr) {
 487     int r = register_cache_with_pcm();
 488     if (r < 0) {
 489       dout(10) << __func__
 490                << " Error while registering osdmon caches with pcm."
 491                << " Cache auto tuning not enabled."
 492                << dendl;
 493       mon_memory_autotune = false;
 494     } else {
 495       mon_memory_autotune = true;
 496     }
 497   }
 498 }
 499
 500 int OSDMonitor::_update_mon_cache_settings()
 501 {
 502   if (g_conf()->mon_memory_target <= 0 ||
 503       g_conf()->mon_memory_target < mon_memory_min ||
 504       g_conf()->rocksdb_cache_size <= 0) {
 505     return -EINVAL;
 506   }
 507
 508   if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
 509     derr << __func__ << " not using pcm and rocksdb" << dendl;
 510     return -EINVAL;
 511   }
 512
 513   uint64_t old_mon_memory_target = mon_memory_target;
 514   uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
 515
 516   // Set the new pcm memory cache sizes
 517   mon_memory_target = g_conf()->mon_memory_target;
 518   rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 519
 520   uint64_t base = mon_memory_base;
 521   double fragmentation = mon_memory_fragmentation;
 522   uint64_t target = mon_memory_target;
 523   uint64_t min = mon_memory_min;
 524   uint64_t max = min;
 525
 526   uint64_t ltarget = (1.0 - fragmentation) * target;
 527   if (ltarget > base + min) {
 528     max = ltarget - base;
 529   }
 530
 531   int r = _set_cache_ratios();
 532   if (r < 0) {
 533     derr << __func__ << " Cache ratios for pcm could not be set."
 534          << " Review the kv (rocksdb) and mon_memory_target sizes."
 535          << dendl;
 536     mon_memory_target = old_mon_memory_target;
 537     rocksdb_cache_size = old_rocksdb_cache_size;
 538     return -EINVAL;
 539   }
 540
 541   if (mon_memory_autotune && pcm != nullptr) {
 542     std::lock_guard l(balancer_lock);
 543     // set pcm cache levels
 544     pcm->set_target_memory(target);
 545     pcm->set_min_memory(min);
 546     pcm->set_max_memory(max);
 547     // tune memory based on new values
 548     pcm->tune_memory();
 549     pcm->balance();
 550     _set_new_cache_sizes();
 551     dout(1) << __func__ << " Updated mon cache setting."
 552              << " target: " << target
 553              << " min: " << min
 554              << " max: " << max
 555              << dendl;
 556   }
 557   return 0;
 558 }
 559
 560 int OSDMonitor::_set_cache_sizes()
 561 {
 562   if (g_conf()->mon_memory_autotune) {
 563     // set the new osdmon cache targets to be managed by pcm
 564     mon_osd_cache_size = g_conf()->mon_osd_cache_size;
 565     rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 566     mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
 567     mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
 568     mon_memory_target = g_conf()->mon_memory_target;
 569     mon_memory_min = g_conf()->mon_osd_cache_size_min;
 570     if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 571       derr << __func__ << " mon_memory_target:" << mon_memory_target
 572            << " mon_memory_min:" << mon_memory_min
 573            << ". Invalid size option(s) provided."
 574            << dendl;
 575       return -EINVAL;
 576     }
 577     // Set the initial inc and full LRU cache sizes
 578     inc_osd_cache.set_bytes(mon_memory_min);
 579     full_osd_cache.set_bytes(mon_memory_min);
 580     mon_memory_autotune = g_conf()->mon_memory_autotune;
 581   }
 582   return 0;
 583 }
 584
 585 bool OSDMonitor::_have_pending_crush()
 586 {
 587   return pending_inc.crush.length() > 0;
 588 }
 589
 590 CrushWrapper &OSDMonitor::_get_stable_crush()
 591 {
 592   return *osdmap.crush;
 593 }
 594
 595 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
 596 {
 597   bufferlist bl;
 598   if (pending_inc.crush.length())
 599     bl = pending_inc.crush;
 600   else
 601     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 602
 603   auto p = bl.cbegin();
 604   newcrush.decode(p);
 605 }
 606
 607 void OSDMonitor::create_initial()
 608 {
 609   dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
 610
 611   OSDMap newmap;
 612
 613   bufferlist bl;
 614   mon->store->get("mkfs", "osdmap", bl);
 615
 616   if (bl.length()) {
 617     newmap.decode(bl);
 618     newmap.set_fsid(mon->monmap->fsid);
 619   } else {
 620     newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
 621   }
 622   newmap.set_epoch(1);
 623   newmap.created = newmap.modified = ceph_clock_now();
 624
 625   // new clusters should sort bitwise by default.
 626   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 627
 628   newmap.flags |=
 629     CEPH_OSDMAP_RECOVERY_DELETES |
 630     CEPH_OSDMAP_PURGED_SNAPDIRS |
 631     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 632   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 633   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 634   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 635   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 636   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 637   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 638
 639   // new cluster should require latest by default
 640   if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
 641     if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
 642       derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
 643       newmap.require_osd_release = ceph_release_t::mimic;
 644     } else {
 645       derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
 646       newmap.require_osd_release = ceph_release_t::nautilus;
 647     }
 648   } else {
 649     newmap.require_osd_release = ceph_release_t::octopus;
 650     ceph_release_t r = ceph_release_from_name(
 651       g_conf()->mon_osd_initial_require_min_compat_client);
 652     if (!r) {
 653       ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 654     }
 655     newmap.require_min_compat_client = r;
 656   }
 657
 658   // encode into pending incremental
 659   uint64_t features = newmap.get_encoding_features();
 660   newmap.encode(pending_inc.fullmap,
 661                 features | CEPH_FEATURE_RESERVED);
 662   pending_inc.full_crc = newmap.get_crc();
 663   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 664 }
 665
 666 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 667 {
 668   s.insert(service_name);
 669   s.insert(OSD_PG_CREATING_PREFIX);
 670   s.insert(OSD_METADATA_PREFIX);
 671   s.insert(OSD_SNAP_PREFIX);
 672 }
 673
 674 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 675 {
 676   // we really don't care if the version has been updated, because we may
 677   // have trimmed without having increased the last committed; yet, we may
 678   // need to update the in-memory manifest.
 679   load_osdmap_manifest();
 680
 681   version_t version = get_last_committed();
 682   if (version == osdmap.epoch)
 683     return;
 684   ceph_assert(version > osdmap.epoch);
 685
 686   dout(15) << "update_from_paxos paxos e " << version
 687            << ", my e " << osdmap.epoch << dendl;
 688
 689   if (mapping_job) {
 690     if (!mapping_job->is_done()) {
 691       dout(1) << __func__ << " mapping job "
 692               << mapping_job.get() << " did not complete, "
 693               << mapping_job->shards << " left, canceling" << dendl;
 694       mapping_job->abort();
 695     }
 696     mapping_job.reset();
 697   }
 698
 699   load_health();
 700
 701   /*
 702    * We will possibly have a stashed latest that *we* wrote, and we will
 703    * always be sure to have the oldest full map in the first..last range
 704    * due to encode_trim_extra(), which includes the oldest full map in the trim
 705    * transaction.
 706    *
 707    * encode_trim_extra() does not however write the full map's
 708    * version to 'full_latest'.  This is only done when we are building the
 709    * full maps from the incremental versions.  But don't panic!  We make sure
 710    * that the following conditions find whichever full map version is newer.
 711    */
 712   version_t latest_full = get_version_latest_full();
 713   if (latest_full == 0 && get_first_committed() > 1)
 714     latest_full = get_first_committed();
 715
 716   if (get_first_committed() > 1 &&
 717       latest_full < get_first_committed()) {
 718     // the monitor could be just sync'ed with its peer, and the latest_full key
 719     // is not encoded in the paxos commits in encode_pending(), so we need to
 720     // make sure we get it pointing to a proper version.
 721     version_t lc = get_last_committed();
 722     version_t fc = get_first_committed();
 723
 724     dout(10) << __func__ << " looking for valid full map in interval"
 725              << " [" << fc << ", " << lc << "]" << dendl;
 726
 727     latest_full = 0;
 728     for (version_t v = lc; v >= fc; v--) {
 729       string full_key = "full_" + stringify(v);
 730       if (mon->store->exists(get_service_name(), full_key)) {
 731         dout(10) << __func__ << " found latest full map v " << v << dendl;
 732         latest_full = v;
 733         break;
 734       }
 735     }
 736
 737     ceph_assert(latest_full > 0);
 738     auto t(std::make_shared<MonitorDBStore::Transaction>());
 739     put_version_latest_full(t, latest_full);
 740     mon->store->apply_transaction(t);
 741     dout(10) << __func__ << " updated the on-disk full map version to "
 742              << latest_full << dendl;
 743   }
 744
 745   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 746     bufferlist latest_bl;
 747     get_version_full(latest_full, latest_bl);
 748     ceph_assert(latest_bl.length() != 0);
 749     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 750     osdmap = OSDMap();
 751     osdmap.decode(latest_bl);
 752   }
 753
 754   bufferlist bl;
 755   if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 756     auto p = bl.cbegin();
 757     std::lock_guard<std::mutex> l(creating_pgs_lock);
 758     creating_pgs.decode(p);
 759     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 760             << creating_pgs.last_scan_epoch
 761             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 762   } else {
 763     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 764             << dendl;
 765   }
 766
 767   // walk through incrementals
 768   MonitorDBStore::TransactionRef t;
 769   size_t tx_size = 0;
 770   while (version > osdmap.epoch) {
 771     bufferlist inc_bl;
 772     int err = get_version(osdmap.epoch+1, inc_bl);
 773     ceph_assert(err == 0);
 774     ceph_assert(inc_bl.length());
 775     // set priority cache manager levels if the osdmap is
 776     // being populated for the first time.
 777     if (mon_memory_autotune && pcm == nullptr) {
 778       int r = register_cache_with_pcm();
 779       if (r < 0) {
 780         dout(10) << __func__
 781                  << " Error while registering osdmon caches with pcm."
 782                  << " Proceeding without cache auto tuning."
 783                  << dendl;
 784       }
 785     }
 786
 787     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 788             << dendl;
 789     OSDMap::Incremental inc(inc_bl);
 790     err = osdmap.apply_incremental(inc);
 791     ceph_assert(err == 0);
 792
 793     if (!t)
 794       t.reset(new MonitorDBStore::Transaction);
 795
 796     // Write out the full map for all past epochs.  Encode the full
 797     // map with the same features as the incremental.  If we don't
 798     // know, use the quorum features.  If we don't know those either,
 799     // encode with all features.
 800     uint64_t f = inc.encode_features;
 801     if (!f)
 802       f = mon->get_quorum_con_features();
 803     if (!f)
 804       f = -1;
 805     bufferlist full_bl;
 806     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 807     tx_size += full_bl.length();
 808
 809     bufferlist orig_full_bl;
 810     get_version_full(osdmap.epoch, orig_full_bl);
 811     if (orig_full_bl.length()) {
 812       // the primary provided the full map
 813       ceph_assert(inc.have_crc);
 814       if (inc.full_crc != osdmap.crc) {
 815         // This will happen if the mons were running mixed versions in
 816         // the past or some other circumstance made the full encoded
 817         // maps divergent.  Reloading here will bring us back into
 818         // sync with the primary for this and all future maps.  OSDs
 819         // will also be brought back into sync when they discover the
 820         // crc mismatch and request a full map from a mon.
 821         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 822              << dendl;
 823
 824         dout(20) << __func__ << " my (bad) full osdmap:\n";
 825         JSONFormatter jf(true);
 826         jf.dump_object("osdmap", osdmap);
 827         jf.flush(*_dout);
 828         *_dout << "\nhexdump:\n";
 829         full_bl.hexdump(*_dout);
 830         *_dout << dendl;
 831
 832         osdmap = OSDMap();
 833         osdmap.decode(orig_full_bl);
 834
 835         dout(20) << __func__ << " canonical full osdmap:\n";
 836         JSONFormatter jf(true);
 837         jf.dump_object("osdmap", osdmap);
 838         jf.flush(*_dout);
 839         *_dout << "\nhexdump:\n";
 840         orig_full_bl.hexdump(*_dout);
 841         *_dout << dendl;
 842       }
 843     } else {
 844       ceph_assert(!inc.have_crc);
 845       put_version_full(t, osdmap.epoch, full_bl);
 846     }
 847     put_version_latest_full(t, osdmap.epoch);
 848
 849     // share
 850     dout(1) << osdmap << dendl;
 851
 852     if (osdmap.epoch == 1) {
 853       t->erase("mkfs", "osdmap");
 854     }
 855
 856     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 857       mon->store->apply_transaction(t);
 858       t = MonitorDBStore::TransactionRef();
 859       tx_size = 0;
 860     }
 861     for (const auto &osd_state : inc.new_state) {
 862       if (osd_state.second & CEPH_OSD_UP) {
 863         // could be marked up *or* down, but we're too lazy to check which
 864         last_osd_report.erase(osd_state.first);
 865       }
 866       if (osd_state.second & CEPH_OSD_EXISTS) {
 867         // could be created *or* destroyed, but we can safely drop it
 868         osd_epochs.erase(osd_state.first);
 869       }
 870     }
 871   }
 872
 873   if (t) {
 874     mon->store->apply_transaction(t);
 875   }
 876
 877   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 878     if (osdmap.is_out(o))
 879       continue;
 880     auto found = down_pending_out.find(o);
 881     if (osdmap.is_down(o)) {
 882       // populate down -> out map
 883       if (found == down_pending_out.end()) {
 884         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 885         down_pending_out[o] = ceph_clock_now();
 886       }
 887     } else {
 888       if (found != down_pending_out.end()) {
 889         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 890         down_pending_out.erase(found);
 891       }
 892     }
 893   }
 894   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 895
 896   check_osdmap_subs();
 897   check_pg_creates_subs();
 898
 899   share_map_with_random_osd();
 900   update_logger();
 901   process_failures();
 902
 903   // make sure our feature bits reflect the latest map
 904   update_msgr_features();
 905
 906   if (!mon->is_leader()) {
 907     // will be called by on_active() on the leader, avoid doing so twice
 908     start_mapping();
 909   }
 910 }
 911
 912 int OSDMonitor::register_cache_with_pcm()
 913 {
 914   if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 915     derr << __func__ << " Invalid memory size specified for mon caches."
 916          << " Caches will not be auto-tuned."
 917          << dendl;
 918     return -EINVAL;
 919   }
 920   uint64_t base = mon_memory_base;
 921   double fragmentation = mon_memory_fragmentation;
 922   // For calculating total target memory, consider rocksdb cache size.
 923   uint64_t target = mon_memory_target;
 924   uint64_t min = mon_memory_min;
 925   uint64_t max = min;
 926
 927   // Apply the same logic as in bluestore to set the max amount
 928   // of memory to use for cache. Assume base memory for OSDMaps
 929   // and then add in some overhead for fragmentation.
 930   uint64_t ltarget = (1.0 - fragmentation) * target;
 931   if (ltarget > base + min) {
 932     max = ltarget - base;
 933   }
 934
 935   rocksdb_binned_kv_cache = mon->store->get_priority_cache();
 936   if (!rocksdb_binned_kv_cache) {
 937     derr << __func__ << " not using rocksdb" << dendl;
 938     return -EINVAL;
 939   }
 940
 941   int r = _set_cache_ratios();
 942   if (r < 0) {
 943     derr << __func__ << " Cache ratios for pcm could not be set."
 944          << " Review the kv (rocksdb) and mon_memory_target sizes."
 945          << dendl;
 946     return -EINVAL;
 947   }
 948
 949   pcm = std::make_shared<PriorityCache::Manager>(
 950       cct, min, max, target, true);
 951   pcm->insert("kv", rocksdb_binned_kv_cache, true);
 952   pcm->insert("inc", inc_cache, true);
 953   pcm->insert("full", full_cache, true);
 954   dout(1) << __func__ << " pcm target: " << target
 955            << " pcm max: " << max
 956            << " pcm min: " << min
 957            << " inc_osd_cache size: " << inc_osd_cache.get_size()
 958            << dendl;
 959   return 0;
 960 }
 961
 962 int OSDMonitor::_set_cache_ratios()
 963 {
 964   double old_cache_kv_ratio = cache_kv_ratio;
 965
 966   // Set the cache ratios for kv(rocksdb), inc and full caches
 967   cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
 968   if (cache_kv_ratio >= 1.0) {
 969     derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
 970          << ") must be in range [0,<1.0]."
 971          << dendl;
 972     cache_kv_ratio = old_cache_kv_ratio;
 973     return -EINVAL;
 974   }
 975   rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
 976   cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
 977   inc_cache->set_cache_ratio(cache_inc_ratio);
 978   full_cache->set_cache_ratio(cache_full_ratio);
 979
 980   dout(1) << __func__ << " kv ratio " << cache_kv_ratio
 981            << " inc ratio " << cache_inc_ratio
 982            << " full ratio " << cache_full_ratio
 983            << dendl;
 984   return 0;
 985 }
 986
 987 void OSDMonitor::start_mapping()
 988 {
 989   // initiate mapping job
 990   if (mapping_job) {
 991     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
 992              << dendl;
 993     mapping_job->abort();
 994   }
 995   if (!osdmap.get_pools().empty()) {
 996     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
 997     mapping_job = mapping.start_update(osdmap, mapper,
 998                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
 999     dout(10) << __func__ << " started mapping job " << mapping_job.get()
1000              << " at " << fin->start << dendl;
1001     mapping_job->set_finish_event(fin);
1002   } else {
1003     dout(10) << __func__ << " no pools, no mapping job" << dendl;
1004     mapping_job = nullptr;
1005   }
1006 }
1007
1008 void OSDMonitor::update_msgr_features()
1009 {
1010   set<int> types;
1011   types.insert((int)entity_name_t::TYPE_OSD);
1012   types.insert((int)entity_name_t::TYPE_CLIENT);
1013   types.insert((int)entity_name_t::TYPE_MDS);
1014   types.insert((int)entity_name_t::TYPE_MON);
1015   for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1016     uint64_t mask;
1017     uint64_t features = osdmap.get_features(*q, &mask);
1018     if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1019       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1020       ceph::net::Policy p = mon->messenger->get_policy(*q);
1021       p.features_required = (p.features_required & ~mask) | features;
1022       mon->messenger->set_policy(*q, p);
1023     }
1024   }
1025 }
1026
1027 void OSDMonitor::on_active()
1028 {
1029   update_logger();
1030
1031   if (mon->is_leader()) {
1032     mon->clog->debug() << "osdmap " << osdmap;
1033     if (!priority_convert) {
1034       // Only do this once at start-up
1035       convert_pool_priorities();
1036       priority_convert = true;
1037     }
1038   } else {
1039     list<MonOpRequestRef> ls;
1040     take_all_failures(ls);
1041     while (!ls.empty()) {
1042       MonOpRequestRef op = ls.front();
1043       op->mark_osdmon_event(__func__);
1044       dispatch(op);
1045       ls.pop_front();
1046     }
1047   }
1048   start_mapping();
1049 }
1050
1051 void OSDMonitor::on_restart()
1052 {
1053   last_osd_report.clear();
1054 }
1055
1056 void OSDMonitor::on_shutdown()
1057 {
1058   dout(10) << __func__ << dendl;
1059   if (mapping_job) {
1060     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1061              << dendl;
1062     mapping_job->abort();
1063   }
1064
1065   // discard failure info, waiters
1066   list<MonOpRequestRef> ls;
1067   take_all_failures(ls);
1068   ls.clear();
1069 }
1070
1071 void OSDMonitor::update_logger()
1072 {
1073   dout(10) << "update_logger" << dendl;
1074
1075   mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1076   mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1077   mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1078   mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1079 }
1080
1081 void OSDMonitor::create_pending()
1082 {
1083   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1084   pending_inc.fsid = mon->monmap->fsid;
1085   pending_metadata.clear();
1086   pending_metadata_rm.clear();
1087   pending_pseudo_purged_snaps.clear();
1088
1089   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1090
1091   // safety checks (this shouldn't really happen)
1092   {
1093     if (osdmap.backfillfull_ratio <= 0) {
1094       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1095       if (pending_inc.new_backfillfull_ratio > 1.0)
1096         pending_inc.new_backfillfull_ratio /= 100;
1097       dout(1) << __func__ << " setting backfillfull_ratio = "
1098               << pending_inc.new_backfillfull_ratio << dendl;
1099     }
1100     if (osdmap.full_ratio <= 0) {
1101       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1102       if (pending_inc.new_full_ratio > 1.0)
1103         pending_inc.new_full_ratio /= 100;
1104       dout(1) << __func__ << " setting full_ratio = "
1105               << pending_inc.new_full_ratio << dendl;
1106     }
1107     if (osdmap.nearfull_ratio <= 0) {
1108       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1109       if (pending_inc.new_nearfull_ratio > 1.0)
1110         pending_inc.new_nearfull_ratio /= 100;
1111       dout(1) << __func__ << " setting nearfull_ratio = "
1112               << pending_inc.new_nearfull_ratio << dendl;
1113     }
1114   }
1115
1116   // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1117   // structure.
1118   if (osdmap.crush->has_legacy_rule_ids()) {
1119     CrushWrapper newcrush;
1120     _get_pending_crush(newcrush);
1121
1122     // First, for all pools, work out which rule they really used
1123     // by resolving ruleset to rule.
1124     for (const auto &i : osdmap.get_pools()) {
1125       const auto pool_id = i.first;
1126       const auto &pool = i.second;
1127       int new_rule_id = newcrush.find_rule(pool.crush_rule,
1128                                            pool.type, pool.size);
1129
1130       dout(1) << __func__ << " rewriting pool "
1131               << osdmap.get_pool_name(pool_id) << " crush ruleset "
1132               << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1133       if (pending_inc.new_pools.count(pool_id) == 0) {
1134         pending_inc.new_pools[pool_id] = pool;
1135       }
1136       pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1137     }
1138
1139     // Now, go ahead and renumber all the rules so that their
1140     // rule_id field corresponds to their position in the array
1141     auto old_to_new = newcrush.renumber_rules();
1142     dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1143     for (const auto &i : old_to_new) {
1144       dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1145     }
1146     pending_inc.crush.clear();
1147     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1148   }
1149 }
1150
1151 creating_pgs_t
1152 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1153                                const OSDMap& nextmap)
1154 {
1155   dout(10) << __func__ << dendl;
1156   creating_pgs_t pending_creatings;
1157   {
1158     std::lock_guard<std::mutex> l(creating_pgs_lock);
1159     pending_creatings = creating_pgs;
1160   }
1161   // check for new or old pools
1162   if (pending_creatings.last_scan_epoch < inc.epoch) {
1163     unsigned queued = 0;
1164     queued += scan_for_creating_pgs(osdmap.get_pools(),
1165                                     inc.old_pools,
1166                                     inc.modified,
1167                                     &pending_creatings);
1168     queued += scan_for_creating_pgs(inc.new_pools,
1169                                     inc.old_pools,
1170                                     inc.modified,
1171                                     &pending_creatings);
1172     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1173     for (auto deleted_pool : inc.old_pools) {
1174       auto removed = pending_creatings.remove_pool(deleted_pool);
1175       dout(10) << __func__ << " " << removed
1176                << " pg removed because containing pool deleted: "
1177                << deleted_pool << dendl;
1178       last_epoch_clean.remove_pool(deleted_pool);
1179     }
1180     // pgmon updates its creating_pgs in check_osd_map() which is called by
1181     // on_active() and check_osd_map() could be delayed if lease expires, so its
1182     // creating_pgs could be stale in comparison with the one of osdmon. let's
1183     // trim them here. otherwise, they will be added back after being erased.
1184     unsigned removed = 0;
1185     for (auto& pg : pending_created_pgs) {
1186       dout(20) << __func__ << " noting created pg " << pg << dendl;
1187       pending_creatings.created_pools.insert(pg.pool());
1188       removed += pending_creatings.pgs.erase(pg);
1189     }
1190     pending_created_pgs.clear();
1191     dout(10) << __func__ << " " << removed
1192              << " pgs removed because they're created" << dendl;
1193     pending_creatings.last_scan_epoch = osdmap.get_epoch();
1194   }
1195
1196   // filter out any pgs that shouldn't exist.
1197   {
1198     auto i = pending_creatings.pgs.begin();
1199     while (i != pending_creatings.pgs.end()) {
1200       if (!nextmap.pg_exists(i->first)) {
1201         dout(10) << __func__ << " removing pg " << i->first
1202                  << " which should not exist" << dendl;
1203         i = pending_creatings.pgs.erase(i);
1204       } else {
1205         ++i;
1206       }
1207     }
1208   }
1209
1210   // process queue
1211   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1212   const auto total = pending_creatings.pgs.size();
1213   while (pending_creatings.pgs.size() < max &&
1214          !pending_creatings.queue.empty()) {
1215     auto p = pending_creatings.queue.begin();
1216     int64_t poolid = p->first;
1217     dout(10) << __func__ << " pool " << poolid
1218              << " created " << p->second.created
1219              << " modified " << p->second.modified
1220              << " [" << p->second.start << "-" << p->second.end << ")"
1221              << dendl;
1222     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1223                                   p->second.end - p->second.start);
1224     ps_t first = p->second.start;
1225     ps_t end = first + n;
1226     for (ps_t ps = first; ps < end; ++ps) {
1227       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1228       // NOTE: use the *current* epoch as the PG creation epoch so that the
1229       // OSD does not have to generate a long set of PastIntervals.
1230       pending_creatings.pgs.emplace(
1231         pgid,
1232         creating_pgs_t::pg_create_info(inc.epoch,
1233                                        p->second.modified));
1234       dout(10) << __func__ << " adding " << pgid << dendl;
1235     }
1236     p->second.start = end;
1237     if (p->second.done()) {
1238       dout(10) << __func__ << " done with queue for " << poolid << dendl;
1239       pending_creatings.queue.erase(p);
1240     } else {
1241       dout(10) << __func__ << " pool " << poolid
1242                << " now [" << p->second.start << "-" << p->second.end << ")"
1243                << dendl;
1244     }
1245   }
1246   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1247            << " pools" << dendl;
1248
1249   if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1250     // walk creating pgs' history and past_intervals forward
1251     for (auto& i : pending_creatings.pgs) {
1252       // this mirrors PG::start_peering_interval()
1253       pg_t pgid = i.first;
1254
1255       // this is a bit imprecise, but sufficient?
1256       struct min_size_predicate_t : public IsPGRecoverablePredicate {
1257         const pg_pool_t *pi;
1258         bool operator()(const set<pg_shard_t> &have) const {
1259           return have.size() >= pi->min_size;
1260         }
1261         explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1262       } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1263
1264       vector<int> up, acting;
1265       int up_primary, acting_primary;
1266       nextmap.pg_to_up_acting_osds(
1267         pgid, &up, &up_primary, &acting, &acting_primary);
1268       if (i.second.history.epoch_created == 0) {
1269         // new pg entry, set it up
1270         i.second.up = up;
1271         i.second.acting = acting;
1272         i.second.up_primary = up_primary;
1273         i.second.acting_primary = acting_primary;
1274         i.second.history = pg_history_t(i.second.create_epoch,
1275                                         i.second.create_stamp);
1276         dout(10) << __func__ << "  pg " << pgid << " just added, "
1277                  << " up " << i.second.up
1278                  << " p " << i.second.up_primary
1279                  << " acting " << i.second.acting
1280                  << " p " << i.second.acting_primary
1281                  << " history " << i.second.history
1282                  << " past_intervals " << i.second.past_intervals
1283                  << dendl;
1284      } else {
1285         std::stringstream debug;
1286         if (PastIntervals::check_new_interval(
1287               i.second.acting_primary, acting_primary,
1288               i.second.acting, acting,
1289               i.second.up_primary, up_primary,
1290               i.second.up, up,
1291               i.second.history.same_interval_since,
1292               i.second.history.last_epoch_clean,
1293               &nextmap,
1294               &osdmap,
1295               pgid,
1296               min_size_predicate,
1297               &i.second.past_intervals,
1298               &debug)) {
1299           epoch_t e = inc.epoch;
1300           i.second.history.same_interval_since = e;
1301           if (i.second.up != up) {
1302             i.second.history.same_up_since = e;
1303           }
1304           if (i.second.acting_primary != acting_primary) {
1305             i.second.history.same_primary_since = e;
1306           }
1307           if (pgid.is_split(
1308                 osdmap.get_pg_num(pgid.pool()),
1309                 nextmap.get_pg_num(pgid.pool()),
1310                 nullptr)) {
1311             i.second.history.last_epoch_split = e;
1312           }
1313           dout(10) << __func__ << "  pg " << pgid << " new interval,"
1314                    << " up " << i.second.up << " -> " << up
1315                    << " p " << i.second.up_primary << " -> " << up_primary
1316                    << " acting " << i.second.acting << " -> " << acting
1317                    << " p " << i.second.acting_primary << " -> "
1318                    << acting_primary
1319                    << " history " << i.second.history
1320                    << " past_intervals " << i.second.past_intervals
1321                    << dendl;
1322           dout(20) << "  debug: " << debug.str() << dendl;
1323           i.second.up = up;
1324           i.second.acting = acting;
1325           i.second.up_primary = up_primary;
1326           i.second.acting_primary = acting_primary;
1327         }
1328       }
1329     }
1330   }
1331   dout(10) << __func__
1332            << " " << (pending_creatings.pgs.size() - total)
1333            << "/" << pending_creatings.pgs.size()
1334            << " pgs added from queued pools" << dendl;
1335   return pending_creatings;
1336 }
1337
1338 void OSDMonitor::maybe_prime_pg_temp()
1339 {
1340   bool all = false;
1341   if (pending_inc.crush.length()) {
1342     dout(10) << __func__ << " new crush map, all" << dendl;
1343     all = true;
1344   }
1345
1346   if (!pending_inc.new_up_client.empty()) {
1347     dout(10) << __func__ << " new up osds, all" << dendl;
1348     all = true;
1349   }
1350
1351   // check for interesting OSDs
1352   set<int> osds;
1353   for (auto p = pending_inc.new_state.begin();
1354        !all && p != pending_inc.new_state.end();
1355        ++p) {
1356     if ((p->second & CEPH_OSD_UP) &&
1357         osdmap.is_up(p->first)) {
1358       osds.insert(p->first);
1359     }
1360   }
1361   for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1362        !all && p != pending_inc.new_weight.end();
1363        ++p) {
1364     if (p->second < osdmap.get_weight(p->first)) {
1365       // weight reduction
1366       osds.insert(p->first);
1367     } else {
1368       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1369                << dendl;
1370       all = true;
1371     }
1372   }
1373
1374   if (!all && osds.empty())
1375     return;
1376
1377   if (!all) {
1378     unsigned estimate =
1379       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1380     if (estimate > mapping.get_num_pgs() *
1381         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1382       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1383                << osds.size() << " osds >= "
1384                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1385                << mapping.get_num_pgs() << " pgs, all"
1386                << dendl;
1387       all = true;
1388     } else {
1389       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1390                << osds.size() << " osds" << dendl;
1391     }
1392   }
1393
1394   OSDMap next;
1395   next.deepish_copy_from(osdmap);
1396   next.apply_incremental(pending_inc);
1397
1398   if (next.get_pools().empty()) {
1399     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1400   } else if (all) {
1401     PrimeTempJob job(next, this);
1402     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1403     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1404       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1405     } else {
1406       dout(10) << __func__ << " did not finish in "
1407                << g_conf()->mon_osd_prime_pg_temp_max_time
1408                << ", stopping" << dendl;
1409       job.abort();
1410     }
1411   } else {
1412     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1413     utime_t stop = ceph_clock_now();
1414     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1415     const int chunk = 1000;
1416     int n = chunk;
1417     std::unordered_set<pg_t> did_pgs;
1418     for (auto osd : osds) {
1419       auto& pgs = mapping.get_osd_acting_pgs(osd);
1420       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1421       for (auto pgid : pgs) {
1422         if (!did_pgs.insert(pgid).second) {
1423           continue;
1424         }
1425         prime_pg_temp(next, pgid);
1426         if (--n <= 0) {
1427           n = chunk;
1428           if (ceph_clock_now() > stop) {
1429             dout(10) << __func__ << " consumed more than "
1430                      << g_conf()->mon_osd_prime_pg_temp_max_time
1431                      << " seconds, stopping"
1432                      << dendl;
1433             return;
1434           }
1435         }
1436       }
1437     }
1438   }
1439 }
1440
1441 void OSDMonitor::prime_pg_temp(
1442   const OSDMap& next,
1443   pg_t pgid)
1444 {
1445   // TODO: remove this creating_pgs direct access?
1446   if (creating_pgs.pgs.count(pgid)) {
1447     return;
1448   }
1449   if (!osdmap.pg_exists(pgid)) {
1450     return;
1451   }
1452
1453   vector<int> up, acting;
1454   mapping.get(pgid, &up, nullptr, &acting, nullptr);
1455
1456   vector<int> next_up, next_acting;
1457   int next_up_primary, next_acting_primary;
1458   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1459                             &next_acting, &next_acting_primary);
1460   if (acting == next_acting &&
1461       !(up != acting && next_up == next_acting))
1462     return;  // no change since last epoch
1463
1464   if (acting.empty())
1465     return;  // if previously empty now we can be no worse off
1466   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1467   if (pool && acting.size() < pool->min_size)
1468     return;  // can be no worse off than before
1469
1470   if (next_up == next_acting) {
1471     acting.clear();
1472     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1473              << dendl;
1474   }
1475
1476   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1477            << " -> " << next_up << "/" << next_acting
1478            << ", priming " << acting
1479            << dendl;
1480   {
1481     std::lock_guard l(prime_pg_temp_lock);
1482     // do not touch a mapping if a change is pending
1483     pending_inc.new_pg_temp.emplace(
1484       pgid,
1485       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1486   }
1487 }
1488
1489 /**
1490  * @note receiving a transaction in this function gives a fair amount of
1491  * freedom to the service implementation if it does need it. It shouldn't.
1492  */
1493 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1494 {
1495   dout(10) << "encode_pending e " << pending_inc.epoch
1496            << dendl;
1497
1498   if (do_prune(t)) {
1499     dout(1) << __func__ << " osdmap full prune encoded e"
1500             << pending_inc.epoch << dendl;
1501   }
1502
1503   // finalize up pending_inc
1504   pending_inc.modified = ceph_clock_now();
1505
1506   int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1507   ceph_assert(r == 0);
1508
1509   if (mapping_job) {
1510     if (!mapping_job->is_done()) {
1511       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1512               << mapping_job.get() << " did not complete, "
1513               << mapping_job->shards << " left" << dendl;
1514       mapping_job->abort();
1515     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1516       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1517               << mapping_job.get() << " is prior epoch "
1518               << mapping.get_epoch() << dendl;
1519     } else {
1520       if (g_conf()->mon_osd_prime_pg_temp) {
1521         maybe_prime_pg_temp();
1522       }
1523     }
1524   } else if (g_conf()->mon_osd_prime_pg_temp) {
1525     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1526             << dendl;
1527   }
1528   mapping_job.reset();
1529
1530   // ensure we don't have blank new_state updates.  these are interrpeted as
1531   // CEPH_OSD_UP (and almost certainly not what we want!).
1532   auto p = pending_inc.new_state.begin();
1533   while (p != pending_inc.new_state.end()) {
1534     if (p->second == 0) {
1535       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1536       p = pending_inc.new_state.erase(p);
1537     } else {
1538       if (p->second & CEPH_OSD_UP) {
1539         pending_inc.new_last_up_change = pending_inc.modified;
1540       }
1541       ++p;
1542     }
1543   }
1544   if (!pending_inc.new_up_client.empty()) {
1545     pending_inc.new_last_up_change = pending_inc.modified;
1546   }
1547   for (auto& i : pending_inc.new_weight) {
1548     if (i.first >= osdmap.max_osd) {
1549       if (i.second) {
1550         // new osd is already marked in
1551         pending_inc.new_last_in_change = pending_inc.modified;
1552         break;
1553       }
1554     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1555       // existing osd marked in or out
1556       pending_inc.new_last_in_change = pending_inc.modified;
1557       break;
1558     }
1559   }
1560
1561   {
1562     OSDMap tmp;
1563     tmp.deepish_copy_from(osdmap);
1564     tmp.apply_incremental(pending_inc);
1565
1566     // clean pg_temp mappings
1567     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1568
1569     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1570     {
1571       // check every upmapped pg for now
1572       // until we could reliably identify certain cases to ignore,
1573       // which is obviously the hard part TBD..
1574       vector<pg_t> pgs_to_check;
1575       tmp.get_upmap_pgs(&pgs_to_check);
1576       if (pgs_to_check.size() <
1577           static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1578         // not enough pgs, do it inline
1579         tmp.clean_pg_upmaps(cct, &pending_inc);
1580       } else {
1581         CleanUpmapJob job(cct, tmp, pending_inc);
1582         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1583         job.wait();
1584       }
1585     }
1586
1587     // update creating pgs first so that we can remove the created pgid and
1588     // process the pool flag removal below in the same osdmap epoch.
1589     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1590     bufferlist creatings_bl;
1591     uint64_t features = CEPH_FEATURES_ALL;
1592     if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1593       dout(20) << __func__ << " encoding pending pgs without octopus features"
1594                << dendl;
1595       features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1596     }
1597     encode(pending_creatings, creatings_bl, features);
1598     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1599
1600     // remove any old (or incompat) POOL_CREATING flags
1601     for (auto& i : tmp.get_pools()) {
1602       if (tmp.require_osd_release < ceph_release_t::nautilus) {
1603         // pre-nautilus OSDMaps shouldn't get this flag.
1604         if (pending_inc.new_pools.count(i.first)) {
1605           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1606         }
1607       }
1608       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1609           !pending_creatings.still_creating_pool(i.first)) {
1610         dout(10) << __func__ << " done creating pool " << i.first
1611                  << ", clearing CREATING flag" << dendl;
1612         if (pending_inc.new_pools.count(i.first) == 0) {
1613           pending_inc.new_pools[i.first] = i.second;
1614         }
1615         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1616       }
1617     }
1618
1619     // collect which pools are currently affected by
1620     // the near/backfill/full osd(s),
1621     // and set per-pool near/backfill/full flag instead
1622     set<int64_t> full_pool_ids;
1623     set<int64_t> backfillfull_pool_ids;
1624     set<int64_t> nearfull_pool_ids;
1625     tmp.get_full_pools(cct,
1626                        &full_pool_ids,
1627                        &backfillfull_pool_ids,
1628                          &nearfull_pool_ids);
1629     if (full_pool_ids.empty() ||
1630         backfillfull_pool_ids.empty() ||
1631         nearfull_pool_ids.empty()) {
1632       // normal case - no nearfull, backfillfull or full osds
1633         // try cancel any improper nearfull/backfillfull/full pool
1634         // flags first
1635       for (auto &pool: tmp.get_pools()) {
1636         auto p = pool.first;
1637         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1638             nearfull_pool_ids.empty()) {
1639           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1640                    << "'s nearfull flag" << dendl;
1641           if (pending_inc.new_pools.count(p) == 0) {
1642             // load original pool info first!
1643             pending_inc.new_pools[p] = pool.second;
1644           }
1645           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1646         }
1647         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1648             backfillfull_pool_ids.empty()) {
1649           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650                    << "'s backfillfull flag" << dendl;
1651           if (pending_inc.new_pools.count(p) == 0) {
1652             pending_inc.new_pools[p] = pool.second;
1653           }
1654           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1655         }
1656         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1657             full_pool_ids.empty()) {
1658           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1659             // set by EQUOTA, skipping
1660             continue;
1661           }
1662           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1663                    << "'s full flag" << dendl;
1664           if (pending_inc.new_pools.count(p) == 0) {
1665             pending_inc.new_pools[p] = pool.second;
1666           }
1667           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1668         }
1669       }
1670     }
1671     if (!full_pool_ids.empty()) {
1672       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1673                << " as full" << dendl;
1674       for (auto &p: full_pool_ids) {
1675         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1676           continue;
1677         }
1678         if (pending_inc.new_pools.count(p) == 0) {
1679           pending_inc.new_pools[p] = tmp.pools[p];
1680         }
1681         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1682         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1683         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1684       }
1685       // cancel FLAG_FULL for pools which are no longer full too
1686       for (auto &pool: tmp.get_pools()) {
1687         auto p = pool.first;
1688         if (full_pool_ids.count(p)) {
1689           // skip pools we have just marked as full above
1690           continue;
1691         }
1692         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1693             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1694           // don't touch if currently is not full
1695           // or is running out of quota (and hence considered as full)
1696           continue;
1697         }
1698         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1699                  << "'s full flag" << dendl;
1700         if (pending_inc.new_pools.count(p) == 0) {
1701           pending_inc.new_pools[p] = pool.second;
1702         }
1703         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1704       }
1705     }
1706     if (!backfillfull_pool_ids.empty()) {
1707       for (auto &p: backfillfull_pool_ids) {
1708         if (full_pool_ids.count(p)) {
1709           // skip pools we have already considered as full above
1710           continue;
1711         }
1712         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1713           // make sure FLAG_FULL is truly set, so we are safe not
1714           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1716           continue;
1717         }
1718         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1719           // don't bother if pool is already marked as backfillfull
1720           continue;
1721         }
1722         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1723                  << "'s as backfillfull" << dendl;
1724         if (pending_inc.new_pools.count(p) == 0) {
1725           pending_inc.new_pools[p] = tmp.pools[p];
1726         }
1727         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1728         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1729       }
1730       // cancel FLAG_BACKFILLFULL for pools
1731       // which are no longer backfillfull too
1732       for (auto &pool: tmp.get_pools()) {
1733         auto p = pool.first;
1734         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1735           // skip pools we have just marked as backfillfull/full above
1736           continue;
1737         }
1738         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1739           // and don't touch if currently is not backfillfull
1740           continue;
1741         }
1742         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743                  << "'s backfillfull flag" << dendl;
1744         if (pending_inc.new_pools.count(p) == 0) {
1745           pending_inc.new_pools[p] = pool.second;
1746         }
1747         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1748       }
1749     }
1750     if (!nearfull_pool_ids.empty()) {
1751       for (auto &p: nearfull_pool_ids) {
1752         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1753           continue;
1754         }
1755         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1756           // make sure FLAG_FULL is truly set, so we are safe not
1757           // to set a extra (redundant) FLAG_NEARFULL flag
1758           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1759           continue;
1760         }
1761         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1762           // don't bother if pool is already marked as nearfull
1763           continue;
1764         }
1765         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1766                  << "'s as nearfull" << dendl;
1767         if (pending_inc.new_pools.count(p) == 0) {
1768           pending_inc.new_pools[p] = tmp.pools[p];
1769         }
1770         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1771       }
1772       // cancel FLAG_NEARFULL for pools
1773       // which are no longer nearfull too
1774       for (auto &pool: tmp.get_pools()) {
1775         auto p = pool.first;
1776         if (full_pool_ids.count(p) ||
1777             backfillfull_pool_ids.count(p) ||
1778             nearfull_pool_ids.count(p)) {
1779           // skip pools we have just marked as
1780           // nearfull/backfillfull/full above
1781           continue;
1782         }
1783         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1784           // and don't touch if currently is not nearfull
1785           continue;
1786         }
1787         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1788                  << "'s nearfull flag" << dendl;
1789         if (pending_inc.new_pools.count(p) == 0) {
1790           pending_inc.new_pools[p] = pool.second;
1791         }
1792         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1793       }
1794     }
1795
1796     // min_compat_client?
1797     if (!tmp.require_min_compat_client) {
1798       auto mv = tmp.get_min_compat_client();
1799       dout(1) << __func__ << " setting require_min_compat_client to currently "
1800               << "required " << mv << dendl;
1801       mon->clog->info() << "setting require_min_compat_client to currently "
1802                         << "required " << mv;
1803       pending_inc.new_require_min_compat_client = mv;
1804     }
1805
1806     if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1807         tmp.require_osd_release >= ceph_release_t::nautilus) {
1808       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1809       // add creating flags?
1810       for (auto& i : tmp.get_pools()) {
1811         if (pending_creatings.still_creating_pool(i.first)) {
1812           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1813                    << dendl;
1814           if (pending_inc.new_pools.count(i.first) == 0) {
1815             pending_inc.new_pools[i.first] = i.second;
1816           }
1817           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1818         }
1819       }
1820       // adjust blacklist items to all be TYPE_ANY
1821       for (auto& i : tmp.blacklist) {
1822         auto a = i.first;
1823         a.set_type(entity_addr_t::TYPE_ANY);
1824         pending_inc.new_blacklist[a] = i.second;
1825         pending_inc.old_blacklist.push_back(i.first);
1826       }
1827     }
1828
1829     if (osdmap.require_osd_release < ceph_release_t::octopus &&
1830         tmp.require_osd_release >= ceph_release_t::octopus) {
1831       dout(10) << __func__ << " first octopus+ epoch" << dendl;
1832
1833       // adjust obsoleted cache modes
1834       for (auto& [poolid, pi] : tmp.pools) {
1835         if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1836           if (pending_inc.new_pools.count(poolid) == 0) {
1837             pending_inc.new_pools[poolid] = pi;
1838           }
1839           dout(10) << __func__ << " switching pool " << poolid
1840                    << " cachemode from forward -> proxy" << dendl;
1841           pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1842         }
1843         if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1844           if (pending_inc.new_pools.count(poolid) == 0) {
1845             pending_inc.new_pools[poolid] = pi;
1846           }
1847           dout(10) << __func__ << " switching pool " << poolid
1848                    << " cachemode from readforward -> readproxy" << dendl;
1849           pending_inc.new_pools[poolid].cache_mode =
1850             pg_pool_t::CACHEMODE_READPROXY;
1851         }
1852       }
1853
1854       // clear removed_snaps for every pool
1855       for (auto& [poolid, pi] : tmp.pools) {
1856         if (pi.removed_snaps.empty()) {
1857           continue;
1858         }
1859         if (pending_inc.new_pools.count(poolid) == 0) {
1860           pending_inc.new_pools[poolid] = pi;
1861         }
1862         dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1863                  << dendl;
1864         pending_inc.new_pools[poolid].removed_snaps.clear();
1865       }
1866
1867       // create a combined purged snap epoch key for all purged snaps
1868       // prior to this epoch, and store it in the current epoch (i.e.,
1869       // the last pre-octopus epoch, just prior to the one we're
1870       // encoding now).
1871       auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1872       it->lower_bound("purged_snap_");
1873       map<int64_t,snap_interval_set_t> combined;
1874       while (it->valid()) {
1875         if (it->key().find("purged_snap_") != 0) {
1876           break;
1877         }
1878         string k = it->key();
1879         long long unsigned pool;
1880         int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1881         if (n != 1) {
1882           derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1883         } else {
1884           bufferlist v = it->value();
1885           auto p = v.cbegin();
1886           snapid_t begin, end;
1887           ceph::decode(begin, p);
1888           ceph::decode(end, p);
1889           combined[pool].insert(begin, end - begin);
1890         }
1891         it->next();
1892       }
1893       if (!combined.empty()) {
1894         string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1895         bufferlist v;
1896         ceph::encode(combined, v);
1897         t->put(OSD_SNAP_PREFIX, k, v);
1898         dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1899                  << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1900                  << dendl;
1901       } else {
1902         dout(10) << __func__ << " there were no pre-octopus purged snaps"
1903                  << dendl;
1904       }
1905
1906       // clean out the old removed_snap_ and removed_epoch keys
1907       // ('`' is ASCII '_' + 1)
1908       t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1909       t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1910     }
1911   }
1912
1913   // tell me about it
1914   for (auto i = pending_inc.new_state.begin();
1915        i != pending_inc.new_state.end();
1916        ++i) {
1917     int s = i->second ? i->second : CEPH_OSD_UP;
1918     if (s & CEPH_OSD_UP) {
1919       dout(2) << " osd." << i->first << " DOWN" << dendl;
1920       // Reset laggy parameters if failure interval exceeds a threshold.
1921       const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1922       if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1923         int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1924         if (grace_interval_threshold_exceeded(last_failure_interval)) {
1925           set_default_laggy_params(i->first);
1926         }
1927       }
1928     }
1929     if (s & CEPH_OSD_EXISTS)
1930       dout(2) << " osd." << i->first << " DNE" << dendl;
1931   }
1932   for (auto i = pending_inc.new_up_client.begin();
1933        i != pending_inc.new_up_client.end();
1934        ++i) {
1935     //FIXME: insert cluster addresses too
1936     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1937   }
1938   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1939        i != pending_inc.new_weight.end();
1940        ++i) {
1941     if (i->second == CEPH_OSD_OUT) {
1942       dout(2) << " osd." << i->first << " OUT" << dendl;
1943     } else if (i->second == CEPH_OSD_IN) {
1944       dout(2) << " osd." << i->first << " IN" << dendl;
1945     } else {
1946       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1947     }
1948   }
1949
1950   // features for osdmap and its incremental
1951   uint64_t features;
1952
1953   // encode full map and determine its crc
1954   OSDMap tmp;
1955   {
1956     tmp.deepish_copy_from(osdmap);
1957     tmp.apply_incremental(pending_inc);
1958
1959     // determine appropriate features
1960     features = tmp.get_encoding_features();
1961     dout(10) << __func__ << " encoding full map with "
1962              << tmp.require_osd_release
1963              << " features " << features << dendl;
1964
1965     // the features should be a subset of the mon quorum's features!
1966     ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1967
1968     bufferlist fullbl;
1969     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1970     pending_inc.full_crc = tmp.get_crc();
1971
1972     // include full map in the txn.  note that old monitors will
1973     // overwrite this.  new ones will now skip the local full map
1974     // encode and reload from this.
1975     put_version_full(t, pending_inc.epoch, fullbl);
1976   }
1977
1978   // encode
1979   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1980   bufferlist bl;
1981   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1982
1983   dout(20) << " full_crc " << tmp.get_crc()
1984            << " inc_crc " << pending_inc.inc_crc << dendl;
1985
1986   /* put everything in the transaction */
1987   put_version(t, pending_inc.epoch, bl);
1988   put_last_committed(t, pending_inc.epoch);
1989
1990   // metadata, too!
1991   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1992        p != pending_metadata.end();
1993        ++p)
1994     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1995   for (set<int>::iterator p = pending_metadata_rm.begin();
1996        p != pending_metadata_rm.end();
1997        ++p)
1998     t->erase(OSD_METADATA_PREFIX, stringify(*p));
1999   pending_metadata.clear();
2000   pending_metadata_rm.clear();
2001
2002   // purged_snaps
2003   if (tmp.require_osd_release >= ceph_release_t::octopus &&
2004       !pending_inc.new_purged_snaps.empty()) {
2005     // all snaps purged this epoch (across all pools)
2006     string k = make_purged_snap_epoch_key(pending_inc.epoch);
2007     bufferlist v;
2008     encode(pending_inc.new_purged_snaps, v);
2009     t->put(OSD_SNAP_PREFIX, k, v);
2010   }
2011   for (auto& i : pending_inc.new_purged_snaps) {
2012     for (auto q = i.second.begin();
2013          q != i.second.end();
2014          ++q) {
2015       insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2016                                 pending_inc.epoch,
2017                                 t);
2018     }
2019   }
2020   for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2021     for (auto snap : snaps) {
2022       insert_purged_snap_update(pool, snap, snap + 1,
2023                                 pending_inc.epoch,
2024                                 t);
2025     }
2026   }
2027
2028   // health
2029   health_check_map_t next;
2030   tmp.check_health(cct, &next);
2031   encode_health(next, t);
2032 }
2033
2034 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2035 {
2036   bufferlist bl;
2037   int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2038   if (r < 0)
2039     return r;
2040   try {
2041     auto p = bl.cbegin();
2042     decode(m, p);
2043   }
2044   catch (buffer::error& e) {
2045     if (err)
2046       *err << "osd." << osd << " metadata is corrupt";
2047     return -EIO;
2048   }
2049   return 0;
2050 }
2051
2052 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2053 {
2054   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2055     if (osdmap.is_up(osd)) {
2056       map<string,string> meta;
2057       load_metadata(osd, meta, nullptr);
2058       auto p = meta.find(field);
2059       if (p == meta.end()) {
2060         (*out)["unknown"]++;
2061       } else {
2062         (*out)[p->second]++;
2063       }
2064     }
2065   }
2066 }
2067
2068 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2069 {
2070   map<string,int> by_val;
2071   count_metadata(field, &by_val);
2072   f->open_object_section(field.c_str());
2073   for (auto& p : by_val) {
2074     f->dump_int(p.first.c_str(), p.second);
2075   }
2076   f->close_section();
2077 }
2078
2079 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2080 {
2081   map<string, string> metadata;
2082   int r = load_metadata(osd, metadata, nullptr);
2083   if (r < 0)
2084     return r;
2085
2086   auto it = metadata.find("osd_objectstore");
2087   if (it == metadata.end())
2088     return -ENOENT;
2089   *type = it->second;
2090   return 0;
2091 }
2092
2093 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2094                                                  const pg_pool_t &pool,
2095                                                  ostream *err)
2096 {
2097   // just check a few pgs for efficiency - this can't give a guarantee anyway,
2098   // since filestore osds could always join the pool later
2099   set<int> checked_osds;
2100   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2101     vector<int> up, acting;
2102     pg_t pgid(ps, pool_id);
2103     osdmap.pg_to_up_acting_osds(pgid, up, acting);
2104     for (int osd : up) {
2105       if (checked_osds.find(osd) != checked_osds.end())
2106         continue;
2107       string objectstore_type;
2108       int r = get_osd_objectstore_type(osd, &objectstore_type);
2109       // allow with missing metadata, e.g. due to an osd never booting yet
2110       if (r < 0 || objectstore_type == "bluestore") {
2111         checked_osds.insert(osd);
2112         continue;
2113       }
2114       *err << "osd." << osd << " uses " << objectstore_type;
2115       return false;
2116     }
2117   }
2118   return true;
2119 }
2120
2121 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2122 {
2123   map<string,string> m;
2124   if (int r = load_metadata(osd, m, err))
2125     return r;
2126   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2127     f->dump_string(p->first.c_str(), p->second);
2128   return 0;
2129 }
2130
2131 void OSDMonitor::print_nodes(Formatter *f)
2132 {
2133   // group OSDs by their hosts
2134   map<string, list<int> > osds; // hostname => osd
2135   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2136     map<string, string> m;
2137     if (load_metadata(osd, m, NULL)) {
2138       continue;
2139     }
2140     map<string, string>::iterator hostname = m.find("hostname");
2141     if (hostname == m.end()) {
2142       // not likely though
2143       continue;
2144     }
2145     osds[hostname->second].push_back(osd);
2146   }
2147
2148   dump_services(f, osds, "osd");
2149 }
2150
2151 void OSDMonitor::share_map_with_random_osd()
2152 {
2153   if (osdmap.get_num_up_osds() == 0) {
2154     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2155     return;
2156   }
2157
2158   MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2159   if (!s) {
2160     dout(10) << __func__ << " no up osd on our session map" << dendl;
2161     return;
2162   }
2163
2164   dout(10) << "committed, telling random " << s->name
2165            << " all about it" << dendl;
2166
2167   // get feature of the peer
2168   // use quorum_con_features, if it's an anonymous connection.
2169   uint64_t features = s->con_features ? s->con_features :
2170                                         mon->get_quorum_con_features();
2171   // whatev, they'll request more if they need it
2172   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2173   s->con->send_message(m);
2174   // NOTE: do *not* record osd has up to this epoch (as we do
2175   // elsewhere) as they may still need to request older values.
2176 }
2177
2178 version_t OSDMonitor::get_trim_to() const
2179 {
2180   if (mon->get_quorum().empty()) {
2181     dout(10) << __func__ << ": quorum not formed" << dendl;
2182     return 0;
2183   }
2184
2185   {
2186     std::lock_guard<std::mutex> l(creating_pgs_lock);
2187     if (!creating_pgs.pgs.empty()) {
2188       return 0;
2189     }
2190   }
2191
2192   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2193     dout(0) << __func__
2194             << " blocking osdmap trim"
2195                " ('mon_debug_block_osdmap_trim' set to 'true')"
2196             << dendl;
2197     return 0;
2198   }
2199
2200   {
2201     epoch_t floor = get_min_last_epoch_clean();
2202     dout(10) << " min_last_epoch_clean " << floor << dendl;
2203     if (g_conf()->mon_osd_force_trim_to > 0 &&
2204         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2205       floor = g_conf()->mon_osd_force_trim_to;
2206       dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2207     }
2208     unsigned min = g_conf()->mon_min_osdmap_epochs;
2209     if (floor + min > get_last_committed()) {
2210       if (min < get_last_committed())
2211         floor = get_last_committed() - min;
2212       else
2213         floor = 0;
2214     }
2215     if (floor > get_first_committed())
2216       return floor;
2217   }
2218   return 0;
2219 }
2220
2221 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2222 {
2223   auto floor = last_epoch_clean.get_lower_bound(osdmap);
2224   // also scan osd epochs
2225   // don't trim past the oldest reported osd epoch
2226   for (auto& osd_epoch : osd_epochs) {
2227     if (osd_epoch.second < floor &&
2228         osdmap.is_in(osd_epoch.first)) {
2229       floor = osd_epoch.second;
2230     }
2231   }
2232   return floor;
2233 }
2234
2235 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2236                                    version_t first)
2237 {
2238   dout(10) << __func__ << " including full map for e " << first << dendl;
2239   bufferlist bl;
2240   get_version_full(first, bl);
2241   put_version_full(tx, first, bl);
2242
2243   if (has_osdmap_manifest &&
2244       first > osdmap_manifest.get_first_pinned()) {
2245     _prune_update_trimmed(tx, first);
2246   }
2247 }
2248
2249
2250 /* full osdmap prune
2251  *
2252  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2253  */
2254
2255 void OSDMonitor::load_osdmap_manifest()
2256 {
2257   bool store_has_manifest =
2258     mon->store->exists(get_service_name(), "osdmap_manifest");
2259
2260   if (!store_has_manifest) {
2261     if (!has_osdmap_manifest) {
2262       return;
2263     }
2264
2265     dout(20) << __func__
2266              << " dropping osdmap manifest from memory." << dendl;
2267     osdmap_manifest = osdmap_manifest_t();
2268     has_osdmap_manifest = false;
2269     return;
2270   }
2271
2272   dout(20) << __func__
2273            << " osdmap manifest detected in store; reload." << dendl;
2274
2275   bufferlist manifest_bl;
2276   int r = get_value("osdmap_manifest", manifest_bl);
2277   if (r < 0) {
2278     derr << __func__ << " unable to read osdmap version manifest" << dendl;
2279     ceph_abort_msg("error reading manifest");
2280   }
2281   osdmap_manifest.decode(manifest_bl);
2282   has_osdmap_manifest = true;
2283
2284   dout(10) << __func__ << " store osdmap manifest pinned ("
2285            << osdmap_manifest.get_first_pinned()
2286            << " .. "
2287            << osdmap_manifest.get_last_pinned()
2288            << ")"
2289            << dendl;
2290 }
2291
2292 bool OSDMonitor::should_prune() const
2293 {
2294   version_t first = get_first_committed();
2295   version_t last = get_last_committed();
2296   version_t min_osdmap_epochs =
2297     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2298   version_t prune_min =
2299     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2300   version_t prune_interval =
2301     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2302   version_t last_pinned = osdmap_manifest.get_last_pinned();
2303   version_t last_to_pin = last - min_osdmap_epochs;
2304
2305   // Make it or break it constraints.
2306   //
2307   // If any of these conditions fails, we will not prune, regardless of
2308   // whether we have an on-disk manifest with an on-going pruning state.
2309   //
2310   if ((last - first) <= min_osdmap_epochs) {
2311     // between the first and last committed epochs, we don't have
2312     // enough epochs to trim, much less to prune.
2313     dout(10) << __func__
2314              << " currently holding only " << (last - first)
2315              << " epochs (min osdmap epochs: " << min_osdmap_epochs
2316              << "); do not prune."
2317              << dendl;
2318     return false;
2319
2320   } else if ((last_to_pin - first) < prune_min) {
2321     // between the first committed epoch and the last epoch we would prune,
2322     // we simply don't have enough versions over the minimum to prune maps.
2323     dout(10) << __func__
2324              << " could only prune " << (last_to_pin - first)
2325              << " epochs (" << first << ".." << last_to_pin << "), which"
2326                 " is less than the required minimum (" << prune_min << ")"
2327              << dendl;
2328     return false;
2329
2330   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2331     dout(10) << __func__
2332              << " we have pruned as far as we can; do not prune."
2333              << dendl;
2334     return false;
2335
2336   } else if (last_pinned + prune_interval > last_to_pin) {
2337     dout(10) << __func__
2338              << " not enough epochs to form an interval (last pinned: "
2339              << last_pinned << ", last to pin: "
2340              << last_to_pin << ", interval: " << prune_interval << ")"
2341              << dendl;
2342     return false;
2343   }
2344
2345   dout(15) << __func__
2346            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2347            << " lc (" << first << ".." << last << ")"
2348            << dendl;
2349   return true;
2350 }
2351
2352 void OSDMonitor::_prune_update_trimmed(
2353     MonitorDBStore::TransactionRef tx,
2354     version_t first)
2355 {
2356   dout(10) << __func__
2357            << " first " << first
2358            << " last_pinned " << osdmap_manifest.get_last_pinned()
2359            << " last_pinned " << osdmap_manifest.get_last_pinned()
2360            << dendl;
2361
2362   osdmap_manifest_t manifest = osdmap_manifest;
2363
2364   if (!manifest.is_pinned(first)) {
2365     manifest.pin(first);
2366   }
2367
2368   set<version_t>::iterator p_end = manifest.pinned.find(first);
2369   set<version_t>::iterator p = manifest.pinned.begin();
2370   manifest.pinned.erase(p, p_end);
2371   ceph_assert(manifest.get_first_pinned() == first);
2372
2373   if (manifest.get_last_pinned() == first+1 ||
2374       manifest.pinned.size() == 1) {
2375     // we reached the end of the line, as pinned maps go; clean up our
2376     // manifest, and let `should_prune()` decide whether we should prune
2377     // again.
2378     tx->erase(get_service_name(), "osdmap_manifest");
2379     return;
2380   }
2381
2382   bufferlist bl;
2383   manifest.encode(bl);
2384   tx->put(get_service_name(), "osdmap_manifest", bl);
2385 }
2386
2387 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2388 {
2389   dout(1) << __func__ << dendl;
2390
2391   version_t pin_first;
2392
2393   // verify constrainsts on stable in-memory state
2394   if (!has_osdmap_manifest) {
2395     // we must have never pruned, OR if we pruned the state must no longer
2396     // be relevant (i.e., the state must have been removed alongside with
2397     // the trim that *must* have removed past the last pinned map in a
2398     // previous prune).
2399     ceph_assert(osdmap_manifest.pinned.empty());
2400     ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2401     pin_first = get_first_committed();
2402
2403   } else {
2404     // we must have pruned in the past AND its state is still relevant
2405     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2406     // and thus we still hold a manifest in the store).
2407     ceph_assert(!osdmap_manifest.pinned.empty());
2408     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2409     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2410
2411     dout(10) << __func__
2412              << " first_pinned " << osdmap_manifest.get_first_pinned()
2413              << " last_pinned " << osdmap_manifest.get_last_pinned()
2414              << dendl;
2415
2416     pin_first = osdmap_manifest.get_last_pinned();
2417   }
2418
2419   manifest.pin(pin_first);
2420 }
2421
2422 bool OSDMonitor::_prune_sanitize_options() const
2423 {
2424   uint64_t prune_interval =
2425     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2426   uint64_t prune_min =
2427     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2428   uint64_t txsize =
2429     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2430
2431   bool r = true;
2432
2433   if (prune_interval == 0) {
2434     derr << __func__
2435          << " prune is enabled BUT prune interval is zero; abort."
2436          << dendl;
2437     r = false;
2438   } else if (prune_interval == 1) {
2439     derr << __func__
2440          << " prune interval is equal to one, which essentially means"
2441             " no pruning; abort."
2442          << dendl;
2443     r = false;
2444   }
2445   if (prune_min == 0) {
2446     derr << __func__
2447          << " prune is enabled BUT prune min is zero; abort."
2448          << dendl;
2449     r = false;
2450   }
2451   if (prune_interval > prune_min) {
2452     derr << __func__
2453          << " impossible to ascertain proper prune interval because"
2454          << " it is greater than the minimum prune epochs"
2455          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2456          << dendl;
2457     r = false;
2458   }
2459
2460   if (txsize < prune_interval - 1) {
2461     derr << __func__
2462          << "'mon_osdmap_full_prune_txsize' (" << txsize
2463          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2464          << "); abort." << dendl;
2465     r = false;
2466   }
2467   return r;
2468 }
2469
2470 bool OSDMonitor::is_prune_enabled() const {
2471   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2472 }
2473
2474 bool OSDMonitor::is_prune_supported() const {
2475   return mon->get_required_mon_features().contains_any(
2476       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2477 }
2478
2479 /** do_prune
2480  *
2481  * @returns true if has side-effects; false otherwise.
2482  */
2483 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2484 {
2485   bool enabled = is_prune_enabled();
2486
2487   dout(1) << __func__ << " osdmap full prune "
2488           << ( enabled ? "enabled" : "disabled")
2489           << dendl;
2490
2491   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2492     return false;
2493   }
2494
2495   // we are beyond the minimum prune versions, we need to remove maps because
2496   // otherwise the store will grow unbounded and we may end up having issues
2497   // with available disk space or store hangs.
2498
2499   // we will not pin all versions. We will leave a buffer number of versions.
2500   // this allows us the monitor to trim maps without caring too much about
2501   // pinned maps, and then allow us to use another ceph-mon without these
2502   // capabilities, without having to repair the store.
2503
2504   osdmap_manifest_t manifest = osdmap_manifest;
2505
2506   version_t first = get_first_committed();
2507   version_t last = get_last_committed();
2508
2509   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2510   version_t last_pinned = manifest.get_last_pinned();
2511   uint64_t prune_interval =
2512     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2513   uint64_t txsize =
2514     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2515
2516   prune_init(manifest);
2517
2518   // we need to get rid of some osdmaps
2519
2520   dout(5) << __func__
2521           << " lc (" << first << " .. " << last << ")"
2522           << " last_pinned " << last_pinned
2523           << " interval " << prune_interval
2524           << " last_to_pin " << last_to_pin
2525           << dendl;
2526
2527   // We will be erasing maps as we go.
2528   //
2529   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2530   //
2531   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2532   // we stop pruning. We could prune the maps between `next_to_pin` and
2533   // `last_to_pin`, but by not doing it we end up with neater pruned
2534   // intervals, aligned with `prune_interval`. Besides, this should not be a
2535   // problem as long as `prune_interval` is set to a sane value, instead of
2536   // hundreds or thousands of maps.
2537
2538   auto map_exists = [this](version_t v) {
2539     string k = mon->store->combine_strings("full", v);
2540     return mon->store->exists(get_service_name(), k);
2541   };
2542
2543   // 'interval' represents the number of maps from the last pinned
2544   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2545   // version 11 next; all intermediate versions will be removed.
2546   //
2547   // 'txsize' represents the maximum number of versions we'll be removing in
2548   // this iteration. If 'txsize' is large enough to perform multiple passes
2549   // pinning and removing maps, we will do so; if not, we'll do at least one
2550   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2551   // ensure that we never go *over* the maximum.
2552
2553   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2554   uint64_t removal_interval = prune_interval - 1;
2555
2556   if (txsize < removal_interval) {
2557     dout(5) << __func__
2558             << " setting txsize to removal interval size ("
2559             << removal_interval << " versions"
2560             << dendl;
2561     txsize = removal_interval;
2562   }
2563   ceph_assert(removal_interval > 0);
2564
2565   uint64_t num_pruned = 0;
2566   while (num_pruned + removal_interval <= txsize) {
2567     last_pinned = manifest.get_last_pinned();
2568
2569     if (last_pinned + prune_interval > last_to_pin) {
2570       break;
2571     }
2572     ceph_assert(last_pinned < last_to_pin);
2573
2574     version_t next_pinned = last_pinned + prune_interval;
2575     ceph_assert(next_pinned <= last_to_pin);
2576     manifest.pin(next_pinned);
2577
2578     dout(20) << __func__
2579              << " last_pinned " << last_pinned
2580              << " next_pinned " << next_pinned
2581              << " num_pruned " << num_pruned
2582              << " removal interval (" << (last_pinned+1)
2583              << ".." << (next_pinned-1) << ")"
2584              << " txsize " << txsize << dendl;
2585
2586     ceph_assert(map_exists(last_pinned));
2587     ceph_assert(map_exists(next_pinned));
2588
2589     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2590       ceph_assert(!manifest.is_pinned(v));
2591
2592       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2593       string full_key = mon->store->combine_strings("full", v);
2594       tx->erase(get_service_name(), full_key);
2595       ++num_pruned;
2596     }
2597   }
2598
2599   ceph_assert(num_pruned > 0);
2600
2601   bufferlist bl;
2602   manifest.encode(bl);
2603   tx->put(get_service_name(), "osdmap_manifest", bl);
2604
2605   return true;
2606 }
2607
2608
2609 // -------------
2610
2611 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2612 {
2613   op->mark_osdmon_event(__func__);
2614   Message *m = op->get_req();
2615   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2616
2617   switch (m->get_type()) {
2618     // READs
2619   case MSG_MON_COMMAND:
2620     try {
2621       return preprocess_command(op);
2622     } catch (const bad_cmd_get& e) {
2623       bufferlist bl;
2624       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2625       return true;
2626     }
2627   case CEPH_MSG_MON_GET_OSDMAP:
2628     return preprocess_get_osdmap(op);
2629
2630     // damp updates
2631   case MSG_OSD_MARK_ME_DOWN:
2632     return preprocess_mark_me_down(op);
2633   case MSG_OSD_MARK_ME_DEAD:
2634     return preprocess_mark_me_dead(op);
2635   case MSG_OSD_FULL:
2636     return preprocess_full(op);
2637   case MSG_OSD_FAILURE:
2638     return preprocess_failure(op);
2639   case MSG_OSD_BOOT:
2640     return preprocess_boot(op);
2641   case MSG_OSD_ALIVE:
2642     return preprocess_alive(op);
2643   case MSG_OSD_PG_CREATED:
2644     return preprocess_pg_created(op);
2645   case MSG_OSD_PG_READY_TO_MERGE:
2646     return preprocess_pg_ready_to_merge(op);
2647   case MSG_OSD_PGTEMP:
2648     return preprocess_pgtemp(op);
2649   case MSG_OSD_BEACON:
2650     return preprocess_beacon(op);
2651
2652   case CEPH_MSG_POOLOP:
2653     return preprocess_pool_op(op);
2654
2655   case MSG_REMOVE_SNAPS:
2656     return preprocess_remove_snaps(op);
2657
2658   case MSG_MON_GET_PURGED_SNAPS:
2659     return preprocess_get_purged_snaps(op);
2660
2661   default:
2662     ceph_abort();
2663     return true;
2664   }
2665 }
2666
2667 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2668 {
2669   op->mark_osdmon_event(__func__);
2670   Message *m = op->get_req();
2671   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2672
2673   switch (m->get_type()) {
2674     // damp updates
2675   case MSG_OSD_MARK_ME_DOWN:
2676     return prepare_mark_me_down(op);
2677   case MSG_OSD_MARK_ME_DEAD:
2678     return prepare_mark_me_dead(op);
2679   case MSG_OSD_FULL:
2680     return prepare_full(op);
2681   case MSG_OSD_FAILURE:
2682     return prepare_failure(op);
2683   case MSG_OSD_BOOT:
2684     return prepare_boot(op);
2685   case MSG_OSD_ALIVE:
2686     return prepare_alive(op);
2687   case MSG_OSD_PG_CREATED:
2688     return prepare_pg_created(op);
2689   case MSG_OSD_PGTEMP:
2690     return prepare_pgtemp(op);
2691   case MSG_OSD_PG_READY_TO_MERGE:
2692     return prepare_pg_ready_to_merge(op);
2693   case MSG_OSD_BEACON:
2694     return prepare_beacon(op);
2695
2696   case MSG_MON_COMMAND:
2697     try {
2698       return prepare_command(op);
2699     } catch (const bad_cmd_get& e) {
2700       bufferlist bl;
2701       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2702       return true;
2703     }
2704
2705   case CEPH_MSG_POOLOP:
2706     return prepare_pool_op(op);
2707
2708   case MSG_REMOVE_SNAPS:
2709     return prepare_remove_snaps(op);
2710
2711
2712   default:
2713     ceph_abort();
2714   }
2715
2716   return false;
2717 }
2718
2719 bool OSDMonitor::should_propose(double& delay)
2720 {
2721   dout(10) << "should_propose" << dendl;
2722
2723   // if full map, propose immediately!  any subsequent changes will be clobbered.
2724   if (pending_inc.fullmap.length())
2725     return true;
2726
2727   // adjust osd weights?
2728   if (!osd_weight.empty() &&
2729       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2730     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2731     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2732     delay = 0.0;
2733     osd_weight.clear();
2734     return true;
2735   }
2736
2737   return PaxosService::should_propose(delay);
2738 }
2739
2740
2741
2742 // ---------------------------
2743 // READs
2744
2745 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2746 {
2747   op->mark_osdmon_event(__func__);
2748   auto m = op->get_req<MMonGetOSDMap>();
2749
2750   uint64_t features = mon->get_quorum_con_features();
2751   if (op->get_session() && op->get_session()->con_features)
2752     features = op->get_session()->con_features;
2753
2754   dout(10) << __func__ << " " << *m << dendl;
2755   MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2756   epoch_t first = get_first_committed();
2757   epoch_t last = osdmap.get_epoch();
2758   int max = g_conf()->osd_map_message_max;
2759   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2760   for (epoch_t e = std::max(first, m->get_full_first());
2761        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2762        ++e, --max) {
2763     bufferlist& bl = reply->maps[e];
2764     int r = get_version_full(e, features, bl);
2765     ceph_assert(r >= 0);
2766     max_bytes -= bl.length();
2767   }
2768   for (epoch_t e = std::max(first, m->get_inc_first());
2769        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2770        ++e, --max) {
2771     bufferlist& bl = reply->incremental_maps[e];
2772     int r = get_version(e, features, bl);
2773     ceph_assert(r >= 0);
2774     max_bytes -= bl.length();
2775   }
2776   reply->oldest_map = first;
2777   reply->newest_map = last;
2778   mon->send_reply(op, reply);
2779   return true;
2780 }
2781
2782
2783 // ---------------------------
2784 // UPDATEs
2785
2786 // failure --
2787
2788 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2789   // check permissions
2790   MonSession *session = op->get_session();
2791   if (!session)
2792     return true;
2793   if (!session->is_capable("osd", MON_CAP_X)) {
2794     dout(0) << "got MOSDFailure from entity with insufficient caps "
2795             << session->caps << dendl;
2796     return true;
2797   }
2798   if (fsid != mon->monmap->fsid) {
2799     dout(0) << "check_source: on fsid " << fsid
2800             << " != " << mon->monmap->fsid << dendl;
2801     return true;
2802   }
2803   return false;
2804 }
2805
2806
2807 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2808 {
2809   op->mark_osdmon_event(__func__);
2810   auto m = op->get_req<MOSDFailure>();
2811   // who is target_osd
2812   int badboy = m->get_target_osd();
2813
2814   // check permissions
2815   if (check_source(op, m->fsid))
2816     goto didit;
2817
2818   // first, verify the reporting host is valid
2819   if (m->get_orig_source().is_osd()) {
2820     int from = m->get_orig_source().num();
2821     if (!osdmap.exists(from) ||
2822         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2823         (osdmap.is_down(from) && m->if_osd_failed())) {
2824       dout(5) << "preprocess_failure from dead osd." << from
2825               << ", ignoring" << dendl;
2826       send_incremental(op, m->get_epoch()+1);
2827       goto didit;
2828     }
2829   }
2830
2831
2832   // weird?
2833   if (osdmap.is_down(badboy)) {
2834     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2835             << " " << m->get_target_addrs()
2836             << ", from " << m->get_orig_source() << dendl;
2837     if (m->get_epoch() < osdmap.get_epoch())
2838       send_incremental(op, m->get_epoch()+1);
2839     goto didit;
2840   }
2841   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2842     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2843             << " " << m->get_target_addrs()
2844             << " != map's " << osdmap.get_addrs(badboy)
2845             << ", from " << m->get_orig_source() << dendl;
2846     if (m->get_epoch() < osdmap.get_epoch())
2847       send_incremental(op, m->get_epoch()+1);
2848     goto didit;
2849   }
2850
2851   // already reported?
2852   if (osdmap.is_down(badboy) ||
2853       osdmap.get_up_from(badboy) > m->get_epoch()) {
2854     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2855             << " " << m->get_target_addrs()
2856             << ", from " << m->get_orig_source() << dendl;
2857     if (m->get_epoch() < osdmap.get_epoch())
2858       send_incremental(op, m->get_epoch()+1);
2859     goto didit;
2860   }
2861
2862   if (!can_mark_down(badboy)) {
2863     dout(5) << "preprocess_failure ignoring report of osd."
2864             << m->get_target_osd() << " " << m->get_target_addrs()
2865             << " from " << m->get_orig_source() << dendl;
2866     goto didit;
2867   }
2868
2869   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2870            << " " << m->get_target_addrs()
2871            << ", from " << m->get_orig_source() << dendl;
2872   return false;
2873
2874  didit:
2875   mon->no_reply(op);
2876   return true;
2877 }
2878
2879 class C_AckMarkedDown : public C_MonOp {
2880   OSDMonitor *osdmon;
2881 public:
2882   C_AckMarkedDown(
2883     OSDMonitor *osdmon,
2884     MonOpRequestRef op)
2885     : C_MonOp(op), osdmon(osdmon) {}
2886
2887   void _finish(int r) override {
2888     if (r == 0) {
2889       auto m = op->get_req<MOSDMarkMeDown>();
2890       osdmon->mon->send_reply(
2891         op,
2892         new MOSDMarkMeDown(
2893           m->fsid,
2894           m->target_osd,
2895           m->target_addrs,
2896           m->get_epoch(),
2897           false));   // ACK itself does not request an ack
2898     } else if (r == -EAGAIN) {
2899         osdmon->dispatch(op);
2900     } else {
2901         ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2902     }
2903   }
2904   ~C_AckMarkedDown() override {
2905   }
2906 };
2907
2908 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2909 {
2910   op->mark_osdmon_event(__func__);
2911   auto m = op->get_req<MOSDMarkMeDown>();
2912   int from = m->target_osd;
2913
2914   // check permissions
2915   if (check_source(op, m->fsid))
2916     goto reply;
2917
2918   // first, verify the reporting host is valid
2919   if (!m->get_orig_source().is_osd())
2920     goto reply;
2921
2922   if (!osdmap.exists(from) ||
2923       osdmap.is_down(from) ||
2924       osdmap.get_addrs(from) != m->target_addrs) {
2925     dout(5) << "preprocess_mark_me_down from dead osd."
2926             << from << ", ignoring" << dendl;
2927     send_incremental(op, m->get_epoch()+1);
2928     goto reply;
2929   }
2930
2931   // no down might be set
2932   if (!can_mark_down(from))
2933     goto reply;
2934
2935   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2936            << " " << m->target_addrs << dendl;
2937   return false;
2938
2939  reply:
2940   if (m->request_ack) {
2941     Context *c(new C_AckMarkedDown(this, op));
2942     c->complete(0);
2943   }
2944   return true;
2945 }
2946
2947 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2948 {
2949   op->mark_osdmon_event(__func__);
2950   auto m = op->get_req<MOSDMarkMeDown>();
2951   int target_osd = m->target_osd;
2952
2953   ceph_assert(osdmap.is_up(target_osd));
2954   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2955
2956   mon->clog->info() << "osd." << target_osd << " marked itself down";
2957   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2958   if (m->request_ack)
2959     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2960   return true;
2961 }
2962
2963 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2964 {
2965   op->mark_osdmon_event(__func__);
2966   auto m = op->get_req<MOSDMarkMeDead>();
2967   int from = m->target_osd;
2968
2969   // check permissions
2970   if (check_source(op, m->fsid)) {
2971     mon->no_reply(op);
2972     return true;
2973   }
2974
2975   // first, verify the reporting host is valid
2976   if (!m->get_orig_source().is_osd()) {
2977     mon->no_reply(op);
2978     return true;
2979   }
2980
2981   if (!osdmap.exists(from) ||
2982       !osdmap.is_down(from)) {
2983     dout(5) << __func__ << " from nonexistent or up osd." << from
2984             << ", ignoring" << dendl;
2985     send_incremental(op, m->get_epoch()+1);
2986     mon->no_reply(op);
2987     return true;
2988   }
2989
2990   return false;
2991 }
2992
2993 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2994 {
2995   op->mark_osdmon_event(__func__);
2996   auto m = op->get_req<MOSDMarkMeDead>();
2997   int target_osd = m->target_osd;
2998
2999   ceph_assert(osdmap.is_down(target_osd));
3000
3001   mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
3002                     << m->get_epoch();
3003   if (!pending_inc.new_xinfo.count(target_osd)) {
3004     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3005   }
3006   pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3007   wait_for_finished_proposal(
3008     op,
3009     new LambdaContext(
3010       [op, this] (int r) {
3011         if (r >= 0) {
3012           mon->no_reply(op);      // ignore on success
3013         }
3014       }
3015       ));
3016   return true;
3017 }
3018
3019 bool OSDMonitor::can_mark_down(int i)
3020 {
3021   if (osdmap.is_nodown(i)) {
3022     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3023             << "will not mark it down" << dendl;
3024     return false;
3025   }
3026
3027   int num_osds = osdmap.get_num_osds();
3028   if (num_osds == 0) {
3029     dout(5) << __func__ << " no osds" << dendl;
3030     return false;
3031   }
3032   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3033   float up_ratio = (float)up / (float)num_osds;
3034   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3035     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3036             << g_conf()->mon_osd_min_up_ratio
3037             << ", will not mark osd." << i << " down" << dendl;
3038     return false;
3039   }
3040   return true;
3041 }
3042
3043 bool OSDMonitor::can_mark_up(int i)
3044 {
3045   if (osdmap.is_noup(i)) {
3046     dout(5) << __func__ << " osd." << i << " is marked as noup, "
3047             << "will not mark it up" << dendl;
3048     return false;
3049   }
3050
3051   return true;
3052 }
3053
3054 /**
3055  * @note the parameter @p i apparently only exists here so we can output the
3056  *       osd's id on messages.
3057  */
3058 bool OSDMonitor::can_mark_out(int i)
3059 {
3060   if (osdmap.is_noout(i)) {
3061     dout(5) << __func__ << " osd." << i << " is marked as noout, "
3062             << "will not mark it out" << dendl;
3063     return false;
3064   }
3065
3066   int num_osds = osdmap.get_num_osds();
3067   if (num_osds == 0) {
3068     dout(5) << __func__ << " no osds" << dendl;
3069     return false;
3070   }
3071   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3072   float in_ratio = (float)in / (float)num_osds;
3073   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3074     if (i >= 0)
3075       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3076               << g_conf()->mon_osd_min_in_ratio
3077               << ", will not mark osd." << i << " out" << dendl;
3078     else
3079       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3080               << g_conf()->mon_osd_min_in_ratio
3081               << ", will not mark osds out" << dendl;
3082     return false;
3083   }
3084
3085   return true;
3086 }
3087
3088 bool OSDMonitor::can_mark_in(int i)
3089 {
3090   if (osdmap.is_noin(i)) {
3091     dout(5) << __func__ << " osd." << i << " is marked as noin, "
3092             << "will not mark it in" << dendl;
3093     return false;
3094   }
3095
3096   return true;
3097 }
3098
3099 bool OSDMonitor::check_failures(utime_t now)
3100 {
3101   bool found_failure = false;
3102   auto p = failure_info.begin();
3103   while (p != failure_info.end()) {
3104     auto& [target_osd, fi] = *p;
3105     if (can_mark_down(target_osd) &&
3106         check_failure(now, target_osd, fi)) {
3107       found_failure = true;
3108       ++p;
3109     } else if (is_failure_stale(now, fi)) {
3110       dout(10) << " dropping stale failure_info for osd." << target_osd
3111                << " from " << fi.reporters.size() << " reporters"
3112                << dendl;
3113       p = failure_info.erase(p);
3114     } else {
3115       ++p;
3116     }
3117   }
3118   return found_failure;
3119 }
3120
3121 utime_t OSDMonitor::get_grace_time(utime_t now,
3122                                    int target_osd,
3123                                    failure_info_t& fi) const
3124 {
3125   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3126   if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3127     return orig_grace;
3128   }
3129   utime_t grace = orig_grace;
3130   double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3131   double decay_k = ::log(.5) / halflife;
3132
3133   // scale grace period based on historical probability of 'lagginess'
3134   // (false positive failures due to slowness).
3135   const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3136   const utime_t failed_for = now - fi.get_failed_since();
3137   double decay = exp((double)failed_for * decay_k);
3138   dout(20) << " halflife " << halflife << " decay_k " << decay_k
3139            << " failed_for " << failed_for << " decay " << decay << dendl;
3140   double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3141   grace += my_grace;
3142
3143   // consider the peers reporting a failure a proxy for a potential
3144   // 'subcluster' over the overall cluster that is similarly
3145   // laggy.  this is clearly not true in all cases, but will sometimes
3146   // help us localize the grace correction to a subset of the system
3147   // (say, a rack with a bad switch) that is unhappy.
3148   double peer_grace = 0;
3149   for (auto& [reporter, report] : fi.reporters) {
3150     if (osdmap.exists(reporter)) {
3151       const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3152       utime_t elapsed = now - xi.down_stamp;
3153       double decay = exp((double)elapsed * decay_k);
3154       peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3155     }
3156   }
3157   peer_grace /= (double)fi.reporters.size();
3158   grace += peer_grace;
3159   dout(10) << " osd." << target_osd << " has "
3160            << fi.reporters.size() << " reporters, "
3161            << grace << " grace (" << orig_grace << " + " << my_grace
3162            << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3163            << dendl;
3164
3165   return grace;
3166 }
3167
3168 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3169 {
3170   // already pending failure?
3171   if (pending_inc.new_state.count(target_osd) &&
3172       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3173     dout(10) << " already pending failure" << dendl;
3174     return true;
3175   }
3176
3177   set<string> reporters_by_subtree;
3178   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3179   ceph_assert(fi.reporters.size());
3180   for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3181     // get the parent bucket whose type matches with "reporter_subtree_level".
3182     // fall back to OSD if the level doesn't exist.
3183     if (osdmap.exists(p->first)) {
3184       auto reporter_loc = osdmap.crush->get_full_location(p->first);
3185       if (auto iter = reporter_loc.find(reporter_subtree_level);
3186           iter == reporter_loc.end()) {
3187         reporters_by_subtree.insert("osd." + to_string(p->first));
3188       } else {
3189         reporters_by_subtree.insert(iter->second);
3190       }
3191       ++p;
3192     } else {
3193       fi.cancel_report(p->first);;
3194       p = fi.reporters.erase(p);
3195     }
3196   }
3197   if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3198     return false;
3199   }
3200   const utime_t failed_for = now - fi.get_failed_since();
3201   const utime_t grace = get_grace_time(now, target_osd, fi);
3202   if (failed_for >= grace) {
3203     dout(1) << " we have enough reporters to mark osd." << target_osd
3204             << " down" << dendl;
3205     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3206
3207     mon->clog->info() << "osd." << target_osd << " failed ("
3208                       << osdmap.crush->get_full_location_ordered_string(
3209                         target_osd)
3210                       << ") ("
3211                       << (int)reporters_by_subtree.size()
3212                       << " reporters from different "
3213                       << reporter_subtree_level << " after "
3214                       << failed_for << " >= grace " << grace << ")";
3215     return true;
3216   }
3217   return false;
3218 }
3219
3220 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3221 {
3222   // if it takes too long to either cancel the report to mark the osd down,
3223   // some reporters must have failed to cancel their reports. let's just
3224   // forget these reports.
3225   const utime_t failed_for = now - fi.get_failed_since();
3226   auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3227   auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3228   return failed_for >= (heartbeat_grace + heartbeat_stale);
3229 }
3230
3231 void OSDMonitor::force_failure(int target_osd, int by)
3232 {
3233   // already pending failure?
3234   if (pending_inc.new_state.count(target_osd) &&
3235       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3236     dout(10) << " already pending failure" << dendl;
3237     return;
3238   }
3239
3240   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3241   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3242   if (!pending_inc.new_xinfo.count(target_osd)) {
3243     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3244   }
3245   pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3246
3247   mon->clog->info() << "osd." << target_osd << " failed ("
3248                     << osdmap.crush->get_full_location_ordered_string(target_osd)
3249                     << ") (connection refused reported by osd." << by << ")";
3250   return;
3251 }
3252
3253 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3254 {
3255   op->mark_osdmon_event(__func__);
3256   auto m = op->get_req<MOSDFailure>();
3257   dout(1) << "prepare_failure osd." << m->get_target_osd()
3258           << " " << m->get_target_addrs()
3259           << " from " << m->get_orig_source()
3260           << " is reporting failure:" << m->if_osd_failed() << dendl;
3261
3262   int target_osd = m->get_target_osd();
3263   int reporter = m->get_orig_source().num();
3264   ceph_assert(osdmap.is_up(target_osd));
3265   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3266
3267   mon->no_reply(op);
3268
3269   if (m->if_osd_failed()) {
3270     // calculate failure time
3271     utime_t now = ceph_clock_now();
3272     utime_t failed_since =
3273       m->get_recv_stamp() - utime_t(m->failed_for, 0);
3274
3275     // add a report
3276     if (m->is_immediate()) {
3277       mon->clog->debug() << "osd." << m->get_target_osd()
3278                          << " reported immediately failed by "
3279                          << m->get_orig_source();
3280       force_failure(target_osd, reporter);
3281       return true;
3282     }
3283     mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3284                       << m->get_orig_source();
3285
3286     failure_info_t& fi = failure_info[target_osd];
3287     fi.add_report(reporter, failed_since, op);
3288     return check_failure(now, target_osd, fi);
3289   } else {
3290     // remove the report
3291     mon->clog->debug() << "osd." << m->get_target_osd()
3292                        << " failure report canceled by "
3293                        << m->get_orig_source();
3294     if (failure_info.count(target_osd)) {
3295       failure_info_t& fi = failure_info[target_osd];
3296       fi.cancel_report(reporter);
3297       if (fi.reporters.empty()) {
3298         dout(10) << " removing last failure_info for osd." << target_osd
3299                  << dendl;
3300         failure_info.erase(target_osd);
3301       } else {
3302         dout(10) << " failure_info for osd." << target_osd << " now "
3303                  << fi.reporters.size() << " reporters" << dendl;
3304       }
3305     } else {
3306       dout(10) << " no failure_info for osd." << target_osd << dendl;
3307     }
3308   }
3309
3310   return false;
3311 }
3312
3313 void OSDMonitor::process_failures()
3314 {
3315   map<int,failure_info_t>::iterator p = failure_info.begin();
3316   while (p != failure_info.end()) {
3317     if (osdmap.is_up(p->first)) {
3318       ++p;
3319     } else {
3320       dout(10) << "process_failures osd." << p->first << dendl;
3321       list<MonOpRequestRef> ls;
3322       p->second.take_report_messages(ls);
3323       failure_info.erase(p++);
3324
3325       while (!ls.empty()) {
3326         MonOpRequestRef o = ls.front();
3327         if (o) {
3328           o->mark_event(__func__);
3329           MOSDFailure *m = o->get_req<MOSDFailure>();
3330           send_latest(o, m->get_epoch());
3331           mon->no_reply(o);
3332         }
3333         ls.pop_front();
3334       }
3335     }
3336   }
3337 }
3338
3339 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3340 {
3341   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3342
3343   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3344        p != failure_info.end();
3345        ++p) {
3346     p->second.take_report_messages(ls);
3347   }
3348   failure_info.clear();
3349 }
3350
3351 int OSDMonitor::get_grace_interval_threshold()
3352 {
3353   int halflife = g_conf()->mon_osd_laggy_halflife;
3354   // Scale the halflife period (default: 1_hr) by
3355   // a factor (48) to calculate the threshold.
3356   int grace_threshold_factor = 48;
3357   return halflife * grace_threshold_factor;
3358 }
3359
3360 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3361 {
3362   int grace_interval_threshold_secs = get_grace_interval_threshold();
3363   if (last_failed_interval > grace_interval_threshold_secs) {
3364     dout(1) << " last_failed_interval " << last_failed_interval
3365             << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3366             << dendl;
3367     return true;
3368   }
3369   return false;
3370 }
3371
3372 void OSDMonitor::set_default_laggy_params(int target_osd)
3373 {
3374   if (pending_inc.new_xinfo.count(target_osd) == 0) {
3375     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3376   }
3377   osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3378   xi.down_stamp = pending_inc.modified;
3379   xi.laggy_probability = 0.0;
3380   xi.laggy_interval = 0;
3381   dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3382 }
3383
3384
3385 // boot --
3386
3387 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3388 {
3389   op->mark_osdmon_event(__func__);
3390   auto m = op->get_req<MOSDBoot>();
3391   int from = m->get_orig_source_inst().name.num();
3392
3393   // check permissions, ignore if failed (no response expected)
3394   MonSession *session = op->get_session();
3395   if (!session)
3396     goto ignore;
3397   if (!session->is_capable("osd", MON_CAP_X)) {
3398     dout(0) << "got preprocess_boot message from entity with insufficient caps"
3399             << session->caps << dendl;
3400     goto ignore;
3401   }
3402
3403   if (m->sb.cluster_fsid != mon->monmap->fsid) {
3404     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3405             << " != " << mon->monmap->fsid << dendl;
3406     goto ignore;
3407   }
3408
3409   if (m->get_orig_source_inst().addr.is_blank_ip()) {
3410     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3411     goto ignore;
3412   }
3413
3414   ceph_assert(m->get_orig_source_inst().name.is_osd());
3415
3416   // force all osds to have gone through luminous prior to upgrade to nautilus
3417   {
3418     vector<string> missing;
3419     if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3420       missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3421     }
3422     if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3423       missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3424     }
3425     if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3426       missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3427     }
3428     if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3429       missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3430     }
3431
3432     if (!missing.empty()) {
3433       using std::experimental::make_ostream_joiner;
3434
3435       stringstream ss;
3436       copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3437
3438       mon->clog->info() << "disallowing boot of OSD "
3439                         << m->get_orig_source_inst()
3440                         << " because the osd lacks " << ss.str();
3441       goto ignore;
3442     }
3443   }
3444
3445   // make sure osd versions do not span more than 3 releases
3446   if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3447       osdmap.require_osd_release < ceph_release_t::mimic) {
3448     mon->clog->info() << "disallowing boot of octopus+ OSD "
3449                       << m->get_orig_source_inst()
3450                       << " because require_osd_release < mimic";
3451     goto ignore;
3452   }
3453
3454   // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3455   // we are reusing a jewel feature bit that was retired in luminous.
3456   if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3457       osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3458       !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3459     mon->clog->info() << "disallowing boot of OSD "
3460                       << m->get_orig_source_inst()
3461                       << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3462     goto ignore;
3463   }
3464
3465   // already booted?
3466   if (osdmap.is_up(from) &&
3467       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3468       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3469     // yup.
3470     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3471             << " " << m->get_orig_source_addrs()
3472             << " =~ " << osdmap.get_addrs(from) << dendl;
3473     _booted(op, false);
3474     return true;
3475   }
3476
3477   if (osdmap.exists(from) &&
3478       !osdmap.get_uuid(from).is_zero() &&
3479       osdmap.get_uuid(from) != m->sb.osd_fsid) {
3480     dout(7) << __func__ << " from " << m->get_orig_source_inst()
3481             << " clashes with existing osd: different fsid"
3482             << " (ours: " << osdmap.get_uuid(from)
3483             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3484     goto ignore;
3485   }
3486
3487   if (osdmap.exists(from) &&
3488       osdmap.get_info(from).up_from > m->version &&
3489       osdmap.get_most_recent_addrs(from).legacy_equals(
3490         m->get_orig_source_addrs())) {
3491     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3492     send_latest(op, m->sb.current_epoch+1);
3493     return true;
3494   }
3495
3496   // noup?
3497   if (!can_mark_up(from)) {
3498     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3499     send_latest(op, m->sb.current_epoch+1);
3500     return true;
3501   }
3502
3503   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3504   return false;
3505
3506  ignore:
3507   return true;
3508 }
3509
3510 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3511 {
3512   op->mark_osdmon_event(__func__);
3513   auto m = op->get_req<MOSDBoot>();
3514   dout(7) << __func__ << " from " << m->get_source()
3515           << " sb " << m->sb
3516           << " client_addrs" << m->get_connection()->get_peer_addrs()
3517           << " cluster_addrs " << m->cluster_addrs
3518           << " hb_back_addrs " << m->hb_back_addrs
3519           << " hb_front_addrs " << m->hb_front_addrs
3520           << dendl;
3521
3522   ceph_assert(m->get_orig_source().is_osd());
3523   int from = m->get_orig_source().num();
3524
3525   // does this osd exist?
3526   if (from >= osdmap.get_max_osd()) {
3527     dout(1) << "boot from osd." << from << " >= max_osd "
3528             << osdmap.get_max_osd() << dendl;
3529     return false;
3530   }
3531
3532   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3533   if (pending_inc.new_state.count(from))
3534     oldstate ^= pending_inc.new_state[from];
3535
3536   // already up?  mark down first?
3537   if (osdmap.is_up(from)) {
3538     dout(7) << __func__ << " was up, first marking down osd." << from << " "
3539             << osdmap.get_addrs(from) << dendl;
3540     // preprocess should have caught these;  if not, assert.
3541     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3542                   m->get_orig_source_addrs()) ||
3543                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3544     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3545
3546     if (pending_inc.new_state.count(from) == 0 ||
3547         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3548       // mark previous guy down
3549       pending_inc.new_state[from] = CEPH_OSD_UP;
3550     }
3551     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3552   } else if (pending_inc.new_up_client.count(from)) {
3553     // already prepared, just wait
3554     dout(7) << __func__ << " already prepared, waiting on "
3555             << m->get_orig_source_addr() << dendl;
3556     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3557   } else {
3558     // mark new guy up.
3559     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3560     pending_inc.new_up_cluster[from] = m->cluster_addrs;
3561     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3562     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3563
3564     down_pending_out.erase(from);  // if any
3565
3566     if (m->sb.weight)
3567       osd_weight[from] = m->sb.weight;
3568
3569     // set uuid?
3570     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3571              << dendl;
3572     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3573       // preprocess should have caught this;  if not, assert.
3574       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3575       pending_inc.new_uuid[from] = m->sb.osd_fsid;
3576     }
3577
3578     // fresh osd?
3579     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3580       const osd_info_t& i = osdmap.get_info(from);
3581       if (i.up_from > i.lost_at) {
3582         dout(10) << " fresh osd; marking lost_at too" << dendl;
3583         pending_inc.new_lost[from] = osdmap.get_epoch();
3584       }
3585     }
3586
3587     // metadata
3588     bufferlist osd_metadata;
3589     encode(m->metadata, osd_metadata);
3590     pending_metadata[from] = osd_metadata;
3591     pending_metadata_rm.erase(from);
3592
3593     // adjust last clean unmount epoch?
3594     const osd_info_t& info = osdmap.get_info(from);
3595     dout(10) << " old osd_info: " << info << dendl;
3596     if (m->sb.mounted > info.last_clean_begin ||
3597         (m->sb.mounted == info.last_clean_begin &&
3598          m->sb.clean_thru > info.last_clean_end)) {
3599       epoch_t begin = m->sb.mounted;
3600       epoch_t end = m->sb.clean_thru;
3601
3602       dout(10) << __func__ << " osd." << from << " last_clean_interval "
3603                << "[" << info.last_clean_begin << "," << info.last_clean_end
3604                << ") -> [" << begin << "-" << end << ")"
3605                << dendl;
3606       pending_inc.new_last_clean_interval[from] =
3607         pair<epoch_t,epoch_t>(begin, end);
3608     }
3609
3610     if (pending_inc.new_xinfo.count(from) == 0)
3611       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3612     osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3613     if (m->boot_epoch == 0) {
3614       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3615       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3616       dout(10) << " not laggy, new xi " << xi << dendl;
3617     } else {
3618       if (xi.down_stamp.sec()) {
3619         int interval = ceph_clock_now().sec() -
3620           xi.down_stamp.sec();
3621         if (g_conf()->mon_osd_laggy_max_interval &&
3622             (interval > g_conf()->mon_osd_laggy_max_interval)) {
3623           interval =  g_conf()->mon_osd_laggy_max_interval;
3624         }
3625         xi.laggy_interval =
3626           interval * g_conf()->mon_osd_laggy_weight +
3627           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3628       }
3629       xi.laggy_probability =
3630         g_conf()->mon_osd_laggy_weight +
3631         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3632       dout(10) << " laggy, now xi " << xi << dendl;
3633     }
3634
3635     // set features shared by the osd
3636     if (m->osd_features)
3637       xi.features = m->osd_features;
3638     else
3639       xi.features = m->get_connection()->get_features();
3640
3641     // mark in?
3642     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3643          (oldstate & CEPH_OSD_AUTOOUT)) ||
3644         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3645         (g_conf()->mon_osd_auto_mark_in)) {
3646       if (can_mark_in(from)) {
3647         if (xi.old_weight > 0) {
3648           pending_inc.new_weight[from] = xi.old_weight;
3649           xi.old_weight = 0;
3650         } else {
3651           pending_inc.new_weight[from] = CEPH_OSD_IN;
3652         }
3653       } else {
3654         dout(7) << __func__ << " NOIN set, will not mark in "
3655                 << m->get_orig_source_addr() << dendl;
3656       }
3657     }
3658
3659     // wait
3660     wait_for_finished_proposal(op, new C_Booted(this, op));
3661   }
3662   return true;
3663 }
3664
3665 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3666 {
3667   op->mark_osdmon_event(__func__);
3668   auto m = op->get_req<MOSDBoot>();
3669   dout(7) << "_booted " << m->get_orig_source_inst()
3670           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3671
3672   if (logit) {
3673     mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3674                       << " boot";
3675   }
3676
3677   send_latest(op, m->sb.current_epoch+1);
3678 }
3679
3680
3681 // -------------
3682 // full
3683
3684 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3685 {
3686   op->mark_osdmon_event(__func__);
3687   auto m = op->get_req<MOSDFull>();
3688   int from = m->get_orig_source().num();
3689   set<string> state;
3690   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3691
3692   // check permissions, ignore if failed
3693   MonSession *session = op->get_session();
3694   if (!session)
3695     goto ignore;
3696   if (!session->is_capable("osd", MON_CAP_X)) {
3697     dout(0) << "MOSDFull from entity with insufficient privileges:"
3698             << session->caps << dendl;
3699     goto ignore;
3700   }
3701
3702   // ignore a full message from the osd instance that already went down
3703   if (!osdmap.exists(from)) {
3704     dout(7) << __func__ << " ignoring full message from nonexistent "
3705             << m->get_orig_source_inst() << dendl;
3706     goto ignore;
3707   }
3708   if ((!osdmap.is_up(from) &&
3709        osdmap.get_most_recent_addrs(from).legacy_equals(
3710          m->get_orig_source_addrs())) ||
3711       (osdmap.is_up(from) &&
3712        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3713     dout(7) << __func__ << " ignoring full message from down "
3714             << m->get_orig_source_inst() << dendl;
3715     goto ignore;
3716   }
3717
3718   OSDMap::calc_state_set(osdmap.get_state(from), state);
3719
3720   if ((osdmap.get_state(from) & mask) == m->state) {
3721     dout(7) << __func__ << " state already " << state << " for osd." << from
3722             << " " << m->get_orig_source_inst() << dendl;
3723     _reply_map(op, m->version);
3724     goto ignore;
3725   }
3726
3727   dout(10) << __func__ << " want state " << state << " for osd." << from
3728            << " " << m->get_orig_source_inst() << dendl;
3729   return false;
3730
3731  ignore:
3732   return true;
3733 }
3734
3735 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3736 {
3737   op->mark_osdmon_event(__func__);
3738   auto m = op->get_req<MOSDFull>();
3739   const int from = m->get_orig_source().num();
3740
3741   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3742   const unsigned want_state = m->state & mask;  // safety first
3743
3744   unsigned cur_state = osdmap.get_state(from);
3745   auto p = pending_inc.new_state.find(from);
3746   if (p != pending_inc.new_state.end()) {
3747     cur_state ^= p->second;
3748   }
3749   cur_state &= mask;
3750
3751   set<string> want_state_set, cur_state_set;
3752   OSDMap::calc_state_set(want_state, want_state_set);
3753   OSDMap::calc_state_set(cur_state, cur_state_set);
3754
3755   if (cur_state != want_state) {
3756     if (p != pending_inc.new_state.end()) {
3757       p->second &= ~mask;
3758     } else {
3759       pending_inc.new_state[from] = 0;
3760     }
3761     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3762     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3763             << " -> " << want_state_set << dendl;
3764   } else {
3765     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3766             << " = wanted " << want_state_set << ", just waiting" << dendl;
3767   }
3768
3769   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3770   return true;
3771 }
3772
3773 // -------------
3774 // alive
3775
3776 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3777 {
3778   op->mark_osdmon_event(__func__);
3779   auto m = op->get_req<MOSDAlive>();
3780   int from = m->get_orig_source().num();
3781
3782   // check permissions, ignore if failed
3783   MonSession *session = op->get_session();
3784   if (!session)
3785     goto ignore;
3786   if (!session->is_capable("osd", MON_CAP_X)) {
3787     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3788             << session->caps << dendl;
3789     goto ignore;
3790   }
3791
3792   if (!osdmap.is_up(from) ||
3793       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3794     dout(7) << "preprocess_alive ignoring alive message from down "
3795             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3796             << dendl;
3797     goto ignore;
3798   }
3799
3800   if (osdmap.get_up_thru(from) >= m->want) {
3801     // yup.
3802     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3803     _reply_map(op, m->version);
3804     return true;
3805   }
3806
3807   dout(10) << "preprocess_alive want up_thru " << m->want
3808            << " from " << m->get_orig_source_inst() << dendl;
3809   return false;
3810
3811  ignore:
3812   return true;
3813 }
3814
3815 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3816 {
3817   op->mark_osdmon_event(__func__);
3818   auto m = op->get_req<MOSDAlive>();
3819   int from = m->get_orig_source().num();
3820
3821   if (0) {  // we probably don't care much about these
3822     mon->clog->debug() << m->get_orig_source_inst() << " alive";
3823   }
3824
3825   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3826           << " from " << m->get_orig_source_inst() << dendl;
3827
3828   update_up_thru(from, m->version); // set to the latest map the OSD has
3829   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3830   return true;
3831 }
3832
3833 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3834 {
3835   op->mark_osdmon_event(__func__);
3836   dout(7) << "_reply_map " << e
3837           << " from " << op->get_req()->get_orig_source_inst()
3838           << dendl;
3839   send_latest(op, e);
3840 }
3841
3842 // pg_created
3843 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3844 {
3845   op->mark_osdmon_event(__func__);
3846   auto m  = op->get_req<MOSDPGCreated>();
3847   dout(10) << __func__ << " " << *m << dendl;
3848   auto session = op->get_session();
3849   mon->no_reply(op);
3850   if (!session) {
3851     dout(10) << __func__ << ": no monitor session!" << dendl;
3852     return true;
3853   }
3854   if (!session->is_capable("osd", MON_CAP_X)) {
3855     derr << __func__ << " received from entity "
3856          << "with insufficient privileges " << session->caps << dendl;
3857     return true;
3858   }
3859   // always forward the "created!" to the leader
3860   return false;
3861 }
3862
3863 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3864 {
3865   op->mark_osdmon_event(__func__);
3866   auto m = op->get_req<MOSDPGCreated>();
3867   dout(10) << __func__ << " " << *m << dendl;
3868   auto src = m->get_orig_source();
3869   auto from = src.num();
3870   if (!src.is_osd() ||
3871       !mon->osdmon()->osdmap.is_up(from) ||
3872       !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3873         m->get_orig_source_addrs())) {
3874     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3875     return false;
3876   }
3877   pending_created_pgs.push_back(m->pgid);
3878   return true;
3879 }
3880
3881 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3882 {
3883   op->mark_osdmon_event(__func__);
3884   auto m = op->get_req<MOSDPGReadyToMerge>();
3885   dout(10) << __func__ << " " << *m << dendl;
3886   const pg_pool_t *pi;
3887   auto session = op->get_session();
3888   if (!session) {
3889     dout(10) << __func__ << ": no monitor session!" << dendl;
3890     goto ignore;
3891   }
3892   if (!session->is_capable("osd", MON_CAP_X)) {
3893     derr << __func__ << " received from entity "
3894          << "with insufficient privileges " << session->caps << dendl;
3895     goto ignore;
3896   }
3897   pi = osdmap.get_pg_pool(m->pgid.pool());
3898   if (!pi) {
3899     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3900     goto ignore;
3901   }
3902   if (pi->get_pg_num() <= m->pgid.ps()) {
3903     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3904     goto ignore;
3905   }
3906   if (pi->get_pg_num() != m->pgid.ps() + 1) {
3907     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3908     goto ignore;
3909   }
3910   if (pi->get_pg_num_pending() > m->pgid.ps()) {
3911     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3912     goto ignore;
3913   }
3914   return false;
3915
3916  ignore:
3917   mon->no_reply(op);
3918   return true;
3919 }
3920
3921 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3922 {
3923   op->mark_osdmon_event(__func__);
3924   auto m  = op->get_req<MOSDPGReadyToMerge>();
3925   dout(10) << __func__ << " " << *m << dendl;
3926   pg_pool_t p;
3927   if (pending_inc.new_pools.count(m->pgid.pool()))
3928     p = pending_inc.new_pools[m->pgid.pool()];
3929   else
3930     p = *osdmap.get_pg_pool(m->pgid.pool());
3931   if (p.get_pg_num() != m->pgid.ps() + 1 ||
3932       p.get_pg_num_pending() > m->pgid.ps()) {
3933     dout(10) << __func__
3934              << " race with concurrent pg_num[_pending] update, will retry"
3935              << dendl;
3936     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3937     return true;
3938   }
3939
3940   if (m->ready) {
3941     p.dec_pg_num(m->pgid,
3942                  pending_inc.epoch,
3943                  m->source_version,
3944                  m->target_version,
3945                  m->last_epoch_started,
3946                  m->last_epoch_clean);
3947     p.last_change = pending_inc.epoch;
3948   } else {
3949     // back off the merge attempt!
3950     p.set_pg_num_pending(p.get_pg_num());
3951   }
3952
3953   // force pre-nautilus clients to resend their ops, since they
3954   // don't understand pg_num_pending changes form a new interval
3955   p.last_force_op_resend_prenautilus = pending_inc.epoch;
3956
3957   pending_inc.new_pools[m->pgid.pool()] = p;
3958
3959   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3960   if (m->ready &&
3961       prob > 0 &&
3962       prob > (double)(rand() % 1000)/1000.0) {
3963     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3964     auto n = new MMonCommand(mon->monmap->get_fsid());
3965     n->set_connection(m->get_connection());
3966     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3967                osdmap.get_pool_name(m->pgid.pool()) +
3968                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3969                stringify(m->pgid.ps() + 1) + "\"}" };
3970     MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3971     nop->set_type_service();
3972     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3973   } else {
3974     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3975   }
3976   return true;
3977 }
3978
3979
3980 // -------------
3981 // pg_temp changes
3982
3983 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3984 {
3985   auto m = op->get_req<MOSDPGTemp>();
3986   dout(10) << "preprocess_pgtemp " << *m << dendl;
3987   mempool::osdmap::vector<int> empty;
3988   int from = m->get_orig_source().num();
3989   size_t ignore_cnt = 0;
3990
3991   // check caps
3992   MonSession *session = op->get_session();
3993   if (!session)
3994     goto ignore;
3995   if (!session->is_capable("osd", MON_CAP_X)) {
3996     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3997             << session->caps << dendl;
3998     goto ignore;
3999   }
4000
4001   if (!osdmap.is_up(from) ||
4002       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4003     dout(7) << "ignoring pgtemp message from down "
4004             << m->get_orig_source() << " " << m->get_orig_source_addrs()
4005             << dendl;
4006     goto ignore;
4007   }
4008
4009   if (m->forced) {
4010     return false;
4011   }
4012
4013   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4014     dout(20) << " " << p->first
4015              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4016              << " -> " << p->second << dendl;
4017
4018     // does the pool exist?
4019     if (!osdmap.have_pg_pool(p->first.pool())) {
4020       /*
4021        * 1. If the osdmap does not have the pool, it means the pool has been
4022        *    removed in-between the osd sending this message and us handling it.
4023        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4024        *    not exist in the pending either, as the osds would not send a
4025        *    message about a pool they know nothing about (yet).
4026        * 3. However, if the pool does exist in the pending, then it must be a
4027        *    new pool, and not relevant to this message (see 1).
4028        */
4029       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4030                << ": pool has been removed" << dendl;
4031       ignore_cnt++;
4032       continue;
4033     }
4034
4035     int acting_primary = -1;
4036     osdmap.pg_to_up_acting_osds(
4037       p->first, nullptr, nullptr, nullptr, &acting_primary);
4038     if (acting_primary != from) {
4039       /* If the source isn't the primary based on the current osdmap, we know
4040        * that the interval changed and that we can discard this message.
4041        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4042        * which of two pg temp mappings on the same pg is more recent.
4043        */
4044       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4045                << ": primary has changed" << dendl;
4046       ignore_cnt++;
4047       continue;
4048     }
4049
4050     // removal?
4051     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4052                               osdmap.primary_temp->count(p->first)))
4053       return false;
4054     // change?
4055     //  NOTE: we assume that this will clear pg_primary, so consider
4056     //        an existing pg_primary field to imply a change
4057     if (p->second.size() &&
4058         (osdmap.pg_temp->count(p->first) == 0 ||
4059          osdmap.pg_temp->get(p->first) != p->second ||
4060          osdmap.primary_temp->count(p->first)))
4061       return false;
4062   }
4063
4064   // should we ignore all the pgs?
4065   if (ignore_cnt == m->pg_temp.size())
4066     goto ignore;
4067
4068   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4069   _reply_map(op, m->map_epoch);
4070   return true;
4071
4072  ignore:
4073   mon->no_reply(op);
4074   return true;
4075 }
4076
4077 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4078 {
4079   epoch_t old_up_thru = osdmap.get_up_thru(from);
4080   auto ut = pending_inc.new_up_thru.find(from);
4081   if (ut != pending_inc.new_up_thru.end()) {
4082     old_up_thru = ut->second;
4083   }
4084   if (up_thru > old_up_thru) {
4085     // set up_thru too, so the osd doesn't have to ask again
4086     pending_inc.new_up_thru[from] = up_thru;
4087   }
4088 }
4089
4090 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4091 {
4092   op->mark_osdmon_event(__func__);
4093   auto m = op->get_req<MOSDPGTemp>();
4094   int from = m->get_orig_source().num();
4095   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4096   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4097     uint64_t pool = p->first.pool();
4098     if (pending_inc.old_pools.count(pool)) {
4099       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4100                << ": pool pending removal" << dendl;
4101       continue;
4102     }
4103     if (!osdmap.have_pg_pool(pool)) {
4104       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4105                << ": pool has been removed" << dendl;
4106       continue;
4107     }
4108     pending_inc.new_pg_temp[p->first] =
4109       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4110
4111     // unconditionally clear pg_primary (until this message can encode
4112     // a change for that, too.. at which point we need to also fix
4113     // preprocess_pg_temp)
4114     if (osdmap.primary_temp->count(p->first) ||
4115         pending_inc.new_primary_temp.count(p->first))
4116       pending_inc.new_primary_temp[p->first] = -1;
4117   }
4118
4119   // set up_thru too, so the osd doesn't have to ask again
4120   update_up_thru(from, m->map_epoch);
4121
4122   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4123   return true;
4124 }
4125
4126
4127 // ---
4128
4129 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4130 {
4131   op->mark_osdmon_event(__func__);
4132   auto m = op->get_req<MRemoveSnaps>();
4133   dout(7) << "preprocess_remove_snaps " << *m << dendl;
4134
4135   // check privilege, ignore if failed
4136   MonSession *session = op->get_session();
4137   mon->no_reply(op);
4138   if (!session)
4139     goto ignore;
4140   if (!session->caps.is_capable(
4141         cct,
4142         session->entity_name,
4143         "osd", "osd pool rmsnap", {}, true, true, false,
4144         session->get_peer_socket_addr())) {
4145     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4146             << session->caps << dendl;
4147     goto ignore;
4148   }
4149
4150   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4151        q != m->snaps.end();
4152        ++q) {
4153     if (!osdmap.have_pg_pool(q->first)) {
4154       dout(10) << " ignoring removed_snaps " << q->second
4155                << " on non-existent pool " << q->first << dendl;
4156       continue;
4157     }
4158     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4159     for (vector<snapid_t>::iterator p = q->second.begin();
4160          p != q->second.end();
4161          ++p) {
4162       if (*p > pi->get_snap_seq() ||
4163           !_is_removed_snap(q->first, *p)) {
4164         return false;
4165       }
4166     }
4167   }
4168
4169   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4170     auto reply = make_message<MRemoveSnaps>();
4171     reply->snaps = m->snaps;
4172     mon->send_reply(op, reply.detach());
4173   }
4174
4175  ignore:
4176   return true;
4177 }
4178
4179 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4180 {
4181   op->mark_osdmon_event(__func__);
4182   auto m = op->get_req<MRemoveSnaps>();
4183   dout(7) << "prepare_remove_snaps " << *m << dendl;
4184
4185   for (auto& [pool, snaps] : m->snaps) {
4186     if (!osdmap.have_pg_pool(pool)) {
4187       dout(10) << " ignoring removed_snaps " << snaps
4188                << " on non-existent pool " << pool << dendl;
4189       continue;
4190     }
4191
4192     pg_pool_t& pi = osdmap.pools[pool];
4193     for (auto s : snaps) {
4194       if (!_is_removed_snap(pool, s) &&
4195           (!pending_inc.new_pools.count(pool) ||
4196            !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4197           (!pending_inc.new_removed_snaps.count(pool) ||
4198            !pending_inc.new_removed_snaps[pool].contains(s))) {
4199         pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4200         if (osdmap.require_osd_release < ceph_release_t::octopus) {
4201           newpi->removed_snaps.insert(s);
4202           dout(10) << " pool " << pool << " removed_snaps added " << s
4203                    << " (now " << newpi->removed_snaps << ")" << dendl;
4204         }
4205         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4206         if (s > newpi->get_snap_seq()) {
4207           dout(10) << " pool " << pool << " snap_seq "
4208                    << newpi->get_snap_seq() << " -> " << s << dendl;
4209           newpi->set_snap_seq(s);
4210         }
4211         newpi->set_snap_epoch(pending_inc.epoch);
4212         dout(10) << " added pool " << pool << " snap " << s
4213                  << " to removed_snaps queue" << dendl;
4214         pending_inc.new_removed_snaps[pool].insert(s);
4215       }
4216     }
4217   }
4218
4219   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4220     auto reply = make_message<MRemoveSnaps>();
4221     reply->snaps = m->snaps;
4222     wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4223   }
4224
4225   return true;
4226 }
4227
4228 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4229 {
4230   op->mark_osdmon_event(__func__);
4231   auto m = op->get_req<MMonGetPurgedSnaps>();
4232   dout(7) << __func__ << " " << *m << dendl;
4233
4234   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4235
4236   string k = make_purged_snap_epoch_key(m->start);
4237   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4238   it->upper_bound(k);
4239   unsigned long epoch = m->last;
4240   while (it->valid()) {
4241     if (it->key().find("purged_epoch_") != 0) {
4242       break;
4243     }
4244     string k = it->key();
4245     int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4246     if (n != 1) {
4247       derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4248     } else if (epoch > m->last) {
4249       break;
4250     } else {
4251       bufferlist bl = it->value();
4252       auto p = bl.cbegin();
4253       auto &v = r[epoch];
4254       try {
4255         ceph::decode(v, p);
4256       } catch (buffer::error& e) {
4257         derr << __func__ << " unable to parse value for key '" << it->key()
4258              << "': \n";
4259         bl.hexdump(*_dout);
4260         *_dout << dendl;
4261       }
4262       n += 4 + v.size() * 16;
4263     }
4264     if (n > 1048576) {
4265       // impose a semi-arbitrary limit to message size
4266       break;
4267     }
4268     it->next();
4269   }
4270
4271   auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4272   reply->purged_snaps.swap(r);
4273   mon->send_reply(op, reply.detach());
4274
4275   return true;
4276 }
4277
4278 // osd beacon
4279 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4280 {
4281   op->mark_osdmon_event(__func__);
4282   // check caps
4283   auto session = op->get_session();
4284   mon->no_reply(op);
4285   if (!session) {
4286     dout(10) << __func__ << " no monitor session!" << dendl;
4287     return true;
4288   }
4289   if (!session->is_capable("osd", MON_CAP_X)) {
4290     derr << __func__ << " received from entity "
4291          << "with insufficient privileges " << session->caps << dendl;
4292     return true;
4293   }
4294   // Always forward the beacon to the leader, even if they are the same as
4295   // the old one. The leader will mark as down osds that haven't sent
4296   // beacon for a few minutes.
4297   return false;
4298 }
4299
4300 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4301 {
4302   op->mark_osdmon_event(__func__);
4303   const auto beacon = op->get_req<MOSDBeacon>();
4304   const auto src = beacon->get_orig_source();
4305   dout(10) << __func__ << " " << *beacon
4306            << " from " << src << dendl;
4307   int from = src.num();
4308
4309   if (!src.is_osd() ||
4310       !osdmap.is_up(from) ||
4311       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4312     if (src.is_osd() && !osdmap.is_up(from)) {
4313       // share some new maps with this guy in case it may not be
4314       // aware of its own deadness...
4315       send_latest(op, beacon->version+1);
4316     }
4317     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4318     return false;
4319   }
4320
4321   last_osd_report[from] = ceph_clock_now();
4322   osd_epochs[from] = beacon->version;
4323
4324   for (const auto& pg : beacon->pgs) {
4325     last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4326   }
4327
4328   if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4329       beacon->last_purged_snaps_scrub) {
4330     if (pending_inc.new_xinfo.count(from) == 0) {
4331       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4332     }
4333     pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4334       beacon->last_purged_snaps_scrub;
4335     return true;
4336   } else {
4337     return false;
4338   }
4339 }
4340
4341 // ---------------
4342 // map helpers
4343
4344 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4345 {
4346   op->mark_osdmon_event(__func__);
4347   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4348           << " start " << start << dendl;
4349   if (start == 0)
4350     send_full(op);
4351   else
4352     send_incremental(op, start);
4353 }
4354
4355
4356 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4357 {
4358   MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4359   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4360   r->oldest_map = get_first_committed();
4361   r->newest_map = osdmap.get_epoch();
4362   return r;
4363 }
4364
4365 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4366 {
4367   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4368            << std::hex << features << std::dec << dendl;
4369   MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
4370   m->oldest_map = get_first_committed();
4371   m->newest_map = osdmap.get_epoch();
4372
4373   for (epoch_t e = to; e >= from && e > 0; e--) {
4374     bufferlist bl;
4375     int err = get_version(e, features, bl);
4376     if (err == 0) {
4377       ceph_assert(bl.length());
4378       // if (get_version(e, bl) > 0) {
4379       dout(20) << "build_incremental    inc " << e << " "
4380                << bl.length() << " bytes" << dendl;
4381       m->incremental_maps[e] = bl;
4382     } else {
4383       ceph_assert(err == -ENOENT);
4384       ceph_assert(!bl.length());
4385       get_version_full(e, features, bl);
4386       if (bl.length() > 0) {
4387       //else if (get_version("full", e, bl) > 0) {
4388       dout(20) << "build_incremental   full " << e << " "
4389                << bl.length() << " bytes" << dendl;
4390       m->maps[e] = bl;
4391       } else {
4392         ceph_abort();  // we should have all maps.
4393       }
4394     }
4395   }
4396   return m;
4397 }
4398
4399 void OSDMonitor::send_full(MonOpRequestRef op)
4400 {
4401   op->mark_osdmon_event(__func__);
4402   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4403   mon->send_reply(op, build_latest_full(op->get_session()->con_features));
4404 }
4405
4406 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4407 {
4408   op->mark_osdmon_event(__func__);
4409
4410   MonSession *s = op->get_session();
4411   ceph_assert(s);
4412
4413   if (s->proxy_con) {
4414     // oh, we can tell the other mon to do it
4415     dout(10) << __func__ << " asking proxying mon to send_incremental from "
4416              << first << dendl;
4417     MRoute *r = new MRoute(s->proxy_tid, NULL);
4418     r->send_osdmap_first = first;
4419     s->proxy_con->send_message(r);
4420     op->mark_event("reply: send routed send_osdmap_first reply");
4421   } else {
4422     // do it ourselves
4423     send_incremental(first, s, false, op);
4424   }
4425 }
4426
4427 void OSDMonitor::send_incremental(epoch_t first,
4428                                   MonSession *session,
4429                                   bool onetime,
4430                                   MonOpRequestRef req)
4431 {
4432   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4433           << " to " << session->name << dendl;
4434
4435   // get feature of the peer
4436   // use quorum_con_features, if it's an anonymous connection.
4437   uint64_t features = session->con_features ? session->con_features :
4438     mon->get_quorum_con_features();
4439
4440   if (first <= session->osd_epoch) {
4441     dout(10) << __func__ << " " << session->name << " should already have epoch "
4442              << session->osd_epoch << dendl;
4443     first = session->osd_epoch + 1;
4444   }
4445
4446   if (first < get_first_committed()) {
4447     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4448     m->oldest_map = get_first_committed();
4449     m->newest_map = osdmap.get_epoch();
4450
4451     first = get_first_committed();
4452     bufferlist bl;
4453     int err = get_version_full(first, features, bl);
4454     ceph_assert(err == 0);
4455     ceph_assert(bl.length());
4456     dout(20) << "send_incremental starting with base full "
4457              << first << " " << bl.length() << " bytes" << dendl;
4458     m->maps[first] = bl;
4459
4460     if (req) {
4461       mon->send_reply(req, m);
4462       session->osd_epoch = first;
4463       return;
4464     } else {
4465       session->con->send_message(m);
4466       session->osd_epoch = first;
4467     }
4468     first++;
4469   }
4470
4471   while (first <= osdmap.get_epoch()) {
4472     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4473                                      osdmap.get_epoch());
4474     MOSDMap *m = build_incremental(first, last, features);
4475
4476     if (req) {
4477       // send some maps.  it may not be all of them, but it will get them
4478       // started.
4479       mon->send_reply(req, m);
4480     } else {
4481       session->con->send_message(m);
4482       first = last + 1;
4483     }
4484     session->osd_epoch = last;
4485     if (onetime || req)
4486       break;
4487   }
4488 }
4489
4490 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4491 {
4492   return get_version(ver, mon->get_quorum_con_features(), bl);
4493 }
4494
4495 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4496 {
4497   OSDMap::Incremental inc;
4498   auto q = bl.cbegin();
4499   inc.decode(q);
4500   // always encode with subset of osdmap's canonical features
4501   uint64_t f = features & inc.encode_features;
4502   dout(20) << __func__ << " " << inc.epoch << " with features " << f
4503            << dendl;
4504   bl.clear();
4505   if (inc.fullmap.length()) {
4506     // embedded full map?
4507     OSDMap m;
4508     m.decode(inc.fullmap);
4509     inc.fullmap.clear();
4510     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4511   }
4512   if (inc.crush.length()) {
4513     // embedded crush map
4514     CrushWrapper c;
4515     auto p = inc.crush.cbegin();
4516     c.decode(p);
4517     inc.crush.clear();
4518     c.encode(inc.crush, f);
4519   }
4520   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4521 }
4522
4523 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4524 {
4525   OSDMap m;
4526   auto q = bl.cbegin();
4527   m.decode(q);
4528   // always encode with subset of osdmap's canonical features
4529   uint64_t f = features & m.get_encoding_features();
4530   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4531            << dendl;
4532   bl.clear();
4533   m.encode(bl, f | CEPH_FEATURE_RESERVED);
4534 }
4535
4536 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4537 {
4538   uint64_t significant_features = OSDMap::get_significant_features(features);
4539   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4540     return 0;
4541   }
4542   int ret = PaxosService::get_version(ver, bl);
4543   if (ret < 0) {
4544     return ret;
4545   }
4546   // NOTE: this check is imprecise; the OSDMap encoding features may
4547   // be a subset of the latest mon quorum features, but worst case we
4548   // reencode once and then cache the (identical) result under both
4549   // feature masks.
4550   if (significant_features !=
4551       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4552     reencode_incremental_map(bl, features);
4553   }
4554   inc_osd_cache.add_bytes({ver, significant_features}, bl);
4555   return 0;
4556 }
4557
4558 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4559 {
4560   bufferlist inc_bl;
4561   int err = get_version(ver, inc_bl);
4562   ceph_assert(err == 0);
4563   ceph_assert(inc_bl.length());
4564
4565   auto p = inc_bl.cbegin();
4566   inc.decode(p);
4567   dout(10) << __func__ << "     "
4568            << " epoch " << inc.epoch
4569            << " inc_crc " << inc.inc_crc
4570            << " full_crc " << inc.full_crc
4571            << " encode_features " << inc.encode_features << dendl;
4572   return 0;
4573 }
4574
4575 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4576 {
4577   dout(10) << __func__ << " ver " << ver << dendl;
4578
4579   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4580   if (closest_pinned == 0) {
4581     return -ENOENT;
4582   }
4583   if (closest_pinned > ver) {
4584     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4585   }
4586   ceph_assert(closest_pinned <= ver);
4587
4588   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4589
4590   // get osdmap incremental maps and apply on top of this one.
4591   bufferlist osdm_bl;
4592   bool has_cached_osdmap = false;
4593   for (version_t v = ver-1; v >= closest_pinned; --v) {
4594     if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4595                                 &osdm_bl)) {
4596       dout(10) << __func__ << " found map in cache ver " << v << dendl;
4597       closest_pinned = v;
4598       has_cached_osdmap = true;
4599       break;
4600     }
4601   }
4602
4603   if (!has_cached_osdmap) {
4604     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4605     if (err != 0) {
4606       derr << __func__ << " closest pinned map ver " << closest_pinned
4607            << " not available! error: " << cpp_strerror(err) << dendl;
4608     }
4609     ceph_assert(err == 0);
4610   }
4611
4612   ceph_assert(osdm_bl.length());
4613
4614   OSDMap osdm;
4615   osdm.decode(osdm_bl);
4616
4617   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4618            << " e" << osdm.epoch
4619            << " crc " << osdm.get_crc()
4620            << " -- applying incremental maps." << dendl;
4621
4622   uint64_t encode_features = 0;
4623   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4624     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
4625
4626     OSDMap::Incremental inc;
4627     int err = get_inc(v, inc);
4628     ceph_assert(err == 0);
4629
4630     encode_features = inc.encode_features;
4631
4632     err = osdm.apply_incremental(inc);
4633     ceph_assert(err == 0);
4634
4635     // this block performs paranoid checks on map retrieval
4636     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4637         inc.full_crc != 0) {
4638
4639       uint64_t f = encode_features;
4640       if (!f) {
4641         f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4642       }
4643
4644       // encode osdmap to force calculating crcs
4645       bufferlist tbl;
4646       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4647       // decode osdmap to compare crcs with what's expected by incremental
4648       OSDMap tosdm;
4649       tosdm.decode(tbl);
4650
4651       if (tosdm.get_crc() != inc.full_crc) {
4652         derr << __func__
4653              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4654              << ", expected " << inc.full_crc << ")" << dendl;
4655         ceph_abort_msg("osdmap crc mismatch");
4656       }
4657     }
4658
4659     // note: we cannot add the recently computed map to the cache, as is,
4660     // because we have not encoded the map into a bl.
4661   }
4662
4663   if (!encode_features) {
4664     dout(10) << __func__
4665              << " last incremental map didn't have features;"
4666              << " defaulting to quorum's or all" << dendl;
4667     encode_features =
4668       (mon->quorum_con_features ? mon->quorum_con_features : -1);
4669   }
4670   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4671
4672   return 0;
4673 }
4674
4675 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4676 {
4677   return get_version_full(ver, mon->get_quorum_con_features(), bl);
4678 }
4679
4680 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4681                                  bufferlist& bl)
4682 {
4683   uint64_t significant_features = OSDMap::get_significant_features(features);
4684   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4685     return 0;
4686   }
4687   int ret = PaxosService::get_version_full(ver, bl);
4688   if (ret == -ENOENT) {
4689     // build map?
4690     ret = get_full_from_pinned_map(ver, bl);
4691   }
4692   if (ret < 0) {
4693     return ret;
4694   }
4695   // NOTE: this check is imprecise; the OSDMap encoding features may
4696   // be a subset of the latest mon quorum features, but worst case we
4697   // reencode once and then cache the (identical) result under both
4698   // feature masks.
4699   if (significant_features !=
4700       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4701     reencode_full_map(bl, features);
4702   }
4703   full_osd_cache.add_bytes({ver, significant_features}, bl);
4704   return 0;
4705 }
4706
4707 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4708 {
4709   dout(10) << "blacklist " << av << " until " << until << dendl;
4710   for (auto a : av.v) {
4711     if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4712       a.set_type(entity_addr_t::TYPE_ANY);
4713     } else {
4714       a.set_type(entity_addr_t::TYPE_LEGACY);
4715     }
4716     pending_inc.new_blacklist[a] = until;
4717   }
4718   return pending_inc.epoch;
4719 }
4720
4721 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4722 {
4723   if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4724     a.set_type(entity_addr_t::TYPE_ANY);
4725   } else {
4726     a.set_type(entity_addr_t::TYPE_LEGACY);
4727   }
4728   dout(10) << "blacklist " << a << " until " << until << dendl;
4729   pending_inc.new_blacklist[a] = until;
4730   return pending_inc.epoch;
4731 }
4732
4733
4734 void OSDMonitor::check_osdmap_subs()
4735 {
4736   dout(10) << __func__ << dendl;
4737   if (!osdmap.get_epoch()) {
4738     return;
4739   }
4740   auto osdmap_subs = mon->session_map.subs.find("osdmap");
4741   if (osdmap_subs == mon->session_map.subs.end()) {
4742     return;
4743   }
4744   auto p = osdmap_subs->second->begin();
4745   while (!p.end()) {
4746     auto sub = *p;
4747     ++p;
4748     check_osdmap_sub(sub);
4749   }
4750 }
4751
4752 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4753 {
4754   dout(10) << __func__ << " " << sub << " next " << sub->next
4755            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4756   if (sub->next <= osdmap.get_epoch()) {
4757     if (sub->next >= 1)
4758       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4759     else
4760       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4761     if (sub->onetime)
4762       mon->session_map.remove_sub(sub);
4763     else
4764       sub->next = osdmap.get_epoch() + 1;
4765   }
4766 }
4767
4768 void OSDMonitor::check_pg_creates_subs()
4769 {
4770   if (!osdmap.get_num_up_osds()) {
4771     return;
4772   }
4773   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4774   mon->with_session_map([this](const MonSessionMap& session_map) {
4775       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4776       if (pg_creates_subs == session_map.subs.end()) {
4777         return;
4778       }
4779       for (auto sub : *pg_creates_subs->second) {
4780         check_pg_creates_sub(sub);
4781       }
4782     });
4783 }
4784
4785 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4786 {
4787   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4788   ceph_assert(sub->type == "osd_pg_creates");
4789   // only send these if the OSD is up.  we will check_subs() when they do
4790   // come up so they will get the creates then.
4791   if (sub->session->name.is_osd() &&
4792       mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4793     sub->next = send_pg_creates(sub->session->name.num(),
4794                                 sub->session->con.get(),
4795                                 sub->next);
4796   }
4797 }
4798
4799 void OSDMonitor::do_application_enable(int64_t pool_id,
4800                                        const std::string &app_name,
4801                                        const std::string &app_key,
4802                                        const std::string &app_value,
4803                                        bool force)
4804 {
4805   ceph_assert(paxos->is_plugged() && is_writeable());
4806
4807   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4808            << dendl;
4809
4810   ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4811
4812   auto pp = osdmap.get_pg_pool(pool_id);
4813   ceph_assert(pp != nullptr);
4814
4815   pg_pool_t p = *pp;
4816   if (pending_inc.new_pools.count(pool_id)) {
4817     p = pending_inc.new_pools[pool_id];
4818   }
4819
4820   if (app_key.empty()) {
4821     p.application_metadata.insert({app_name, {}});
4822   } else {
4823     if (force) {
4824       p.application_metadata[app_name][app_key] = app_value;
4825     } else {
4826       p.application_metadata.insert({app_name, {{app_key, app_value}}});
4827     }
4828   }
4829   p.last_change = pending_inc.epoch;
4830   pending_inc.new_pools[pool_id] = p;
4831 }
4832
4833 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4834                                  pool_opts_t::key_t opt,
4835                                  pool_opts_t::value_t val)
4836 {
4837   auto p = pending_inc.new_pools.try_emplace(
4838     pool_id, *osdmap.get_pg_pool(pool_id));
4839   p.first->second.opts.set(opt, val);
4840 }
4841
4842 unsigned OSDMonitor::scan_for_creating_pgs(
4843   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4844   const mempool::osdmap::set<int64_t>& removed_pools,
4845   utime_t modified,
4846   creating_pgs_t* creating_pgs) const
4847 {
4848   unsigned queued = 0;
4849   for (auto& p : pools) {
4850     int64_t poolid = p.first;
4851     if (creating_pgs->created_pools.count(poolid)) {
4852       dout(10) << __func__ << " already created " << poolid << dendl;
4853       continue;
4854     }
4855     const pg_pool_t& pool = p.second;
4856     int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4857                                          pool.get_type(), pool.get_size());
4858     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4859       continue;
4860
4861     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4862     const auto created = pool.get_last_change();
4863     if (last_scan_epoch && created <= last_scan_epoch) {
4864       dout(10) << __func__ << " no change in pool " << poolid
4865                << " " << pool << dendl;
4866       continue;
4867     }
4868     if (removed_pools.count(poolid)) {
4869       dout(10) << __func__ << " pool is being removed: " << poolid
4870                << " " << pool << dendl;
4871       continue;
4872     }
4873     dout(10) << __func__ << " queueing pool create for " << poolid
4874              << " " << pool << dendl;
4875     creating_pgs->create_pool(poolid, pool.get_pg_num(),
4876                               created, modified);
4877     queued++;
4878   }
4879   return queued;
4880 }
4881
4882 void OSDMonitor::update_creating_pgs()
4883 {
4884   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4885            << creating_pgs.queue.size() << " pools in queue" << dendl;
4886   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4887   std::lock_guard<std::mutex> l(creating_pgs_lock);
4888   for (const auto& pg : creating_pgs.pgs) {
4889     int acting_primary = -1;
4890     auto pgid = pg.first;
4891     if (!osdmap.pg_exists(pgid)) {
4892       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4893                << dendl;
4894       continue;
4895     }
4896     auto mapped = pg.second.create_epoch;
4897     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4898     spg_t spgid(pgid);
4899     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4900     // check the previous creating_pgs, look for the target to whom the pg was
4901     // previously mapped
4902     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4903       const auto last_acting_primary = pgs_by_epoch.first;
4904       for (auto& pgs: pgs_by_epoch.second) {
4905         if (pgs.second.count(spgid)) {
4906           if (last_acting_primary == acting_primary) {
4907             mapped = pgs.first;
4908           } else {
4909             dout(20) << __func__ << " " << pgid << " "
4910                      << " acting_primary:" << last_acting_primary
4911                      << " -> " << acting_primary << dendl;
4912             // note epoch if the target of the create message changed.
4913             mapped = mapping.get_epoch();
4914           }
4915           break;
4916         } else {
4917           // newly creating
4918           mapped = mapping.get_epoch();
4919         }
4920       }
4921     }
4922     dout(10) << __func__ << " will instruct osd." << acting_primary
4923              << " to create " << pgid << "@" << mapped << dendl;
4924     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4925   }
4926   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4927   creating_pgs_epoch = mapping.get_epoch();
4928 }
4929
4930 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4931 {
4932   dout(30) << __func__ << " osd." << osd << " next=" << next
4933            << " " << creating_pgs_by_osd_epoch << dendl;
4934   std::lock_guard<std::mutex> l(creating_pgs_lock);
4935   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4936     dout(20) << __func__
4937              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4938     // the subscribers will be updated when the mapping is completed anyway
4939     return next;
4940   }
4941   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4942   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4943     return next;
4944   ceph_assert(!creating_pgs_by_epoch->second.empty());
4945
4946   MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4947   MOSDPGCreate2 *m = nullptr;
4948
4949   bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
4950
4951   epoch_t last = 0;
4952   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4953        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4954     auto epoch = epoch_pgs->first;
4955     auto& pgs = epoch_pgs->second;
4956     dout(20) << __func__ << " osd." << osd << " from " << next
4957              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4958     last = epoch;
4959     for (auto& pg : pgs) {
4960       // Need the create time from the monitor using its clock to set
4961       // last_scrub_stamp upon pg creation.
4962       auto create = creating_pgs.pgs.find(pg.pgid);
4963       ceph_assert(create != creating_pgs.pgs.end());
4964       if (old) {
4965         if (!oldm) {
4966           oldm = new MOSDPGCreate(creating_pgs_epoch);
4967         }
4968         oldm->mkpg.emplace(pg.pgid,
4969                            pg_create_t{create->second.create_epoch, pg.pgid, 0});
4970         oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
4971       } else {
4972         if (!m) {
4973           m = new MOSDPGCreate2(creating_pgs_epoch);
4974         }
4975         m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4976                                      create->second.create_stamp));
4977         if (create->second.history.epoch_created) {
4978           dout(20) << __func__ << "   " << pg << " " << create->second.history
4979                    << " " << create->second.past_intervals << dendl;
4980           m->pg_extra.emplace(pg, make_pair(create->second.history,
4981                                             create->second.past_intervals));
4982         }
4983       }
4984       dout(20) << __func__ << " will create " << pg
4985                << " at " << create->second.create_epoch << dendl;
4986     }
4987   }
4988   if (m) {
4989     con->send_message(m);
4990   } else if (oldm) {
4991     con->send_message(oldm);
4992   } else {
4993     dout(20) << __func__ << " osd." << osd << " from " << next
4994              << " has nothing to send" << dendl;
4995     return next;
4996   }
4997
4998   // sub is current through last + 1
4999   return last + 1;
5000 }
5001
5002 // TICK
5003
5004
5005 void OSDMonitor::tick()
5006 {
5007   if (!is_active()) return;
5008
5009   dout(10) << osdmap << dendl;
5010
5011   // always update osdmap manifest, regardless of being the leader.
5012   load_osdmap_manifest();
5013
5014   // always tune priority cache manager memory on leader and peons
5015   if (ceph_using_tcmalloc() && mon_memory_autotune) {
5016     std::lock_guard l(balancer_lock);
5017     if (pcm != nullptr) {
5018       pcm->tune_memory();
5019       pcm->balance();
5020       _set_new_cache_sizes();
5021       dout(10) << "tick balancer "
5022                << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5023                << " inc comtd_bytes: " << inc_cache->get_committed_size()
5024                << " inc used_bytes: " << inc_cache->_get_used_bytes()
5025                << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5026                << dendl;
5027       dout(10) << "tick balancer "
5028                << " full cache_bytes: " << full_cache->get_cache_bytes()
5029                << " full comtd_bytes: " << full_cache->get_committed_size()
5030                << " full used_bytes: " << full_cache->_get_used_bytes()
5031                << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5032                << dendl;
5033     }
5034   }
5035
5036   if (!mon->is_leader()) return;
5037
5038   bool do_propose = false;
5039   utime_t now = ceph_clock_now();
5040
5041   if (handle_osd_timeouts(now, last_osd_report)) {
5042     do_propose = true;
5043   }
5044
5045   // mark osds down?
5046   if (check_failures(now)) {
5047     do_propose = true;
5048   }
5049
5050   // Force a proposal if we need to prune; pruning is performed on
5051   // ``encode_pending()``, hence why we need to regularly trigger a proposal
5052   // even if there's nothing going on.
5053   if (is_prune_enabled() && should_prune()) {
5054     do_propose = true;
5055   }
5056
5057   // mark down osds out?
5058
5059   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5060    * influence at all. The decision is made based on the ratio of "in" osds,
5061    * and the function returns false if this ratio is lower that the minimum
5062    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5063    */
5064   if (can_mark_out(-1)) {
5065     string down_out_subtree_limit = g_conf().get_val<string>(
5066       "mon_osd_down_out_subtree_limit");
5067     set<int> down_cache;  // quick cache of down subtrees
5068
5069     map<int,utime_t>::iterator i = down_pending_out.begin();
5070     while (i != down_pending_out.end()) {
5071       int o = i->first;
5072       utime_t down = now;
5073       down -= i->second;
5074       ++i;
5075
5076       if (osdmap.is_down(o) &&
5077           osdmap.is_in(o) &&
5078           can_mark_out(o)) {
5079         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5080         utime_t grace = orig_grace;
5081         double my_grace = 0.0;
5082
5083         if (g_conf()->mon_osd_adjust_down_out_interval) {
5084           // scale grace period the same way we do the heartbeat grace.
5085           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5086           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5087           double decay_k = ::log(.5) / halflife;
5088           double decay = exp((double)down * decay_k);
5089           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5090                    << " down for " << down << " decay " << decay << dendl;
5091           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5092           grace += my_grace;
5093         }
5094
5095         // is this an entire large subtree down?
5096         if (down_out_subtree_limit.length()) {
5097           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5098           if (type > 0) {
5099             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5100               dout(10) << "tick entire containing " << down_out_subtree_limit
5101                        << " subtree for osd." << o
5102                        << " is down; resetting timer" << dendl;
5103               // reset timer, too.
5104               down_pending_out[o] = now;
5105               continue;
5106             }
5107           }
5108         }
5109
5110         bool down_out = !osdmap.is_destroyed(o) &&
5111           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5112         bool destroyed_out = osdmap.is_destroyed(o) &&
5113           g_conf()->mon_osd_destroyed_out_interval > 0 &&
5114         // this is not precise enough as we did not make a note when this osd
5115         // was marked as destroyed, but let's not bother with that
5116         // complexity for now.
5117           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5118         if (down_out || destroyed_out) {
5119           dout(10) << "tick marking osd." << o << " OUT after " << down
5120                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5121           pending_inc.new_weight[o] = CEPH_OSD_OUT;
5122
5123           // set the AUTOOUT bit.
5124           if (pending_inc.new_state.count(o) == 0)
5125             pending_inc.new_state[o] = 0;
5126           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5127
5128           // remember previous weight
5129           if (pending_inc.new_xinfo.count(o) == 0)
5130             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5131           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5132
5133           do_propose = true;
5134
5135           mon->clog->info() << "Marking osd." << o << " out (has been down for "
5136                             << int(down.sec()) << " seconds)";
5137         } else
5138           continue;
5139       }
5140
5141       down_pending_out.erase(o);
5142     }
5143   } else {
5144     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5145   }
5146
5147   // expire blacklisted items?
5148   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5149        p != osdmap.blacklist.end();
5150        ++p) {
5151     if (p->second < now) {
5152       dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5153       pending_inc.old_blacklist.push_back(p->first);
5154       do_propose = true;
5155     }
5156   }
5157
5158   if (try_prune_purged_snaps()) {
5159     do_propose = true;
5160   }
5161
5162   if (update_pools_status())
5163     do_propose = true;
5164
5165   if (do_propose ||
5166       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
5167     propose_pending();
5168 }
5169
5170 void OSDMonitor::_set_new_cache_sizes()
5171 {
5172   uint64_t cache_size = 0;
5173   int64_t inc_alloc = 0;
5174   int64_t full_alloc = 0;
5175   int64_t kv_alloc = 0;
5176
5177   if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5178     cache_size = pcm->get_tuned_mem();
5179     inc_alloc = inc_cache->get_committed_size();
5180     full_alloc = full_cache->get_committed_size();
5181     kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5182   }
5183
5184   inc_osd_cache.set_bytes(inc_alloc);
5185   full_osd_cache.set_bytes(full_alloc);
5186
5187   dout(1) << __func__ << " cache_size:" << cache_size
5188            << " inc_alloc: " << inc_alloc
5189            << " full_alloc: " << full_alloc
5190            << " kv_alloc: " << kv_alloc
5191            << dendl;
5192 }
5193
5194 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5195                                      std::map<int,utime_t> &last_osd_report)
5196 {
5197   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5198   if (now - mon->get_leader_since() < timeo) {
5199     // We haven't been the leader for long enough to consider OSD timeouts
5200     return false;
5201   }
5202
5203   int max_osd = osdmap.get_max_osd();
5204   bool new_down = false;
5205
5206   for (int i=0; i < max_osd; ++i) {
5207     dout(30) << __func__ << ": checking up on osd " << i << dendl;
5208     if (!osdmap.exists(i)) {
5209       last_osd_report.erase(i); // if any
5210       continue;
5211     }
5212     if (!osdmap.is_up(i))
5213       continue;
5214     const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5215     if (t == last_osd_report.end()) {
5216       // it wasn't in the map; start the timer.
5217       last_osd_report[i] = now;
5218     } else if (can_mark_down(i)) {
5219       utime_t diff = now - t->second;
5220       if (diff > timeo) {
5221         mon->clog->info() << "osd." << i << " marked down after no beacon for "
5222                           << diff << " seconds";
5223         derr << "no beacon from osd." << i << " since " << t->second
5224              << ", " << diff << " seconds ago.  marking down" << dendl;
5225         pending_inc.new_state[i] = CEPH_OSD_UP;
5226         new_down = true;
5227       }
5228     }
5229   }
5230   return new_down;
5231 }
5232
5233 static void dump_cpu_list(Formatter *f, const char *name,
5234                           const string& strlist)
5235 {
5236   cpu_set_t cpu_set;
5237   size_t cpu_set_size;
5238   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5239     return;
5240   }
5241   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5242   f->open_array_section(name);
5243   for (auto cpu : cpus) {
5244     f->dump_int("cpu", cpu);
5245   }
5246   f->close_section();
5247 }
5248
5249 void OSDMonitor::dump_info(Formatter *f)
5250 {
5251   f->open_object_section("osdmap");
5252   osdmap.dump(f);
5253   f->close_section();
5254
5255   f->open_array_section("osd_metadata");
5256   for (int i=0; i<osdmap.get_max_osd(); ++i) {
5257     if (osdmap.exists(i)) {
5258       f->open_object_section("osd");
5259       f->dump_unsigned("id", i);
5260       dump_osd_metadata(i, f, NULL);
5261       f->close_section();
5262     }
5263   }
5264   f->close_section();
5265
5266   f->open_object_section("osdmap_clean_epochs");
5267   f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5268
5269   f->open_object_section("last_epoch_clean");
5270   last_epoch_clean.dump(f);
5271   f->close_section();
5272
5273   f->open_array_section("osd_epochs");
5274   for (auto& osd_epoch : osd_epochs) {
5275     f->open_object_section("osd");
5276     f->dump_unsigned("id", osd_epoch.first);
5277     f->dump_unsigned("epoch", osd_epoch.second);
5278     f->close_section();
5279   }
5280   f->close_section(); // osd_epochs
5281
5282   f->close_section(); // osd_clean_epochs
5283
5284   f->dump_unsigned("osdmap_first_committed", get_first_committed());
5285   f->dump_unsigned("osdmap_last_committed", get_last_committed());
5286
5287   f->open_object_section("crushmap");
5288   osdmap.crush->dump(f);
5289   f->close_section();
5290
5291   if (has_osdmap_manifest) {
5292     f->open_object_section("osdmap_manifest");
5293     osdmap_manifest.dump(f);
5294     f->close_section();
5295   }
5296 }
5297
5298 namespace {
5299   enum osd_pool_get_choices {
5300     SIZE, MIN_SIZE,
5301     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5302     NODELETE, NOPGCHANGE, NOSIZECHANGE,
5303     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5304     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5305     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5306     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5307     CACHE_TARGET_FULL_RATIO,
5308     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5309     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5310     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5311     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5312     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5313     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5314     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5315     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5316     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5317     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5318     PG_AUTOSCALE_BIAS };
5319
5320   std::set<osd_pool_get_choices>
5321     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5322                                 const std::set<osd_pool_get_choices>& second)
5323     {
5324       std::set<osd_pool_get_choices> result;
5325       std::set_difference(first.begin(), first.end(),
5326                           second.begin(), second.end(),
5327                           std::inserter(result, result.end()));
5328       return result;
5329     }
5330 }
5331
5332
5333 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5334 {
5335   op->mark_osdmon_event(__func__);
5336   auto m = op->get_req<MMonCommand>();
5337   int r = 0;
5338   bufferlist rdata;
5339   stringstream ss, ds;
5340
5341   cmdmap_t cmdmap;
5342   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5343     string rs = ss.str();
5344     mon->reply_command(op, -EINVAL, rs, get_last_committed());
5345     return true;
5346   }
5347
5348   MonSession *session = op->get_session();
5349   if (!session) {
5350     derr << __func__ << " no session" << dendl;
5351     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5352     return true;
5353   }
5354
5355   string prefix;
5356   cmd_getval(cmdmap, "prefix", prefix);
5357
5358   string format;
5359   cmd_getval(cmdmap, "format", format, string("plain"));
5360   boost::scoped_ptr<Formatter> f(Formatter::create(format));
5361
5362   if (prefix == "osd stat") {
5363     if (f) {
5364       f->open_object_section("osdmap");
5365       osdmap.print_summary(f.get(), ds, "", true);
5366       f->close_section();
5367       f->flush(rdata);
5368     } else {
5369       osdmap.print_summary(nullptr, ds, "", true);
5370       rdata.append(ds);
5371     }
5372   }
5373   else if (prefix == "osd dump" ||
5374            prefix == "osd tree" ||
5375            prefix == "osd tree-from" ||
5376            prefix == "osd ls" ||
5377            prefix == "osd getmap" ||
5378            prefix == "osd getcrushmap" ||
5379            prefix == "osd ls-tree" ||
5380            prefix == "osd info") {
5381     string val;
5382
5383     epoch_t epoch = 0;
5384     int64_t epochnum;
5385     cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5386     epoch = epochnum;
5387
5388     bufferlist osdmap_bl;
5389     int err = get_version_full(epoch, osdmap_bl);
5390     if (err == -ENOENT) {
5391       r = -ENOENT;
5392       ss << "there is no map for epoch " << epoch;
5393       goto reply;
5394     }
5395     ceph_assert(err == 0);
5396     ceph_assert(osdmap_bl.length());
5397
5398     OSDMap *p;
5399     if (epoch == osdmap.get_epoch()) {
5400       p = &osdmap;
5401     } else {
5402       p = new OSDMap;
5403       p->decode(osdmap_bl);
5404     }
5405
5406     auto sg = make_scope_guard([&] {
5407       if (p != &osdmap) {
5408         delete p;
5409       }
5410     });
5411
5412     if (prefix == "osd dump") {
5413       stringstream ds;
5414       if (f) {
5415         f->open_object_section("osdmap");
5416         p->dump(f.get());
5417         f->close_section();
5418         f->flush(ds);
5419       } else {
5420         p->print(ds);
5421       }
5422       rdata.append(ds);
5423       if (!f)
5424         ds << " ";
5425     } else if (prefix == "osd ls") {
5426       if (f) {
5427         f->open_array_section("osds");
5428         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5429           if (osdmap.exists(i)) {
5430             f->dump_int("osd", i);
5431           }
5432         }
5433         f->close_section();
5434         f->flush(ds);
5435       } else {
5436         bool first = true;
5437         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5438           if (osdmap.exists(i)) {
5439             if (!first)
5440               ds << "\n";
5441             first = false;
5442             ds << i;
5443           }
5444         }
5445       }
5446       rdata.append(ds);
5447     } else if (prefix == "osd info") {
5448       int64_t osd_id;
5449       bool do_single_osd = true;
5450       if (!cmd_getval(cmdmap, "id", osd_id)) {
5451         do_single_osd = false;
5452       }
5453
5454       if (do_single_osd && !osdmap.exists(osd_id)) {
5455         ss << "osd." << osd_id << " does not exist";
5456         r = -EINVAL;
5457         goto reply;
5458       }
5459
5460       if (f) {
5461         if (do_single_osd) {
5462           osdmap.dump_osd(osd_id, f.get());
5463         } else {
5464           osdmap.dump_osds(f.get());
5465         }
5466         f->flush(ds);
5467       } else {
5468         if (do_single_osd) {
5469           osdmap.print_osd(osd_id, ds);
5470         } else {
5471           osdmap.print_osds(ds);
5472         }
5473       }
5474       rdata.append(ds);
5475     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5476       string bucket;
5477       if (prefix == "osd tree-from") {
5478         cmd_getval(cmdmap, "bucket", bucket);
5479         if (!osdmap.crush->name_exists(bucket)) {
5480           ss << "bucket '" << bucket << "' does not exist";
5481           r = -ENOENT;
5482           goto reply;
5483         }
5484         int id = osdmap.crush->get_item_id(bucket);
5485         if (id >= 0) {
5486           ss << "\"" << bucket << "\" is not a bucket";
5487           r = -EINVAL;
5488           goto reply;
5489         }
5490       }
5491
5492       vector<string> states;
5493       cmd_getval(cmdmap, "states", states);
5494       unsigned filter = 0;
5495       for (auto& s : states) {
5496         if (s == "up") {
5497           filter |= OSDMap::DUMP_UP;
5498         } else if (s == "down") {
5499           filter |= OSDMap::DUMP_DOWN;
5500         } else if (s == "in") {
5501           filter |= OSDMap::DUMP_IN;
5502         } else if (s == "out") {
5503           filter |= OSDMap::DUMP_OUT;
5504         } else if (s == "destroyed") {
5505           filter |= OSDMap::DUMP_DESTROYED;
5506         } else {
5507           ss << "unrecognized state '" << s << "'";
5508           r = -EINVAL;
5509           goto reply;
5510         }
5511       }
5512       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5513           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5514         ss << "cannot specify both 'in' and 'out'";
5515         r = -EINVAL;
5516         goto reply;
5517       }
5518       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5519            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5520            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5521            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5522            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5523            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5524         ss << "can specify only one of 'up', 'down' and 'destroyed'";
5525         r = -EINVAL;
5526         goto reply;
5527       }
5528       if (f) {
5529         f->open_object_section("tree");
5530         p->print_tree(f.get(), NULL, filter, bucket);
5531         f->close_section();
5532         f->flush(ds);
5533       } else {
5534         p->print_tree(NULL, &ds, filter, bucket);
5535       }
5536       rdata.append(ds);
5537     } else if (prefix == "osd getmap") {
5538       rdata.append(osdmap_bl);
5539       ss << "got osdmap epoch " << p->get_epoch();
5540     } else if (prefix == "osd getcrushmap") {
5541       p->crush->encode(rdata, mon->get_quorum_con_features());
5542       ss << p->get_crush_version();
5543     } else if (prefix == "osd ls-tree") {
5544       string bucket_name;
5545       cmd_getval(cmdmap, "name", bucket_name);
5546       set<int> osds;
5547       r = p->get_osds_by_bucket_name(bucket_name, &osds);
5548       if (r == -ENOENT) {
5549         ss << "\"" << bucket_name << "\" does not exist";
5550         goto reply;
5551       } else if (r < 0) {
5552         ss << "can not parse bucket name:\"" << bucket_name << "\"";
5553         goto reply;
5554       }
5555
5556       if (f) {
5557         f->open_array_section("osds");
5558         for (auto &i : osds) {
5559           if (osdmap.exists(i)) {
5560             f->dump_int("osd", i);
5561           }
5562         }
5563         f->close_section();
5564         f->flush(ds);
5565       } else {
5566         bool first = true;
5567         for (auto &i : osds) {
5568           if (osdmap.exists(i)) {
5569             if (!first)
5570               ds << "\n";
5571             first = false;
5572             ds << i;
5573           }
5574         }
5575       }
5576
5577       rdata.append(ds);
5578     }
5579   } else if (prefix == "osd getmaxosd") {
5580     if (f) {
5581       f->open_object_section("getmaxosd");
5582       f->dump_unsigned("epoch", osdmap.get_epoch());
5583       f->dump_int("max_osd", osdmap.get_max_osd());
5584       f->close_section();
5585       f->flush(rdata);
5586     } else {
5587       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5588       rdata.append(ds);
5589     }
5590   } else if (prefix == "osd utilization") {
5591     string out;
5592     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5593     if (f)
5594       f->flush(rdata);
5595     else
5596       rdata.append(out);
5597     r = 0;
5598     goto reply;
5599   } else if (prefix  == "osd find") {
5600     int64_t osd;
5601     if (!cmd_getval(cmdmap, "id", osd)) {
5602       ss << "unable to parse osd id value '"
5603          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5604       r = -EINVAL;
5605       goto reply;
5606     }
5607     if (!osdmap.exists(osd)) {
5608       ss << "osd." << osd << " does not exist";
5609       r = -ENOENT;
5610       goto reply;
5611     }
5612     string format;
5613     cmd_getval(cmdmap, "format", format);
5614     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5615     f->open_object_section("osd_location");
5616     f->dump_int("osd", osd);
5617     f->dump_object("addrs", osdmap.get_addrs(osd));
5618     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5619
5620     // try to identify host, pod/container name, etc.
5621     map<string,string> m;
5622     load_metadata(osd, m, nullptr);
5623     if (auto p = m.find("hostname"); p != m.end()) {
5624       f->dump_string("host", p->second);
5625     }
5626     for (auto& k : {
5627         "pod_name", "pod_namespace", // set by rook
5628         "container_name"             // set by cephadm, ceph-ansible
5629         }) {
5630       if (auto p = m.find(k); p != m.end()) {
5631         f->dump_string(k, p->second);
5632       }
5633     }
5634
5635     // crush is helpful too
5636     f->open_object_section("crush_location");
5637     map<string,string> loc = osdmap.crush->get_full_location(osd);
5638     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5639       f->dump_string(p->first.c_str(), p->second);
5640     f->close_section();
5641     f->close_section();
5642     f->flush(rdata);
5643   } else if (prefix == "osd metadata") {
5644     int64_t osd = -1;
5645     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5646         !cmd_getval(cmdmap, "id", osd)) {
5647       ss << "unable to parse osd id value '"
5648          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5649       r = -EINVAL;
5650       goto reply;
5651     }
5652     if (osd >= 0 && !osdmap.exists(osd)) {
5653       ss << "osd." << osd << " does not exist";
5654       r = -ENOENT;
5655       goto reply;
5656     }
5657     string format;
5658     cmd_getval(cmdmap, "format", format);
5659     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5660     if (osd >= 0) {
5661       f->open_object_section("osd_metadata");
5662       f->dump_unsigned("id", osd);
5663       r = dump_osd_metadata(osd, f.get(), &ss);
5664       if (r < 0)
5665         goto reply;
5666       f->close_section();
5667     } else {
5668       r = 0;
5669       f->open_array_section("osd_metadata");
5670       for (int i=0; i<osdmap.get_max_osd(); ++i) {
5671         if (osdmap.exists(i)) {
5672           f->open_object_section("osd");
5673           f->dump_unsigned("id", i);
5674           r = dump_osd_metadata(i, f.get(), NULL);
5675           if (r == -EINVAL || r == -ENOENT) {
5676             // Drop error, continue to get other daemons' metadata
5677             dout(4) << "No metadata for osd." << i << dendl;
5678             r = 0;
5679           } else if (r < 0) {
5680             // Unexpected error
5681             goto reply;
5682           }
5683           f->close_section();
5684         }
5685       }
5686       f->close_section();
5687     }
5688     f->flush(rdata);
5689   } else if (prefix == "osd versions") {
5690     if (!f)
5691       f.reset(Formatter::create("json-pretty"));
5692     count_metadata("ceph_version", f.get());
5693     f->flush(rdata);
5694     r = 0;
5695   } else if (prefix == "osd count-metadata") {
5696     if (!f)
5697       f.reset(Formatter::create("json-pretty"));
5698     string field;
5699     cmd_getval(cmdmap, "property", field);
5700     count_metadata(field, f.get());
5701     f->flush(rdata);
5702     r = 0;
5703   } else if (prefix == "osd numa-status") {
5704     TextTable tbl;
5705     if (f) {
5706       f->open_array_section("osds");
5707     } else {
5708       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5709       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5710       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5711       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5712       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5713       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5714     }
5715     for (int i=0; i<osdmap.get_max_osd(); ++i) {
5716       if (osdmap.exists(i)) {
5717         map<string,string> m;
5718         ostringstream err;
5719         if (load_metadata(i, m, &err) < 0) {
5720           continue;
5721         }
5722         string host;
5723         auto p = m.find("hostname");
5724         if (p != m.end()) {
5725           host = p->second;
5726         }
5727         if (f) {
5728           f->open_object_section("osd");
5729           f->dump_int("osd", i);
5730           f->dump_string("host", host);
5731           for (auto n : { "network_numa_node", "objectstore_numa_node",
5732                 "numa_node" }) {
5733             p = m.find(n);
5734             if (p != m.end()) {
5735               f->dump_int(n, atoi(p->second.c_str()));
5736             }
5737           }
5738           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5739             p = m.find(n);
5740             if (p != m.end()) {
5741               list<string> ls = get_str_list(p->second, ",");
5742               f->open_array_section(n);
5743               for (auto node : ls) {
5744                 f->dump_int("node", atoi(node.c_str()));
5745               }
5746               f->close_section();
5747             }
5748           }
5749           for (auto n : { "numa_node_cpus" }) {
5750             p = m.find(n);
5751             if (p != m.end()) {
5752               dump_cpu_list(f.get(), n, p->second);
5753             }
5754           }
5755           f->close_section();
5756         } else {
5757           tbl << i;
5758           tbl << host;
5759           p = m.find("network_numa_nodes");
5760           if (p != m.end()) {
5761             tbl << p->second;
5762           } else {
5763             tbl << "-";
5764           }
5765           p = m.find("objectstore_numa_nodes");
5766           if (p != m.end()) {
5767             tbl << p->second;
5768           } else {
5769             tbl << "-";
5770           }
5771           p = m.find("numa_node");
5772           auto q = m.find("numa_node_cpus");
5773           if (p != m.end() && q != m.end()) {
5774             tbl << p->second;
5775             tbl << q->second;
5776           } else {
5777             tbl << "-";
5778             tbl << "-";
5779           }
5780           tbl << TextTable::endrow;
5781         }
5782       }
5783     }
5784     if (f) {
5785       f->close_section();
5786       f->flush(rdata);
5787     } else {
5788       rdata.append(stringify(tbl));
5789     }
5790   } else if (prefix == "osd map") {
5791     string poolstr, objstr, namespacestr;
5792     cmd_getval(cmdmap, "pool", poolstr);
5793     cmd_getval(cmdmap, "object", objstr);
5794     cmd_getval(cmdmap, "nspace", namespacestr);
5795
5796     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5797     if (pool < 0) {
5798       ss << "pool " << poolstr << " does not exist";
5799       r = -ENOENT;
5800       goto reply;
5801     }
5802     object_locator_t oloc(pool, namespacestr);
5803     object_t oid(objstr);
5804     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5805     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5806     vector<int> up, acting;
5807     int up_p, acting_p;
5808     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5809
5810     string fullobjname;
5811     if (!namespacestr.empty())
5812       fullobjname = namespacestr + string("/") + oid.name;
5813     else
5814       fullobjname = oid.name;
5815     if (f) {
5816       f->open_object_section("osd_map");
5817       f->dump_unsigned("epoch", osdmap.get_epoch());
5818       f->dump_string("pool", poolstr);
5819       f->dump_int("pool_id", pool);
5820       f->dump_stream("objname") << fullobjname;
5821       f->dump_stream("raw_pgid") << pgid;
5822       f->dump_stream("pgid") << mpgid;
5823       f->open_array_section("up");
5824       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5825         f->dump_int("osd", *p);
5826       f->close_section();
5827       f->dump_int("up_primary", up_p);
5828       f->open_array_section("acting");
5829       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5830         f->dump_int("osd", *p);
5831       f->close_section();
5832       f->dump_int("acting_primary", acting_p);
5833       f->close_section(); // osd_map
5834       f->flush(rdata);
5835     } else {
5836       ds << "osdmap e" << osdmap.get_epoch()
5837         << " pool '" << poolstr << "' (" << pool << ")"
5838         << " object '" << fullobjname << "' ->"
5839         << " pg " << pgid << " (" << mpgid << ")"
5840         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5841         << pg_vector_string(acting) << ", p" << acting_p << ")";
5842       rdata.append(ds);
5843     }
5844
5845   } else if (prefix == "pg map") {
5846     pg_t pgid;
5847     string pgidstr;
5848     cmd_getval(cmdmap, "pgid", pgidstr);
5849     if (!pgid.parse(pgidstr.c_str())) {
5850       ss << "invalid pgid '" << pgidstr << "'";
5851       r = -EINVAL;
5852       goto reply;
5853     }
5854     vector<int> up, acting;
5855     if (!osdmap.have_pg_pool(pgid.pool())) {
5856       ss << "pg '" << pgidstr << "' does not exist";
5857       r = -ENOENT;
5858       goto reply;
5859     }
5860     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5861     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5862     if (f) {
5863       f->open_object_section("pg_map");
5864       f->dump_unsigned("epoch", osdmap.get_epoch());
5865       f->dump_stream("raw_pgid") << pgid;
5866       f->dump_stream("pgid") << mpgid;
5867       f->open_array_section("up");
5868       for (auto osd : up) {
5869         f->dump_int("up_osd", osd);
5870       }
5871       f->close_section();
5872       f->open_array_section("acting");
5873       for (auto osd : acting) {
5874         f->dump_int("acting_osd", osd);
5875       }
5876       f->close_section();
5877       f->close_section();
5878       f->flush(rdata);
5879     } else {
5880       ds << "osdmap e" << osdmap.get_epoch()
5881          << " pg " << pgid << " (" << mpgid << ")"
5882          << " -> up " << up << " acting " << acting;
5883       rdata.append(ds);
5884     }
5885     goto reply;
5886
5887   } else if (prefix == "osd lspools") {
5888     if (f)
5889       f->open_array_section("pools");
5890     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5891          p != osdmap.pools.end();
5892          ++p) {
5893       if (f) {
5894         f->open_object_section("pool");
5895         f->dump_int("poolnum", p->first);
5896         f->dump_string("poolname", osdmap.pool_name[p->first]);
5897         f->close_section();
5898       } else {
5899         ds << p->first << ' ' << osdmap.pool_name[p->first];
5900         if (next(p) != osdmap.pools.end()) {
5901           ds << '\n';
5902         }
5903       }
5904     }
5905     if (f) {
5906       f->close_section();
5907       f->flush(ds);
5908     }
5909     rdata.append(ds);
5910   } else if (prefix == "osd blacklist ls") {
5911     if (f)
5912       f->open_array_section("blacklist");
5913
5914     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5915          p != osdmap.blacklist.end();
5916          ++p) {
5917       if (f) {
5918         f->open_object_section("entry");
5919         f->dump_string("addr", p->first.get_legacy_str());
5920         f->dump_stream("until") << p->second;
5921         f->close_section();
5922       } else {
5923         stringstream ss;
5924         string s;
5925         ss << p->first << " " << p->second;
5926         getline(ss, s);
5927         s += "\n";
5928         rdata.append(s);
5929       }
5930     }
5931     if (f) {
5932       f->close_section();
5933       f->flush(rdata);
5934     }
5935     ss << "listed " << osdmap.blacklist.size() << " entries";
5936
5937   } else if (prefix == "osd pool ls") {
5938     string detail;
5939     cmd_getval(cmdmap, "detail", detail);
5940     if (!f && detail == "detail") {
5941       ostringstream ss;
5942       osdmap.print_pools(ss);
5943       rdata.append(ss.str());
5944     } else {
5945       if (f)
5946         f->open_array_section("pools");
5947       for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5948            it != osdmap.get_pools().end();
5949            ++it) {
5950         if (f) {
5951           if (detail == "detail") {
5952             f->open_object_section("pool");
5953             f->dump_int("pool_id", it->first);
5954             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5955             it->second.dump(f.get());
5956             f->close_section();
5957           } else {
5958             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5959           }
5960         } else {
5961           rdata.append(osdmap.get_pool_name(it->first) + "\n");
5962         }
5963       }
5964       if (f) {
5965         f->close_section();
5966         f->flush(rdata);
5967       }
5968     }
5969
5970   } else if (prefix == "osd crush get-tunable") {
5971     string tunable;
5972     cmd_getval(cmdmap, "tunable", tunable);
5973     ostringstream rss;
5974     if (f)
5975       f->open_object_section("tunable");
5976     if (tunable == "straw_calc_version") {
5977       if (f)
5978         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5979       else
5980         rss << osdmap.crush->get_straw_calc_version() << "\n";
5981     } else {
5982       r = -EINVAL;
5983       goto reply;
5984     }
5985     if (f) {
5986       f->close_section();
5987       f->flush(rdata);
5988     } else {
5989       rdata.append(rss.str());
5990     }
5991     r = 0;
5992
5993   } else if (prefix == "osd pool get") {
5994     string poolstr;
5995     cmd_getval(cmdmap, "pool", poolstr);
5996     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5997     if (pool < 0) {
5998       ss << "unrecognized pool '" << poolstr << "'";
5999       r = -ENOENT;
6000       goto reply;
6001     }
6002
6003     const pg_pool_t *p = osdmap.get_pg_pool(pool);
6004     string var;
6005     cmd_getval(cmdmap, "var", var);
6006
6007     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6008     const choices_map_t ALL_CHOICES = {
6009       {"size", SIZE},
6010       {"min_size", MIN_SIZE},
6011       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6012       {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
6013       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6014       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6015       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6016       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6017       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6018       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6019       {"use_gmt_hitset", USE_GMT_HITSET},
6020       {"target_max_objects", TARGET_MAX_OBJECTS},
6021       {"target_max_bytes", TARGET_MAX_BYTES},
6022       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6023       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6024       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6025       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6026       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6027       {"erasure_code_profile", ERASURE_CODE_PROFILE},
6028       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6029       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6030       {"fast_read", FAST_READ},
6031       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6032       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6033       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6034       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6035       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6036       {"recovery_priority", RECOVERY_PRIORITY},
6037       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6038       {"scrub_priority", SCRUB_PRIORITY},
6039       {"compression_mode", COMPRESSION_MODE},
6040       {"compression_algorithm", COMPRESSION_ALGORITHM},
6041       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6042       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6043       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6044       {"csum_type", CSUM_TYPE},
6045       {"csum_max_block", CSUM_MAX_BLOCK},
6046       {"csum_min_block", CSUM_MIN_BLOCK},
6047       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6048       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6049       {"pg_num_min", PG_NUM_MIN},
6050       {"target_size_bytes", TARGET_SIZE_BYTES},
6051       {"target_size_ratio", TARGET_SIZE_RATIO},
6052       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6053     };
6054
6055     typedef std::set<osd_pool_get_choices> choices_set_t;
6056
6057     const choices_set_t ONLY_TIER_CHOICES = {
6058       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6059       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6060       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6061       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6062       MIN_READ_RECENCY_FOR_PROMOTE,
6063       MIN_WRITE_RECENCY_FOR_PROMOTE,
6064       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6065     };
6066     const choices_set_t ONLY_ERASURE_CHOICES = {
6067       EC_OVERWRITES, ERASURE_CODE_PROFILE
6068     };
6069
6070     choices_set_t selected_choices;
6071     if (var == "all") {
6072       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6073           it != ALL_CHOICES.end(); ++it) {
6074         selected_choices.insert(it->second);
6075       }
6076
6077       if(!p->is_tier()) {
6078         selected_choices = subtract_second_from_first(selected_choices,
6079                                                       ONLY_TIER_CHOICES);
6080       }
6081
6082       if(!p->is_erasure()) {
6083         selected_choices = subtract_second_from_first(selected_choices,
6084                                                       ONLY_ERASURE_CHOICES);
6085       }
6086     } else /* var != "all" */  {
6087       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6088       osd_pool_get_choices selected = found->second;
6089
6090       if (!p->is_tier() &&
6091           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6092         ss << "pool '" << poolstr
6093            << "' is not a tier pool: variable not applicable";
6094         r = -EACCES;
6095         goto reply;
6096       }
6097
6098       if (!p->is_erasure() &&
6099           ONLY_ERASURE_CHOICES.find(selected)
6100           != ONLY_ERASURE_CHOICES.end()) {
6101         ss << "pool '" << poolstr
6102            << "' is not a erasure pool: variable not applicable";
6103         r = -EACCES;
6104         goto reply;
6105       }
6106
6107       if (pool_opts_t::is_opt_name(var) &&
6108           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6109         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6110         r = -ENOENT;
6111         goto reply;
6112       }
6113
6114       selected_choices.insert(selected);
6115     }
6116
6117     if (f) {
6118       f->open_object_section("pool");
6119       f->dump_string("pool", poolstr);
6120       f->dump_int("pool_id", pool);
6121       for(choices_set_t::const_iterator it = selected_choices.begin();
6122           it != selected_choices.end(); ++it) {
6123         choices_map_t::const_iterator i;
6124         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6125           if (i->second == *it) {
6126             break;
6127           }
6128         }
6129         ceph_assert(i != ALL_CHOICES.end());
6130         switch(*it) {
6131           case PG_NUM:
6132             f->dump_int("pg_num", p->get_pg_num());
6133             break;
6134           case PGP_NUM:
6135             f->dump_int("pgp_num", p->get_pgp_num());
6136             break;
6137           case SIZE:
6138             f->dump_int("size", p->get_size());
6139             break;
6140           case MIN_SIZE:
6141             f->dump_int("min_size", p->get_min_size());
6142             break;
6143           case CRUSH_RULE:
6144             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6145               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6146                                p->get_crush_rule()));
6147             } else {
6148               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6149             }
6150             break;
6151           case EC_OVERWRITES:
6152             f->dump_bool("allow_ec_overwrites",
6153                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6154             break;
6155           case PG_AUTOSCALE_MODE:
6156             f->dump_string("pg_autoscale_mode",
6157                            pg_pool_t::get_pg_autoscale_mode_name(
6158                              p->pg_autoscale_mode));
6159             break;
6160           case HASHPSPOOL:
6161           case NODELETE:
6162           case NOPGCHANGE:
6163           case NOSIZECHANGE:
6164           case WRITE_FADVISE_DONTNEED:
6165           case NOSCRUB:
6166           case NODEEP_SCRUB:
6167             f->dump_bool(i->first.c_str(),
6168                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6169             break;
6170           case HIT_SET_PERIOD:
6171             f->dump_int("hit_set_period", p->hit_set_period);
6172             break;
6173           case HIT_SET_COUNT:
6174             f->dump_int("hit_set_count", p->hit_set_count);
6175             break;
6176           case HIT_SET_TYPE:
6177             f->dump_string("hit_set_type",
6178                            HitSet::get_type_name(p->hit_set_params.get_type()));
6179             break;
6180           case HIT_SET_FPP:
6181             {
6182               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6183                 BloomHitSet::Params *bloomp =
6184                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6185                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6186               } else if(var != "all") {
6187                 f->close_section();
6188                 ss << "hit set is not of type Bloom; " <<
6189                   "invalid to get a false positive rate!";
6190                 r = -EINVAL;
6191                 goto reply;
6192               }
6193             }
6194             break;
6195           case USE_GMT_HITSET:
6196             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6197             break;
6198           case TARGET_MAX_OBJECTS:
6199             f->dump_unsigned("target_max_objects", p->target_max_objects);
6200             break;
6201           case TARGET_MAX_BYTES:
6202             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6203             break;
6204           case CACHE_TARGET_DIRTY_RATIO:
6205             f->dump_unsigned("cache_target_dirty_ratio_micro",
6206                              p->cache_target_dirty_ratio_micro);
6207             f->dump_float("cache_target_dirty_ratio",
6208                           ((float)p->cache_target_dirty_ratio_micro/1000000));
6209             break;
6210           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6211             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6212                              p->cache_target_dirty_high_ratio_micro);
6213             f->dump_float("cache_target_dirty_high_ratio",
6214                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6215             break;
6216           case CACHE_TARGET_FULL_RATIO:
6217             f->dump_unsigned("cache_target_full_ratio_micro",
6218                              p->cache_target_full_ratio_micro);
6219             f->dump_float("cache_target_full_ratio",
6220                           ((float)p->cache_target_full_ratio_micro/1000000));
6221             break;
6222           case CACHE_MIN_FLUSH_AGE:
6223             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6224             break;
6225           case CACHE_MIN_EVICT_AGE:
6226             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6227             break;
6228           case ERASURE_CODE_PROFILE:
6229             f->dump_string("erasure_code_profile", p->erasure_code_profile);
6230             break;
6231           case MIN_READ_RECENCY_FOR_PROMOTE:
6232             f->dump_int("min_read_recency_for_promote",
6233                         p->min_read_recency_for_promote);
6234             break;
6235           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6236             f->dump_int("min_write_recency_for_promote",
6237                         p->min_write_recency_for_promote);
6238             break;
6239           case FAST_READ:
6240             f->dump_int("fast_read", p->fast_read);
6241             break;
6242           case HIT_SET_GRADE_DECAY_RATE:
6243             f->dump_int("hit_set_grade_decay_rate",
6244                         p->hit_set_grade_decay_rate);
6245             break;
6246           case HIT_SET_SEARCH_LAST_N:
6247             f->dump_int("hit_set_search_last_n",
6248                         p->hit_set_search_last_n);
6249             break;
6250           case SCRUB_MIN_INTERVAL:
6251           case SCRUB_MAX_INTERVAL:
6252           case DEEP_SCRUB_INTERVAL:
6253           case RECOVERY_PRIORITY:
6254           case RECOVERY_OP_PRIORITY:
6255           case SCRUB_PRIORITY:
6256           case COMPRESSION_MODE:
6257           case COMPRESSION_ALGORITHM:
6258           case COMPRESSION_REQUIRED_RATIO:
6259           case COMPRESSION_MAX_BLOB_SIZE:
6260           case COMPRESSION_MIN_BLOB_SIZE:
6261           case CSUM_TYPE:
6262           case CSUM_MAX_BLOCK:
6263           case CSUM_MIN_BLOCK:
6264           case FINGERPRINT_ALGORITHM:
6265           case PG_NUM_MIN:
6266           case TARGET_SIZE_BYTES:
6267           case TARGET_SIZE_RATIO:
6268           case PG_AUTOSCALE_BIAS:
6269             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6270             if (p->opts.is_set(key)) {
6271               if(*it == CSUM_TYPE) {
6272                 int64_t val;
6273                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6274                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6275               } else {
6276                 p->opts.dump(i->first, f.get());
6277               }
6278             }
6279             break;
6280         }
6281       }
6282       f->close_section();
6283       f->flush(rdata);
6284     } else /* !f */ {
6285       for(choices_set_t::const_iterator it = selected_choices.begin();
6286           it != selected_choices.end(); ++it) {
6287         choices_map_t::const_iterator i;
6288         switch(*it) {
6289           case PG_NUM:
6290             ss << "pg_num: " << p->get_pg_num() << "\n";
6291             break;
6292           case PGP_NUM:
6293             ss << "pgp_num: " << p->get_pgp_num() << "\n";
6294             break;
6295           case SIZE:
6296             ss << "size: " << p->get_size() << "\n";
6297             break;
6298           case MIN_SIZE:
6299             ss << "min_size: " << p->get_min_size() << "\n";
6300             break;
6301           case CRUSH_RULE:
6302             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6303               ss << "crush_rule: " << osdmap.crush->get_rule_name(
6304                 p->get_crush_rule()) << "\n";
6305             } else {
6306               ss << "crush_rule: " << p->get_crush_rule() << "\n";
6307             }
6308             break;
6309           case PG_AUTOSCALE_MODE:
6310             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6311               p->pg_autoscale_mode) <<"\n";
6312             break;
6313           case HIT_SET_PERIOD:
6314             ss << "hit_set_period: " << p->hit_set_period << "\n";
6315             break;
6316           case HIT_SET_COUNT:
6317             ss << "hit_set_count: " << p->hit_set_count << "\n";
6318             break;
6319           case HIT_SET_TYPE:
6320             ss << "hit_set_type: " <<
6321               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6322             break;
6323           case HIT_SET_FPP:
6324             {
6325               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6326                 BloomHitSet::Params *bloomp =
6327                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6328                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6329               } else if(var != "all") {
6330                 ss << "hit set is not of type Bloom; " <<
6331                   "invalid to get a false positive rate!";
6332                 r = -EINVAL;
6333                 goto reply;
6334               }
6335             }
6336             break;
6337           case USE_GMT_HITSET:
6338             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6339             break;
6340           case TARGET_MAX_OBJECTS:
6341             ss << "target_max_objects: " << p->target_max_objects << "\n";
6342             break;
6343           case TARGET_MAX_BYTES:
6344             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6345             break;
6346           case CACHE_TARGET_DIRTY_RATIO:
6347             ss << "cache_target_dirty_ratio: "
6348                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6349             break;
6350           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6351             ss << "cache_target_dirty_high_ratio: "
6352                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6353             break;
6354           case CACHE_TARGET_FULL_RATIO:
6355             ss << "cache_target_full_ratio: "
6356                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6357             break;
6358           case CACHE_MIN_FLUSH_AGE:
6359             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6360             break;
6361           case CACHE_MIN_EVICT_AGE:
6362             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6363             break;
6364           case ERASURE_CODE_PROFILE:
6365             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6366             break;
6367           case MIN_READ_RECENCY_FOR_PROMOTE:
6368             ss << "min_read_recency_for_promote: " <<
6369               p->min_read_recency_for_promote << "\n";
6370             break;
6371           case HIT_SET_GRADE_DECAY_RATE:
6372             ss << "hit_set_grade_decay_rate: " <<
6373               p->hit_set_grade_decay_rate << "\n";
6374             break;
6375           case HIT_SET_SEARCH_LAST_N:
6376             ss << "hit_set_search_last_n: " <<
6377               p->hit_set_search_last_n << "\n";
6378             break;
6379           case EC_OVERWRITES:
6380             ss << "allow_ec_overwrites: " <<
6381               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6382               "\n";
6383             break;
6384           case HASHPSPOOL:
6385           case NODELETE:
6386           case NOPGCHANGE:
6387           case NOSIZECHANGE:
6388           case WRITE_FADVISE_DONTNEED:
6389           case NOSCRUB:
6390           case NODEEP_SCRUB:
6391             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6392               if (i->second == *it)
6393                 break;
6394             }
6395             ceph_assert(i != ALL_CHOICES.end());
6396             ss << i->first << ": " <<
6397               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6398                "true" : "false") << "\n";
6399             break;
6400           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6401             ss << "min_write_recency_for_promote: " <<
6402               p->min_write_recency_for_promote << "\n";
6403             break;
6404           case FAST_READ:
6405             ss << "fast_read: " << p->fast_read << "\n";
6406             break;
6407           case SCRUB_MIN_INTERVAL:
6408           case SCRUB_MAX_INTERVAL:
6409           case DEEP_SCRUB_INTERVAL:
6410           case RECOVERY_PRIORITY:
6411           case RECOVERY_OP_PRIORITY:
6412           case SCRUB_PRIORITY:
6413           case COMPRESSION_MODE:
6414           case COMPRESSION_ALGORITHM:
6415           case COMPRESSION_REQUIRED_RATIO:
6416           case COMPRESSION_MAX_BLOB_SIZE:
6417           case COMPRESSION_MIN_BLOB_SIZE:
6418           case CSUM_TYPE:
6419           case CSUM_MAX_BLOCK:
6420           case CSUM_MIN_BLOCK:
6421           case FINGERPRINT_ALGORITHM:
6422           case PG_NUM_MIN:
6423           case TARGET_SIZE_BYTES:
6424           case TARGET_SIZE_RATIO:
6425           case PG_AUTOSCALE_BIAS:
6426             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6427               if (i->second == *it)
6428                 break;
6429             }
6430             ceph_assert(i != ALL_CHOICES.end());
6431             {
6432               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6433               if (p->opts.is_set(key)) {
6434                 if(key == pool_opts_t::CSUM_TYPE) {
6435                   int64_t val;
6436                   p->opts.get(key, &val);
6437                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6438                 } else {
6439                   ss << i->first << ": " << p->opts.get(key) << "\n";
6440                 }
6441               }
6442             }
6443             break;
6444         }
6445         rdata.append(ss.str());
6446         ss.str("");
6447       }
6448     }
6449     r = 0;
6450   } else if (prefix == "osd pool get-quota") {
6451     string pool_name;
6452     cmd_getval(cmdmap, "pool", pool_name);
6453
6454     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6455     if (poolid < 0) {
6456       ceph_assert(poolid == -ENOENT);
6457       ss << "unrecognized pool '" << pool_name << "'";
6458       r = -ENOENT;
6459       goto reply;
6460     }
6461     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6462     const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6463     const object_stat_sum_t& sum = pstat->stats.sum;
6464     if (f) {
6465       f->open_object_section("pool_quotas");
6466       f->dump_string("pool_name", pool_name);
6467       f->dump_unsigned("pool_id", poolid);
6468       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6469       f->dump_int("current_num_objects", sum.num_objects);
6470       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6471       f->dump_int("current_num_bytes", sum.num_bytes);
6472       f->close_section();
6473       f->flush(rdata);
6474     } else {
6475       stringstream rs;
6476       rs << "quotas for pool '" << pool_name << "':\n"
6477          << "  max objects: ";
6478       if (p->quota_max_objects == 0)
6479         rs << "N/A";
6480       else {
6481         rs << si_u_t(p->quota_max_objects) << " objects";
6482         rs << "  (current num objects: " << sum.num_objects << " objects)";
6483       }
6484       rs << "\n"
6485          << "  max bytes  : ";
6486       if (p->quota_max_bytes == 0)
6487         rs << "N/A";
6488       else {
6489         rs << byte_u_t(p->quota_max_bytes);
6490         rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
6491       }
6492       rdata.append(rs.str());
6493     }
6494     rdata.append("\n");
6495     r = 0;
6496   } else if (prefix == "osd crush rule list" ||
6497              prefix == "osd crush rule ls") {
6498     if (f) {
6499       f->open_array_section("rules");
6500       osdmap.crush->list_rules(f.get());
6501       f->close_section();
6502       f->flush(rdata);
6503     } else {
6504       ostringstream ss;
6505       osdmap.crush->list_rules(&ss);
6506       rdata.append(ss.str());
6507     }
6508   } else if (prefix == "osd crush rule ls-by-class") {
6509     string class_name;
6510     cmd_getval(cmdmap, "class", class_name);
6511     if (class_name.empty()) {
6512       ss << "no class specified";
6513       r = -EINVAL;
6514       goto reply;
6515     }
6516     set<int> rules;
6517     r = osdmap.crush->get_rules_by_class(class_name, &rules);
6518     if (r < 0) {
6519       ss << "failed to get rules by class '" << class_name << "'";
6520       goto reply;
6521     }
6522     if (f) {
6523       f->open_array_section("rules");
6524       for (auto &rule: rules) {
6525         f->dump_string("name", osdmap.crush->get_rule_name(rule));
6526       }
6527       f->close_section();
6528       f->flush(rdata);
6529     } else {
6530       ostringstream rs;
6531       for (auto &rule: rules) {
6532         rs << osdmap.crush->get_rule_name(rule) << "\n";
6533       }
6534       rdata.append(rs.str());
6535     }
6536   } else if (prefix == "osd crush rule dump") {
6537     string name;
6538     cmd_getval(cmdmap, "name", name);
6539     string format;
6540     cmd_getval(cmdmap, "format", format);
6541     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6542     if (name == "") {
6543       f->open_array_section("rules");
6544       osdmap.crush->dump_rules(f.get());
6545       f->close_section();
6546     } else {
6547       int ruleno = osdmap.crush->get_rule_id(name);
6548       if (ruleno < 0) {
6549         ss << "unknown crush rule '" << name << "'";
6550         r = ruleno;
6551         goto reply;
6552       }
6553       osdmap.crush->dump_rule(ruleno, f.get());
6554     }
6555     ostringstream rs;
6556     f->flush(rs);
6557     rs << "\n";
6558     rdata.append(rs.str());
6559   } else if (prefix == "osd crush dump") {
6560     string format;
6561     cmd_getval(cmdmap, "format", format);
6562     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6563     f->open_object_section("crush_map");
6564     osdmap.crush->dump(f.get());
6565     f->close_section();
6566     ostringstream rs;
6567     f->flush(rs);
6568     rs << "\n";
6569     rdata.append(rs.str());
6570   } else if (prefix == "osd crush show-tunables") {
6571     string format;
6572     cmd_getval(cmdmap, "format", format);
6573     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6574     f->open_object_section("crush_map_tunables");
6575     osdmap.crush->dump_tunables(f.get());
6576     f->close_section();
6577     ostringstream rs;
6578     f->flush(rs);
6579     rs << "\n";
6580     rdata.append(rs.str());
6581   } else if (prefix == "osd crush tree") {
6582     string shadow;
6583     cmd_getval(cmdmap, "shadow", shadow);
6584     bool show_shadow = shadow == "--show-shadow";
6585     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6586     if (f) {
6587       f->open_object_section("crush_tree");
6588       osdmap.crush->dump_tree(nullptr,
6589                               f.get(),
6590                               osdmap.get_pool_names(),
6591                               show_shadow);
6592       f->close_section();
6593       f->flush(rdata);
6594     } else {
6595       ostringstream ss;
6596       osdmap.crush->dump_tree(&ss,
6597                               nullptr,
6598                               osdmap.get_pool_names(),
6599                               show_shadow);
6600       rdata.append(ss.str());
6601     }
6602   } else if (prefix == "osd crush ls") {
6603     string name;
6604     if (!cmd_getval(cmdmap, "node", name)) {
6605       ss << "no node specified";
6606       r = -EINVAL;
6607       goto reply;
6608     }
6609     if (!osdmap.crush->name_exists(name)) {
6610       ss << "node '" << name << "' does not exist";
6611       r = -ENOENT;
6612       goto reply;
6613     }
6614     int id = osdmap.crush->get_item_id(name);
6615     list<int> result;
6616     if (id >= 0) {
6617       result.push_back(id);
6618     } else {
6619       int num = osdmap.crush->get_bucket_size(id);
6620       for (int i = 0; i < num; ++i) {
6621         result.push_back(osdmap.crush->get_bucket_item(id, i));
6622       }
6623     }
6624     if (f) {
6625       f->open_array_section("items");
6626       for (auto i : result) {
6627         f->dump_string("item", osdmap.crush->get_item_name(i));
6628       }
6629       f->close_section();
6630       f->flush(rdata);
6631     } else {
6632       ostringstream ss;
6633       for (auto i : result) {
6634         ss << osdmap.crush->get_item_name(i) << "\n";
6635       }
6636       rdata.append(ss.str());
6637     }
6638     r = 0;
6639   } else if (prefix == "osd crush class ls") {
6640     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6641     f->open_array_section("crush_classes");
6642     for (auto i : osdmap.crush->class_name)
6643       f->dump_string("class", i.second);
6644     f->close_section();
6645     f->flush(rdata);
6646   } else if (prefix == "osd crush class ls-osd") {
6647     string name;
6648     cmd_getval(cmdmap, "class", name);
6649     set<int> osds;
6650     osdmap.crush->get_devices_by_class(name, &osds);
6651     if (f) {
6652       f->open_array_section("osds");
6653       for (auto &osd: osds)
6654         f->dump_int("osd", osd);
6655       f->close_section();
6656       f->flush(rdata);
6657     } else {
6658       bool first = true;
6659       for (auto &osd : osds) {
6660         if (!first)
6661           ds << "\n";
6662         first = false;
6663         ds << osd;
6664       }
6665       rdata.append(ds);
6666     }
6667   } else if (prefix == "osd crush get-device-class") {
6668     vector<string> idvec;
6669     cmd_getval(cmdmap, "ids", idvec);
6670     map<int, string> class_by_osd;
6671     for (auto& id : idvec) {
6672       ostringstream ts;
6673       long osd = parse_osd_id(id.c_str(), &ts);
6674       if (osd < 0) {
6675         ss << "unable to parse osd id:'" << id << "'";
6676         r = -EINVAL;
6677         goto reply;
6678       }
6679       auto device_class = osdmap.crush->get_item_class(osd);
6680       if (device_class)
6681         class_by_osd[osd] = device_class;
6682       else
6683         class_by_osd[osd] = ""; // no class
6684     }
6685     if (f) {
6686       f->open_array_section("osd_device_classes");
6687       for (auto& i : class_by_osd) {
6688         f->open_object_section("osd_device_class");
6689         f->dump_int("osd", i.first);
6690         f->dump_string("device_class", i.second);
6691         f->close_section();
6692       }
6693       f->close_section();
6694       f->flush(rdata);
6695     } else {
6696       if (class_by_osd.size() == 1) {
6697         // for single input, make a clean output
6698         ds << class_by_osd.begin()->second;
6699       } else {
6700         // note that we do not group osds by class here
6701         for (auto it = class_by_osd.begin();
6702              it != class_by_osd.end();
6703              it++) {
6704           ds << "osd." << it->first << ' ' << it->second;
6705           if (next(it) != class_by_osd.end())
6706             ds << '\n';
6707         }
6708       }
6709       rdata.append(ds);
6710     }
6711   } else if (prefix == "osd erasure-code-profile ls") {
6712     const auto &profiles = osdmap.get_erasure_code_profiles();
6713     if (f)
6714       f->open_array_section("erasure-code-profiles");
6715     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6716       if (f)
6717         f->dump_string("profile", i->first.c_str());
6718       else
6719         rdata.append(i->first + "\n");
6720     }
6721     if (f) {
6722       f->close_section();
6723       ostringstream rs;
6724       f->flush(rs);
6725       rs << "\n";
6726       rdata.append(rs.str());
6727     }
6728   } else if (prefix == "osd crush weight-set ls") {
6729     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6730     if (f) {
6731       f->open_array_section("weight_sets");
6732       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6733         f->dump_string("pool", "(compat)");
6734       }
6735       for (auto& i : osdmap.crush->choose_args) {
6736         if (i.first >= 0) {
6737           f->dump_string("pool", osdmap.get_pool_name(i.first));
6738         }
6739       }
6740       f->close_section();
6741       f->flush(rdata);
6742     } else {
6743       ostringstream rs;
6744       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6745         rs << "(compat)\n";
6746       }
6747       for (auto& i : osdmap.crush->choose_args) {
6748         if (i.first >= 0) {
6749           rs << osdmap.get_pool_name(i.first) << "\n";
6750         }
6751       }
6752       rdata.append(rs.str());
6753     }
6754   } else if (prefix == "osd crush weight-set dump") {
6755     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6756                                                      "json-pretty"));
6757     osdmap.crush->dump_choose_args(f.get());
6758     f->flush(rdata);
6759   } else if (prefix == "osd erasure-code-profile get") {
6760     string name;
6761     cmd_getval(cmdmap, "name", name);
6762     if (!osdmap.has_erasure_code_profile(name)) {
6763       ss << "unknown erasure code profile '" << name << "'";
6764       r = -ENOENT;
6765       goto reply;
6766     }
6767     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6768     if (f)
6769       f->open_object_section("profile");
6770     for (map<string,string>::const_iterator i = profile.begin();
6771          i != profile.end();
6772          ++i) {
6773       if (f)
6774         f->dump_string(i->first.c_str(), i->second.c_str());
6775       else
6776         rdata.append(i->first + "=" + i->second + "\n");
6777     }
6778     if (f) {
6779       f->close_section();
6780       ostringstream rs;
6781       f->flush(rs);
6782       rs << "\n";
6783       rdata.append(rs.str());
6784     }
6785   } else if (prefix == "osd pool application get") {
6786     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6787                                                      "json-pretty"));
6788     string pool_name;
6789     cmd_getval(cmdmap, "pool", pool_name);
6790     string app;
6791     cmd_getval(cmdmap, "app", app);
6792     string key;
6793     cmd_getval(cmdmap, "key", key);
6794
6795     if (pool_name.empty()) {
6796       // all
6797       f->open_object_section("pools");
6798       for (const auto &pool : osdmap.pools) {
6799         std::string name("<unknown>");
6800         const auto &pni = osdmap.pool_name.find(pool.first);
6801         if (pni != osdmap.pool_name.end())
6802           name = pni->second;
6803         f->open_object_section(name.c_str());
6804         for (auto &app_pair : pool.second.application_metadata) {
6805           f->open_object_section(app_pair.first.c_str());
6806           for (auto &kv_pair : app_pair.second) {
6807             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6808           }
6809           f->close_section();
6810         }
6811         f->close_section(); // name
6812       }
6813       f->close_section(); // pools
6814       f->flush(rdata);
6815     } else {
6816       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6817       if (pool < 0) {
6818         ss << "unrecognized pool '" << pool_name << "'";
6819         r = -ENOENT;
6820         goto reply;
6821       }
6822       auto p = osdmap.get_pg_pool(pool);
6823       // filter by pool
6824       if (app.empty()) {
6825         f->open_object_section(pool_name.c_str());
6826         for (auto &app_pair : p->application_metadata) {
6827           f->open_object_section(app_pair.first.c_str());
6828           for (auto &kv_pair : app_pair.second) {
6829             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6830           }
6831           f->close_section(); // application
6832         }
6833         f->close_section(); // pool_name
6834         f->flush(rdata);
6835         goto reply;
6836       }
6837
6838       auto app_it = p->application_metadata.find(app);
6839       if (app_it == p->application_metadata.end()) {
6840         ss << "pool '" << pool_name << "' has no application '" << app << "'";
6841         r = -ENOENT;
6842         goto reply;
6843       }
6844       // filter by pool + app
6845       if (key.empty()) {
6846         f->open_object_section(app_it->first.c_str());
6847         for (auto &kv_pair : app_it->second) {
6848           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6849         }
6850         f->close_section(); // application
6851         f->flush(rdata);
6852         goto reply;
6853       }
6854       // filter by pool + app + key
6855       auto key_it = app_it->second.find(key);
6856       if (key_it == app_it->second.end()) {
6857         ss << "application '" << app << "' on pool '" << pool_name
6858            << "' does not have key '" << key << "'";
6859         r = -ENOENT;
6860         goto reply;
6861       }
6862       ss << key_it->second << "\n";
6863       rdata.append(ss.str());
6864       ss.str("");
6865     }
6866   } else if (prefix == "osd get-require-min-compat-client") {
6867     ss << osdmap.require_min_compat_client << std::endl;
6868     rdata.append(ss.str());
6869     ss.str("");
6870     goto reply;
6871   } else if (prefix == "osd pool application enable" ||
6872              prefix == "osd pool application disable" ||
6873              prefix == "osd pool application set" ||
6874              prefix == "osd pool application rm") {
6875     bool changed = false;
6876     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6877     if (r != 0) {
6878       // Error, reply.
6879       goto reply;
6880     } else if (changed) {
6881       // Valid mutation, proceed to prepare phase
6882       return false;
6883     } else {
6884       // Idempotent case, reply
6885       goto reply;
6886     }
6887   } else {
6888     // try prepare update
6889     return false;
6890   }
6891
6892  reply:
6893   string rs;
6894   getline(ss, rs);
6895   mon->reply_command(op, r, rs, rdata, get_last_committed());
6896   return true;
6897 }
6898
6899 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6900 {
6901   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6902     osdmap.get_pg_pool(pool_id));
6903   ceph_assert(pool);
6904   pool->set_flag(flags);
6905 }
6906
6907 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6908 {
6909   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6910     osdmap.get_pg_pool(pool_id));
6911   ceph_assert(pool);
6912   pool->unset_flag(flags);
6913 }
6914
6915 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
6916 {
6917   char k[80];
6918   snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
6919   return k;
6920 }
6921
6922 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
6923 {
6924   char k[80];
6925   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6926            (unsigned long long)pool, (unsigned long long)snap);
6927   return k;
6928 }
6929
6930 string OSDMonitor::make_purged_snap_key_value(
6931   int64_t pool, snapid_t snap, snapid_t num,
6932   epoch_t epoch, bufferlist *v)
6933 {
6934   // encode the *last* epoch in the key so that we can use forward
6935   // iteration only to search for an epoch in an interval.
6936   encode(snap, *v);
6937   encode(snap + num, *v);
6938   encode(epoch, *v);
6939   return make_purged_snap_key(pool, snap + num - 1);
6940 }
6941
6942
6943 int OSDMonitor::lookup_purged_snap(
6944   int64_t pool, snapid_t snap,
6945   snapid_t *begin, snapid_t *end)
6946 {
6947   string k = make_purged_snap_key(pool, snap);
6948   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6949   it->lower_bound(k);
6950   if (!it->valid()) {
6951     dout(20) << __func__
6952              << " pool " << pool << " snap " << snap
6953              << " - key '" << k << "' not found" << dendl;
6954     return -ENOENT;
6955   }
6956   if (it->key().find("purged_snap_") != 0) {
6957     dout(20) << __func__
6958              << " pool " << pool << " snap " << snap
6959              << " - key '" << k << "' got '" << it->key()
6960              << "', wrong prefix" << dendl;
6961     return -ENOENT;
6962   }
6963   string gotk = it->key();
6964   const char *format = "purged_snap_%llu_";
6965   long long int keypool;
6966   int n = sscanf(gotk.c_str(), format, &keypool);
6967   if (n != 1) {
6968     derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6969     return -ENOENT;
6970   }
6971   if (pool != keypool) {
6972     dout(20) << __func__
6973              << " pool " << pool << " snap " << snap
6974              << " - key '" << k << "' got '" << gotk
6975              << "', wrong pool " << keypool
6976              << dendl;
6977     return -ENOENT;
6978   }
6979   bufferlist v = it->value();
6980   auto p = v.cbegin();
6981   decode(*begin, p);
6982   decode(*end, p);
6983   if (snap < *begin || snap >= *end) {
6984     dout(20) << __func__
6985              << " pool " << pool << " snap " << snap
6986              << " - found [" << *begin << "," << *end << "), no overlap"
6987              << dendl;
6988     return -ENOENT;
6989   }
6990   return 0;
6991 }
6992
6993 void OSDMonitor::insert_purged_snap_update(
6994   int64_t pool,
6995   snapid_t start, snapid_t end,
6996   epoch_t epoch,
6997   MonitorDBStore::TransactionRef t)
6998 {
6999   snapid_t before_begin, before_end;
7000   snapid_t after_begin, after_end;
7001   int b = lookup_purged_snap(pool, start - 1,
7002                              &before_begin, &before_end);
7003   int a = lookup_purged_snap(pool, end,
7004                              &after_begin, &after_end);
7005   if (!b && !a) {
7006     dout(10) << __func__
7007              << " [" << start << "," << end << ") - joins ["
7008              << before_begin << "," << before_end << ") and ["
7009              << after_begin << "," << after_end << ")" << dendl;
7010     // erase only the begin record; we'll overwrite the end one.
7011     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7012     bufferlist v;
7013     string k = make_purged_snap_key_value(pool,
7014                                           before_begin, after_end - before_begin,
7015                                           pending_inc.epoch, &v);
7016     t->put(OSD_SNAP_PREFIX, k, v);
7017   } else if (!b) {
7018     dout(10) << __func__
7019              << " [" << start << "," << end << ") - join with earlier ["
7020              << before_begin << "," << before_end << ")" << dendl;
7021     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7022     bufferlist v;
7023     string k = make_purged_snap_key_value(pool,
7024                                           before_begin, end - before_begin,
7025                                           pending_inc.epoch, &v);
7026     t->put(OSD_SNAP_PREFIX, k, v);
7027   } else if (!a) {
7028     dout(10) << __func__
7029              << " [" << start << "," << end << ") - join with later ["
7030              << after_begin << "," << after_end << ")" << dendl;
7031     // overwrite after record
7032     bufferlist v;
7033     string k = make_purged_snap_key_value(pool,
7034                                           start, after_end - start,
7035                                           pending_inc.epoch, &v);
7036     t->put(OSD_SNAP_PREFIX, k, v);
7037   } else {
7038     dout(10) << __func__
7039              << " [" << start << "," << end << ") - new"
7040              << dendl;
7041     bufferlist v;
7042     string k = make_purged_snap_key_value(pool,
7043                                           start, end - start,
7044                                           pending_inc.epoch, &v);
7045     t->put(OSD_SNAP_PREFIX, k, v);
7046   }
7047 }
7048
7049 bool OSDMonitor::try_prune_purged_snaps()
7050 {
7051   if (!mon->mgrstatmon()->is_readable()) {
7052     return false;
7053   }
7054   if (!pending_inc.new_purged_snaps.empty()) {
7055     return false;  // we already pruned for this epoch
7056   }
7057
7058   unsigned max_prune = cct->_conf.get_val<uint64_t>(
7059     "mon_max_snap_prune_per_epoch");
7060   if (!max_prune) {
7061     max_prune = 100000;
7062   }
7063   dout(10) << __func__ << " max_prune " << max_prune << dendl;
7064
7065   unsigned actually_pruned = 0;
7066   auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
7067   for (auto& p : osdmap.get_pools()) {
7068     auto q = purged_snaps.find(p.first);
7069     if (q == purged_snaps.end()) {
7070       continue;
7071     }
7072     auto& purged = q->second;
7073     if (purged.empty()) {
7074       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7075       continue;
7076     }
7077     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7078     snap_interval_set_t to_prune;
7079     unsigned maybe_pruned = actually_pruned;
7080     for (auto i = purged.begin(); i != purged.end(); ++i) {
7081       snapid_t begin = i.get_start();
7082       auto end = i.get_start() + i.get_len();
7083       snapid_t pbegin = 0, pend = 0;
7084       int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7085       if (r == 0) {
7086         // already purged.
7087         // be a bit aggressive about backing off here, because the mon may
7088         // do a lot of work going through this set, and if we know the
7089         // purged set from the OSDs is at least *partly* stale we may as
7090         // well wait for it to be fresh.
7091         dout(20) << __func__ << "  we've already purged " << pbegin
7092                  << "~" << (pend - pbegin) << dendl;
7093         break;  // next pool
7094       }
7095       if (pbegin && pbegin > begin && pbegin < end) {
7096         // the tail of [begin,end) is purged; shorten the range
7097         end = pbegin;
7098       }
7099       to_prune.insert(begin, end - begin);
7100       maybe_pruned += end - begin;
7101       if (maybe_pruned >= max_prune) {
7102         break;
7103       }
7104     }
7105     if (!to_prune.empty()) {
7106       // PGs may still be reporting things as purged that we have already
7107       // pruned from removed_snaps_queue.
7108       snap_interval_set_t actual;
7109       auto r = osdmap.removed_snaps_queue.find(p.first);
7110       if (r != osdmap.removed_snaps_queue.end()) {
7111         actual.intersection_of(to_prune, r->second);
7112       }
7113       actually_pruned += actual.size();
7114       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7115                << ", actual pruned " << actual << dendl;
7116       if (!actual.empty()) {
7117         pending_inc.new_purged_snaps[p.first].swap(actual);
7118       }
7119     }
7120     if (actually_pruned >= max_prune) {
7121       break;
7122     }
7123   }
7124   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7125   return !!actually_pruned;
7126 }
7127
7128 bool OSDMonitor::update_pools_status()
7129 {
7130   if (!mon->mgrstatmon()->is_readable())
7131     return false;
7132
7133   bool ret = false;
7134
7135   auto& pools = osdmap.get_pools();
7136   for (auto it = pools.begin(); it != pools.end(); ++it) {
7137     const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
7138     if (!pstat)
7139       continue;
7140     const object_stat_sum_t& sum = pstat->stats.sum;
7141     const pg_pool_t &pool = it->second;
7142     const string& pool_name = osdmap.get_pool_name(it->first);
7143
7144     bool pool_is_full =
7145       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7146       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7147
7148     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7149       if (pool_is_full)
7150         continue;
7151
7152       mon->clog->info() << "pool '" << pool_name
7153                        << "' no longer out of quota; removing NO_QUOTA flag";
7154       // below we cancel FLAG_FULL too, we'll set it again in
7155       // OSDMonitor::encode_pending if it still fails the osd-full checking.
7156       clear_pool_flags(it->first,
7157                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7158       ret = true;
7159     } else {
7160       if (!pool_is_full)
7161         continue;
7162
7163       if (pool.quota_max_bytes > 0 &&
7164           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7165         mon->clog->warn() << "pool '" << pool_name << "' is full"
7166                          << " (reached quota's max_bytes: "
7167                          << byte_u_t(pool.quota_max_bytes) << ")";
7168       }
7169       if (pool.quota_max_objects > 0 &&
7170                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7171         mon->clog->warn() << "pool '" << pool_name << "' is full"
7172                          << " (reached quota's max_objects: "
7173                          << pool.quota_max_objects << ")";
7174       }
7175       // set both FLAG_FULL_QUOTA and FLAG_FULL
7176       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7177       // since FLAG_FULL should always take precedence
7178       set_pool_flags(it->first,
7179                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7180       clear_pool_flags(it->first,
7181                        pg_pool_t::FLAG_NEARFULL |
7182                        pg_pool_t::FLAG_BACKFILLFULL);
7183       ret = true;
7184     }
7185   }
7186   return ret;
7187 }
7188
7189 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7190 {
7191   op->mark_osdmon_event(__func__);
7192   auto m = op->get_req<MPoolOp>();
7193   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7194   MonSession *session = op->get_session();
7195   if (!session)
7196     return -EPERM;
7197   string erasure_code_profile;
7198   stringstream ss;
7199   string rule_name;
7200   int ret = 0;
7201   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7202                          0, 0, 0, 0, 0, 0.0,
7203                          erasure_code_profile,
7204                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7205                          &ss);
7206
7207   if (ret < 0) {
7208     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7209   }
7210   return ret;
7211 }
7212
7213 int OSDMonitor::crush_rename_bucket(const string& srcname,
7214                                     const string& dstname,
7215                                     ostream *ss)
7216 {
7217   int ret;
7218   //
7219   // Avoid creating a pending crush if it does not already exists and
7220   // the rename would fail.
7221   //
7222   if (!_have_pending_crush()) {
7223     ret = _get_stable_crush().can_rename_bucket(srcname,
7224                                                 dstname,
7225                                                 ss);
7226     if (ret)
7227       return ret;
7228   }
7229
7230   CrushWrapper newcrush;
7231   _get_pending_crush(newcrush);
7232
7233   ret = newcrush.rename_bucket(srcname,
7234                                dstname,
7235                                ss);
7236   if (ret)
7237     return ret;
7238
7239   pending_inc.crush.clear();
7240   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7241   *ss << "renamed bucket " << srcname << " into " << dstname;
7242   return 0;
7243 }
7244
7245 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7246 {
7247   string replacement = "";
7248
7249   if (plugin == "jerasure_generic" ||
7250       plugin == "jerasure_sse3" ||
7251       plugin == "jerasure_sse4" ||
7252       plugin == "jerasure_neon") {
7253     replacement = "jerasure";
7254   } else if (plugin == "shec_generic" ||
7255              plugin == "shec_sse3" ||
7256              plugin == "shec_sse4" ||
7257              plugin == "shec_neon") {
7258     replacement = "shec";
7259   }
7260
7261   if (replacement != "") {
7262     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7263             << plugin << " that has been deprecated. Please use "
7264             << replacement << " instead." << dendl;
7265   }
7266 }
7267
7268 int OSDMonitor::normalize_profile(const string& profilename,
7269                                   ErasureCodeProfile &profile,
7270                                   bool force,
7271                                   ostream *ss)
7272 {
7273   ErasureCodeInterfaceRef erasure_code;
7274   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7275   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7276   check_legacy_ec_plugin(plugin->second, profilename);
7277   int err = instance.factory(plugin->second,
7278                              g_conf().get_val<std::string>("erasure_code_dir"),
7279                              profile, &erasure_code, ss);
7280   if (err) {
7281     return err;
7282   }
7283
7284   err = erasure_code->init(profile, ss);
7285   if (err) {
7286     return err;
7287   }
7288
7289   auto it = profile.find("stripe_unit");
7290   if (it != profile.end()) {
7291     string err_str;
7292     uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7293     if (!err_str.empty()) {
7294       *ss << "could not parse stripe_unit '" << it->second
7295           << "': " << err_str << std::endl;
7296       return -EINVAL;
7297     }
7298     uint32_t data_chunks = erasure_code->get_data_chunk_count();
7299     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7300     if (chunk_size != stripe_unit) {
7301       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7302           << "alignment. Would be padded to " << chunk_size
7303           << std::endl;
7304       return -EINVAL;
7305     }
7306     if ((stripe_unit % 4096) != 0 && !force) {
7307       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7308           << "use --force to override this check" << std::endl;
7309       return -EINVAL;
7310     }
7311   }
7312   return 0;
7313 }
7314
7315 int OSDMonitor::crush_rule_create_erasure(const string &name,
7316                                              const string &profile,
7317                                              int *rule,
7318                                              ostream *ss)
7319 {
7320   int ruleid = osdmap.crush->get_rule_id(name);
7321   if (ruleid != -ENOENT) {
7322     *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7323     return -EEXIST;
7324   }
7325
7326   CrushWrapper newcrush;
7327   _get_pending_crush(newcrush);
7328
7329   ruleid = newcrush.get_rule_id(name);
7330   if (ruleid != -ENOENT) {
7331     *rule = newcrush.get_rule_mask_ruleset(ruleid);
7332     return -EALREADY;
7333   } else {
7334     ErasureCodeInterfaceRef erasure_code;
7335     int err = get_erasure_code(profile, &erasure_code, ss);
7336     if (err) {
7337       *ss << "failed to load plugin using profile " << profile << std::endl;
7338       return err;
7339     }
7340
7341     err = erasure_code->create_rule(name, newcrush, ss);
7342     erasure_code.reset();
7343     if (err < 0)
7344       return err;
7345     *rule = err;
7346     pending_inc.crush.clear();
7347     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7348     return 0;
7349   }
7350 }
7351
7352 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7353                                  ErasureCodeInterfaceRef *erasure_code,
7354                                  ostream *ss) const
7355 {
7356   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7357     return -EAGAIN;
7358   ErasureCodeProfile profile =
7359     osdmap.get_erasure_code_profile(erasure_code_profile);
7360   ErasureCodeProfile::const_iterator plugin =
7361     profile.find("plugin");
7362   if (plugin == profile.end()) {
7363     *ss << "cannot determine the erasure code plugin"
7364         << " because there is no 'plugin' entry in the erasure_code_profile "
7365         << profile << std::endl;
7366     return -EINVAL;
7367   }
7368   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7369   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7370   return instance.factory(plugin->second,
7371                           g_conf().get_val<std::string>("erasure_code_dir"),
7372                           profile, erasure_code, ss);
7373 }
7374
7375 int OSDMonitor::check_cluster_features(uint64_t features,
7376                                        stringstream &ss)
7377 {
7378   stringstream unsupported_ss;
7379   int unsupported_count = 0;
7380   if ((mon->get_quorum_con_features() & features) != features) {
7381     unsupported_ss << "the monitor cluster";
7382     ++unsupported_count;
7383   }
7384
7385   set<int32_t> up_osds;
7386   osdmap.get_up_osds(up_osds);
7387   for (set<int32_t>::iterator it = up_osds.begin();
7388        it != up_osds.end(); ++it) {
7389     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7390     if ((xi.features & features) != features) {
7391       if (unsupported_count > 0)
7392         unsupported_ss << ", ";
7393       unsupported_ss << "osd." << *it;
7394       unsupported_count ++;
7395     }
7396   }
7397
7398   if (unsupported_count > 0) {
7399     ss << "features " << features << " unsupported by: "
7400        << unsupported_ss.str();
7401     return -ENOTSUP;
7402   }
7403
7404   // check pending osd state, too!
7405   for (map<int32_t,osd_xinfo_t>::const_iterator p =
7406          pending_inc.new_xinfo.begin();
7407        p != pending_inc.new_xinfo.end(); ++p) {
7408     const osd_xinfo_t &xi = p->second;
7409     if ((xi.features & features) != features) {
7410       dout(10) << __func__ << " pending osd." << p->first
7411                << " features are insufficient; retry" << dendl;
7412       return -EAGAIN;
7413     }
7414   }
7415
7416   return 0;
7417 }
7418
7419 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7420                                                  stringstream& ss)
7421 {
7422   OSDMap::Incremental new_pending = pending_inc;
7423   encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7424   OSDMap newmap;
7425   newmap.deepish_copy_from(osdmap);
7426   newmap.apply_incremental(new_pending);
7427
7428   // client compat
7429   if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7430     auto mv = newmap.get_min_compat_client();
7431     if (mv > newmap.require_min_compat_client) {
7432       ss << "new crush map requires client version " << mv
7433          << " but require_min_compat_client is "
7434          << newmap.require_min_compat_client;
7435       return false;
7436     }
7437   }
7438
7439   // osd compat
7440   uint64_t features =
7441     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7442     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7443   stringstream features_ss;
7444   int r = check_cluster_features(features, features_ss);
7445   if (r) {
7446     ss << "Could not change CRUSH: " << features_ss.str();
7447     return false;
7448   }
7449
7450   return true;
7451 }
7452
7453 bool OSDMonitor::erasure_code_profile_in_use(
7454   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7455   const string &profile,
7456   ostream *ss)
7457 {
7458   bool found = false;
7459   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7460        p != pools.end();
7461        ++p) {
7462     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7463       *ss << osdmap.pool_name[p->first] << " ";
7464       found = true;
7465     }
7466   }
7467   if (found) {
7468     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7469   }
7470   return found;
7471 }
7472
7473 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7474                                            map<string,string> *erasure_code_profile_map,
7475                                            ostream *ss)
7476 {
7477   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7478                                    get_json_str_map,
7479                                    *ss,
7480                                    erasure_code_profile_map,
7481                                    true);
7482   if (r)
7483     return r;
7484   ceph_assert((*erasure_code_profile_map).count("plugin"));
7485   string default_plugin = (*erasure_code_profile_map)["plugin"];
7486   map<string,string> user_map;
7487   for (vector<string>::const_iterator i = erasure_code_profile.begin();
7488        i != erasure_code_profile.end();
7489        ++i) {
7490     size_t equal = i->find('=');
7491     if (equal == string::npos) {
7492       user_map[*i] = string();
7493       (*erasure_code_profile_map)[*i] = string();
7494     } else {
7495       const string key = i->substr(0, equal);
7496       equal++;
7497       const string value = i->substr(equal);
7498       if (key.find("ruleset-") == 0) {
7499         *ss << "property '" << key << "' is no longer supported; try "
7500             << "'crush-" << key.substr(8) << "' instead";
7501         return -EINVAL;
7502       }
7503       user_map[key] = value;
7504       (*erasure_code_profile_map)[key] = value;
7505     }
7506   }
7507
7508   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7509     (*erasure_code_profile_map) = user_map;
7510
7511   return 0;
7512 }
7513
7514 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7515                                   const string &erasure_code_profile,
7516                                   uint8_t repl_size,
7517                                   unsigned *size, unsigned *min_size,
7518                                   ostream *ss)
7519 {
7520   int err = 0;
7521   switch (pool_type) {
7522   case pg_pool_t::TYPE_REPLICATED:
7523     if (repl_size == 0) {
7524       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7525     }
7526     *size = repl_size;
7527     *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7528     break;
7529   case pg_pool_t::TYPE_ERASURE:
7530     {
7531       ErasureCodeInterfaceRef erasure_code;
7532       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7533       if (err == 0) {
7534         *size = erasure_code->get_chunk_count();
7535         *min_size =
7536           erasure_code->get_data_chunk_count() +
7537           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7538         assert(*min_size <= *size);
7539         assert(*min_size >= erasure_code->get_data_chunk_count());
7540       }
7541     }
7542     break;
7543   default:
7544     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7545     err = -EINVAL;
7546     break;
7547   }
7548   return err;
7549 }
7550
7551 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7552                                           const string &erasure_code_profile,
7553                                           uint32_t *stripe_width,
7554                                           ostream *ss)
7555 {
7556   int err = 0;
7557   switch (pool_type) {
7558   case pg_pool_t::TYPE_REPLICATED:
7559     // ignored
7560     break;
7561   case pg_pool_t::TYPE_ERASURE:
7562     {
7563       ErasureCodeProfile profile =
7564         osdmap.get_erasure_code_profile(erasure_code_profile);
7565       ErasureCodeInterfaceRef erasure_code;
7566       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7567       if (err)
7568         break;
7569       uint32_t data_chunks = erasure_code->get_data_chunk_count();
7570       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7571       auto it = profile.find("stripe_unit");
7572       if (it != profile.end()) {
7573         string err_str;
7574         stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7575         ceph_assert(err_str.empty());
7576       }
7577       *stripe_width = data_chunks *
7578         erasure_code->get_chunk_size(stripe_unit * data_chunks);
7579     }
7580     break;
7581   default:
7582     *ss << "prepare_pool_stripe_width: "
7583        << pool_type << " is not a known pool type";
7584     err = -EINVAL;
7585     break;
7586   }
7587   return err;
7588 }
7589
7590 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7591                                         const string &erasure_code_profile,
7592                                         const string &rule_name,
7593                                         int *crush_rule,
7594                                         ostream *ss)
7595 {
7596
7597   if (*crush_rule < 0) {
7598     switch (pool_type) {
7599     case pg_pool_t::TYPE_REPLICATED:
7600       {
7601         if (rule_name == "") {
7602           // Use default rule
7603           *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7604           if (*crush_rule < 0) {
7605             // Errors may happen e.g. if no valid rule is available
7606             *ss << "No suitable CRUSH rule exists, check "
7607                 << "'osd pool default crush *' config options";
7608             return -ENOENT;
7609           }
7610         } else {
7611           return get_crush_rule(rule_name, crush_rule, ss);
7612         }
7613       }
7614       break;
7615     case pg_pool_t::TYPE_ERASURE:
7616       {
7617         int err = crush_rule_create_erasure(rule_name,
7618                                                erasure_code_profile,
7619                                                crush_rule, ss);
7620         switch (err) {
7621         case -EALREADY:
7622           dout(20) << "prepare_pool_crush_rule: rule "
7623                    << rule_name << " try again" << dendl;
7624           // fall through
7625         case 0:
7626           // need to wait for the crush rule to be proposed before proceeding
7627           err = -EAGAIN;
7628           break;
7629         case -EEXIST:
7630           err = 0;
7631           break;
7632         }
7633         return err;
7634       }
7635       break;
7636     default:
7637       *ss << "prepare_pool_crush_rule: " << pool_type
7638          << " is not a known pool type";
7639       return -EINVAL;
7640       break;
7641     }
7642   } else {
7643     if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7644       *ss << "CRUSH rule " << *crush_rule << " not found";
7645       return -ENOENT;
7646     }
7647   }
7648
7649   return 0;
7650 }
7651
7652 int OSDMonitor::get_crush_rule(const string &rule_name,
7653                                int *crush_rule,
7654                                ostream *ss)
7655 {
7656   int ret;
7657   ret = osdmap.crush->get_rule_id(rule_name);
7658   if (ret != -ENOENT) {
7659     // found it, use it
7660     *crush_rule = ret;
7661   } else {
7662     CrushWrapper newcrush;
7663     _get_pending_crush(newcrush);
7664
7665     ret = newcrush.get_rule_id(rule_name);
7666     if (ret != -ENOENT) {
7667       // found it, wait for it to be proposed
7668       dout(20) << __func__ << ": rule " << rule_name
7669                << " try again" << dendl;
7670       return -EAGAIN;
7671     } else {
7672       // Cannot find it , return error
7673       *ss << "specified rule " << rule_name << " doesn't exist";
7674       return ret;
7675     }
7676   }
7677   return 0;
7678 }
7679
7680 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7681 {
7682   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7683   auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
7684   auto max_pgs = max_pgs_per_osd * num_osds;
7685   uint64_t projected = 0;
7686   if (pool < 0) {
7687     projected += pg_num * size;
7688   }
7689   for (const auto& i : osdmap.get_pools()) {
7690     if (i.first == pool) {
7691       projected += pg_num * size;
7692     } else {
7693       projected += i.second.get_pg_num_target() * i.second.get_size();
7694     }
7695   }
7696   if (projected > max_pgs) {
7697     if (pool >= 0) {
7698       *ss << "pool id " << pool;
7699     }
7700     *ss << " pg_num " << pg_num << " size " << size
7701         << " would mean " << projected
7702         << " total pgs, which exceeds max " << max_pgs
7703         << " (mon_max_pg_per_osd " << max_pgs_per_osd
7704         << " * num_in_osds " << num_osds << ")";
7705     return -ERANGE;
7706   }
7707   return 0;
7708 }
7709
7710 /**
7711  * @param name The name of the new pool
7712  * @param crush_rule The crush rule to use. If <0, will use the system default
7713  * @param crush_rule_name The crush rule to use, if crush_rulset <0
7714  * @param pg_num The pg_num to use. If set to 0, will use the system default
7715  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7716  * @param repl_size Replication factor, or 0 for default
7717  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7718  * @param pool_type TYPE_ERASURE, or TYPE_REP
7719  * @param expected_num_objects expected number of objects on the pool
7720  * @param fast_read fast read type.
7721  * @param ss human readable error message, if any.
7722  *
7723  * @return 0 on success, negative errno on failure.
7724  */
7725 int OSDMonitor::prepare_new_pool(string& name,
7726                                  int crush_rule,
7727                                  const string &crush_rule_name,
7728                                  unsigned pg_num, unsigned pgp_num,
7729                                  unsigned pg_num_min,
7730                                  const uint64_t repl_size,
7731                                  const uint64_t target_size_bytes,
7732                                  const float target_size_ratio,
7733                                  const string &erasure_code_profile,
7734                                  const unsigned pool_type,
7735                                  const uint64_t expected_num_objects,
7736                                  FastReadType fast_read,
7737                                  const string& pg_autoscale_mode,
7738                                  ostream *ss)
7739 {
7740   if (name.length() == 0)
7741     return -EINVAL;
7742   if (pg_num == 0)
7743     pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7744   if (pgp_num == 0)
7745     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7746   if (!pgp_num)
7747     pgp_num = pg_num;
7748   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7749     *ss << "'pg_num' must be greater than 0 and less than or equal to "
7750         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7751         << " (you may adjust 'mon max pool pg num' for higher values)";
7752     return -ERANGE;
7753   }
7754   if (pgp_num > pg_num) {
7755     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7756         << ", which in this case is " << pg_num;
7757     return -ERANGE;
7758   }
7759   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7760     *ss << "'fast_read' can only apply to erasure coding pool";
7761     return -EINVAL;
7762   }
7763   int r;
7764   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7765                                  crush_rule_name, &crush_rule, ss);
7766   if (r) {
7767     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7768     return r;
7769   }
7770   if (g_conf()->mon_osd_crush_smoke_test) {
7771     CrushWrapper newcrush;
7772     _get_pending_crush(newcrush);
7773     ostringstream err;
7774     CrushTester tester(newcrush, err);
7775     tester.set_min_x(0);
7776     tester.set_max_x(50);
7777     tester.set_rule(crush_rule);
7778     auto start = ceph::coarse_mono_clock::now();
7779     r = tester.test_with_fork(g_conf()->mon_lease);
7780     auto duration = ceph::coarse_mono_clock::now() - start;
7781     if (r < 0) {
7782       dout(10) << "tester.test_with_fork returns " << r
7783                << ": " << err.str() << dendl;
7784       *ss << "crush test failed with " << r << ": " << err.str();
7785       return r;
7786     }
7787     dout(10) << __func__ << " crush smoke test duration: "
7788              << duration << dendl;
7789   }
7790   unsigned size, min_size;
7791   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7792                         &size, &min_size, ss);
7793   if (r) {
7794     dout(10) << "prepare_pool_size returns " << r << dendl;
7795     return r;
7796   }
7797   r = check_pg_num(-1, pg_num, size, ss);
7798   if (r) {
7799     dout(10) << "check_pg_num returns " << r << dendl;
7800     return r;
7801   }
7802
7803   if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7804     return -EINVAL;
7805   }
7806
7807   uint32_t stripe_width = 0;
7808   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7809   if (r) {
7810     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7811     return r;
7812   }
7813
7814   bool fread = false;
7815   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7816     switch (fast_read) {
7817       case FAST_READ_OFF:
7818         fread = false;
7819         break;
7820       case FAST_READ_ON:
7821         fread = true;
7822         break;
7823       case FAST_READ_DEFAULT:
7824         fread = g_conf()->osd_pool_default_ec_fast_read;
7825         break;
7826       default:
7827         *ss << "invalid fast_read setting: " << fast_read;
7828         return -EINVAL;
7829     }
7830   }
7831
7832   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7833        p != pending_inc.new_pool_names.end();
7834        ++p) {
7835     if (p->second == name)
7836       return 0;
7837   }
7838
7839   if (-1 == pending_inc.new_pool_max)
7840     pending_inc.new_pool_max = osdmap.pool_max;
7841   int64_t pool = ++pending_inc.new_pool_max;
7842   pg_pool_t empty;
7843   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7844   pi->create_time = ceph_clock_now();
7845   pi->type = pool_type;
7846   pi->fast_read = fread;
7847   pi->flags = g_conf()->osd_pool_default_flags;
7848   if (g_conf()->osd_pool_default_flag_hashpspool)
7849     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7850   if (g_conf()->osd_pool_default_flag_nodelete)
7851     pi->set_flag(pg_pool_t::FLAG_NODELETE);
7852   if (g_conf()->osd_pool_default_flag_nopgchange)
7853     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7854   if (g_conf()->osd_pool_default_flag_nosizechange)
7855     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7856   pi->set_flag(pg_pool_t::FLAG_CREATING);
7857   if (g_conf()->osd_pool_use_gmt_hitset)
7858     pi->use_gmt_hitset = true;
7859   else
7860     pi->use_gmt_hitset = false;
7861
7862   pi->size = size;
7863   pi->min_size = min_size;
7864   pi->crush_rule = crush_rule;
7865   pi->expected_num_objects = expected_num_objects;
7866   pi->object_hash = CEPH_STR_HASH_RJENKINS;
7867
7868   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7869         g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7870       m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7871     pi->pg_autoscale_mode = m;
7872   } else {
7873     pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7874   }
7875   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7876   pi->set_pg_num(
7877     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7878     : pg_num);
7879   pi->set_pg_num_pending(pi->get_pg_num());
7880   pi->set_pg_num_target(pg_num);
7881   pi->set_pgp_num(pi->get_pg_num());
7882   pi->set_pgp_num_target(pgp_num);
7883   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7884       pg_num_min) {
7885     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7886   }
7887   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7888         pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7889     pi->pg_autoscale_mode = m;
7890   }
7891
7892   pi->last_change = pending_inc.epoch;
7893   pi->auid = 0;
7894
7895   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7896       pi->erasure_code_profile = erasure_code_profile;
7897   } else {
7898       pi->erasure_code_profile = "";
7899   }
7900   pi->stripe_width = stripe_width;
7901
7902   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7903       target_size_bytes) {
7904     // only store for nautilus+ because TARGET_SIZE_BYTES may be
7905     // larger than int32_t max.
7906     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7907   }
7908   if (target_size_ratio > 0.0 &&
7909       osdmap.require_osd_release >= ceph_release_t::nautilus) {
7910     // only store for nautilus+, just to be consistent and tidy.
7911     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7912   }
7913
7914   pi->cache_target_dirty_ratio_micro =
7915     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7916   pi->cache_target_dirty_high_ratio_micro =
7917     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7918   pi->cache_target_full_ratio_micro =
7919     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7920   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7921   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7922
7923   pending_inc.new_pool_names[pool] = name;
7924   return 0;
7925 }
7926
7927 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7928 {
7929   op->mark_osdmon_event(__func__);
7930   ostringstream ss;
7931   if (pending_inc.new_flags < 0)
7932     pending_inc.new_flags = osdmap.get_flags();
7933   pending_inc.new_flags |= flag;
7934   ss << OSDMap::get_flag_string(flag) << " is set";
7935   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7936                                                     get_last_committed() + 1));
7937   return true;
7938 }
7939
7940 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7941 {
7942   op->mark_osdmon_event(__func__);
7943   ostringstream ss;
7944   if (pending_inc.new_flags < 0)
7945     pending_inc.new_flags = osdmap.get_flags();
7946   pending_inc.new_flags &= ~flag;
7947   ss << OSDMap::get_flag_string(flag) << " is unset";
7948   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7949                                                     get_last_committed() + 1));
7950   return true;
7951 }
7952
7953 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7954                                          stringstream& ss)
7955 {
7956   string poolstr;
7957   cmd_getval(cmdmap, "pool", poolstr);
7958   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7959   if (pool < 0) {
7960     ss << "unrecognized pool '" << poolstr << "'";
7961     return -ENOENT;
7962   }
7963   string var;
7964   cmd_getval(cmdmap, "var", var);
7965
7966   pg_pool_t p = *osdmap.get_pg_pool(pool);
7967   if (pending_inc.new_pools.count(pool))
7968     p = pending_inc.new_pools[pool];
7969
7970   // accept val as a json string in the normal case (current
7971   // generation monitor).  parse out int or float values from the
7972   // string as needed.  however, if it is not a string, try to pull
7973   // out an int, in case an older monitor with an older json schema is
7974   // forwarding a request.
7975   string val;
7976   string interr, floaterr;
7977   int64_t n = 0;
7978   double f = 0;
7979   int64_t uf = 0;  // micro-f
7980   cmd_getval(cmdmap, "val", val);
7981
7982   auto si_options = {
7983     "target_max_objects"
7984   };
7985   auto iec_options = {
7986     "target_max_bytes",
7987     "target_size_bytes",
7988     "compression_max_blob_size",
7989     "compression_min_blob_size",
7990     "csum_max_block",
7991     "csum_min_block",
7992   };
7993   if (count(begin(si_options), end(si_options), var)) {
7994     n = strict_si_cast<int64_t>(val.c_str(), &interr);
7995   } else if (count(begin(iec_options), end(iec_options), var)) {
7996     n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7997   } else {
7998     // parse string as both int and float; different fields use different types.
7999     n = strict_strtoll(val.c_str(), 10, &interr);
8000     f = strict_strtod(val.c_str(), &floaterr);
8001     uf = llrintl(f * (double)1000000.0);
8002   }
8003
8004   if (!p.is_tier() &&
8005       (var == "hit_set_type" || var == "hit_set_period" ||
8006        var == "hit_set_count" || var == "hit_set_fpp" ||
8007        var == "target_max_objects" || var == "target_max_bytes" ||
8008        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8009        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8010        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8011        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8012        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8013     return -EACCES;
8014   }
8015
8016   if (var == "size") {
8017     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8018       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8019       return -EPERM;
8020     }
8021     if (p.type == pg_pool_t::TYPE_ERASURE) {
8022       ss << "can not change the size of an erasure-coded pool";
8023       return -ENOTSUP;
8024     }
8025     if (interr.length()) {
8026       ss << "error parsing integer value '" << val << "': " << interr;
8027       return -EINVAL;
8028     }
8029     if (n <= 0 || n > 10) {
8030       ss << "pool size must be between 1 and 10";
8031       return -EINVAL;
8032     }
8033     if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
8034       return -EINVAL;
8035     }
8036     int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
8037     if (r < 0) {
8038       return r;
8039     }
8040     p.size = n;
8041     p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8042   } else if (var == "min_size") {
8043     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8044       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8045       return -EPERM;
8046     }
8047     if (interr.length()) {
8048       ss << "error parsing integer value '" << val << "': " << interr;
8049       return -EINVAL;
8050     }
8051
8052     if (p.type != pg_pool_t::TYPE_ERASURE) {
8053       if (n < 1 || n > p.size) {
8054         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8055         return -EINVAL;
8056       }
8057     } else {
8058        ErasureCodeInterfaceRef erasure_code;
8059        int k;
8060        stringstream tmp;
8061        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8062        if (err == 0) {
8063          k = erasure_code->get_data_chunk_count();
8064        } else {
8065          ss << __func__ << " get_erasure_code failed: " << tmp.str();
8066          return err;
8067        }
8068
8069        if (n < k || n > p.size) {
8070          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8071          return -EINVAL;
8072        }
8073     }
8074     p.min_size = n;
8075   } else if (var == "pg_num_actual") {
8076     if (interr.length()) {
8077       ss << "error parsing integer value '" << val << "': " << interr;
8078       return -EINVAL;
8079     }
8080     if (n == (int)p.get_pg_num()) {
8081       return 0;
8082     }
8083     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8084       ss << "'pg_num' must be greater than 0 and less than or equal to "
8085          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8086          << " (you may adjust 'mon max pool pg num' for higher values)";
8087       return -ERANGE;
8088     }
8089     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8090       ss << "cannot adjust pg_num while initial PGs are being created";
8091       return -EBUSY;
8092     }
8093     if (n > (int)p.get_pg_num()) {
8094       if (p.get_pg_num() != p.get_pg_num_pending()) {
8095         // force pre-nautilus clients to resend their ops, since they
8096         // don't understand pg_num_pending changes form a new interval
8097         p.last_force_op_resend_prenautilus = pending_inc.epoch;
8098       }
8099       p.set_pg_num(n);
8100     } else {
8101       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8102         ss << "nautilus OSDs are required to adjust pg_num_pending";
8103         return -EPERM;
8104       }
8105       if (n < (int)p.get_pgp_num()) {
8106         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8107         return -EINVAL;
8108       }
8109       if (n < (int)p.get_pg_num() - 1) {
8110         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8111            << ") - 1; only single pg decrease is currently supported";
8112         return -EINVAL;
8113       }
8114       p.set_pg_num_pending(n);
8115       // force pre-nautilus clients to resend their ops, since they
8116       // don't understand pg_num_pending changes form a new interval
8117       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8118     }
8119     // force pre-luminous clients to resend their ops, since they
8120     // don't understand that split PGs now form a new interval.
8121     p.last_force_op_resend_preluminous = pending_inc.epoch;
8122   } else if (var == "pg_num") {
8123     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8124       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8125       return -EPERM;
8126     }
8127     if (interr.length()) {
8128       ss << "error parsing integer value '" << val << "': " << interr;
8129       return -EINVAL;
8130     }
8131     if (n == (int)p.get_pg_num_target()) {
8132       return 0;
8133     }
8134     if (n <= 0 || static_cast<uint64_t>(n) >
8135                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8136       ss << "'pg_num' must be greater than 0 and less than or equal to "
8137          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8138          << " (you may adjust 'mon max pool pg num' for higher values)";
8139       return -ERANGE;
8140     }
8141     if (n > (int)p.get_pg_num_target()) {
8142       int r = check_pg_num(pool, n, p.get_size(), &ss);
8143       if (r) {
8144         return r;
8145       }
8146       bool force = false;
8147       cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8148       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8149         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
8150         return -EPERM;
8151       }
8152     } else {
8153       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8154         ss << "nautilus OSDs are required to decrease pg_num";
8155         return -EPERM;
8156       }
8157     }
8158     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8159       // pre-nautilus osdmap format; increase pg_num directly
8160       assert(n > (int)p.get_pg_num());
8161       // force pre-nautilus clients to resend their ops, since they
8162       // don't understand pg_num_target changes form a new interval
8163       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8164       // force pre-luminous clients to resend their ops, since they
8165       // don't understand that split PGs now form a new interval.
8166       p.last_force_op_resend_preluminous = pending_inc.epoch;
8167       p.set_pg_num(n);
8168     } else {
8169       // set targets; mgr will adjust pg_num_actual and pgp_num later.
8170       // make pgp_num track pg_num if it already matches.  if it is set
8171       // differently, leave it different and let the user control it
8172       // manually.
8173       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8174         p.set_pgp_num_target(n);
8175       }
8176       p.set_pg_num_target(n);
8177     }
8178   } else if (var == "pgp_num_actual") {
8179     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8180       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8181       return -EPERM;
8182     }
8183     if (interr.length()) {
8184       ss << "error parsing integer value '" << val << "': " << interr;
8185       return -EINVAL;
8186     }
8187     if (n <= 0) {
8188       ss << "specified pgp_num must > 0, but you set to " << n;
8189       return -EINVAL;
8190     }
8191     if (n > (int)p.get_pg_num()) {
8192       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8193       return -EINVAL;
8194     }
8195     if (n > (int)p.get_pg_num_pending()) {
8196       ss << "specified pgp_num " << n
8197          << " > pg_num_pending " << p.get_pg_num_pending();
8198       return -EINVAL;
8199     }
8200     p.set_pgp_num(n);
8201   } else if (var == "pgp_num") {
8202     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8203       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8204       return -EPERM;
8205     }
8206     if (interr.length()) {
8207       ss << "error parsing integer value '" << val << "': " << interr;
8208       return -EINVAL;
8209     }
8210     if (n <= 0) {
8211       ss << "specified pgp_num must > 0, but you set to " << n;
8212       return -EINVAL;
8213     }
8214     if (n > (int)p.get_pg_num_target()) {
8215       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8216       return -EINVAL;
8217     }
8218     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8219       // pre-nautilus osdmap format; increase pgp_num directly
8220       p.set_pgp_num(n);
8221     } else {
8222       p.set_pgp_num_target(n);
8223     }
8224   } else if (var == "pg_autoscale_mode") {
8225     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8226     if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8227       ss << "specified invalid mode " << val;
8228       return -EINVAL;
8229     }
8230     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8231       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8232       return -EINVAL;
8233     }
8234     p.pg_autoscale_mode = m;
8235   } else if (var == "crush_rule") {
8236     int id = osdmap.crush->get_rule_id(val);
8237     if (id == -ENOENT) {
8238       ss << "crush rule " << val << " does not exist";
8239       return -ENOENT;
8240     }
8241     if (id < 0) {
8242       ss << cpp_strerror(id);
8243       return -ENOENT;
8244     }
8245     if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8246       return -EINVAL;
8247     }
8248     p.crush_rule = id;
8249   } else if (var == "nodelete" || var == "nopgchange" ||
8250              var == "nosizechange" || var == "write_fadvise_dontneed" ||
8251              var == "noscrub" || var == "nodeep-scrub") {
8252     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8253     // make sure we only compare against 'n' if we didn't receive a string
8254     if (val == "true" || (interr.empty() && n == 1)) {
8255       p.set_flag(flag);
8256     } else if (val == "false" || (interr.empty() && n == 0)) {
8257       p.unset_flag(flag);
8258     } else {
8259       ss << "expecting value 'true', 'false', '0', or '1'";
8260       return -EINVAL;
8261     }
8262   } else if (var == "hashpspool") {
8263     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8264     bool force = false;
8265     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8266
8267     if (!force) {
8268       ss << "are you SURE?  this will remap all placement groups in this pool,"
8269             " this triggers large data movement,"
8270             " pass --yes-i-really-mean-it if you really do.";
8271       return -EPERM;
8272     }
8273     // make sure we only compare against 'n' if we didn't receive a string
8274     if (val == "true" || (interr.empty() && n == 1)) {
8275       p.set_flag(flag);
8276     } else if (val == "false" || (interr.empty() && n == 0)) {
8277       p.unset_flag(flag);
8278     } else {
8279       ss << "expecting value 'true', 'false', '0', or '1'";
8280       return -EINVAL;
8281     }
8282   } else if (var == "hit_set_type") {
8283     if (val == "none")
8284       p.hit_set_params = HitSet::Params();
8285     else {
8286       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8287       if (err)
8288         return err;
8289       if (val == "bloom") {
8290         BloomHitSet::Params *bsp = new BloomHitSet::Params;
8291         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8292         p.hit_set_params = HitSet::Params(bsp);
8293       } else if (val == "explicit_hash")
8294         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8295       else if (val == "explicit_object")
8296         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8297       else {
8298         ss << "unrecognized hit_set type '" << val << "'";
8299         return -EINVAL;
8300       }
8301     }
8302   } else if (var == "hit_set_period") {
8303     if (interr.length()) {
8304       ss << "error parsing integer value '" << val << "': " << interr;
8305       return -EINVAL;
8306     } else if (n < 0) {
8307       ss << "hit_set_period should be non-negative";
8308       return -EINVAL;
8309     }
8310     p.hit_set_period = n;
8311   } else if (var == "hit_set_count") {
8312     if (interr.length()) {
8313       ss << "error parsing integer value '" << val << "': " << interr;
8314       return -EINVAL;
8315     } else if (n < 0) {
8316       ss << "hit_set_count should be non-negative";
8317       return -EINVAL;
8318     }
8319     p.hit_set_count = n;
8320   } else if (var == "hit_set_fpp") {
8321     if (floaterr.length()) {
8322       ss << "error parsing floating point value '" << val << "': " << floaterr;
8323       return -EINVAL;
8324     } else if (f < 0 || f > 1.0) {
8325       ss << "hit_set_fpp should be in the range 0..1";
8326       return -EINVAL;
8327     }
8328     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8329       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8330       return -EINVAL;
8331     }
8332     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8333     bloomp->set_fpp(f);
8334   } else if (var == "use_gmt_hitset") {
8335     if (val == "true" || (interr.empty() && n == 1)) {
8336       p.use_gmt_hitset = true;
8337     } else {
8338       ss << "expecting value 'true' or '1'";
8339       return -EINVAL;
8340     }
8341   } else if (var == "allow_ec_overwrites") {
8342     if (!p.is_erasure()) {
8343       ss << "ec overwrites can only be enabled for an erasure coded pool";
8344       return -EINVAL;
8345     }
8346     stringstream err;
8347     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8348         !is_pool_currently_all_bluestore(pool, p, &err)) {
8349       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8350       return -EINVAL;
8351     }
8352     if (val == "true" || (interr.empty() && n == 1)) {
8353         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8354     } else if (val == "false" || (interr.empty() && n == 0)) {
8355       ss << "ec overwrites cannot be disabled once enabled";
8356       return -EINVAL;
8357     } else {
8358       ss << "expecting value 'true', 'false', '0', or '1'";
8359       return -EINVAL;
8360     }
8361   } else if (var == "target_max_objects") {
8362     if (interr.length()) {
8363       ss << "error parsing int '" << val << "': " << interr;
8364       return -EINVAL;
8365     }
8366     p.target_max_objects = n;
8367   } else if (var == "target_max_bytes") {
8368     if (interr.length()) {
8369       ss << "error parsing int '" << val << "': " << interr;
8370       return -EINVAL;
8371     }
8372     p.target_max_bytes = n;
8373   } else if (var == "cache_target_dirty_ratio") {
8374     if (floaterr.length()) {
8375       ss << "error parsing float '" << val << "': " << floaterr;
8376       return -EINVAL;
8377     }
8378     if (f < 0 || f > 1.0) {
8379       ss << "value must be in the range 0..1";
8380       return -ERANGE;
8381     }
8382     p.cache_target_dirty_ratio_micro = uf;
8383   } else if (var == "cache_target_dirty_high_ratio") {
8384     if (floaterr.length()) {
8385       ss << "error parsing float '" << val << "': " << floaterr;
8386       return -EINVAL;
8387     }
8388     if (f < 0 || f > 1.0) {
8389       ss << "value must be in the range 0..1";
8390       return -ERANGE;
8391     }
8392     p.cache_target_dirty_high_ratio_micro = uf;
8393   } else if (var == "cache_target_full_ratio") {
8394     if (floaterr.length()) {
8395       ss << "error parsing float '" << val << "': " << floaterr;
8396       return -EINVAL;
8397     }
8398     if (f < 0 || f > 1.0) {
8399       ss << "value must be in the range 0..1";
8400       return -ERANGE;
8401     }
8402     p.cache_target_full_ratio_micro = uf;
8403   } else if (var == "cache_min_flush_age") {
8404     if (interr.length()) {
8405       ss << "error parsing int '" << val << "': " << interr;
8406       return -EINVAL;
8407     }
8408     p.cache_min_flush_age = n;
8409   } else if (var == "cache_min_evict_age") {
8410     if (interr.length()) {
8411       ss << "error parsing int '" << val << "': " << interr;
8412       return -EINVAL;
8413     }
8414     p.cache_min_evict_age = n;
8415   } else if (var == "min_read_recency_for_promote") {
8416     if (interr.length()) {
8417       ss << "error parsing integer value '" << val << "': " << interr;
8418       return -EINVAL;
8419     }
8420     p.min_read_recency_for_promote = n;
8421   } else if (var == "hit_set_grade_decay_rate") {
8422     if (interr.length()) {
8423       ss << "error parsing integer value '" << val << "': " << interr;
8424       return -EINVAL;
8425     }
8426     if (n > 100 || n < 0) {
8427       ss << "value out of range,valid range is 0 - 100";
8428       return -EINVAL;
8429     }
8430     p.hit_set_grade_decay_rate = n;
8431   } else if (var == "hit_set_search_last_n") {
8432     if (interr.length()) {
8433       ss << "error parsing integer value '" << val << "': " << interr;
8434       return -EINVAL;
8435     }
8436     if (n > p.hit_set_count || n < 0) {
8437       ss << "value out of range,valid range is 0 - hit_set_count";
8438       return -EINVAL;
8439     }
8440     p.hit_set_search_last_n = n;
8441   } else if (var == "min_write_recency_for_promote") {
8442     if (interr.length()) {
8443       ss << "error parsing integer value '" << val << "': " << interr;
8444       return -EINVAL;
8445     }
8446     p.min_write_recency_for_promote = n;
8447   } else if (var == "fast_read") {
8448     if (p.is_replicated()) {
8449         ss << "fast read is not supported in replication pool";
8450         return -EINVAL;
8451     }
8452     if (val == "true" || (interr.empty() && n == 1)) {
8453       p.fast_read = true;
8454     } else if (val == "false" || (interr.empty() && n == 0)) {
8455       p.fast_read = false;
8456     } else {
8457       ss << "expecting value 'true', 'false', '0', or '1'";
8458       return -EINVAL;
8459     }
8460   } else if (pool_opts_t::is_opt_name(var)) {
8461     bool unset = val == "unset";
8462     if (var == "compression_mode") {
8463       if (!unset) {
8464         auto cmode = Compressor::get_comp_mode_type(val);
8465         if (!cmode) {
8466           ss << "unrecognized compression mode '" << val << "'";
8467           return -EINVAL;
8468         }
8469       }
8470     } else if (var == "compression_algorithm") {
8471       if (!unset) {
8472         auto alg = Compressor::get_comp_alg_type(val);
8473         if (!alg) {
8474           ss << "unrecognized compression_algorithm '" << val << "'";
8475           return -EINVAL;
8476         }
8477       }
8478     } else if (var == "compression_required_ratio") {
8479       if (floaterr.length()) {
8480         ss << "error parsing float value '" << val << "': " << floaterr;
8481         return -EINVAL;
8482       }
8483       if (f < 0 || f > 1) {
8484         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8485         return -EINVAL;
8486       }
8487     } else if (var == "csum_type") {
8488       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8489       if (t < 0 ) {
8490         ss << "unrecognized csum_type '" << val << "'";
8491         return -EINVAL;
8492       }
8493       //preserve csum_type numeric value
8494       n = t;
8495       interr.clear();
8496     } else if (var == "compression_max_blob_size" ||
8497                var == "compression_min_blob_size" ||
8498                var == "csum_max_block" ||
8499                var == "csum_min_block") {
8500       if (interr.length()) {
8501         ss << "error parsing int value '" << val << "': " << interr;
8502         return -EINVAL;
8503       }
8504     } else if (var == "fingerprint_algorithm") {
8505       if (!unset) {
8506         auto alg = pg_pool_t::get_fingerprint_from_str(val);
8507         if (!alg) {
8508           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8509           return -EINVAL;
8510         }
8511       }
8512     } else if (var == "target_size_bytes") {
8513       if (interr.length()) {
8514         ss << "error parsing unit value '" << val << "': " << interr;
8515         return -EINVAL;
8516       }
8517       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8518         ss << "must set require_osd_release to nautilus or "
8519            << "later before setting target_size_bytes";
8520         return -EINVAL;
8521       }
8522     } else if (var == "pg_num_min") {
8523       if (interr.length()) {
8524         ss << "error parsing int value '" << val << "': " << interr;
8525         return -EINVAL;
8526       }
8527       if (n > (int)p.get_pg_num_target()) {
8528         ss << "specified pg_num_min " << n
8529            << " > pg_num " << p.get_pg_num_target();
8530         return -EINVAL;
8531       }
8532     } else if (var == "recovery_priority") {
8533       if (interr.length()) {
8534         ss << "error parsing int value '" << val << "': " << interr;
8535         return -EINVAL;
8536       }
8537       if (!g_conf()->debug_allow_any_pool_priority) {
8538         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8539           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8540              << " and " << OSD_POOL_PRIORITY_MAX;
8541           return -EINVAL;
8542         }
8543       }
8544     } else if (var == "pg_autoscale_bias") {
8545       if (f < 0.0 || f > 1000.0) {
8546         ss << "pg_autoscale_bias must be between 0 and 1000";
8547         return -EINVAL;
8548       }
8549     }
8550
8551     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8552     switch (desc.type) {
8553     case pool_opts_t::STR:
8554       if (unset) {
8555         p.opts.unset(desc.key);
8556       } else {
8557         p.opts.set(desc.key, static_cast<std::string>(val));
8558       }
8559       break;
8560     case pool_opts_t::INT:
8561       if (interr.length()) {
8562         ss << "error parsing integer value '" << val << "': " << interr;
8563         return -EINVAL;
8564       }
8565       if (n == 0) {
8566         p.opts.unset(desc.key);
8567       } else {
8568         p.opts.set(desc.key, static_cast<int64_t>(n));
8569       }
8570       break;
8571     case pool_opts_t::DOUBLE:
8572       if (floaterr.length()) {
8573         ss << "error parsing floating point value '" << val << "': " << floaterr;
8574         return -EINVAL;
8575       }
8576       if (f == 0) {
8577         p.opts.unset(desc.key);
8578       } else {
8579         p.opts.set(desc.key, static_cast<double>(f));
8580       }
8581       break;
8582     default:
8583       ceph_assert(!"unknown type");
8584     }
8585   } else {
8586     ss << "unrecognized variable '" << var << "'";
8587     return -EINVAL;
8588   }
8589   if (val != "unset") {
8590     ss << "set pool " << pool << " " << var << " to " << val;
8591   } else {
8592     ss << "unset pool " << pool << " " << var;
8593   }
8594   p.last_change = pending_inc.epoch;
8595   pending_inc.new_pools[pool] = p;
8596   return 0;
8597 }
8598
8599 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8600                                                  const cmdmap_t& cmdmap,
8601                                                  stringstream& ss)
8602 {
8603   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8604 }
8605
8606 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8607                                                     const cmdmap_t& cmdmap,
8608                                                     stringstream& ss,
8609                                                     bool *modified)
8610 {
8611   return _command_pool_application(prefix, cmdmap, ss, modified, false);
8612 }
8613
8614
8615 /**
8616  * Common logic for preprocess and prepare phases of pool application
8617  * tag commands.  In preprocess mode we're only detecting invalid
8618  * commands, and determining whether it was a modification or a no-op.
8619  * In prepare mode we're actually updating the pending state.
8620  */
8621 int OSDMonitor::_command_pool_application(const string &prefix,
8622                                           const cmdmap_t& cmdmap,
8623                                           stringstream& ss,
8624                                           bool *modified,
8625                                           bool preparing)
8626 {
8627   string pool_name;
8628   cmd_getval(cmdmap, "pool", pool_name);
8629   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8630   if (pool < 0) {
8631     ss << "unrecognized pool '" << pool_name << "'";
8632     return -ENOENT;
8633   }
8634
8635   pg_pool_t p = *osdmap.get_pg_pool(pool);
8636   if (preparing) {
8637     if (pending_inc.new_pools.count(pool)) {
8638       p = pending_inc.new_pools[pool];
8639     }
8640   }
8641
8642   string app;
8643   cmd_getval(cmdmap, "app", app);
8644   bool app_exists = (p.application_metadata.count(app) > 0);
8645
8646   string key;
8647   cmd_getval(cmdmap, "key", key);
8648   if (key == "all") {
8649     ss << "key cannot be 'all'";
8650     return -EINVAL;
8651   }
8652
8653   string value;
8654   cmd_getval(cmdmap, "value", value);
8655   if (value == "all") {
8656     ss << "value cannot be 'all'";
8657     return -EINVAL;
8658   }
8659
8660   if (boost::algorithm::ends_with(prefix, "enable")) {
8661     if (app.empty()) {
8662       ss << "application name must be provided";
8663       return -EINVAL;
8664     }
8665
8666     if (p.is_tier()) {
8667       ss << "application must be enabled on base tier";
8668       return -EINVAL;
8669     }
8670
8671     bool force = false;
8672     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8673
8674     if (!app_exists && !p.application_metadata.empty() && !force) {
8675       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8676          << "application; pass --yes-i-really-mean-it to proceed anyway";
8677       return -EPERM;
8678     }
8679
8680     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8681       ss << "too many enabled applications on pool '" << pool_name << "'; "
8682          << "max " << MAX_POOL_APPLICATIONS;
8683       return -EINVAL;
8684     }
8685
8686     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8687       ss << "application name '" << app << "' too long; max length "
8688          << MAX_POOL_APPLICATION_LENGTH;
8689       return -EINVAL;
8690     }
8691
8692     if (!app_exists) {
8693       p.application_metadata[app] = {};
8694     }
8695     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8696
8697   } else if (boost::algorithm::ends_with(prefix, "disable")) {
8698     bool force = false;
8699     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8700
8701     if (!force) {
8702       ss << "Are you SURE? Disabling an application within a pool might result "
8703          << "in loss of application functionality; pass "
8704          << "--yes-i-really-mean-it to proceed anyway";
8705       return -EPERM;
8706     }
8707
8708     if (!app_exists) {
8709       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8710          << "'";
8711       return 0; // idempotent
8712     }
8713
8714     p.application_metadata.erase(app);
8715     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8716
8717   } else if (boost::algorithm::ends_with(prefix, "set")) {
8718     if (p.is_tier()) {
8719       ss << "application metadata must be set on base tier";
8720       return -EINVAL;
8721     }
8722
8723     if (!app_exists) {
8724       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8725          << "'";
8726       return -ENOENT;
8727     }
8728
8729     string key;
8730     cmd_getval(cmdmap, "key", key);
8731
8732     if (key.empty()) {
8733       ss << "key must be provided";
8734       return -EINVAL;
8735     }
8736
8737     auto &app_keys = p.application_metadata[app];
8738     if (app_keys.count(key) == 0 &&
8739         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8740       ss << "too many keys set for application '" << app << "' on pool '"
8741          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8742       return -EINVAL;
8743     }
8744
8745     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8746       ss << "key '" << app << "' too long; max length "
8747          << MAX_POOL_APPLICATION_LENGTH;
8748       return -EINVAL;
8749     }
8750
8751     string value;
8752     cmd_getval(cmdmap, "value", value);
8753     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8754       ss << "value '" << value << "' too long; max length "
8755          << MAX_POOL_APPLICATION_LENGTH;
8756       return -EINVAL;
8757     }
8758
8759     p.application_metadata[app][key] = value;
8760     ss << "set application '" << app << "' key '" << key << "' to '"
8761        << value << "' on pool '" << pool_name << "'";
8762   } else if (boost::algorithm::ends_with(prefix, "rm")) {
8763     if (!app_exists) {
8764       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8765          << "'";
8766       return -ENOENT;
8767     }
8768
8769     string key;
8770     cmd_getval(cmdmap, "key", key);
8771     auto it = p.application_metadata[app].find(key);
8772     if (it == p.application_metadata[app].end()) {
8773       ss << "application '" << app << "' on pool '" << pool_name
8774          << "' does not have key '" << key << "'";
8775       return 0; // idempotent
8776     }
8777
8778     p.application_metadata[app].erase(it);
8779     ss << "removed application '" << app << "' key '" << key << "' on pool '"
8780        << pool_name << "'";
8781   } else {
8782     ceph_abort();
8783   }
8784
8785   if (preparing) {
8786     p.last_change = pending_inc.epoch;
8787     pending_inc.new_pools[pool] = p;
8788   }
8789
8790   // Because we fell through this far, we didn't hit no-op cases,
8791   // so pool was definitely modified
8792   if (modified != nullptr) {
8793     *modified = true;
8794   }
8795
8796   return 0;
8797 }
8798
8799 int OSDMonitor::_prepare_command_osd_crush_remove(
8800     CrushWrapper &newcrush,
8801     int32_t id,
8802     int32_t ancestor,
8803     bool has_ancestor,
8804     bool unlink_only)
8805 {
8806   int err = 0;
8807
8808   if (has_ancestor) {
8809     err = newcrush.remove_item_under(cct, id, ancestor,
8810         unlink_only);
8811   } else {
8812     err = newcrush.remove_item(cct, id, unlink_only);
8813   }
8814   return err;
8815 }
8816
8817 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8818 {
8819   pending_inc.crush.clear();
8820   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8821 }
8822
8823 int OSDMonitor::prepare_command_osd_crush_remove(
8824     CrushWrapper &newcrush,
8825     int32_t id,
8826     int32_t ancestor,
8827     bool has_ancestor,
8828     bool unlink_only)
8829 {
8830   int err = _prepare_command_osd_crush_remove(
8831       newcrush, id, ancestor,
8832       has_ancestor, unlink_only);
8833
8834   if (err < 0)
8835     return err;
8836
8837   ceph_assert(err == 0);
8838   do_osd_crush_remove(newcrush);
8839
8840   return 0;
8841 }
8842
8843 int OSDMonitor::prepare_command_osd_remove(int32_t id)
8844 {
8845   if (osdmap.is_up(id)) {
8846     return -EBUSY;
8847   }
8848
8849   pending_inc.new_state[id] = osdmap.get_state(id);
8850   pending_inc.new_uuid[id] = uuid_d();
8851   pending_metadata_rm.insert(id);
8852   pending_metadata.erase(id);
8853
8854   return 0;
8855 }
8856
8857 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8858 {
8859   ceph_assert(existing_id);
8860   *existing_id = -1;
8861
8862   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8863     if (!osdmap.exists(i) &&
8864         pending_inc.new_up_client.count(i) == 0 &&
8865         (pending_inc.new_state.count(i) == 0 ||
8866          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8867       *existing_id = i;
8868       return -1;
8869     }
8870   }
8871
8872   if (pending_inc.new_max_osd < 0) {
8873     return osdmap.get_max_osd();
8874   }
8875   return pending_inc.new_max_osd;
8876 }
8877
8878 void OSDMonitor::do_osd_create(
8879     const int32_t id,
8880     const uuid_d& uuid,
8881     const string& device_class,
8882     int32_t* new_id)
8883 {
8884   dout(10) << __func__ << " uuid " << uuid << dendl;
8885   ceph_assert(new_id);
8886
8887   // We presume validation has been performed prior to calling this
8888   // function. We assert with prejudice.
8889
8890   int32_t allocated_id = -1; // declare here so we can jump
8891   int32_t existing_id = -1;
8892   if (!uuid.is_zero()) {
8893     existing_id = osdmap.identify_osd(uuid);
8894     if (existing_id >= 0) {
8895       ceph_assert(id < 0 || id == existing_id);
8896       *new_id = existing_id;
8897       goto out;
8898     } else if (id >= 0) {
8899       // uuid does not exist, and id has been provided, so just create
8900       // the new osd.id
8901       *new_id = id;
8902       goto out;
8903     }
8904   }
8905
8906   // allocate a new id
8907   allocated_id = _allocate_osd_id(&existing_id);
8908   dout(10) << __func__ << " allocated id " << allocated_id
8909            << " existing id " << existing_id << dendl;
8910   if (existing_id >= 0) {
8911     ceph_assert(existing_id < osdmap.get_max_osd());
8912     ceph_assert(allocated_id < 0);
8913     pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8914     *new_id = existing_id;
8915   } else if (allocated_id >= 0) {
8916     ceph_assert(existing_id < 0);
8917     // raise max_osd
8918     if (pending_inc.new_max_osd < 0) {
8919       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8920     } else {
8921       ++pending_inc.new_max_osd;
8922     }
8923     *new_id = pending_inc.new_max_osd - 1;
8924     ceph_assert(*new_id == allocated_id);
8925   } else {
8926     ceph_abort_msg("unexpected condition");
8927   }
8928
8929 out:
8930   if (device_class.size()) {
8931     CrushWrapper newcrush;
8932     _get_pending_crush(newcrush);
8933     if (newcrush.get_max_devices() < *new_id + 1) {
8934       newcrush.set_max_devices(*new_id + 1);
8935     }
8936     string name = string("osd.") + stringify(*new_id);
8937     if (!newcrush.item_exists(*new_id)) {
8938       newcrush.set_item_name(*new_id, name);
8939     }
8940     ostringstream ss;
8941     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8942     if (r < 0) {
8943       derr << __func__ << " failed to set " << name << " device_class "
8944            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8945            << dendl;
8946       // non-fatal... this might be a replay and we want to be idempotent.
8947     } else {
8948       dout(20) << __func__ << " set " << name << " device_class " << device_class
8949                << dendl;
8950       pending_inc.crush.clear();
8951       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8952     }
8953   } else {
8954     dout(20) << __func__ << " no device_class" << dendl;
8955   }
8956
8957   dout(10) << __func__ << " using id " << *new_id << dendl;
8958   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8959     pending_inc.new_max_osd = *new_id + 1;
8960   }
8961
8962   pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8963   if (!uuid.is_zero())
8964     pending_inc.new_uuid[*new_id] = uuid;
8965 }
8966
8967 int OSDMonitor::validate_osd_create(
8968     const int32_t id,
8969     const uuid_d& uuid,
8970     const bool check_osd_exists,
8971     int32_t* existing_id,
8972     stringstream& ss)
8973 {
8974
8975   dout(10) << __func__ << " id " << id << " uuid " << uuid
8976            << " check_osd_exists " << check_osd_exists << dendl;
8977
8978   ceph_assert(existing_id);
8979
8980   if (id < 0 && uuid.is_zero()) {
8981     // we have nothing to validate
8982     *existing_id = -1;
8983     return 0;
8984   } else if (uuid.is_zero()) {
8985     // we have an id but we will ignore it - because that's what
8986     // `osd create` does.
8987     return 0;
8988   }
8989
8990   /*
8991    * This function will be used to validate whether we are able to
8992    * create a new osd when the `uuid` is specified.
8993    *
8994    * It will be used by both `osd create` and `osd new`, as the checks
8995    * are basically the same when it pertains to osd id and uuid validation.
8996    * However, `osd create` presumes an `uuid` is optional, for legacy
8997    * reasons, while `osd new` requires the `uuid` to be provided. This
8998    * means that `osd create` will not be idempotent if an `uuid` is not
8999    * provided, but we will always guarantee the idempotency of `osd new`.
9000    */
9001
9002   ceph_assert(!uuid.is_zero());
9003   if (pending_inc.identify_osd(uuid) >= 0) {
9004     // osd is about to exist
9005     return -EAGAIN;
9006   }
9007
9008   int32_t i = osdmap.identify_osd(uuid);
9009   if (i >= 0) {
9010     // osd already exists
9011     if (id >= 0 && i != id) {
9012       ss << "uuid " << uuid << " already in use for different id " << i;
9013       return -EEXIST;
9014     }
9015     // return a positive errno to distinguish between a blocking error
9016     // and an error we consider to not be a problem (i.e., this would be
9017     // an idempotent operation).
9018     *existing_id = i;
9019     return EEXIST;
9020   }
9021   // i < 0
9022   if (id >= 0) {
9023     if (pending_inc.new_state.count(id)) {
9024       // osd is about to exist
9025       return -EAGAIN;
9026     }
9027     // we may not care if an osd exists if we are recreating a previously
9028     // destroyed osd.
9029     if (check_osd_exists && osdmap.exists(id)) {
9030       ss << "id " << id << " already in use and does not match uuid "
9031          << uuid;
9032       return -EINVAL;
9033     }
9034   }
9035   return 0;
9036 }
9037
9038 int OSDMonitor::prepare_command_osd_create(
9039     const int32_t id,
9040     const uuid_d& uuid,
9041     int32_t* existing_id,
9042     stringstream& ss)
9043 {
9044   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9045   ceph_assert(existing_id);
9046   if (osdmap.is_destroyed(id)) {
9047     ss << "ceph osd create has been deprecated. Please use ceph osd new "
9048           "instead.";
9049     return -EINVAL;
9050   }
9051
9052   if (uuid.is_zero()) {
9053     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9054   }
9055
9056   return validate_osd_create(id, uuid, true, existing_id, ss);
9057 }
9058
9059 int OSDMonitor::prepare_command_osd_new(
9060     MonOpRequestRef op,
9061     const cmdmap_t& cmdmap,
9062     const map<string,string>& params,
9063     stringstream &ss,
9064     Formatter *f)
9065 {
9066   uuid_d uuid;
9067   string uuidstr;
9068   int64_t id = -1;
9069
9070   ceph_assert(paxos->is_plugged());
9071
9072   dout(10) << __func__ << " " << op << dendl;
9073
9074   /* validate command. abort now if something's wrong. */
9075
9076   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9077    *
9078    * If `id` is not specified, we will identify any existing osd based
9079    * on `uuid`. Operation will be idempotent iff secrets match.
9080    *
9081    * If `id` is specified, we will identify any existing osd based on
9082    * `uuid` and match against `id`. If they match, operation will be
9083    * idempotent iff secrets match.
9084    *
9085    * `-i secrets.json` will be optional. If supplied, will be used
9086    * to check for idempotency when `id` and `uuid` match.
9087    *
9088    * If `id` is not specified, and `uuid` does not exist, an id will
9089    * be found or allocated for the osd.
9090    *
9091    * If `id` is specified, and the osd has been previously marked
9092    * as destroyed, then the `id` will be reused.
9093    */
9094   if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9095     ss << "requires the OSD's UUID to be specified.";
9096     return -EINVAL;
9097   } else if (!uuid.parse(uuidstr.c_str())) {
9098     ss << "invalid UUID value '" << uuidstr << "'.";
9099     return -EINVAL;
9100   }
9101
9102   if (cmd_getval(cmdmap, "id", id) &&
9103       (id < 0)) {
9104     ss << "invalid OSD id; must be greater or equal than zero.";
9105     return -EINVAL;
9106   }
9107
9108   // are we running an `osd create`-like command, or recreating
9109   // a previously destroyed osd?
9110
9111   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9112
9113   // we will care about `id` to assess whether osd is `destroyed`, or
9114   // to create a new osd.
9115   // we will need an `id` by the time we reach auth.
9116
9117   int32_t existing_id = -1;
9118   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9119                                 &existing_id, ss);
9120
9121   bool may_be_idempotent = false;
9122   if (err == EEXIST) {
9123     // this is idempotent from the osdmon's point-of-view
9124     may_be_idempotent = true;
9125     ceph_assert(existing_id >= 0);
9126     id = existing_id;
9127   } else if (err < 0) {
9128     return err;
9129   }
9130
9131   if (!may_be_idempotent) {
9132     // idempotency is out of the window. We are either creating a new
9133     // osd or recreating a destroyed osd.
9134     //
9135     // We now need to figure out if we have an `id` (and if it's valid),
9136     // of find an `id` if we don't have one.
9137
9138     // NOTE: we need to consider the case where the `id` is specified for
9139     // `osd create`, and we must honor it. So this means checking if
9140     // the `id` is destroyed, and if so assume the destroy; otherwise,
9141     // check if it `exists` - in which case we complain about not being
9142     // `destroyed`. In the end, if nothing fails, we must allow the
9143     // creation, so that we are compatible with `create`.
9144     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9145       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9146       ss << "OSD " << id << " has not yet been destroyed";
9147       return -EINVAL;
9148     } else if (id < 0) {
9149       // find an `id`
9150       id = _allocate_osd_id(&existing_id);
9151       if (id < 0) {
9152         ceph_assert(existing_id >= 0);
9153         id = existing_id;
9154       }
9155       dout(10) << __func__ << " found id " << id << " to use" << dendl;
9156     } else if (id >= 0 && osdmap.is_destroyed(id)) {
9157       dout(10) << __func__ << " recreating osd." << id << dendl;
9158     } else {
9159       dout(10) << __func__ << " creating new osd." << id << dendl;
9160     }
9161   } else {
9162     ceph_assert(id >= 0);
9163     ceph_assert(osdmap.exists(id));
9164   }
9165
9166   // we are now able to either create a brand new osd or reuse an existing
9167   // osd that has been previously destroyed.
9168
9169   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9170
9171   if (may_be_idempotent && params.empty()) {
9172     // nothing to do, really.
9173     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9174     ceph_assert(id >= 0);
9175     if (f) {
9176       f->open_object_section("created_osd");
9177       f->dump_int("osdid", id);
9178       f->close_section();
9179     } else {
9180       ss << id;
9181     }
9182     return EEXIST;
9183   }
9184
9185   string device_class;
9186   auto p = params.find("crush_device_class");
9187   if (p != params.end()) {
9188     device_class = p->second;
9189     dout(20) << __func__ << " device_class will be " << device_class << dendl;
9190   }
9191   string cephx_secret, lockbox_secret, dmcrypt_key;
9192   bool has_lockbox = false;
9193   bool has_secrets = params.count("cephx_secret")
9194     || params.count("cephx_lockbox_secret")
9195     || params.count("dmcrypt_key");
9196
9197   ConfigKeyService *svc = nullptr;
9198   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9199
9200   if (has_secrets) {
9201     if (params.count("cephx_secret") == 0) {
9202       ss << "requires a cephx secret.";
9203       return -EINVAL;
9204     }
9205     cephx_secret = params.at("cephx_secret");
9206
9207     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9208     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9209
9210     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9211              << " dmcrypt " << has_dmcrypt_key << dendl;
9212
9213     if (has_lockbox_secret && has_dmcrypt_key) {
9214       has_lockbox = true;
9215       lockbox_secret = params.at("cephx_lockbox_secret");
9216       dmcrypt_key = params.at("dmcrypt_key");
9217     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9218       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9219       return -EINVAL;
9220     }
9221
9222     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9223
9224     err = mon->authmon()->validate_osd_new(id, uuid,
9225         cephx_secret,
9226         lockbox_secret,
9227         cephx_entity,
9228         lockbox_entity,
9229         ss);
9230     if (err < 0) {
9231       return err;
9232     } else if (may_be_idempotent && err != EEXIST) {
9233       // for this to be idempotent, `id` should already be >= 0; no need
9234       // to use validate_id.
9235       ceph_assert(id >= 0);
9236       ss << "osd." << id << " exists but secrets do not match";
9237       return -EEXIST;
9238     }
9239
9240     if (has_lockbox) {
9241       svc = (ConfigKeyService*)mon->config_key_service;
9242       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9243       if (err < 0) {
9244         return err;
9245       } else if (may_be_idempotent && err != EEXIST) {
9246         ceph_assert(id >= 0);
9247         ss << "osd." << id << " exists but dm-crypt key does not match.";
9248         return -EEXIST;
9249       }
9250     }
9251   }
9252   ceph_assert(!has_secrets || !cephx_secret.empty());
9253   ceph_assert(!has_lockbox || !lockbox_secret.empty());
9254
9255   if (may_be_idempotent) {
9256     // we have nothing to do for either the osdmon or the authmon,
9257     // and we have no lockbox - so the config key service will not be
9258     // touched. This is therefore an idempotent operation, and we can
9259     // just return right away.
9260     dout(10) << __func__ << " idempotent -- no op." << dendl;
9261     ceph_assert(id >= 0);
9262     if (f) {
9263       f->open_object_section("created_osd");
9264       f->dump_int("osdid", id);
9265       f->close_section();
9266     } else {
9267       ss << id;
9268     }
9269     return EEXIST;
9270   }
9271   ceph_assert(!may_be_idempotent);
9272
9273   // perform updates.
9274   if (has_secrets) {
9275     ceph_assert(!cephx_secret.empty());
9276     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9277            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9278
9279     err = mon->authmon()->do_osd_new(cephx_entity,
9280         lockbox_entity,
9281         has_lockbox);
9282     ceph_assert(0 == err);
9283
9284     if (has_lockbox) {
9285       ceph_assert(nullptr != svc);
9286       svc->do_osd_new(uuid, dmcrypt_key);
9287     }
9288   }
9289
9290   if (is_recreate_destroyed) {
9291     ceph_assert(id >= 0);
9292     ceph_assert(osdmap.is_destroyed(id));
9293     pending_inc.new_weight[id] = CEPH_OSD_OUT;
9294     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9295     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9296       pending_inc.new_state[id] |= CEPH_OSD_NEW;
9297     }
9298     if (osdmap.get_state(id) & CEPH_OSD_UP) {
9299       // due to http://tracker.ceph.com/issues/20751 some clusters may
9300       // have UP set for non-existent OSDs; make sure it is cleared
9301       // for a newly created osd.
9302       pending_inc.new_state[id] |= CEPH_OSD_UP;
9303     }
9304     pending_inc.new_uuid[id] = uuid;
9305   } else {
9306     ceph_assert(id >= 0);
9307     int32_t new_id = -1;
9308     do_osd_create(id, uuid, device_class, &new_id);
9309     ceph_assert(new_id >= 0);
9310     ceph_assert(id == new_id);
9311   }
9312
9313   if (f) {
9314     f->open_object_section("created_osd");
9315     f->dump_int("osdid", id);
9316     f->close_section();
9317   } else {
9318     ss << id;
9319   }
9320
9321   return 0;
9322 }
9323
9324 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9325 {
9326   op->mark_osdmon_event(__func__);
9327   auto m = op->get_req<MMonCommand>();
9328   stringstream ss;
9329   cmdmap_t cmdmap;
9330   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9331     string rs = ss.str();
9332     mon->reply_command(op, -EINVAL, rs, get_last_committed());
9333     return true;
9334   }
9335
9336   MonSession *session = op->get_session();
9337   if (!session) {
9338     derr << __func__ << " no session" << dendl;
9339     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9340     return true;
9341   }
9342
9343   return prepare_command_impl(op, cmdmap);
9344 }
9345
9346 static int parse_reweights(CephContext *cct,
9347                            const cmdmap_t& cmdmap,
9348                            const OSDMap& osdmap,
9349                            map<int32_t, uint32_t>* weights)
9350 {
9351   string weights_str;
9352   if (!cmd_getval(cmdmap, "weights", weights_str)) {
9353     return -EINVAL;
9354   }
9355   std::replace(begin(weights_str), end(weights_str), '\'', '"');
9356   json_spirit::mValue json_value;
9357   if (!json_spirit::read(weights_str, json_value)) {
9358     return -EINVAL;
9359   }
9360   if (json_value.type() != json_spirit::obj_type) {
9361     return -EINVAL;
9362   }
9363   const auto obj = json_value.get_obj();
9364   try {
9365     for (auto& osd_weight : obj) {
9366       auto osd_id = std::stoi(osd_weight.first);
9367       if (!osdmap.exists(osd_id)) {
9368         return -ENOENT;
9369       }
9370       if (osd_weight.second.type() != json_spirit::str_type) {
9371         return -EINVAL;
9372       }
9373       auto weight = std::stoul(osd_weight.second.get_str());
9374       weights->insert({osd_id, weight});
9375     }
9376   } catch (const std::logic_error& e) {
9377     return -EINVAL;
9378   }
9379   return 0;
9380 }
9381
9382 int OSDMonitor::prepare_command_osd_destroy(
9383     int32_t id,
9384     stringstream& ss)
9385 {
9386   ceph_assert(paxos->is_plugged());
9387
9388   // we check if the osd exists for the benefit of `osd purge`, which may
9389   // have previously removed the osd. If the osd does not exist, return
9390   // -ENOENT to convey this, and let the caller deal with it.
9391   //
9392   // we presume that all auth secrets and config keys were removed prior
9393   // to this command being called. if they exist by now, we also assume
9394   // they must have been created by some other command and do not pertain
9395   // to this non-existent osd.
9396   if (!osdmap.exists(id)) {
9397     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9398     return -ENOENT;
9399   }
9400
9401   uuid_d uuid = osdmap.get_uuid(id);
9402   dout(10) << __func__ << " destroying osd." << id
9403            << " uuid " << uuid << dendl;
9404
9405   // if it has been destroyed, we assume our work here is done.
9406   if (osdmap.is_destroyed(id)) {
9407     ss << "destroyed osd." << id;
9408     return 0;
9409   }
9410
9411   EntityName cephx_entity, lockbox_entity;
9412   bool idempotent_auth = false, idempotent_cks = false;
9413
9414   int err = mon->authmon()->validate_osd_destroy(id, uuid,
9415                                                  cephx_entity,
9416                                                  lockbox_entity,
9417                                                  ss);
9418   if (err < 0) {
9419     if (err == -ENOENT) {
9420       idempotent_auth = true;
9421     } else {
9422       return err;
9423     }
9424   }
9425
9426   ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9427   err = svc->validate_osd_destroy(id, uuid);
9428   if (err < 0) {
9429     ceph_assert(err == -ENOENT);
9430     err = 0;
9431     idempotent_cks = true;
9432   }
9433
9434   if (!idempotent_auth) {
9435     err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9436     ceph_assert(0 == err);
9437   }
9438
9439   if (!idempotent_cks) {
9440     svc->do_osd_destroy(id, uuid);
9441   }
9442
9443   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9444   pending_inc.new_uuid[id] = uuid_d();
9445
9446   // we can only propose_pending() once per service, otherwise we'll be
9447   // defying PaxosService and all laws of nature. Therefore, as we may
9448   // be used during 'osd purge', let's keep the caller responsible for
9449   // proposing.
9450   ceph_assert(err == 0);
9451   return 0;
9452 }
9453
9454 int OSDMonitor::prepare_command_osd_purge(
9455     int32_t id,
9456     stringstream& ss)
9457 {
9458   ceph_assert(paxos->is_plugged());
9459   dout(10) << __func__ << " purging osd." << id << dendl;
9460
9461   ceph_assert(!osdmap.is_up(id));
9462
9463   /*
9464    * This may look a bit weird, but this is what's going to happen:
9465    *
9466    *  1. we make sure that removing from crush works
9467    *  2. we call `prepare_command_osd_destroy()`. If it returns an
9468    *     error, then we abort the whole operation, as no updates
9469    *     have been made. However, we this function will have
9470    *     side-effects, thus we need to make sure that all operations
9471    *     performed henceforth will *always* succeed.
9472    *  3. we call `prepare_command_osd_remove()`. Although this
9473    *     function can return an error, it currently only checks if the
9474    *     osd is up - and we have made sure that it is not so, so there
9475    *     is no conflict, and it is effectively an update.
9476    *  4. finally, we call `do_osd_crush_remove()`, which will perform
9477    *     the crush update we delayed from before.
9478    */
9479
9480   CrushWrapper newcrush;
9481   _get_pending_crush(newcrush);
9482
9483   bool may_be_idempotent = false;
9484
9485   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9486   if (err == -ENOENT) {
9487     err = 0;
9488     may_be_idempotent = true;
9489   } else if (err < 0) {
9490     ss << "error removing osd." << id << " from crush";
9491     return err;
9492   }
9493
9494   // no point destroying the osd again if it has already been marked destroyed
9495   if (!osdmap.is_destroyed(id)) {
9496     err = prepare_command_osd_destroy(id, ss);
9497     if (err < 0) {
9498       if (err == -ENOENT) {
9499         err = 0;
9500       } else {
9501         return err;
9502       }
9503     } else {
9504       may_be_idempotent = false;
9505     }
9506   }
9507   ceph_assert(0 == err);
9508
9509   if (may_be_idempotent && !osdmap.exists(id)) {
9510     dout(10) << __func__ << " osd." << id << " does not exist and "
9511              << "we are idempotent." << dendl;
9512     return -ENOENT;
9513   }
9514
9515   err = prepare_command_osd_remove(id);
9516   // we should not be busy, as we should have made sure this id is not up.
9517   ceph_assert(0 == err);
9518
9519   do_osd_crush_remove(newcrush);
9520   return 0;
9521 }
9522
9523 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9524                                       const cmdmap_t& cmdmap)
9525 {
9526   op->mark_osdmon_event(__func__);
9527   auto m = op->get_req<MMonCommand>();
9528   bool ret = false;
9529   stringstream ss;
9530   string rs;
9531   bufferlist rdata;
9532   int err = 0;
9533
9534   string format;
9535   cmd_getval(cmdmap, "format", format, string("plain"));
9536   boost::scoped_ptr<Formatter> f(Formatter::create(format));
9537
9538   string prefix;
9539   cmd_getval(cmdmap, "prefix", prefix);
9540
9541   int64_t osdid;
9542   string osd_name;
9543   bool osdid_present = false;
9544   if (prefix != "osd pg-temp" &&
9545       prefix != "osd pg-upmap" &&
9546       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
9547     osdid_present = cmd_getval(cmdmap, "id", osdid);
9548   }
9549   if (osdid_present) {
9550     ostringstream oss;
9551     oss << "osd." << osdid;
9552     osd_name = oss.str();
9553   }
9554
9555   // Even if there's a pending state with changes that could affect
9556   // a command, considering that said state isn't yet committed, we
9557   // just don't care about those changes if the command currently being
9558   // handled acts as a no-op against the current committed state.
9559   // In a nutshell, we assume this command  happens *before*.
9560   //
9561   // Let me make this clearer:
9562   //
9563   //   - If we have only one client, and that client issues some
9564   //     operation that would conflict with this operation  but is
9565   //     still on the pending state, then we would be sure that said
9566   //     operation wouldn't have returned yet, so the client wouldn't
9567   //     issue this operation (unless the client didn't wait for the
9568   //     operation to finish, and that would be the client's own fault).
9569   //
9570   //   - If we have more than one client, each client will observe
9571   //     whatever is the state at the moment of the commit.  So, if we
9572   //     have two clients, one issuing an unlink and another issuing a
9573   //     link, and if the link happens while the unlink is still on the
9574   //     pending state, from the link's point-of-view this is a no-op.
9575   //     If different clients are issuing conflicting operations and
9576   //     they care about that, then the clients should make sure they
9577   //     enforce some kind of concurrency mechanism -- from our
9578   //     perspective that's what Douglas Adams would call an SEP.
9579   //
9580   // This should be used as a general guideline for most commands handled
9581   // in this function.  Adapt as you see fit, but please bear in mind that
9582   // this is the expected behavior.
9583
9584
9585   if (prefix == "osd setcrushmap" ||
9586       (prefix == "osd crush set" && !osdid_present)) {
9587     if (pending_inc.crush.length()) {
9588       dout(10) << __func__ << " waiting for pending crush update " << dendl;
9589       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9590       return true;
9591     }
9592     dout(10) << "prepare_command setting new crush map" << dendl;
9593     bufferlist data(m->get_data());
9594     CrushWrapper crush;
9595     try {
9596       auto bl = data.cbegin();
9597       crush.decode(bl);
9598     }
9599     catch (const std::exception &e) {
9600       err = -EINVAL;
9601       ss << "Failed to parse crushmap: " << e.what();
9602       goto reply;
9603     }
9604
9605     int64_t prior_version = 0;
9606     if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9607       if (prior_version == osdmap.get_crush_version() - 1) {
9608         // see if we are a resend of the last update.  this is imperfect
9609         // (multiple racing updaters may not both get reliable success)
9610         // but we expect crush updaters (via this interface) to be rare-ish.
9611         bufferlist current, proposed;
9612         osdmap.crush->encode(current, mon->get_quorum_con_features());
9613         crush.encode(proposed, mon->get_quorum_con_features());
9614         if (current.contents_equal(proposed)) {
9615           dout(10) << __func__
9616                    << " proposed matches current and version equals previous"
9617                    << dendl;
9618           err = 0;
9619           ss << osdmap.get_crush_version();
9620           goto reply;
9621         }
9622       }
9623       if (prior_version != osdmap.get_crush_version()) {
9624         err = -EPERM;
9625         ss << "prior_version " << prior_version << " != crush version "
9626            << osdmap.get_crush_version();
9627         goto reply;
9628       }
9629     }
9630
9631     if (crush.has_legacy_rule_ids()) {
9632       err = -EINVAL;
9633       ss << "crush maps with ruleset != ruleid are no longer allowed";
9634       goto reply;
9635     }
9636     if (!validate_crush_against_features(&crush, ss)) {
9637       err = -EINVAL;
9638       goto reply;
9639     }
9640
9641     err = osdmap.validate_crush_rules(&crush, &ss);
9642     if (err < 0) {
9643       goto reply;
9644     }
9645
9646     if (g_conf()->mon_osd_crush_smoke_test) {
9647       // sanity check: test some inputs to make sure this map isn't
9648       // totally broken
9649       dout(10) << " testing map" << dendl;
9650       stringstream ess;
9651       CrushTester tester(crush, ess);
9652       tester.set_min_x(0);
9653       tester.set_max_x(50);
9654       auto start = ceph::coarse_mono_clock::now();
9655       int r = tester.test_with_fork(g_conf()->mon_lease);
9656       auto duration = ceph::coarse_mono_clock::now() - start;
9657       if (r < 0) {
9658         dout(10) << " tester.test_with_fork returns " << r
9659                  << ": " << ess.str() << dendl;
9660         ss << "crush smoke test failed with " << r << ": " << ess.str();
9661         err = r;
9662         goto reply;
9663       }
9664       dout(10) << __func__ << " crush somke test duration: "
9665                << duration << ", result: " << ess.str() << dendl;
9666     }
9667
9668     pending_inc.crush = data;
9669     ss << osdmap.get_crush_version() + 1;
9670     goto update;
9671
9672   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9673     CrushWrapper newcrush;
9674     _get_pending_crush(newcrush);
9675     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9676       int bid = -1 - b;
9677       if (newcrush.bucket_exists(bid) &&
9678           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9679         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9680         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9681       }
9682     }
9683     if (!validate_crush_against_features(&newcrush, ss)) {
9684       err = -EINVAL;
9685       goto reply;
9686     }
9687     pending_inc.crush.clear();
9688     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9689     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9690                                               get_last_committed() + 1));
9691     return true;
9692   } else if (prefix == "osd crush set-device-class") {
9693     string device_class;
9694     if (!cmd_getval(cmdmap, "class", device_class)) {
9695       err = -EINVAL; // no value!
9696       goto reply;
9697     }
9698
9699     bool stop = false;
9700     vector<string> idvec;
9701     cmd_getval(cmdmap, "ids", idvec);
9702     CrushWrapper newcrush;
9703     _get_pending_crush(newcrush);
9704     set<int> updated;
9705     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9706       set<int> osds;
9707       // wildcard?
9708       if (j == 0 &&
9709           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9710         osdmap.get_all_osds(osds);
9711         stop = true;
9712       } else {
9713         // try traditional single osd way
9714         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9715         if (osd < 0) {
9716           // ss has reason for failure
9717           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9718           err = -EINVAL;
9719           continue;
9720         }
9721         osds.insert(osd);
9722       }
9723
9724       for (auto &osd : osds) {
9725         if (!osdmap.exists(osd)) {
9726           ss << "osd." << osd << " does not exist. ";
9727           continue;
9728         }
9729
9730         ostringstream oss;
9731         oss << "osd." << osd;
9732         string name = oss.str();
9733
9734         if (newcrush.get_max_devices() < osd + 1) {
9735           newcrush.set_max_devices(osd + 1);
9736         }
9737         string action;
9738         if (newcrush.item_exists(osd)) {
9739           action = "updating";
9740         } else {
9741           action = "creating";
9742           newcrush.set_item_name(osd, name);
9743         }
9744
9745         dout(5) << action << " crush item id " << osd << " name '" << name
9746                 << "' device_class '" << device_class << "'"
9747                 << dendl;
9748         err = newcrush.update_device_class(osd, device_class, name, &ss);
9749         if (err < 0) {
9750           goto reply;
9751         }
9752         if (err == 0 && !_have_pending_crush()) {
9753           if (!stop) {
9754             // for single osd only, wildcard makes too much noise
9755             ss << "set-device-class item id " << osd << " name '" << name
9756                << "' device_class '" << device_class << "': no change. ";
9757           }
9758         } else {
9759           updated.insert(osd);
9760         }
9761       }
9762     }
9763
9764     pending_inc.crush.clear();
9765     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9766     ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9767     getline(ss, rs);
9768     wait_for_finished_proposal(
9769       op,
9770       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9771     return true;
9772  } else if (prefix == "osd crush rm-device-class") {
9773     bool stop = false;
9774     vector<string> idvec;
9775     cmd_getval(cmdmap, "ids", idvec);
9776     CrushWrapper newcrush;
9777     _get_pending_crush(newcrush);
9778     set<int> updated;
9779
9780     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9781       set<int> osds;
9782
9783       // wildcard?
9784       if (j == 0 &&
9785           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9786         osdmap.get_all_osds(osds);
9787         stop = true;
9788       } else {
9789         // try traditional single osd way
9790         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9791         if (osd < 0) {
9792           // ss has reason for failure
9793           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9794           err = -EINVAL;
9795           goto reply;
9796         }
9797         osds.insert(osd);
9798       }
9799
9800       for (auto &osd : osds) {
9801         if (!osdmap.exists(osd)) {
9802           ss << "osd." << osd << " does not exist. ";
9803           continue;
9804         }
9805
9806         auto class_name = newcrush.get_item_class(osd);
9807         if (!class_name) {
9808           ss << "osd." << osd << " belongs to no class, ";
9809           continue;
9810         }
9811         // note that we do not verify if class_is_in_use here
9812         // in case the device is misclassified and user wants
9813         // to overridely reset...
9814
9815         err = newcrush.remove_device_class(cct, osd, &ss);
9816         if (err < 0) {
9817           // ss has reason for failure
9818           goto reply;
9819         }
9820         updated.insert(osd);
9821       }
9822     }
9823
9824     pending_inc.crush.clear();
9825     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9826     ss << "done removing class of osd(s): " << updated;
9827     getline(ss, rs);
9828     wait_for_finished_proposal(
9829       op,
9830       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9831     return true;
9832   } else if (prefix == "osd crush class create") {
9833     string device_class;
9834     if (!cmd_getval(cmdmap, "class", device_class)) {
9835       err = -EINVAL; // no value!
9836       goto reply;
9837     }
9838     if (osdmap.require_osd_release < ceph_release_t::luminous) {
9839       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9840          << "luminous' before using crush device classes";
9841       err = -EPERM;
9842       goto reply;
9843     }
9844     if (!_have_pending_crush() &&
9845         _get_stable_crush().class_exists(device_class)) {
9846       ss << "class '" << device_class << "' already exists";
9847       goto reply;
9848     }
9849      CrushWrapper newcrush;
9850     _get_pending_crush(newcrush);
9851      if (newcrush.class_exists(device_class)) {
9852       ss << "class '" << device_class << "' already exists";
9853       goto update;
9854     }
9855     int class_id = newcrush.get_or_create_class_id(device_class);
9856     pending_inc.crush.clear();
9857     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9858     ss << "created class " << device_class << " with id " << class_id
9859        << " to crush map";
9860     goto update;
9861   } else if (prefix == "osd crush class rm") {
9862     string device_class;
9863     if (!cmd_getval(cmdmap, "class", device_class)) {
9864        err = -EINVAL; // no value!
9865        goto reply;
9866      }
9867     if (osdmap.require_osd_release < ceph_release_t::luminous) {
9868        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9869          << "luminous' before using crush device classes";
9870        err = -EPERM;
9871        goto reply;
9872      }
9873
9874      if (!osdmap.crush->class_exists(device_class)) {
9875        err = 0;
9876        goto reply;
9877      }
9878
9879      CrushWrapper newcrush;
9880      _get_pending_crush(newcrush);
9881      if (!newcrush.class_exists(device_class)) {
9882        err = 0; // make command idempotent
9883        goto wait;
9884      }
9885      int class_id = newcrush.get_class_id(device_class);
9886      stringstream ts;
9887      if (newcrush.class_is_in_use(class_id, &ts)) {
9888        err = -EBUSY;
9889        ss << "class '" << device_class << "' " << ts.str();
9890        goto reply;
9891      }
9892
9893      // check if class is used by any erasure-code-profiles
9894      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9895        osdmap.get_erasure_code_profiles();
9896      auto ec_profiles = pending_inc.get_erasure_code_profiles();
9897 #ifdef HAVE_STDLIB_MAP_SPLICING
9898      ec_profiles.merge(old_ec_profiles);
9899 #else
9900      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9901                         make_move_iterator(end(old_ec_profiles)));
9902 #endif
9903      list<string> referenced_by;
9904      for (auto &i: ec_profiles) {
9905        for (auto &j: i.second) {
9906          if ("crush-device-class" == j.first && device_class == j.second) {
9907            referenced_by.push_back(i.first);
9908          }
9909        }
9910      }
9911      if (!referenced_by.empty()) {
9912        err = -EBUSY;
9913        ss << "class '" << device_class
9914           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9915        goto reply;
9916      }
9917
9918      set<int> osds;
9919      newcrush.get_devices_by_class(device_class, &osds);
9920      for (auto& p: osds) {
9921        err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9922        if (err < 0) {
9923          // ss has reason for failure
9924          goto reply;
9925        }
9926      }
9927
9928      if (osds.empty()) {
9929        // empty class, remove directly
9930        err = newcrush.remove_class_name(device_class);
9931        if (err < 0) {
9932          ss << "class '" << device_class << "' cannot be removed '"
9933             << cpp_strerror(err) << "'";
9934          goto reply;
9935        }
9936      }
9937
9938      pending_inc.crush.clear();
9939      newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9940      ss << "removed class " << device_class << " with id " << class_id
9941         << " from crush map";
9942      goto update;
9943   } else if (prefix == "osd crush class rename") {
9944     string srcname, dstname;
9945     if (!cmd_getval(cmdmap, "srcname", srcname)) {
9946       err = -EINVAL;
9947       goto reply;
9948     }
9949     if (!cmd_getval(cmdmap, "dstname", dstname)) {
9950       err = -EINVAL;
9951       goto reply;
9952     }
9953
9954     CrushWrapper newcrush;
9955     _get_pending_crush(newcrush);
9956     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9957       // suppose this is a replay and return success
9958       // so command is idempotent
9959       ss << "already renamed to '" << dstname << "'";
9960       err = 0;
9961       goto reply;
9962     }
9963
9964     err = newcrush.rename_class(srcname, dstname);
9965     if (err < 0) {
9966       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9967          << cpp_strerror(err);
9968       goto reply;
9969     }
9970
9971     pending_inc.crush.clear();
9972     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9973     ss << "rename class '" << srcname << "' to '" << dstname << "'";
9974     goto update;
9975   } else if (prefix == "osd crush add-bucket") {
9976     // os crush add-bucket <name> <type>
9977     string name, typestr;
9978     vector<string> argvec;
9979     cmd_getval(cmdmap, "name", name);
9980     cmd_getval(cmdmap, "type", typestr);
9981     cmd_getval(cmdmap, "args", argvec);
9982     map<string,string> loc;
9983     if (!argvec.empty()) {
9984       CrushWrapper::parse_loc_map(argvec, &loc);
9985       dout(0) << "will create and move bucket '" << name
9986               << "' to location " << loc << dendl;
9987     }
9988
9989     if (!_have_pending_crush() &&
9990         _get_stable_crush().name_exists(name)) {
9991       ss << "bucket '" << name << "' already exists";
9992       goto reply;
9993     }
9994
9995     CrushWrapper newcrush;
9996     _get_pending_crush(newcrush);
9997
9998     if (newcrush.name_exists(name)) {
9999       ss << "bucket '" << name << "' already exists";
10000       goto update;
10001     }
10002     int type = newcrush.get_type_id(typestr);
10003     if (type < 0) {
10004       ss << "type '" << typestr << "' does not exist";
10005       err = -EINVAL;
10006       goto reply;
10007     }
10008     if (type == 0) {
10009       ss << "type '" << typestr << "' is for devices, not buckets";
10010       err = -EINVAL;
10011       goto reply;
10012     }
10013     int bucketno;
10014     err = newcrush.add_bucket(0, 0,
10015                               CRUSH_HASH_DEFAULT, type, 0, NULL,
10016                               NULL, &bucketno);
10017     if (err < 0) {
10018       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10019       goto reply;
10020     }
10021     err = newcrush.set_item_name(bucketno, name);
10022     if (err < 0) {
10023       ss << "error setting bucket name to '" << name << "'";
10024       goto reply;
10025     }
10026
10027     if (!loc.empty()) {
10028       if (!newcrush.check_item_loc(cct, bucketno, loc,
10029           (int *)NULL)) {
10030         err = newcrush.move_bucket(cct, bucketno, loc);
10031         if (err < 0) {
10032           ss << "error moving bucket '" << name << "' to location " << loc;
10033           goto reply;
10034         }
10035       } else {
10036         ss << "no need to move item id " << bucketno << " name '" << name
10037            << "' to location " << loc << " in crush map";
10038       }
10039     }
10040
10041     pending_inc.crush.clear();
10042     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10043     if (loc.empty()) {
10044       ss << "added bucket " << name << " type " << typestr
10045          << " to crush map";
10046     } else {
10047       ss << "added bucket " << name << " type " << typestr
10048          << " to location " << loc;
10049     }
10050     goto update;
10051   } else if (prefix == "osd crush rename-bucket") {
10052     string srcname, dstname;
10053     cmd_getval(cmdmap, "srcname", srcname);
10054     cmd_getval(cmdmap, "dstname", dstname);
10055
10056     err = crush_rename_bucket(srcname, dstname, &ss);
10057     if (err == -EALREADY) // equivalent to success for idempotency
10058       err = 0;
10059     if (err)
10060       goto reply;
10061     else
10062       goto update;
10063   } else if (prefix == "osd crush weight-set create" ||
10064              prefix == "osd crush weight-set create-compat") {
10065     CrushWrapper newcrush;
10066     _get_pending_crush(newcrush);
10067     int64_t pool;
10068     int positions;
10069     if (newcrush.has_non_straw2_buckets()) {
10070       ss << "crush map contains one or more bucket(s) that are not straw2";
10071       err = -EPERM;
10072       goto reply;
10073     }
10074     if (prefix == "osd crush weight-set create") {
10075       if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10076           osdmap.require_min_compat_client < ceph_release_t::luminous) {
10077         ss << "require_min_compat_client "
10078            << osdmap.require_min_compat_client
10079            << " < luminous, which is required for per-pool weight-sets. "
10080            << "Try 'ceph osd set-require-min-compat-client luminous' "
10081            << "before using the new interface";
10082         err = -EPERM;
10083         goto reply;
10084       }
10085       string poolname, mode;
10086       cmd_getval(cmdmap, "pool", poolname);
10087       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10088       if (pool < 0) {
10089         ss << "pool '" << poolname << "' not found";
10090         err = -ENOENT;
10091         goto reply;
10092       }
10093       cmd_getval(cmdmap, "mode", mode);
10094       if (mode != "flat" && mode != "positional") {
10095         ss << "unrecognized weight-set mode '" << mode << "'";
10096         err = -EINVAL;
10097         goto reply;
10098       }
10099       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10100     } else {
10101       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10102       positions = 1;
10103     }
10104     if (!newcrush.create_choose_args(pool, positions)) {
10105       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10106         ss << "compat weight-set already created";
10107       } else {
10108         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10109            << "' already created";
10110       }
10111       goto reply;
10112     }
10113     pending_inc.crush.clear();
10114     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10115     goto update;
10116
10117   } else if (prefix == "osd crush weight-set rm" ||
10118              prefix == "osd crush weight-set rm-compat") {
10119     CrushWrapper newcrush;
10120     _get_pending_crush(newcrush);
10121     int64_t pool;
10122     if (prefix == "osd crush weight-set rm") {
10123       string poolname;
10124       cmd_getval(cmdmap, "pool", poolname);
10125       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10126       if (pool < 0) {
10127         ss << "pool '" << poolname << "' not found";
10128         err = -ENOENT;
10129         goto reply;
10130       }
10131     } else {
10132       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10133     }
10134     newcrush.rm_choose_args(pool);
10135     pending_inc.crush.clear();
10136     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10137     goto update;
10138
10139   } else if (prefix == "osd crush weight-set reweight" ||
10140              prefix == "osd crush weight-set reweight-compat") {
10141     string poolname, item;
10142     vector<double> weight;
10143     cmd_getval(cmdmap, "pool", poolname);
10144     cmd_getval(cmdmap, "item", item);
10145     cmd_getval(cmdmap, "weight", weight);
10146     CrushWrapper newcrush;
10147     _get_pending_crush(newcrush);
10148     int64_t pool;
10149     if (prefix == "osd crush weight-set reweight") {
10150       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10151       if (pool < 0) {
10152         ss << "pool '" << poolname << "' not found";
10153         err = -ENOENT;
10154         goto reply;
10155       }
10156       if (!newcrush.have_choose_args(pool)) {
10157         ss << "no weight-set for pool '" << poolname << "'";
10158         err = -ENOENT;
10159         goto reply;
10160       }
10161       auto arg_map = newcrush.choose_args_get(pool);
10162       int positions = newcrush.get_choose_args_positions(arg_map);
10163       if (weight.size() != (size_t)positions) {
10164          ss << "must specify exact " << positions << " weight values";
10165          err = -EINVAL;
10166          goto reply;
10167       }
10168     } else {
10169       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10170       if (!newcrush.have_choose_args(pool)) {
10171         ss << "no backward-compatible weight-set";
10172         err = -ENOENT;
10173         goto reply;
10174       }
10175     }
10176     if (!newcrush.name_exists(item)) {
10177       ss << "item '" << item << "' does not exist";
10178       err = -ENOENT;
10179       goto reply;
10180     }
10181     err = newcrush.choose_args_adjust_item_weightf(
10182       cct,
10183       newcrush.choose_args_get(pool),
10184       newcrush.get_item_id(item),
10185       weight,
10186       &ss);
10187     if (err < 0) {
10188       goto reply;
10189     }
10190     err = 0;
10191     pending_inc.crush.clear();
10192     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10193     goto update;
10194   } else if (osdid_present &&
10195              (prefix == "osd crush set" || prefix == "osd crush add")) {
10196     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10197     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10198     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10199
10200     if (!osdmap.exists(osdid)) {
10201       err = -ENOENT;
10202       ss << osd_name
10203          << " does not exist. Create it before updating the crush map";
10204       goto reply;
10205     }
10206
10207     double weight;
10208     if (!cmd_getval(cmdmap, "weight", weight)) {
10209       ss << "unable to parse weight value '"
10210          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10211       err = -EINVAL;
10212       goto reply;
10213     }
10214
10215     string args;
10216     vector<string> argvec;
10217     cmd_getval(cmdmap, "args", argvec);
10218     map<string,string> loc;
10219     CrushWrapper::parse_loc_map(argvec, &loc);
10220
10221     if (prefix == "osd crush set"
10222         && !_get_stable_crush().item_exists(osdid)) {
10223       err = -ENOENT;
10224       ss << "unable to set item id " << osdid << " name '" << osd_name
10225          << "' weight " << weight << " at location " << loc
10226          << ": does not exist";
10227       goto reply;
10228     }
10229
10230     dout(5) << "adding/updating crush item id " << osdid << " name '"
10231       << osd_name << "' weight " << weight << " at location "
10232       << loc << dendl;
10233     CrushWrapper newcrush;
10234     _get_pending_crush(newcrush);
10235
10236     string action;
10237     if (prefix == "osd crush set" ||
10238         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10239       action = "set";
10240       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10241     } else {
10242       action = "add";
10243       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10244       if (err == 0)
10245         err = 1;
10246     }
10247
10248     if (err < 0)
10249       goto reply;
10250
10251     if (err == 0 && !_have_pending_crush()) {
10252       ss << action << " item id " << osdid << " name '" << osd_name
10253          << "' weight " << weight << " at location " << loc << ": no change";
10254       goto reply;
10255     }
10256
10257     pending_inc.crush.clear();
10258     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10259     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10260        << weight << " at location " << loc << " to crush map";
10261     getline(ss, rs);
10262     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10263                                                       get_last_committed() + 1));
10264     return true;
10265
10266   } else if (prefix == "osd crush create-or-move") {
10267     do {
10268       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10269       if (!osdmap.exists(osdid)) {
10270         err = -ENOENT;
10271         ss << osd_name
10272            << " does not exist.  create it before updating the crush map";
10273         goto reply;
10274       }
10275
10276       double weight;
10277       if (!cmd_getval(cmdmap, "weight", weight)) {
10278         ss << "unable to parse weight value '"
10279            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10280         err = -EINVAL;
10281         goto reply;
10282       }
10283
10284       string args;
10285       vector<string> argvec;
10286       cmd_getval(cmdmap, "args", argvec);
10287       map<string,string> loc;
10288       CrushWrapper::parse_loc_map(argvec, &loc);
10289
10290       dout(0) << "create-or-move crush item name '" << osd_name
10291               << "' initial_weight " << weight << " at location " << loc
10292               << dendl;
10293
10294       CrushWrapper newcrush;
10295       _get_pending_crush(newcrush);
10296
10297       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10298                                          g_conf()->osd_crush_update_weight_set);
10299       if (err == 0) {
10300         ss << "create-or-move updated item name '" << osd_name
10301            << "' weight " << weight
10302            << " at location " << loc << " to crush map";
10303         break;
10304       }
10305       if (err > 0) {
10306         pending_inc.crush.clear();
10307         newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10308         ss << "create-or-move updating item name '" << osd_name
10309            << "' weight " << weight
10310            << " at location " << loc << " to crush map";
10311         getline(ss, rs);
10312         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10313                                                   get_last_committed() + 1));
10314         return true;
10315       }
10316     } while (false);
10317
10318   } else if (prefix == "osd crush move") {
10319     do {
10320       // osd crush move <name> <loc1> [<loc2> ...]
10321       string name;
10322       vector<string> argvec;
10323       cmd_getval(cmdmap, "name", name);
10324       cmd_getval(cmdmap, "args", argvec);
10325       map<string,string> loc;
10326       CrushWrapper::parse_loc_map(argvec, &loc);
10327
10328       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10329       CrushWrapper newcrush;
10330       _get_pending_crush(newcrush);
10331
10332       if (!newcrush.name_exists(name)) {
10333         err = -ENOENT;
10334         ss << "item " << name << " does not exist";
10335         break;
10336       }
10337       int id = newcrush.get_item_id(name);
10338
10339       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10340         if (id >= 0) {
10341           err = newcrush.create_or_move_item(
10342             cct, id, 0, name, loc,
10343             g_conf()->osd_crush_update_weight_set);
10344         } else {
10345           err = newcrush.move_bucket(cct, id, loc);
10346         }
10347         if (err >= 0) {
10348           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10349           pending_inc.crush.clear();
10350           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10351           getline(ss, rs);
10352           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10353                                                    get_last_committed() + 1));
10354           return true;
10355         }
10356       } else {
10357         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10358         err = 0;
10359       }
10360     } while (false);
10361   } else if (prefix == "osd crush swap-bucket") {
10362     string source, dest;
10363     cmd_getval(cmdmap, "source", source);
10364     cmd_getval(cmdmap, "dest", dest);
10365
10366     bool force = false;
10367     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10368
10369     CrushWrapper newcrush;
10370     _get_pending_crush(newcrush);
10371     if (!newcrush.name_exists(source)) {
10372       ss << "source item " << source << " does not exist";
10373       err = -ENOENT;
10374       goto reply;
10375     }
10376     if (!newcrush.name_exists(dest)) {
10377       ss << "dest item " << dest << " does not exist";
10378       err = -ENOENT;
10379       goto reply;
10380     }
10381     int sid = newcrush.get_item_id(source);
10382     int did = newcrush.get_item_id(dest);
10383     int sparent;
10384     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10385       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10386       err = -EPERM;
10387       goto reply;
10388     }
10389     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10390         !force) {
10391       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10392          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10393          << "; pass --yes-i-really-mean-it to proceed anyway";
10394       err = -EPERM;
10395       goto reply;
10396     }
10397     int r = newcrush.swap_bucket(cct, sid, did);
10398     if (r < 0) {
10399       ss << "failed to swap bucket contents: " << cpp_strerror(r);
10400       err = r;
10401       goto reply;
10402     }
10403     ss << "swapped bucket of " << source << " to " << dest;
10404     pending_inc.crush.clear();
10405     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10406     wait_for_finished_proposal(op,
10407                                new Monitor::C_Command(mon, op, err, ss.str(),
10408                                                       get_last_committed() + 1));
10409     return true;
10410   } else if (prefix == "osd crush link") {
10411     // osd crush link <name> <loc1> [<loc2> ...]
10412     string name;
10413     cmd_getval(cmdmap, "name", name);
10414     vector<string> argvec;
10415     cmd_getval(cmdmap, "args", argvec);
10416     map<string,string> loc;
10417     CrushWrapper::parse_loc_map(argvec, &loc);
10418
10419     // Need an explicit check for name_exists because get_item_id returns
10420     // 0 on unfound.
10421     int id = osdmap.crush->get_item_id(name);
10422     if (!osdmap.crush->name_exists(name)) {
10423       err = -ENOENT;
10424       ss << "item " << name << " does not exist";
10425       goto reply;
10426     } else {
10427       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10428     }
10429     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10430       ss << "no need to move item id " << id << " name '" << name
10431          << "' to location " << loc << " in crush map";
10432       err = 0;
10433       goto reply;
10434     }
10435
10436     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10437     CrushWrapper newcrush;
10438     _get_pending_crush(newcrush);
10439
10440     if (!newcrush.name_exists(name)) {
10441       err = -ENOENT;
10442       ss << "item " << name << " does not exist";
10443       goto reply;
10444     } else {
10445       int id = newcrush.get_item_id(name);
10446       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10447         err = newcrush.link_bucket(cct, id, loc);
10448         if (err >= 0) {
10449           ss << "linked item id " << id << " name '" << name
10450              << "' to location " << loc << " in crush map";
10451           pending_inc.crush.clear();
10452           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10453         } else {
10454           ss << "cannot link item id " << id << " name '" << name
10455              << "' to location " << loc;
10456           goto reply;
10457         }
10458       } else {
10459         ss << "no need to move item id " << id << " name '" << name
10460            << "' to location " << loc << " in crush map";
10461         err = 0;
10462       }
10463     }
10464     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10465                                               get_last_committed() + 1));
10466     return true;
10467   } else if (prefix == "osd crush rm" ||
10468              prefix == "osd crush remove" ||
10469              prefix == "osd crush unlink") {
10470     do {
10471       // osd crush rm <id> [ancestor]
10472       CrushWrapper newcrush;
10473       _get_pending_crush(newcrush);
10474
10475       string name;
10476       cmd_getval(cmdmap, "name", name);
10477
10478       if (!osdmap.crush->name_exists(name)) {
10479         err = 0;
10480         ss << "device '" << name << "' does not appear in the crush map";
10481         break;
10482       }
10483       if (!newcrush.name_exists(name)) {
10484         err = 0;
10485         ss << "device '" << name << "' does not appear in the crush map";
10486         getline(ss, rs);
10487         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10488                                                   get_last_committed() + 1));
10489         return true;
10490       }
10491       int id = newcrush.get_item_id(name);
10492       int ancestor = 0;
10493
10494       bool unlink_only = prefix == "osd crush unlink";
10495       string ancestor_str;
10496       if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10497         if (!newcrush.name_exists(ancestor_str)) {
10498           err = -ENOENT;
10499           ss << "ancestor item '" << ancestor_str
10500              << "' does not appear in the crush map";
10501           break;
10502         }
10503         ancestor = newcrush.get_item_id(ancestor_str);
10504       }
10505
10506       err = prepare_command_osd_crush_remove(
10507           newcrush,
10508           id, ancestor,
10509           (ancestor < 0), unlink_only);
10510
10511       if (err == -ENOENT) {
10512         ss << "item " << id << " does not appear in that position";
10513         err = 0;
10514         break;
10515       }
10516       if (err == 0) {
10517         if (!unlink_only)
10518           pending_inc.new_crush_node_flags[id] = 0;
10519         ss << "removed item id " << id << " name '" << name << "' from crush map";
10520         getline(ss, rs);
10521         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10522                                                   get_last_committed() + 1));
10523         return true;
10524       }
10525     } while (false);
10526
10527   } else if (prefix == "osd crush reweight-all") {
10528     CrushWrapper newcrush;
10529     _get_pending_crush(newcrush);
10530
10531     newcrush.reweight(cct);
10532     pending_inc.crush.clear();
10533     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10534     ss << "reweighted crush hierarchy";
10535     getline(ss, rs);
10536     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10537                                                   get_last_committed() + 1));
10538     return true;
10539   } else if (prefix == "osd crush reweight") {
10540     // osd crush reweight <name> <weight>
10541     CrushWrapper newcrush;
10542     _get_pending_crush(newcrush);
10543
10544     string name;
10545     cmd_getval(cmdmap, "name", name);
10546     if (!newcrush.name_exists(name)) {
10547       err = -ENOENT;
10548       ss << "device '" << name << "' does not appear in the crush map";
10549       goto reply;
10550     }
10551
10552     int id = newcrush.get_item_id(name);
10553     if (id < 0) {
10554       ss << "device '" << name << "' is not a leaf in the crush map";
10555       err = -EINVAL;
10556       goto reply;
10557     }
10558     double w;
10559     if (!cmd_getval(cmdmap, "weight", w)) {
10560       ss << "unable to parse weight value '"
10561          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10562       err = -EINVAL;
10563       goto reply;
10564     }
10565
10566     err = newcrush.adjust_item_weightf(cct, id, w,
10567                                        g_conf()->osd_crush_update_weight_set);
10568     if (err < 0)
10569       goto reply;
10570     pending_inc.crush.clear();
10571     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10572     ss << "reweighted item id " << id << " name '" << name << "' to " << w
10573        << " in crush map";
10574     getline(ss, rs);
10575     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10576                                                   get_last_committed() + 1));
10577     return true;
10578   } else if (prefix == "osd crush reweight-subtree") {
10579     // osd crush reweight <name> <weight>
10580     CrushWrapper newcrush;
10581     _get_pending_crush(newcrush);
10582
10583     string name;
10584     cmd_getval(cmdmap, "name", name);
10585     if (!newcrush.name_exists(name)) {
10586       err = -ENOENT;
10587       ss << "device '" << name << "' does not appear in the crush map";
10588       goto reply;
10589     }
10590
10591     int id = newcrush.get_item_id(name);
10592     if (id >= 0) {
10593       ss << "device '" << name << "' is not a subtree in the crush map";
10594       err = -EINVAL;
10595       goto reply;
10596     }
10597     double w;
10598     if (!cmd_getval(cmdmap, "weight", w)) {
10599       ss << "unable to parse weight value '"
10600          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10601       err = -EINVAL;
10602       goto reply;
10603     }
10604
10605     err = newcrush.adjust_subtree_weightf(cct, id, w,
10606                                           g_conf()->osd_crush_update_weight_set);
10607     if (err < 0)
10608       goto reply;
10609     pending_inc.crush.clear();
10610     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10611     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10612        << " in crush map";
10613     getline(ss, rs);
10614     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10615                                               get_last_committed() + 1));
10616     return true;
10617   } else if (prefix == "osd crush tunables") {
10618     CrushWrapper newcrush;
10619     _get_pending_crush(newcrush);
10620
10621     err = 0;
10622     string profile;
10623     cmd_getval(cmdmap, "profile", profile);
10624     if (profile == "legacy" || profile == "argonaut") {
10625       newcrush.set_tunables_legacy();
10626     } else if (profile == "bobtail") {
10627       newcrush.set_tunables_bobtail();
10628     } else if (profile == "firefly") {
10629       newcrush.set_tunables_firefly();
10630     } else if (profile == "hammer") {
10631       newcrush.set_tunables_hammer();
10632     } else if (profile == "jewel") {
10633       newcrush.set_tunables_jewel();
10634     } else if (profile == "optimal") {
10635       newcrush.set_tunables_optimal();
10636     } else if (profile == "default") {
10637       newcrush.set_tunables_default();
10638     } else {
10639       ss << "unrecognized profile '" << profile << "'";
10640       err = -EINVAL;
10641       goto reply;
10642     }
10643
10644     if (!validate_crush_against_features(&newcrush, ss)) {
10645       err = -EINVAL;
10646       goto reply;
10647     }
10648
10649     pending_inc.crush.clear();
10650     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10651     ss << "adjusted tunables profile to " << profile;
10652     getline(ss, rs);
10653     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10654                                               get_last_committed() + 1));
10655     return true;
10656   } else if (prefix == "osd crush set-tunable") {
10657     CrushWrapper newcrush;
10658     _get_pending_crush(newcrush);
10659
10660     err = 0;
10661     string tunable;
10662     cmd_getval(cmdmap, "tunable", tunable);
10663
10664     int64_t value = -1;
10665     if (!cmd_getval(cmdmap, "value", value)) {
10666       err = -EINVAL;
10667       ss << "failed to parse integer value "
10668          << cmd_vartype_stringify(cmdmap.at("value"));
10669       goto reply;
10670     }
10671
10672     if (tunable == "straw_calc_version") {
10673       if (value != 0 && value != 1) {
10674         ss << "value must be 0 or 1; got " << value;
10675         err = -EINVAL;
10676         goto reply;
10677       }
10678       newcrush.set_straw_calc_version(value);
10679     } else {
10680       ss << "unrecognized tunable '" << tunable << "'";
10681       err = -EINVAL;
10682       goto reply;
10683     }
10684
10685     if (!validate_crush_against_features(&newcrush, ss)) {
10686       err = -EINVAL;
10687       goto reply;
10688     }
10689
10690     pending_inc.crush.clear();
10691     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10692     ss << "adjusted tunable " << tunable << " to " << value;
10693     getline(ss, rs);
10694     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10695                                               get_last_committed() + 1));
10696     return true;
10697
10698   } else if (prefix == "osd crush rule create-simple") {
10699     string name, root, type, mode;
10700     cmd_getval(cmdmap, "name", name);
10701     cmd_getval(cmdmap, "root", root);
10702     cmd_getval(cmdmap, "type", type);
10703     cmd_getval(cmdmap, "mode", mode);
10704     if (mode == "")
10705       mode = "firstn";
10706
10707     if (osdmap.crush->rule_exists(name)) {
10708       // The name is uniquely associated to a ruleid and the rule it contains
10709       // From the user point of view, the rule is more meaningfull.
10710       ss << "rule " << name << " already exists";
10711       err = 0;
10712       goto reply;
10713     }
10714
10715     CrushWrapper newcrush;
10716     _get_pending_crush(newcrush);
10717
10718     if (newcrush.rule_exists(name)) {
10719       // The name is uniquely associated to a ruleid and the rule it contains
10720       // From the user point of view, the rule is more meaningfull.
10721       ss << "rule " << name << " already exists";
10722       err = 0;
10723     } else {
10724       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10725                                                pg_pool_t::TYPE_REPLICATED, &ss);
10726       if (ruleno < 0) {
10727         err = ruleno;
10728         goto reply;
10729       }
10730
10731       pending_inc.crush.clear();
10732       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10733     }
10734     getline(ss, rs);
10735     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10736                                               get_last_committed() + 1));
10737     return true;
10738
10739   } else if (prefix == "osd crush rule create-replicated") {
10740     string name, root, type, device_class;
10741     cmd_getval(cmdmap, "name", name);
10742     cmd_getval(cmdmap, "root", root);
10743     cmd_getval(cmdmap, "type", type);
10744     cmd_getval(cmdmap, "class", device_class);
10745
10746     if (osdmap.crush->rule_exists(name)) {
10747       // The name is uniquely associated to a ruleid and the rule it contains
10748       // From the user point of view, the rule is more meaningfull.
10749       ss << "rule " << name << " already exists";
10750       err = 0;
10751       goto reply;
10752     }
10753
10754     CrushWrapper newcrush;
10755     _get_pending_crush(newcrush);
10756
10757     if (newcrush.rule_exists(name)) {
10758       // The name is uniquely associated to a ruleid and the rule it contains
10759       // From the user point of view, the rule is more meaningfull.
10760       ss << "rule " << name << " already exists";
10761       err = 0;
10762     } else {
10763       int ruleno = newcrush.add_simple_rule(
10764         name, root, type, device_class,
10765         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10766       if (ruleno < 0) {
10767         err = ruleno;
10768         goto reply;
10769       }
10770
10771       pending_inc.crush.clear();
10772       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10773     }
10774     getline(ss, rs);
10775     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10776                                               get_last_committed() + 1));
10777     return true;
10778
10779   } else if (prefix == "osd erasure-code-profile rm") {
10780     string name;
10781     cmd_getval(cmdmap, "name", name);
10782
10783     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10784       goto wait;
10785
10786     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10787       err = -EBUSY;
10788       goto reply;
10789     }
10790
10791     if (osdmap.has_erasure_code_profile(name) ||
10792         pending_inc.new_erasure_code_profiles.count(name)) {
10793       if (osdmap.has_erasure_code_profile(name)) {
10794         pending_inc.old_erasure_code_profiles.push_back(name);
10795       } else {
10796         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10797         pending_inc.new_erasure_code_profiles.erase(name);
10798       }
10799
10800       getline(ss, rs);
10801       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10802                                                         get_last_committed() + 1));
10803       return true;
10804     } else {
10805       ss << "erasure-code-profile " << name << " does not exist";
10806       err = 0;
10807       goto reply;
10808     }
10809
10810   } else if (prefix == "osd erasure-code-profile set") {
10811     string name;
10812     cmd_getval(cmdmap, "name", name);
10813     vector<string> profile;
10814     cmd_getval(cmdmap, "profile", profile);
10815
10816     bool force = false;
10817     cmd_getval(cmdmap, "force", force);
10818
10819     map<string,string> profile_map;
10820     err = parse_erasure_code_profile(profile, &profile_map, &ss);
10821     if (err)
10822       goto reply;
10823     if (auto found = profile_map.find("crush-failure-domain");
10824         found != profile_map.end()) {
10825       const auto& failure_domain = found->second;
10826       int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
10827       if (failure_domain_type < 0) {
10828         ss << "erasure-code-profile " << profile_map
10829           << " contains an invalid failure-domain " << std::quoted(failure_domain);
10830         err = -EINVAL;
10831         goto reply;
10832       }
10833     }
10834
10835     if (profile_map.find("plugin") == profile_map.end()) {
10836       ss << "erasure-code-profile " << profile_map
10837          << " must contain a plugin entry" << std::endl;
10838       err = -EINVAL;
10839       goto reply;
10840     }
10841     string plugin = profile_map["plugin"];
10842
10843     if (pending_inc.has_erasure_code_profile(name)) {
10844       dout(20) << "erasure code profile " << name << " try again" << dendl;
10845       goto wait;
10846     } else {
10847       err = normalize_profile(name, profile_map, force, &ss);
10848       if (err)
10849         goto reply;
10850
10851       if (osdmap.has_erasure_code_profile(name)) {
10852         ErasureCodeProfile existing_profile_map =
10853           osdmap.get_erasure_code_profile(name);
10854         err = normalize_profile(name, existing_profile_map, force, &ss);
10855         if (err)
10856           goto reply;
10857
10858         if (existing_profile_map == profile_map) {
10859           err = 0;
10860           goto reply;
10861         }
10862         if (!force) {
10863           err = -EPERM;
10864           ss << "will not override erasure code profile " << name
10865              << " because the existing profile "
10866              << existing_profile_map
10867              << " is different from the proposed profile "
10868              << profile_map;
10869           goto reply;
10870         }
10871       }
10872
10873       dout(20) << "erasure code profile set " << name << "="
10874                << profile_map << dendl;
10875       pending_inc.set_erasure_code_profile(name, profile_map);
10876     }
10877
10878     getline(ss, rs);
10879     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10880                                                       get_last_committed() + 1));
10881     return true;
10882
10883   } else if (prefix == "osd crush rule create-erasure") {
10884     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10885     if (err == -EAGAIN)
10886       goto wait;
10887     if (err)
10888       goto reply;
10889     string name, poolstr;
10890     cmd_getval(cmdmap, "name", name);
10891     string profile;
10892     cmd_getval(cmdmap, "profile", profile);
10893     if (profile == "")
10894       profile = "default";
10895     if (profile == "default") {
10896       if (!osdmap.has_erasure_code_profile(profile)) {
10897         if (pending_inc.has_erasure_code_profile(profile)) {
10898           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10899           goto wait;
10900         }
10901
10902         map<string,string> profile_map;
10903         err = osdmap.get_erasure_code_profile_default(cct,
10904                                                       profile_map,
10905                                                       &ss);
10906         if (err)
10907           goto reply;
10908         err = normalize_profile(name, profile_map, true, &ss);
10909         if (err)
10910           goto reply;
10911         dout(20) << "erasure code profile set " << profile << "="
10912                  << profile_map << dendl;
10913         pending_inc.set_erasure_code_profile(profile, profile_map);
10914         goto wait;
10915       }
10916     }
10917
10918     int rule;
10919     err = crush_rule_create_erasure(name, profile, &rule, &ss);
10920     if (err < 0) {
10921       switch(err) {
10922       case -EEXIST: // return immediately
10923         ss << "rule " << name << " already exists";
10924         err = 0;
10925         goto reply;
10926         break;
10927       case -EALREADY: // wait for pending to be proposed
10928         ss << "rule " << name << " already exists";
10929         err = 0;
10930         break;
10931       default: // non recoverable error
10932         goto reply;
10933         break;
10934       }
10935     } else {
10936       ss << "created rule " << name << " at " << rule;
10937     }
10938
10939     getline(ss, rs);
10940     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10941                                                       get_last_committed() + 1));
10942     return true;
10943
10944   } else if (prefix == "osd crush rule rm") {
10945     string name;
10946     cmd_getval(cmdmap, "name", name);
10947
10948     if (!osdmap.crush->rule_exists(name)) {
10949       ss << "rule " << name << " does not exist";
10950       err = 0;
10951       goto reply;
10952     }
10953
10954     CrushWrapper newcrush;
10955     _get_pending_crush(newcrush);
10956
10957     if (!newcrush.rule_exists(name)) {
10958       ss << "rule " << name << " does not exist";
10959       err = 0;
10960     } else {
10961       int ruleno = newcrush.get_rule_id(name);
10962       ceph_assert(ruleno >= 0);
10963
10964       // make sure it is not in use.
10965       // FIXME: this is ok in some situations, but let's not bother with that
10966       // complexity now.
10967       int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10968       if (osdmap.crush_rule_in_use(ruleset)) {
10969         ss << "crush ruleset " << name << " " << ruleset << " is in use";
10970         err = -EBUSY;
10971         goto reply;
10972       }
10973
10974       err = newcrush.remove_rule(ruleno);
10975       if (err < 0) {
10976         goto reply;
10977       }
10978
10979       pending_inc.crush.clear();
10980       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10981     }
10982     getline(ss, rs);
10983     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10984                                               get_last_committed() + 1));
10985     return true;
10986
10987   } else if (prefix == "osd crush rule rename") {
10988     string srcname;
10989     string dstname;
10990     cmd_getval(cmdmap, "srcname", srcname);
10991     cmd_getval(cmdmap, "dstname", dstname);
10992     if (srcname.empty() || dstname.empty()) {
10993       ss << "must specify both source rule name and destination rule name";
10994       err = -EINVAL;
10995       goto reply;
10996     }
10997     if (srcname == dstname) {
10998       ss << "destination rule name is equal to source rule name";
10999       err = 0;
11000       goto reply;
11001     }
11002
11003     CrushWrapper newcrush;
11004     _get_pending_crush(newcrush);
11005     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11006       // srcname does not exist and dstname already exists
11007       // suppose this is a replay and return success
11008       // (so this command is idempotent)
11009       ss << "already renamed to '" << dstname << "'";
11010       err = 0;
11011       goto reply;
11012     }
11013
11014     err = newcrush.rename_rule(srcname, dstname, &ss);
11015     if (err < 0) {
11016       // ss has reason for failure
11017       goto reply;
11018     }
11019     pending_inc.crush.clear();
11020     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
11021     getline(ss, rs);
11022     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11023                                get_last_committed() + 1));
11024     return true;
11025
11026   } else if (prefix == "osd setmaxosd") {
11027     int64_t newmax;
11028     if (!cmd_getval(cmdmap, "newmax", newmax)) {
11029       ss << "unable to parse 'newmax' value '"
11030          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11031       err = -EINVAL;
11032       goto reply;
11033     }
11034
11035     if (newmax > g_conf()->mon_max_osd) {
11036       err = -ERANGE;
11037       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11038          << g_conf()->mon_max_osd << ")";
11039       goto reply;
11040     }
11041
11042     // Don't allow shrinking OSD number as this will cause data loss
11043     // and may cause kernel crashes.
11044     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11045     if (newmax < osdmap.get_max_osd()) {
11046       // Check if the OSDs exist between current max and new value.
11047       // If there are any OSDs exist, then don't allow shrinking number
11048       // of OSDs.
11049       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11050         if (osdmap.exists(i)) {
11051           err = -EBUSY;
11052           ss << "cannot shrink max_osd to " << newmax
11053              << " because osd." << i << " (and possibly others) still in use";
11054           goto reply;
11055         }
11056       }
11057     }
11058
11059     pending_inc.new_max_osd = newmax;
11060     ss << "set new max_osd = " << pending_inc.new_max_osd;
11061     getline(ss, rs);
11062     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11063                                               get_last_committed() + 1));
11064     return true;
11065
11066   } else if (prefix == "osd set-full-ratio" ||
11067              prefix == "osd set-backfillfull-ratio" ||
11068              prefix == "osd set-nearfull-ratio") {
11069     double n;
11070     if (!cmd_getval(cmdmap, "ratio", n)) {
11071       ss << "unable to parse 'ratio' value '"
11072          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11073       err = -EINVAL;
11074       goto reply;
11075     }
11076     if (prefix == "osd set-full-ratio")
11077       pending_inc.new_full_ratio = n;
11078     else if (prefix == "osd set-backfillfull-ratio")
11079       pending_inc.new_backfillfull_ratio = n;
11080     else if (prefix == "osd set-nearfull-ratio")
11081       pending_inc.new_nearfull_ratio = n;
11082     ss << prefix << " " << n;
11083     getline(ss, rs);
11084     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11085                                               get_last_committed() + 1));
11086     return true;
11087   } else if (prefix == "osd set-require-min-compat-client") {
11088     string v;
11089     cmd_getval(cmdmap, "version", v);
11090     ceph_release_t vno = ceph_release_from_name(v);
11091     if (!vno) {
11092       ss << "version " << v << " is not recognized";
11093       err = -EINVAL;
11094       goto reply;
11095     }
11096     OSDMap newmap;
11097     newmap.deepish_copy_from(osdmap);
11098     newmap.apply_incremental(pending_inc);
11099     newmap.require_min_compat_client = vno;
11100     auto mvno = newmap.get_min_compat_client();
11101     if (vno < mvno) {
11102       ss << "osdmap current utilizes features that require " << mvno
11103          << "; cannot set require_min_compat_client below that to " << vno;
11104       err = -EPERM;
11105       goto reply;
11106     }
11107     bool sure = false;
11108     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11109     if (!sure) {
11110       FeatureMap m;
11111       mon->get_combined_feature_map(&m);
11112       uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
11113       bool first = true;
11114       bool ok = true;
11115       for (int type : {
11116             CEPH_ENTITY_TYPE_CLIENT,
11117             CEPH_ENTITY_TYPE_MDS,
11118             CEPH_ENTITY_TYPE_MGR }) {
11119         auto p = m.m.find(type);
11120         if (p == m.m.end()) {
11121           continue;
11122         }
11123         for (auto& q : p->second) {
11124           uint64_t missing = ~q.first & features;
11125           if (missing) {
11126             if (first) {
11127               ss << "cannot set require_min_compat_client to " << v << ": ";
11128             } else {
11129               ss << "; ";
11130             }
11131             first = false;
11132             ss << q.second << " connected " << ceph_entity_type_name(type)
11133                << "(s) look like " << ceph_release_name(
11134                  ceph_release_from_features(q.first))
11135                << " (missing 0x" << std::hex << missing << std::dec << ")";
11136             ok = false;
11137           }
11138         }
11139       }
11140       if (!ok) {
11141         ss << "; add --yes-i-really-mean-it to do it anyway";
11142         err = -EPERM;
11143         goto reply;
11144       }
11145     }
11146     ss << "set require_min_compat_client to " << vno;
11147     pending_inc.new_require_min_compat_client = vno;
11148     getline(ss, rs);
11149     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11150                                                           get_last_committed() + 1));
11151     return true;
11152   } else if (prefix == "osd pause") {
11153     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11154
11155   } else if (prefix == "osd unpause") {
11156     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11157
11158   } else if (prefix == "osd set") {
11159     bool sure = false;
11160     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11161
11162     string key;
11163     cmd_getval(cmdmap, "key", key);
11164     if (key == "pause")
11165       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11166     else if (key == "noup")
11167       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11168     else if (key == "nodown")
11169       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11170     else if (key == "noout")
11171       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11172     else if (key == "noin")
11173       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11174     else if (key == "nobackfill")
11175       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11176     else if (key == "norebalance")
11177       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11178     else if (key == "norecover")
11179       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11180     else if (key == "noscrub")
11181       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11182     else if (key == "nodeep-scrub")
11183       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11184     else if (key == "notieragent")
11185       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11186     else if (key == "nosnaptrim")
11187       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11188     else if (key == "pglog_hardlimit") {
11189       if (!osdmap.get_num_up_osds() && !sure) {
11190         ss << "Not advisable to continue since no OSDs are up. Pass "
11191            << "--yes-i-really-mean-it if you really wish to continue.";
11192         err = -EPERM;
11193         goto reply;
11194       }
11195       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11196       // we are reusing a jewel feature bit that was retired in luminous.
11197       if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11198          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11199           || sure)) {
11200         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11201       } else {
11202         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11203         err = -EPERM;
11204         goto reply;
11205       }
11206     } else {
11207       ss << "unrecognized flag '" << key << "'";
11208       err = -EINVAL;
11209     }
11210
11211   } else if (prefix == "osd unset") {
11212     string key;
11213     cmd_getval(cmdmap, "key", key);
11214     if (key == "pause")
11215       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11216     else if (key == "noup")
11217       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11218     else if (key == "nodown")
11219       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11220     else if (key == "noout")
11221       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11222     else if (key == "noin")
11223       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11224     else if (key == "nobackfill")
11225       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11226     else if (key == "norebalance")
11227       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11228     else if (key == "norecover")
11229       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11230     else if (key == "noscrub")
11231       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11232     else if (key == "nodeep-scrub")
11233       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11234     else if (key == "notieragent")
11235       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11236     else if (key == "nosnaptrim")
11237       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11238     else {
11239       ss << "unrecognized flag '" << key << "'";
11240       err = -EINVAL;
11241     }
11242
11243   } else if (prefix == "osd require-osd-release") {
11244     string release;
11245     cmd_getval(cmdmap, "release", release);
11246     bool sure = false;
11247     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11248     ceph_release_t rel = ceph_release_from_name(release.c_str());
11249     if (!rel) {
11250       ss << "unrecognized release " << release;
11251       err = -EINVAL;
11252       goto reply;
11253     }
11254     if (rel == osdmap.require_osd_release) {
11255       // idempotent
11256       err = 0;
11257       goto reply;
11258     }
11259     ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11260     if (!osdmap.get_num_up_osds() && !sure) {
11261       ss << "Not advisable to continue since no OSDs are up. Pass "
11262          << "--yes-i-really-mean-it if you really wish to continue.";
11263       err = -EPERM;
11264       goto reply;
11265     }
11266     if (rel == ceph_release_t::mimic) {
11267       if (!mon->monmap->get_required_features().contains_all(
11268             ceph::features::mon::FEATURE_MIMIC)) {
11269         ss << "not all mons are mimic";
11270         err = -EPERM;
11271         goto reply;
11272       }
11273       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11274            && !sure) {
11275         ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11276         err = -EPERM;
11277         goto reply;
11278       }
11279     } else if (rel == ceph_release_t::nautilus) {
11280       if (!mon->monmap->get_required_features().contains_all(
11281             ceph::features::mon::FEATURE_NAUTILUS)) {
11282         ss << "not all mons are nautilus";
11283         err = -EPERM;
11284         goto reply;
11285       }
11286       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11287            && !sure) {
11288         ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11289         err = -EPERM;
11290         goto reply;
11291       }
11292     } else if (rel == ceph_release_t::octopus) {
11293       if (!mon->monmap->get_required_features().contains_all(
11294             ceph::features::mon::FEATURE_OCTOPUS)) {
11295         ss << "not all mons are octopus";
11296         err = -EPERM;
11297         goto reply;
11298       }
11299       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11300            && !sure) {
11301         ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11302         err = -EPERM;
11303         goto reply;
11304       }
11305     } else {
11306       ss << "not supported for this release yet";
11307       err = -EPERM;
11308       goto reply;
11309     }
11310     if (rel < osdmap.require_osd_release) {
11311       ss << "require_osd_release cannot be lowered once it has been set";
11312       err = -EPERM;
11313       goto reply;
11314     }
11315     pending_inc.new_require_osd_release = rel;
11316     goto update;
11317   } else if (prefix == "osd down" ||
11318              prefix == "osd out" ||
11319              prefix == "osd in" ||
11320              prefix == "osd rm" ||
11321              prefix == "osd stop") {
11322
11323     bool any = false;
11324     bool stop = false;
11325     bool verbose = true;
11326     bool definitely_dead = false;
11327
11328     vector<string> idvec;
11329     cmd_getval(cmdmap, "ids", idvec);
11330     cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11331     derr << "definitely_dead " << (int)definitely_dead << dendl;
11332     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11333       set<int> osds;
11334
11335       // wildcard?
11336       if (j == 0 &&
11337           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11338         if (prefix == "osd in") {
11339           // touch out osds only
11340           osdmap.get_out_existing_osds(osds);
11341         } else {
11342           osdmap.get_all_osds(osds);
11343         }
11344         stop = true;
11345         verbose = false; // so the output is less noisy.
11346       } else {
11347         long osd = parse_osd_id(idvec[j].c_str(), &ss);
11348         if (osd < 0) {
11349           ss << "invalid osd id" << osd;
11350           err = -EINVAL;
11351           continue;
11352         } else if (!osdmap.exists(osd)) {
11353           ss << "osd." << osd << " does not exist. ";
11354           continue;
11355         }
11356
11357         osds.insert(osd);
11358       }
11359
11360       for (auto &osd : osds) {
11361         if (prefix == "osd down") {
11362           if (osdmap.is_down(osd)) {
11363             if (verbose)
11364               ss << "osd." << osd << " is already down. ";
11365           } else {
11366             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11367             ss << "marked down osd." << osd << ". ";
11368             any = true;
11369           }
11370           if (definitely_dead) {
11371             if (!pending_inc.new_xinfo.count(osd)) {
11372               pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11373             }
11374             if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11375               any = true;
11376             }
11377             pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11378           }
11379         } else if (prefix == "osd out") {
11380           if (osdmap.is_out(osd)) {
11381             if (verbose)
11382               ss << "osd." << osd << " is already out. ";
11383           } else {
11384             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11385             if (osdmap.osd_weight[osd]) {
11386               if (pending_inc.new_xinfo.count(osd) == 0) {
11387                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11388               }
11389               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11390             }
11391             ss << "marked out osd." << osd << ". ";
11392             std::ostringstream msg;
11393             msg << "Client " << op->get_session()->entity_name
11394                 << " marked osd." << osd << " out";
11395             if (osdmap.is_up(osd)) {
11396               msg << ", while it was still marked up";
11397             } else {
11398               auto period = ceph_clock_now() - down_pending_out[osd];
11399               msg << ", after it was down for " << int(period.sec())
11400                   << " seconds";
11401             }
11402
11403             mon->clog->info() << msg.str();
11404             any = true;
11405           }
11406         } else if (prefix == "osd in") {
11407           if (osdmap.is_in(osd)) {
11408             if (verbose)
11409               ss << "osd." << osd << " is already in. ";
11410           } else {
11411             if (osdmap.osd_xinfo[osd].old_weight > 0) {
11412               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11413               if (pending_inc.new_xinfo.count(osd) == 0) {
11414                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11415               }
11416               pending_inc.new_xinfo[osd].old_weight = 0;
11417             } else {
11418               pending_inc.new_weight[osd] = CEPH_OSD_IN;
11419             }
11420             ss << "marked in osd." << osd << ". ";
11421             any = true;
11422           }
11423         } else if (prefix == "osd rm") {
11424           err = prepare_command_osd_remove(osd);
11425
11426           if (err == -EBUSY) {
11427             if (any)
11428               ss << ", ";
11429             ss << "osd." << osd << " is still up; must be down before removal. ";
11430           } else {
11431             ceph_assert(err == 0);
11432             if (any) {
11433               ss << ", osd." << osd;
11434             } else {
11435               ss << "removed osd." << osd;
11436             }
11437             any = true;
11438           }
11439         } else if (prefix == "osd stop") {
11440           if (osdmap.is_stop(osd)) {
11441             if (verbose)
11442               ss << "osd." << osd << " is already stopped. ";
11443           } else if (osdmap.is_down(osd)) {
11444             pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11445             ss << "stop down osd." << osd << ". ";
11446             any = true;
11447           } else {
11448             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11449             ss << "stop osd." << osd << ". ";
11450             any = true;
11451           }
11452         }
11453       }
11454     }
11455     if (any) {
11456       getline(ss, rs);
11457       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11458                                                 get_last_committed() + 1));
11459       return true;
11460     }
11461   } else if (prefix == "osd set-group" ||
11462              prefix == "osd unset-group" ||
11463              prefix == "osd add-noup" ||
11464              prefix == "osd add-nodown" ||
11465              prefix == "osd add-noin" ||
11466              prefix == "osd add-noout" ||
11467              prefix == "osd rm-noup" ||
11468              prefix == "osd rm-nodown" ||
11469              prefix == "osd rm-noin" ||
11470              prefix == "osd rm-noout") {
11471     bool do_set = prefix == "osd set-group" ||
11472                   prefix.find("add") != string::npos;
11473     string flag_str;
11474     unsigned flags = 0;
11475     vector<string> who;
11476     if (prefix == "osd set-group" || prefix == "osd unset-group") {
11477       cmd_getval(cmdmap, "flags", flag_str);
11478       cmd_getval(cmdmap, "who", who);
11479       vector<string> raw_flags;
11480       boost::split(raw_flags, flag_str, boost::is_any_of(","));
11481       for (auto& f : raw_flags) {
11482         if (f == "noup")
11483           flags |= CEPH_OSD_NOUP;
11484         else if (f == "nodown")
11485           flags |= CEPH_OSD_NODOWN;
11486         else if (f == "noin")
11487           flags |= CEPH_OSD_NOIN;
11488         else if (f == "noout")
11489           flags |= CEPH_OSD_NOOUT;
11490         else {
11491           ss << "unrecognized flag '" << f << "', must be one of "
11492              << "{noup,nodown,noin,noout}";
11493           err = -EINVAL;
11494           goto reply;
11495         }
11496       }
11497     } else {
11498       cmd_getval(cmdmap, "ids", who);
11499       if (prefix.find("noup") != string::npos)
11500         flags = CEPH_OSD_NOUP;
11501       else if (prefix.find("nodown") != string::npos)
11502         flags = CEPH_OSD_NODOWN;
11503       else if (prefix.find("noin") != string::npos)
11504         flags = CEPH_OSD_NOIN;
11505       else if (prefix.find("noout") != string::npos)
11506         flags = CEPH_OSD_NOOUT;
11507       else
11508         ceph_assert(0 == "Unreachable!");
11509     }
11510     if (flags == 0) {
11511       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11512       err = -EINVAL;
11513       goto reply;
11514     }
11515     if (who.empty()) {
11516       ss << "must specify at least one or more targets to set/unset";
11517       err = -EINVAL;
11518       goto reply;
11519     }
11520     set<int> osds;
11521     set<int> crush_nodes;
11522     set<int> device_classes;
11523     for (auto& w : who) {
11524       if (w == "any" || w == "all" || w == "*") {
11525         osdmap.get_all_osds(osds);
11526         break;
11527       }
11528       std::stringstream ts;
11529       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11530         osds.insert(osd);
11531       } else if (osdmap.crush->name_exists(w)) {
11532         crush_nodes.insert(osdmap.crush->get_item_id(w));
11533       } else if (osdmap.crush->class_exists(w)) {
11534         device_classes.insert(osdmap.crush->get_class_id(w));
11535       } else {
11536         ss << "unable to parse osd id or crush node or device class: "
11537            << "\"" << w << "\". ";
11538       }
11539     }
11540     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11541       // ss has reason for failure
11542       err = -EINVAL;
11543       goto reply;
11544     }
11545     bool any = false;
11546     for (auto osd : osds) {
11547       if (!osdmap.exists(osd)) {
11548         ss << "osd." << osd << " does not exist. ";
11549         continue;
11550       }
11551       if (do_set) {
11552         if (flags & CEPH_OSD_NOUP) {
11553           any |= osdmap.is_noup_by_osd(osd) ?
11554             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11555             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11556         }
11557         if (flags & CEPH_OSD_NODOWN) {
11558           any |= osdmap.is_nodown_by_osd(osd) ?
11559             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11560             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11561         }
11562         if (flags & CEPH_OSD_NOIN) {
11563           any |= osdmap.is_noin_by_osd(osd) ?
11564             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11565             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11566         }
11567         if (flags & CEPH_OSD_NOOUT) {
11568           any |= osdmap.is_noout_by_osd(osd) ?
11569             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11570             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11571         }
11572       } else {
11573         if (flags & CEPH_OSD_NOUP) {
11574           any |= osdmap.is_noup_by_osd(osd) ?
11575             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11576             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11577         }
11578         if (flags & CEPH_OSD_NODOWN) {
11579           any |= osdmap.is_nodown_by_osd(osd) ?
11580             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11581             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11582         }
11583         if (flags & CEPH_OSD_NOIN) {
11584           any |= osdmap.is_noin_by_osd(osd) ?
11585             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11586             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11587         }
11588         if (flags & CEPH_OSD_NOOUT) {
11589           any |= osdmap.is_noout_by_osd(osd) ?
11590             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11591             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11592         }
11593       }
11594     }
11595     for (auto& id : crush_nodes) {
11596       auto old_flags = osdmap.get_crush_node_flags(id);
11597       auto& pending_flags = pending_inc.new_crush_node_flags[id];
11598       pending_flags |= old_flags; // adopt existing flags first!
11599       if (do_set) {
11600         pending_flags |= flags;
11601       } else {
11602         pending_flags &= ~flags;
11603       }
11604       any = true;
11605     }
11606     for (auto& id : device_classes) {
11607       auto old_flags = osdmap.get_device_class_flags(id);
11608       auto& pending_flags = pending_inc.new_device_class_flags[id];
11609       pending_flags |= old_flags;
11610       if (do_set) {
11611         pending_flags |= flags;
11612       } else {
11613         pending_flags &= ~flags;
11614       }
11615       any = true;
11616     }
11617     if (any) {
11618       getline(ss, rs);
11619       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11620                                  get_last_committed() + 1));
11621       return true;
11622     }
11623   } else if (prefix == "osd pg-temp") {
11624     string pgidstr;
11625     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11626       ss << "unable to parse 'pgid' value '"
11627          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11628       err = -EINVAL;
11629       goto reply;
11630     }
11631     pg_t pgid;
11632     if (!pgid.parse(pgidstr.c_str())) {
11633       ss << "invalid pgid '" << pgidstr << "'";
11634       err = -EINVAL;
11635       goto reply;
11636     }
11637     if (!osdmap.pg_exists(pgid)) {
11638       ss << "pg " << pgid << " does not exist";
11639       err = -ENOENT;
11640       goto reply;
11641     }
11642     if (pending_inc.new_pg_temp.count(pgid)) {
11643       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11644       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11645       return true;
11646     }
11647
11648     vector<int64_t> id_vec;
11649     vector<int32_t> new_pg_temp;
11650     cmd_getval(cmdmap, "id", id_vec);
11651     if (id_vec.empty())  {
11652       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11653       ss << "done cleaning up pg_temp of " << pgid;
11654       goto update;
11655     }
11656     for (auto osd : id_vec) {
11657       if (!osdmap.exists(osd)) {
11658         ss << "osd." << osd << " does not exist";
11659         err = -ENOENT;
11660         goto reply;
11661       }
11662       new_pg_temp.push_back(osd);
11663     }
11664
11665     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11666     if ((int)new_pg_temp.size() < pool_min_size) {
11667       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11668          << pool_min_size << ")";
11669       err = -EINVAL;
11670       goto reply;
11671     }
11672
11673     int pool_size = osdmap.get_pg_pool_size(pgid);
11674     if ((int)new_pg_temp.size() > pool_size) {
11675       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11676          << pool_size << ")";
11677       err = -EINVAL;
11678       goto reply;
11679     }
11680
11681     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11682       new_pg_temp.begin(), new_pg_temp.end());
11683     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11684     goto update;
11685   } else if (prefix == "osd primary-temp") {
11686     string pgidstr;
11687     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11688       ss << "unable to parse 'pgid' value '"
11689          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11690       err = -EINVAL;
11691       goto reply;
11692     }
11693     pg_t pgid;
11694     if (!pgid.parse(pgidstr.c_str())) {
11695       ss << "invalid pgid '" << pgidstr << "'";
11696       err = -EINVAL;
11697       goto reply;
11698     }
11699     if (!osdmap.pg_exists(pgid)) {
11700       ss << "pg " << pgid << " does not exist";
11701       err = -ENOENT;
11702       goto reply;
11703     }
11704
11705     int64_t osd;
11706     if (!cmd_getval(cmdmap, "id", osd)) {
11707       ss << "unable to parse 'id' value '"
11708          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11709       err = -EINVAL;
11710       goto reply;
11711     }
11712     if (osd != -1 && !osdmap.exists(osd)) {
11713       ss << "osd." << osd << " does not exist";
11714       err = -ENOENT;
11715       goto reply;
11716     }
11717
11718     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11719         osdmap.require_min_compat_client < ceph_release_t::firefly) {
11720       ss << "require_min_compat_client "
11721          << osdmap.require_min_compat_client
11722          << " < firefly, which is required for primary-temp";
11723       err = -EPERM;
11724       goto reply;
11725     }
11726
11727     pending_inc.new_primary_temp[pgid] = osd;
11728     ss << "set " << pgid << " primary_temp mapping to " << osd;
11729     goto update;
11730   } else if (prefix == "pg repeer") {
11731     pg_t pgid;
11732     string pgidstr;
11733     cmd_getval(cmdmap, "pgid", pgidstr);
11734     if (!pgid.parse(pgidstr.c_str())) {
11735       ss << "invalid pgid '" << pgidstr << "'";
11736       err = -EINVAL;
11737       goto reply;
11738     }
11739     if (!osdmap.pg_exists(pgid)) {
11740       ss << "pg '" << pgidstr << "' does not exist";
11741       err = -ENOENT;
11742       goto reply;
11743     }
11744     vector<int> acting;
11745     int primary;
11746     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11747     if (primary < 0) {
11748       err = -EAGAIN;
11749       ss << "pg currently has no primary";
11750       goto reply;
11751     }
11752     if (acting.size() > 1) {
11753       // map to just primary; it will map back to what it wants
11754       pending_inc.new_pg_temp[pgid] = { primary };
11755     } else {
11756       // hmm, pick another arbitrary osd to induce a change.  Note
11757       // that this won't work if there is only one suitable OSD in the cluster.
11758       int i;
11759       bool done = false;
11760       for (i = 0; i < osdmap.get_max_osd(); ++i) {
11761         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11762           continue;
11763         }
11764         pending_inc.new_pg_temp[pgid] = { primary, i };
11765         done = true;
11766         break;
11767       }
11768       if (!done) {
11769         err = -EAGAIN;
11770         ss << "not enough up OSDs in the cluster to force repeer";
11771         goto reply;
11772       }
11773     }
11774     goto update;
11775   } else if (prefix == "osd pg-upmap" ||
11776              prefix == "osd rm-pg-upmap" ||
11777              prefix == "osd pg-upmap-items" ||
11778              prefix == "osd rm-pg-upmap-items") {
11779     if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11780       ss << "min_compat_client "
11781          << osdmap.require_min_compat_client
11782          << " < luminous, which is required for pg-upmap. "
11783          << "Try 'ceph osd set-require-min-compat-client luminous' "
11784          << "before using the new interface";
11785       err = -EPERM;
11786       goto reply;
11787     }
11788     err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11789     if (err == -EAGAIN)
11790       goto wait;
11791     if (err < 0)
11792       goto reply;
11793     string pgidstr;
11794     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11795       ss << "unable to parse 'pgid' value '"
11796          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11797       err = -EINVAL;
11798       goto reply;
11799     }
11800     pg_t pgid;
11801     if (!pgid.parse(pgidstr.c_str())) {
11802       ss << "invalid pgid '" << pgidstr << "'";
11803       err = -EINVAL;
11804       goto reply;
11805     }
11806     if (!osdmap.pg_exists(pgid)) {
11807       ss << "pg " << pgid << " does not exist";
11808       err = -ENOENT;
11809       goto reply;
11810     }
11811     if (pending_inc.old_pools.count(pgid.pool())) {
11812       ss << "pool of " << pgid << " is pending removal";
11813       err = -ENOENT;
11814       getline(ss, rs);
11815       wait_for_finished_proposal(op,
11816         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11817       return true;
11818     }
11819
11820     enum {
11821       OP_PG_UPMAP,
11822       OP_RM_PG_UPMAP,
11823       OP_PG_UPMAP_ITEMS,
11824       OP_RM_PG_UPMAP_ITEMS,
11825     } option;
11826
11827     if (prefix == "osd pg-upmap") {
11828       option = OP_PG_UPMAP;
11829     } else if (prefix == "osd rm-pg-upmap") {
11830       option = OP_RM_PG_UPMAP;
11831     } else if (prefix == "osd pg-upmap-items") {
11832       option = OP_PG_UPMAP_ITEMS;
11833     } else {
11834       option = OP_RM_PG_UPMAP_ITEMS;
11835     }
11836
11837     // check pending upmap changes
11838     switch (option) {
11839     case OP_PG_UPMAP: // fall through
11840     case OP_RM_PG_UPMAP:
11841       if (pending_inc.new_pg_upmap.count(pgid) ||
11842           pending_inc.old_pg_upmap.count(pgid)) {
11843         dout(10) << __func__ << " waiting for pending update on "
11844                  << pgid << dendl;
11845         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11846         return true;
11847       }
11848       break;
11849
11850     case OP_PG_UPMAP_ITEMS: // fall through
11851     case OP_RM_PG_UPMAP_ITEMS:
11852       if (pending_inc.new_pg_upmap_items.count(pgid) ||
11853           pending_inc.old_pg_upmap_items.count(pgid)) {
11854         dout(10) << __func__ << " waiting for pending update on "
11855                  << pgid << dendl;
11856         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11857         return true;
11858       }
11859       break;
11860
11861     default:
11862       ceph_abort_msg("invalid option");
11863     }
11864
11865     switch (option) {
11866     case OP_PG_UPMAP:
11867       {
11868         vector<int64_t> id_vec;
11869         if (!cmd_getval(cmdmap, "id", id_vec)) {
11870           ss << "unable to parse 'id' value(s) '"
11871              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11872           err = -EINVAL;
11873           goto reply;
11874         }
11875
11876         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11877         if ((int)id_vec.size() < pool_min_size) {
11878           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11879              << pool_min_size << ")";
11880           err = -EINVAL;
11881           goto reply;
11882         }
11883
11884         int pool_size = osdmap.get_pg_pool_size(pgid);
11885         if ((int)id_vec.size() > pool_size) {
11886           ss << "num of osds (" << id_vec.size() <<") > pool size ("
11887              << pool_size << ")";
11888           err = -EINVAL;
11889           goto reply;
11890         }
11891
11892         vector<int32_t> new_pg_upmap;
11893         for (auto osd : id_vec) {
11894           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11895             ss << "osd." << osd << " does not exist";
11896             err = -ENOENT;
11897             goto reply;
11898           }
11899           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11900           if (it != new_pg_upmap.end()) {
11901             ss << "osd." << osd << " already exists, ";
11902             continue;
11903           }
11904           new_pg_upmap.push_back(osd);
11905         }
11906
11907         if (new_pg_upmap.empty()) {
11908           ss << "no valid upmap items(pairs) is specified";
11909           err = -EINVAL;
11910           goto reply;
11911         }
11912
11913         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11914           new_pg_upmap.begin(), new_pg_upmap.end());
11915         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
11916       }
11917       break;
11918
11919     case OP_RM_PG_UPMAP:
11920       {
11921         pending_inc.old_pg_upmap.insert(pgid);
11922         ss << "clear " << pgid << " pg_upmap mapping";
11923       }
11924       break;
11925
11926     case OP_PG_UPMAP_ITEMS:
11927       {
11928         vector<int64_t> id_vec;
11929         if (!cmd_getval(cmdmap, "id", id_vec)) {
11930           ss << "unable to parse 'id' value(s) '"
11931              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11932           err = -EINVAL;
11933           goto reply;
11934         }
11935
11936         if (id_vec.size() % 2) {
11937           ss << "you must specify pairs of osd ids to be remapped";
11938           err = -EINVAL;
11939           goto reply;
11940         }
11941
11942         int pool_size = osdmap.get_pg_pool_size(pgid);
11943         if ((int)(id_vec.size() / 2) > pool_size) {
11944           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11945              << pool_size << ")";
11946           err = -EINVAL;
11947           goto reply;
11948         }
11949
11950         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11951         ostringstream items;
11952         items << "[";
11953         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11954           int from = *p++;
11955           int to = *p;
11956           if (from == to) {
11957             ss << "from osd." << from << " == to osd." << to << ", ";
11958             continue;
11959           }
11960           if (!osdmap.exists(from)) {
11961             ss << "osd." << from << " does not exist";
11962             err = -ENOENT;
11963             goto reply;
11964           }
11965           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11966             ss << "osd." << to << " does not exist";
11967             err = -ENOENT;
11968             goto reply;
11969           }
11970           pair<int32_t,int32_t> entry = make_pair(from, to);
11971           auto it = std::find(new_pg_upmap_items.begin(),
11972             new_pg_upmap_items.end(), entry);
11973           if (it != new_pg_upmap_items.end()) {
11974             ss << "osd." << from << " -> osd." << to << " already exists, ";
11975             continue;
11976           }
11977           new_pg_upmap_items.push_back(entry);
11978           items << from << "->" << to << ",";
11979         }
11980         string out(items.str());
11981         out.resize(out.size() - 1); // drop last ','
11982         out += "]";
11983
11984         if (new_pg_upmap_items.empty()) {
11985           ss << "no valid upmap items(pairs) is specified";
11986           err = -EINVAL;
11987           goto reply;
11988         }
11989
11990         pending_inc.new_pg_upmap_items[pgid] =
11991           mempool::osdmap::vector<pair<int32_t,int32_t>>(
11992           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11993         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11994       }
11995       break;
11996
11997     case OP_RM_PG_UPMAP_ITEMS:
11998       {
11999         pending_inc.old_pg_upmap_items.insert(pgid);
12000         ss << "clear " << pgid << " pg_upmap_items mapping";
12001       }
12002       break;
12003
12004     default:
12005       ceph_abort_msg("invalid option");
12006     }
12007
12008     goto update;
12009   } else if (prefix == "osd primary-affinity") {
12010     int64_t id;
12011     if (!cmd_getval(cmdmap, "id", id)) {
12012       ss << "invalid osd id value '"
12013          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12014       err = -EINVAL;
12015       goto reply;
12016     }
12017     double w;
12018     if (!cmd_getval(cmdmap, "weight", w)) {
12019       ss << "unable to parse 'weight' value '"
12020          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12021       err = -EINVAL;
12022       goto reply;
12023     }
12024     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12025     if (ww < 0L) {
12026       ss << "weight must be >= 0";
12027       err = -EINVAL;
12028       goto reply;
12029     }
12030     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12031         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12032       ss << "require_min_compat_client "
12033          << osdmap.require_min_compat_client
12034          << " < firefly, which is required for primary-affinity";
12035       err = -EPERM;
12036       goto reply;
12037     }
12038     if (osdmap.exists(id)) {
12039       pending_inc.new_primary_affinity[id] = ww;
12040       ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
12041       getline(ss, rs);
12042       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12043                                                 get_last_committed() + 1));
12044       return true;
12045     } else {
12046       ss << "osd." << id << " does not exist";
12047       err = -ENOENT;
12048       goto reply;
12049     }
12050   } else if (prefix == "osd reweight") {
12051     int64_t id;
12052     if (!cmd_getval(cmdmap, "id", id)) {
12053       ss << "unable to parse osd id value '"
12054          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12055       err = -EINVAL;
12056       goto reply;
12057     }
12058     double w;
12059     if (!cmd_getval(cmdmap, "weight", w)) {
12060       ss << "unable to parse weight value '"
12061          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12062       err = -EINVAL;
12063       goto reply;
12064     }
12065     long ww = (int)((double)CEPH_OSD_IN*w);
12066     if (ww < 0L) {
12067       ss << "weight must be >= 0";
12068       err = -EINVAL;
12069       goto reply;
12070     }
12071     if (osdmap.exists(id)) {
12072       pending_inc.new_weight[id] = ww;
12073       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12074       getline(ss, rs);
12075       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12076                                                 get_last_committed() + 1));
12077       return true;
12078     } else {
12079       ss << "osd." << id << " does not exist";
12080       err = -ENOENT;
12081       goto reply;
12082     }
12083   } else if (prefix == "osd reweightn") {
12084     map<int32_t, uint32_t> weights;
12085     err = parse_reweights(cct, cmdmap, osdmap, &weights);
12086     if (err) {
12087       ss << "unable to parse 'weights' value '"
12088          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12089       goto reply;
12090     }
12091     pending_inc.new_weight.insert(weights.begin(), weights.end());
12092     wait_for_finished_proposal(
12093         op,
12094         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12095     return true;
12096   } else if (prefix == "osd lost") {
12097     int64_t id;
12098     if (!cmd_getval(cmdmap, "id", id)) {
12099       ss << "unable to parse osd id value '"
12100          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12101       err = -EINVAL;
12102       goto reply;
12103     }
12104     bool sure = false;
12105     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12106     if (!sure) {
12107       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
12108             "--yes-i-really-mean-it if you really do.";
12109       err = -EPERM;
12110       goto reply;
12111     } else if (!osdmap.exists(id)) {
12112       ss << "osd." << id << " does not exist";
12113       err = -ENOENT;
12114       goto reply;
12115     } else if (!osdmap.is_down(id)) {
12116       ss << "osd." << id << " is not down";
12117       err = -EBUSY;
12118       goto reply;
12119     } else {
12120       epoch_t e = osdmap.get_info(id).down_at;
12121       pending_inc.new_lost[id] = e;
12122       ss << "marked osd lost in epoch " << e;
12123       getline(ss, rs);
12124       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12125                                                 get_last_committed() + 1));
12126       return true;
12127     }
12128
12129   } else if (prefix == "osd destroy-actual" ||
12130              prefix == "osd purge-actual" ||
12131              prefix == "osd purge-new") {
12132     /* Destroying an OSD means that we don't expect to further make use of
12133      * the OSDs data (which may even become unreadable after this operation),
12134      * and that we are okay with scrubbing all its cephx keys and config-key
12135      * data (which may include lockbox keys, thus rendering the osd's data
12136      * unreadable).
12137      *
12138      * The OSD will not be removed. Instead, we will mark it as destroyed,
12139      * such that a subsequent call to `create` will not reuse the osd id.
12140      * This will play into being able to recreate the OSD, at the same
12141      * crush location, with minimal data movement.
12142      */
12143
12144     // make sure authmon is writeable.
12145     if (!mon->authmon()->is_writeable()) {
12146       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12147                << "osd destroy" << dendl;
12148       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12149       return false;
12150     }
12151
12152     int64_t id;
12153     if (!cmd_getval(cmdmap, "id", id)) {
12154       auto p = cmdmap.find("id");
12155       if (p == cmdmap.end()) {
12156         ss << "no osd id specified";
12157       } else {
12158         ss << "unable to parse osd id value '"
12159            << cmd_vartype_stringify(cmdmap.at("id")) << "";
12160       }
12161       err = -EINVAL;
12162       goto reply;
12163     }
12164
12165     bool is_destroy = (prefix == "osd destroy-actual");
12166     if (!is_destroy) {
12167       ceph_assert("osd purge-actual" == prefix ||
12168              "osd purge-new" == prefix);
12169     }
12170
12171     bool sure = false;
12172     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12173     if (!sure) {
12174       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
12175          << "This will mean real, permanent data loss, as well "
12176          << "as deletion of cephx and lockbox keys. "
12177          << "Pass --yes-i-really-mean-it if you really do.";
12178       err = -EPERM;
12179       goto reply;
12180     } else if (!osdmap.exists(id)) {
12181       ss << "osd." << id << " does not exist";
12182       err = 0; // idempotent
12183       goto reply;
12184     } else if (osdmap.is_up(id)) {
12185       ss << "osd." << id << " is not `down`.";
12186       err = -EBUSY;
12187       goto reply;
12188     } else if (is_destroy && osdmap.is_destroyed(id)) {
12189       ss << "destroyed osd." << id;
12190       err = 0;
12191       goto reply;
12192     }
12193
12194     if (prefix == "osd purge-new" &&
12195         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12196       ss << "osd." << id << " is not new";
12197       err = -EPERM;
12198       goto reply;
12199     }
12200
12201     bool goto_reply = false;
12202
12203     paxos->plug();
12204     if (is_destroy) {
12205       err = prepare_command_osd_destroy(id, ss);
12206       // we checked above that it should exist.
12207       ceph_assert(err != -ENOENT);
12208     } else {
12209       err = prepare_command_osd_purge(id, ss);
12210       if (err == -ENOENT) {
12211         err = 0;
12212         ss << "osd." << id << " does not exist.";
12213         goto_reply = true;
12214       }
12215     }
12216     paxos->unplug();
12217
12218     if (err < 0 || goto_reply) {
12219       goto reply;
12220     }
12221
12222     if (is_destroy) {
12223       ss << "destroyed osd." << id;
12224     } else {
12225       ss << "purged osd." << id;
12226     }
12227
12228     getline(ss, rs);
12229     wait_for_finished_proposal(op,
12230         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12231     force_immediate_propose();
12232     return true;
12233
12234   } else if (prefix == "osd new") {
12235
12236     // make sure authmon is writeable.
12237     if (!mon->authmon()->is_writeable()) {
12238       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12239                << "osd new" << dendl;
12240       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12241       return false;
12242     }
12243
12244     map<string,string> param_map;
12245
12246     bufferlist bl = m->get_data();
12247     string param_json = bl.to_str();
12248     dout(20) << __func__ << " osd new json = " << param_json << dendl;
12249
12250     err = get_json_str_map(param_json, ss, &param_map);
12251     if (err < 0)
12252       goto reply;
12253
12254     dout(20) << __func__ << " osd new params " << param_map << dendl;
12255
12256     paxos->plug();
12257     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12258     paxos->unplug();
12259
12260     if (err < 0) {
12261       goto reply;
12262     }
12263
12264     if (f) {
12265       f->flush(rdata);
12266     } else {
12267       rdata.append(ss);
12268     }
12269
12270     if (err == EEXIST) {
12271       // idempotent operation
12272       err = 0;
12273       goto reply;
12274     }
12275
12276     wait_for_finished_proposal(op,
12277         new Monitor::C_Command(mon, op, 0, rs, rdata,
12278                                get_last_committed() + 1));
12279     force_immediate_propose();
12280     return true;
12281
12282   } else if (prefix == "osd create") {
12283
12284     // optional id provided?
12285     int64_t id = -1, cmd_id = -1;
12286     if (cmd_getval(cmdmap, "id", cmd_id)) {
12287       if (cmd_id < 0) {
12288         ss << "invalid osd id value '" << cmd_id << "'";
12289         err = -EINVAL;
12290         goto reply;
12291       }
12292       dout(10) << " osd create got id " << cmd_id << dendl;
12293     }
12294
12295     uuid_d uuid;
12296     string uuidstr;
12297     if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12298       if (!uuid.parse(uuidstr.c_str())) {
12299         ss << "invalid uuid value '" << uuidstr << "'";
12300         err = -EINVAL;
12301         goto reply;
12302       }
12303       // we only care about the id if we also have the uuid, to
12304       // ensure the operation's idempotency.
12305       id = cmd_id;
12306     }
12307
12308     int32_t new_id = -1;
12309     err = prepare_command_osd_create(id, uuid, &new_id, ss);
12310     if (err < 0) {
12311       if (err == -EAGAIN) {
12312         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12313         return true;
12314       }
12315       // a check has failed; reply to the user.
12316       goto reply;
12317
12318     } else if (err == EEXIST) {
12319       // this is an idempotent operation; we can go ahead and reply.
12320       if (f) {
12321         f->open_object_section("created_osd");
12322         f->dump_int("osdid", new_id);
12323         f->close_section();
12324         f->flush(rdata);
12325       } else {
12326         ss << new_id;
12327         rdata.append(ss);
12328       }
12329       err = 0;
12330       goto reply;
12331     }
12332
12333     string empty_device_class;
12334     do_osd_create(id, uuid, empty_device_class, &new_id);
12335
12336     if (f) {
12337       f->open_object_section("created_osd");
12338       f->dump_int("osdid", new_id);
12339       f->close_section();
12340       f->flush(rdata);
12341     } else {
12342       ss << new_id;
12343       rdata.append(ss);
12344     }
12345     wait_for_finished_proposal(op,
12346         new Monitor::C_Command(mon, op, 0, rs, rdata,
12347                                get_last_committed() + 1));
12348     return true;
12349
12350   } else if (prefix == "osd blacklist clear") {
12351     pending_inc.new_blacklist.clear();
12352     std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12353     osdmap.get_blacklist(&blacklist);
12354     for (const auto &entry : blacklist) {
12355       pending_inc.old_blacklist.push_back(entry.first);
12356     }
12357     ss << " removed all blacklist entries";
12358     getline(ss, rs);
12359     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12360                                               get_last_committed() + 1));
12361     return true;
12362   } else if (prefix == "osd blacklist") {
12363     string addrstr;
12364     cmd_getval(cmdmap, "addr", addrstr);
12365     entity_addr_t addr;
12366     if (!addr.parse(addrstr.c_str(), 0)) {
12367       ss << "unable to parse address " << addrstr;
12368       err = -EINVAL;
12369       goto reply;
12370     }
12371     else {
12372       if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12373         // always blacklist type ANY
12374         addr.set_type(entity_addr_t::TYPE_ANY);
12375       } else {
12376         addr.set_type(entity_addr_t::TYPE_LEGACY);
12377       }
12378
12379       string blacklistop;
12380       cmd_getval(cmdmap, "blacklistop", blacklistop);
12381       if (blacklistop == "add") {
12382         utime_t expires = ceph_clock_now();
12383         double d;
12384         // default one hour
12385         cmd_getval(cmdmap, "expire", d,
12386           g_conf()->mon_osd_blacklist_default_expire);
12387         expires += d;
12388
12389         pending_inc.new_blacklist[addr] = expires;
12390
12391         {
12392           // cancel any pending un-blacklisting request too
12393           auto it = std::find(pending_inc.old_blacklist.begin(),
12394             pending_inc.old_blacklist.end(), addr);
12395           if (it != pending_inc.old_blacklist.end()) {
12396             pending_inc.old_blacklist.erase(it);
12397           }
12398         }
12399
12400         ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12401         getline(ss, rs);
12402         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12403                                                   get_last_committed() + 1));
12404         return true;
12405       } else if (blacklistop == "rm") {
12406         if (osdmap.is_blacklisted(addr) ||
12407             pending_inc.new_blacklist.count(addr)) {
12408           if (osdmap.is_blacklisted(addr))
12409             pending_inc.old_blacklist.push_back(addr);
12410           else
12411             pending_inc.new_blacklist.erase(addr);
12412           ss << "un-blacklisting " << addr;
12413           getline(ss, rs);
12414           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12415                                                     get_last_committed() + 1));
12416           return true;
12417         }
12418         ss << addr << " isn't blacklisted";
12419         err = 0;
12420         goto reply;
12421       }
12422     }
12423   } else if (prefix == "osd pool mksnap") {
12424     string poolstr;
12425     cmd_getval(cmdmap, "pool", poolstr);
12426     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12427     if (pool < 0) {
12428       ss << "unrecognized pool '" << poolstr << "'";
12429       err = -ENOENT;
12430       goto reply;
12431     }
12432     string snapname;
12433     cmd_getval(cmdmap, "snap", snapname);
12434     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12435     if (p->is_unmanaged_snaps_mode()) {
12436       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12437       err = -EINVAL;
12438       goto reply;
12439     } else if (p->snap_exists(snapname.c_str())) {
12440       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12441       err = 0;
12442       goto reply;
12443     } else if (p->is_tier()) {
12444       ss << "pool " << poolstr << " is a cache tier";
12445       err = -EINVAL;
12446       goto reply;
12447     }
12448     pg_pool_t *pp = 0;
12449     if (pending_inc.new_pools.count(pool))
12450       pp = &pending_inc.new_pools[pool];
12451     if (!pp) {
12452       pp = &pending_inc.new_pools[pool];
12453       *pp = *p;
12454     }
12455     if (pp->snap_exists(snapname.c_str())) {
12456       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12457     } else {
12458       pp->add_snap(snapname.c_str(), ceph_clock_now());
12459       pp->set_snap_epoch(pending_inc.epoch);
12460       ss << "created pool " << poolstr << " snap " << snapname;
12461     }
12462     getline(ss, rs);
12463     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12464                                               get_last_committed() + 1));
12465     return true;
12466   } else if (prefix == "osd pool rmsnap") {
12467     string poolstr;
12468     cmd_getval(cmdmap, "pool", poolstr);
12469     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12470     if (pool < 0) {
12471       ss << "unrecognized pool '" << poolstr << "'";
12472       err = -ENOENT;
12473       goto reply;
12474     }
12475     string snapname;
12476     cmd_getval(cmdmap, "snap", snapname);
12477     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12478     if (p->is_unmanaged_snaps_mode()) {
12479       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12480       err = -EINVAL;
12481       goto reply;
12482     } else if (!p->snap_exists(snapname.c_str())) {
12483       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12484       err = 0;
12485       goto reply;
12486     }
12487     pg_pool_t *pp = 0;
12488     if (pending_inc.new_pools.count(pool))
12489       pp = &pending_inc.new_pools[pool];
12490     if (!pp) {
12491       pp = &pending_inc.new_pools[pool];
12492       *pp = *p;
12493     }
12494     snapid_t sn = pp->snap_exists(snapname.c_str());
12495     if (sn) {
12496       pp->remove_snap(sn);
12497       pp->set_snap_epoch(pending_inc.epoch);
12498       ss << "removed pool " << poolstr << " snap " << snapname;
12499     } else {
12500       ss << "already removed pool " << poolstr << " snap " << snapname;
12501     }
12502     getline(ss, rs);
12503     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12504                                               get_last_committed() + 1));
12505     return true;
12506   } else if (prefix == "osd pool create") {
12507     int64_t pg_num, pg_num_min;
12508     int64_t pgp_num;
12509     cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12510     cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12511     cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12512
12513     string pool_type_str;
12514     cmd_getval(cmdmap, "pool_type", pool_type_str);
12515     if (pool_type_str.empty())
12516       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12517
12518     string poolstr;
12519     cmd_getval(cmdmap, "pool", poolstr);
12520     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12521     if (pool_id >= 0) {
12522       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12523       if (pool_type_str != p->get_type_name()) {
12524         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12525         err = -EINVAL;
12526       } else {
12527         ss << "pool '" << poolstr << "' already exists";
12528         err = 0;
12529       }
12530       goto reply;
12531     }
12532
12533     int pool_type;
12534     if (pool_type_str == "replicated") {
12535       pool_type = pg_pool_t::TYPE_REPLICATED;
12536     } else if (pool_type_str == "erasure") {
12537       pool_type = pg_pool_t::TYPE_ERASURE;
12538     } else {
12539       ss << "unknown pool type '" << pool_type_str << "'";
12540       err = -EINVAL;
12541       goto reply;
12542     }
12543
12544     bool implicit_rule_creation = false;
12545     int64_t expected_num_objects = 0;
12546     string rule_name;
12547     cmd_getval(cmdmap, "rule", rule_name);
12548     string erasure_code_profile;
12549     cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12550
12551     if (pool_type == pg_pool_t::TYPE_ERASURE) {
12552       if (erasure_code_profile == "")
12553         erasure_code_profile = "default";
12554       //handle the erasure code profile
12555       if (erasure_code_profile == "default") {
12556         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12557           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12558             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12559             goto wait;
12560           }
12561
12562           map<string,string> profile_map;
12563           err = osdmap.get_erasure_code_profile_default(cct,
12564                                                       profile_map,
12565                                                       &ss);
12566           if (err)
12567             goto reply;
12568           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12569           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12570           goto wait;
12571         }
12572       }
12573       if (rule_name == "") {
12574         implicit_rule_creation = true;
12575         if (erasure_code_profile == "default") {
12576           rule_name = "erasure-code";
12577         } else {
12578           dout(1) << "implicitly use rule named after the pool: "
12579                 << poolstr << dendl;
12580           rule_name = poolstr;
12581         }
12582       }
12583       cmd_getval(cmdmap, "expected_num_objects",
12584                  expected_num_objects, int64_t(0));
12585     } else {
12586       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12587       //     and put expected_num_objects to rule field
12588       if (erasure_code_profile != "") { // cmd is from CLI
12589         if (rule_name != "") {
12590           string interr;
12591           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12592           if (interr.length()) {
12593             ss << "error parsing integer value '" << rule_name << "': " << interr;
12594             err = -EINVAL;
12595             goto reply;
12596           }
12597         }
12598         rule_name = erasure_code_profile;
12599       } else { // cmd is well-formed
12600         cmd_getval(cmdmap, "expected_num_objects",
12601                    expected_num_objects, int64_t(0));
12602       }
12603     }
12604
12605     if (!implicit_rule_creation && rule_name != "") {
12606       int rule;
12607       err = get_crush_rule(rule_name, &rule, &ss);
12608       if (err == -EAGAIN) {
12609         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12610         return true;
12611       }
12612       if (err)
12613         goto reply;
12614     }
12615
12616     if (expected_num_objects < 0) {
12617       ss << "'expected_num_objects' must be non-negative";
12618       err = -EINVAL;
12619       goto reply;
12620     }
12621
12622     set<int32_t> osds;
12623     osdmap.get_all_osds(osds);
12624     bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12625       string type;
12626       if (!get_osd_objectstore_type(osd, &type)) {
12627         return type == "filestore";
12628       } else {
12629         return false;
12630       }
12631     });
12632
12633     if (has_filestore_osd &&
12634         expected_num_objects > 0 &&
12635         cct->_conf->filestore_merge_threshold > 0) {
12636       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12637       err = -EINVAL;
12638       goto reply;
12639     }
12640
12641     if (has_filestore_osd &&
12642         expected_num_objects == 0 &&
12643         cct->_conf->filestore_merge_threshold < 0) {
12644       int osds = osdmap.get_num_osds();
12645       bool sure = false;
12646       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12647       if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12648         ss << "For better initial performance on pools expected to store a "
12649            << "large number of objects, consider supplying the "
12650            << "expected_num_objects parameter when creating the pool."
12651            << " Pass --yes-i-really-mean-it to ignore it";
12652         err = -EPERM;
12653         goto reply;
12654       }
12655     }
12656
12657     int64_t fast_read_param;
12658     cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12659     FastReadType fast_read = FAST_READ_DEFAULT;
12660     if (fast_read_param == 0)
12661       fast_read = FAST_READ_OFF;
12662     else if (fast_read_param > 0)
12663       fast_read = FAST_READ_ON;
12664
12665     int64_t repl_size = 0;
12666     cmd_getval(cmdmap, "size", repl_size);
12667     int64_t target_size_bytes = 0;
12668     double target_size_ratio = 0.0;
12669     cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12670     cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12671
12672     string pg_autoscale_mode;
12673     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12674
12675     err = prepare_new_pool(poolstr,
12676                            -1, // default crush rule
12677                            rule_name,
12678                            pg_num, pgp_num, pg_num_min,
12679                            repl_size, target_size_bytes, target_size_ratio,
12680                            erasure_code_profile, pool_type,
12681                            (uint64_t)expected_num_objects,
12682                            fast_read,
12683                            pg_autoscale_mode,
12684                            &ss);
12685     if (err < 0) {
12686       switch(err) {
12687       case -EEXIST:
12688         ss << "pool '" << poolstr << "' already exists";
12689         break;
12690       case -EAGAIN:
12691         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12692         return true;
12693       case -ERANGE:
12694         goto reply;
12695       default:
12696         goto reply;
12697         break;
12698       }
12699     } else {
12700       ss << "pool '" << poolstr << "' created";
12701     }
12702     getline(ss, rs);
12703     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12704                                               get_last_committed() + 1));
12705     return true;
12706
12707   } else if (prefix == "osd pool delete" ||
12708              prefix == "osd pool rm") {
12709     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12710     string poolstr, poolstr2, sure;
12711     cmd_getval(cmdmap, "pool", poolstr);
12712     cmd_getval(cmdmap, "pool2", poolstr2);
12713     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12714     if (pool < 0) {
12715       ss << "pool '" << poolstr << "' does not exist";
12716       err = 0;
12717       goto reply;
12718     }
12719
12720     bool force_no_fake = false;
12721     cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12722     bool force = false;
12723     cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12724     if (poolstr2 != poolstr ||
12725         (!force && !force_no_fake)) {
12726       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12727          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12728          << "followed by --yes-i-really-really-mean-it.";
12729       err = -EPERM;
12730       goto reply;
12731     }
12732     err = _prepare_remove_pool(pool, &ss, force_no_fake);
12733     if (err == -EAGAIN) {
12734       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12735       return true;
12736     }
12737     if (err < 0)
12738       goto reply;
12739     goto update;
12740   } else if (prefix == "osd pool rename") {
12741     string srcpoolstr, destpoolstr;
12742     cmd_getval(cmdmap, "srcpool", srcpoolstr);
12743     cmd_getval(cmdmap, "destpool", destpoolstr);
12744     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12745     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12746
12747     if (pool_src < 0) {
12748       if (pool_dst >= 0) {
12749         // src pool doesn't exist, dst pool does exist: to ensure idempotency
12750         // of operations, assume this rename succeeded, as it is not changing
12751         // the current state.  Make sure we output something understandable
12752         // for whoever is issuing the command, if they are paying attention,
12753         // in case it was not intentional; or to avoid a "wtf?" and a bug
12754         // report in case it was intentional, while expecting a failure.
12755         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12756           << destpoolstr << "' does -- assuming successful rename";
12757         err = 0;
12758       } else {
12759         ss << "unrecognized pool '" << srcpoolstr << "'";
12760         err = -ENOENT;
12761       }
12762       goto reply;
12763     } else if (pool_dst >= 0) {
12764       // source pool exists and so does the destination pool
12765       ss << "pool '" << destpoolstr << "' already exists";
12766       err = -EEXIST;
12767       goto reply;
12768     }
12769
12770     int ret = _prepare_rename_pool(pool_src, destpoolstr);
12771     if (ret == 0) {
12772       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12773     } else {
12774       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12775         << cpp_strerror(ret);
12776     }
12777     getline(ss, rs);
12778     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12779                                               get_last_committed() + 1));
12780     return true;
12781
12782   } else if (prefix == "osd pool set") {
12783     err = prepare_command_pool_set(cmdmap, ss);
12784     if (err == -EAGAIN)
12785       goto wait;
12786     if (err < 0)
12787       goto reply;
12788
12789     getline(ss, rs);
12790     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12791                                                    get_last_committed() + 1));
12792     return true;
12793   } else if (prefix == "osd tier add") {
12794     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12795     if (err == -EAGAIN)
12796       goto wait;
12797     if (err)
12798       goto reply;
12799     string poolstr;
12800     cmd_getval(cmdmap, "pool", poolstr);
12801     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12802     if (pool_id < 0) {
12803       ss << "unrecognized pool '" << poolstr << "'";
12804       err = -ENOENT;
12805       goto reply;
12806     }
12807     string tierpoolstr;
12808     cmd_getval(cmdmap, "tierpool", tierpoolstr);
12809     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12810     if (tierpool_id < 0) {
12811       ss << "unrecognized pool '" << tierpoolstr << "'";
12812       err = -ENOENT;
12813       goto reply;
12814     }
12815     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12816     ceph_assert(p);
12817     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12818     ceph_assert(tp);
12819
12820     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12821       goto reply;
12822     }
12823
12824     // make sure new tier is empty
12825     string force_nonempty;
12826     cmd_getval(cmdmap, "force_nonempty", force_nonempty);
12827     const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
12828     if (pstats && pstats->stats.sum.num_objects != 0 &&
12829         force_nonempty != "--force-nonempty") {
12830       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12831       err = -ENOTEMPTY;
12832       goto reply;
12833     }
12834     if (tp->is_erasure()) {
12835       ss << "tier pool '" << tierpoolstr
12836          << "' is an ec pool, which cannot be a tier";
12837       err = -ENOTSUP;
12838       goto reply;
12839     }
12840     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12841         ((force_nonempty != "--force-nonempty") ||
12842          (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
12843       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12844       err = -ENOTEMPTY;
12845       goto reply;
12846     }
12847     // go
12848     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12849     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12850     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12851       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12852       return true;
12853     }
12854     np->tiers.insert(tierpool_id);
12855     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12856     ntp->tier_of = pool_id;
12857     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12858     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12859                                               get_last_committed() + 1));
12860     return true;
12861   } else if (prefix == "osd tier remove" ||
12862              prefix == "osd tier rm") {
12863     string poolstr;
12864     cmd_getval(cmdmap, "pool", poolstr);
12865     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12866     if (pool_id < 0) {
12867       ss << "unrecognized pool '" << poolstr << "'";
12868       err = -ENOENT;
12869       goto reply;
12870     }
12871     string tierpoolstr;
12872     cmd_getval(cmdmap, "tierpool", tierpoolstr);
12873     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12874     if (tierpool_id < 0) {
12875       ss << "unrecognized pool '" << tierpoolstr << "'";
12876       err = -ENOENT;
12877       goto reply;
12878     }
12879     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12880     ceph_assert(p);
12881     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12882     ceph_assert(tp);
12883
12884     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12885       goto reply;
12886     }
12887
12888     if (p->tiers.count(tierpool_id) == 0) {
12889       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12890       err = 0;
12891       goto reply;
12892     }
12893     if (tp->tier_of != pool_id) {
12894       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12895          << osdmap.get_pool_name(tp->tier_of) << "': "
12896          // be scary about it; this is an inconsistency and bells must go off
12897          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12898       err = -EINVAL;
12899       goto reply;
12900     }
12901     if (p->read_tier == tierpool_id) {
12902       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12903       err = -EBUSY;
12904       goto reply;
12905     }
12906     // go
12907     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12908     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12909     if (np->tiers.count(tierpool_id) == 0 ||
12910         ntp->tier_of != pool_id ||
12911         np->read_tier == tierpool_id) {
12912       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12913       return true;
12914     }
12915     np->tiers.erase(tierpool_id);
12916     ntp->clear_tier();
12917     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12918     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12919                                               get_last_committed() + 1));
12920     return true;
12921   } else if (prefix == "osd tier set-overlay") {
12922     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12923     if (err == -EAGAIN)
12924       goto wait;
12925     if (err)
12926       goto reply;
12927     string poolstr;
12928     cmd_getval(cmdmap, "pool", poolstr);
12929     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12930     if (pool_id < 0) {
12931       ss << "unrecognized pool '" << poolstr << "'";
12932       err = -ENOENT;
12933       goto reply;
12934     }
12935     string overlaypoolstr;
12936     cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
12937     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12938     if (overlaypool_id < 0) {
12939       ss << "unrecognized pool '" << overlaypoolstr << "'";
12940       err = -ENOENT;
12941       goto reply;
12942     }
12943     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12944     ceph_assert(p);
12945     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
12946     ceph_assert(overlay_p);
12947     if (p->tiers.count(overlaypool_id) == 0) {
12948       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12949       err = -EINVAL;
12950       goto reply;
12951     }
12952     if (p->read_tier == overlaypool_id) {
12953       err = 0;
12954       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12955       goto reply;
12956     }
12957     if (p->has_read_tier()) {
12958       ss << "pool '" << poolstr << "' has overlay '"
12959          << osdmap.get_pool_name(p->read_tier)
12960          << "'; please remove-overlay first";
12961       err = -EINVAL;
12962       goto reply;
12963     }
12964
12965     // go
12966     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12967     np->read_tier = overlaypool_id;
12968     np->write_tier = overlaypool_id;
12969     np->set_last_force_op_resend(pending_inc.epoch);
12970     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12971     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12972     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12973     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12974       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12975     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12976                                               get_last_committed() + 1));
12977     return true;
12978   } else if (prefix == "osd tier remove-overlay" ||
12979              prefix == "osd tier rm-overlay") {
12980     string poolstr;
12981     cmd_getval(cmdmap, "pool", poolstr);
12982     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12983     if (pool_id < 0) {
12984       ss << "unrecognized pool '" << poolstr << "'";
12985       err = -ENOENT;
12986       goto reply;
12987     }
12988     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12989     ceph_assert(p);
12990     if (!p->has_read_tier()) {
12991       err = 0;
12992       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12993       goto reply;
12994     }
12995
12996     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12997       goto reply;
12998     }
12999
13000     // go
13001     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13002     if (np->has_read_tier()) {
13003       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13004       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13005       nop->set_last_force_op_resend(pending_inc.epoch);
13006     }
13007     if (np->has_write_tier()) {
13008       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13009       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13010       nop->set_last_force_op_resend(pending_inc.epoch);
13011     }
13012     np->clear_read_tier();
13013     np->clear_write_tier();
13014     np->set_last_force_op_resend(pending_inc.epoch);
13015     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13016     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13017                                               get_last_committed() + 1));
13018     return true;
13019   } else if (prefix == "osd tier cache-mode") {
13020     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13021     if (err == -EAGAIN)
13022       goto wait;
13023     if (err)
13024       goto reply;
13025     string poolstr;
13026     cmd_getval(cmdmap, "pool", poolstr);
13027     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13028     if (pool_id < 0) {
13029       ss << "unrecognized pool '" << poolstr << "'";
13030       err = -ENOENT;
13031       goto reply;
13032     }
13033     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13034     ceph_assert(p);
13035     if (!p->is_tier()) {
13036       ss << "pool '" << poolstr << "' is not a tier";
13037       err = -EINVAL;
13038       goto reply;
13039     }
13040     string modestr;
13041     cmd_getval(cmdmap, "mode", modestr);
13042     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13043     if (int(mode) < 0) {
13044       ss << "'" << modestr << "' is not a valid cache mode";
13045       err = -EINVAL;
13046       goto reply;
13047     }
13048
13049     bool sure = false;
13050     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13051
13052     if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13053         mode == pg_pool_t::CACHEMODE_READFORWARD) {
13054       ss << "'" << modestr << "' is no longer a supported cache mode";
13055       err = -EPERM;
13056       goto reply;
13057     }
13058     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13059          mode != pg_pool_t::CACHEMODE_NONE &&
13060          mode != pg_pool_t::CACHEMODE_PROXY &&
13061          mode != pg_pool_t::CACHEMODE_READPROXY) &&
13062          !sure) {
13063       ss << "'" << modestr << "' is not a well-supported cache mode and may "
13064          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
13065       err = -EPERM;
13066       goto reply;
13067     }
13068
13069     // pool already has this cache-mode set and there are no pending changes
13070     if (p->cache_mode == mode &&
13071         (pending_inc.new_pools.count(pool_id) == 0 ||
13072          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13073       ss << "set cache-mode for pool '" << poolstr << "'"
13074          << " to " << pg_pool_t::get_cache_mode_name(mode);
13075       err = 0;
13076       goto reply;
13077     }
13078
13079     /* Mode description:
13080      *
13081      *  none:       No cache-mode defined
13082      *  forward:    Forward all reads and writes to base pool [removed]
13083      *  writeback:  Cache writes, promote reads from base pool
13084      *  readonly:   Forward writes to base pool
13085      *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13086      *  proxy:       Proxy all reads and writes to base pool
13087      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
13088      *
13089      * Hence, these are the allowed transitions:
13090      *
13091      *  none -> any
13092      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13093      *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13094      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13095      *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13096      *  writeback -> readproxy || proxy
13097      *  readonly -> any
13098      */
13099
13100     // We check if the transition is valid against the current pool mode, as
13101     // it is the only committed state thus far.  We will blantly squash
13102     // whatever mode is on the pending state.
13103
13104     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13105         (mode != pg_pool_t::CACHEMODE_PROXY &&
13106           mode != pg_pool_t::CACHEMODE_READPROXY)) {
13107       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13108          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13109          << "' pool; only '"
13110          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13111         << "' allowed.";
13112       err = -EINVAL;
13113       goto reply;
13114     }
13115     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13116         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13117           mode != pg_pool_t::CACHEMODE_PROXY &&
13118           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13119
13120         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13121         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13122           mode != pg_pool_t::CACHEMODE_PROXY)) ||
13123
13124         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13125         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13126           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13127
13128         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13129         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13130           mode != pg_pool_t::CACHEMODE_PROXY &&
13131           mode != pg_pool_t::CACHEMODE_READPROXY))) {
13132
13133       const pool_stat_t* pstats =
13134         mon->mgrstatmon()->get_pool_stat(pool_id);
13135
13136       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13137         ss << "unable to set cache-mode '"
13138            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13139            << "': dirty objects found";
13140         err = -EBUSY;
13141         goto reply;
13142       }
13143     }
13144     // go
13145     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13146     np->cache_mode = mode;
13147     // set this both when moving to and from cache_mode NONE.  this is to
13148     // capture legacy pools that were set up before this flag existed.
13149     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13150     ss << "set cache-mode for pool '" << poolstr
13151         << "' to " << pg_pool_t::get_cache_mode_name(mode);
13152     if (mode == pg_pool_t::CACHEMODE_NONE) {
13153       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13154       ceph_assert(base_pool);
13155       if (base_pool->read_tier == pool_id ||
13156           base_pool->write_tier == pool_id)
13157         ss <<" (WARNING: pool is still configured as read or write tier)";
13158     }
13159     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13160                                               get_last_committed() + 1));
13161     return true;
13162   } else if (prefix == "osd tier add-cache") {
13163     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13164     if (err == -EAGAIN)
13165       goto wait;
13166     if (err)
13167       goto reply;
13168     string poolstr;
13169     cmd_getval(cmdmap, "pool", poolstr);
13170     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13171     if (pool_id < 0) {
13172       ss << "unrecognized pool '" << poolstr << "'";
13173       err = -ENOENT;
13174       goto reply;
13175     }
13176     string tierpoolstr;
13177     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13178     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13179     if (tierpool_id < 0) {
13180       ss << "unrecognized pool '" << tierpoolstr << "'";
13181       err = -ENOENT;
13182       goto reply;
13183     }
13184     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13185     ceph_assert(p);
13186     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13187     ceph_assert(tp);
13188
13189     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13190       goto reply;
13191     }
13192
13193     int64_t size = 0;
13194     if (!cmd_getval(cmdmap, "size", size)) {
13195       ss << "unable to parse 'size' value '"
13196          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13197       err = -EINVAL;
13198       goto reply;
13199     }
13200     // make sure new tier is empty
13201     const pool_stat_t *pstats =
13202       mon->mgrstatmon()->get_pool_stat(tierpool_id);
13203     if (pstats && pstats->stats.sum.num_objects != 0) {
13204       ss << "tier pool '" << tierpoolstr << "' is not empty";
13205       err = -ENOTEMPTY;
13206       goto reply;
13207     }
13208     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13209     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13210     if (int(mode) < 0) {
13211       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13212       err = -EINVAL;
13213       goto reply;
13214     }
13215     HitSet::Params hsp;
13216     auto& cache_hit_set_type =
13217       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13218     if (cache_hit_set_type == "bloom") {
13219       BloomHitSet::Params *bsp = new BloomHitSet::Params;
13220       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13221       hsp = HitSet::Params(bsp);
13222     } else if (cache_hit_set_type == "explicit_hash") {
13223       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13224     } else if (cache_hit_set_type == "explicit_object") {
13225       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13226     } else {
13227       ss << "osd tier cache default hit set type '"
13228          << cache_hit_set_type << "' is not a known type";
13229       err = -EINVAL;
13230       goto reply;
13231     }
13232     // go
13233     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13234     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13235     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13236       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13237       return true;
13238     }
13239     np->tiers.insert(tierpool_id);
13240     np->read_tier = np->write_tier = tierpool_id;
13241     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13242     np->set_last_force_op_resend(pending_inc.epoch);
13243     ntp->set_last_force_op_resend(pending_inc.epoch);
13244     ntp->tier_of = pool_id;
13245     ntp->cache_mode = mode;
13246     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13247     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13248     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13249     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13250     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13251     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13252     ntp->hit_set_params = hsp;
13253     ntp->target_max_bytes = size;
13254     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13255     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13256                                               get_last_committed() + 1));
13257     return true;
13258   } else if (prefix == "osd pool set-quota") {
13259     string poolstr;
13260     cmd_getval(cmdmap, "pool", poolstr);
13261     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13262     if (pool_id < 0) {
13263       ss << "unrecognized pool '" << poolstr << "'";
13264       err = -ENOENT;
13265       goto reply;
13266     }
13267
13268     string field;
13269     cmd_getval(cmdmap, "field", field);
13270     if (field != "max_objects" && field != "max_bytes") {
13271       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13272       err = -EINVAL;
13273       goto reply;
13274     }
13275
13276     // val could contain unit designations, so we treat as a string
13277     string val;
13278     cmd_getval(cmdmap, "val", val);
13279     string tss;
13280     int64_t value;
13281     if (field == "max_objects") {
13282       value = strict_sistrtoll(val.c_str(), &tss);
13283     } else if (field == "max_bytes") {
13284       value = strict_iecstrtoll(val.c_str(), &tss);
13285     } else {
13286       ceph_abort_msg("unrecognized option");
13287     }
13288     if (!tss.empty()) {
13289       ss << "error parsing value '" << val << "': " << tss;
13290       err = -EINVAL;
13291       goto reply;
13292     }
13293
13294     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13295     if (field == "max_objects") {
13296       pi->quota_max_objects = value;
13297     } else if (field == "max_bytes") {
13298       pi->quota_max_bytes = value;
13299     } else {
13300       ceph_abort_msg("unrecognized option");
13301     }
13302     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13303     rs = ss.str();
13304     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13305                                               get_last_committed() + 1));
13306     return true;
13307   } else if (prefix == "osd pool application enable" ||
13308              prefix == "osd pool application disable" ||
13309              prefix == "osd pool application set" ||
13310              prefix == "osd pool application rm") {
13311     err = prepare_command_pool_application(prefix, cmdmap, ss);
13312     if (err == -EAGAIN) {
13313       goto wait;
13314     } else if (err < 0) {
13315       goto reply;
13316     } else {
13317       goto update;
13318     }
13319   } else if (prefix == "osd force-create-pg") {
13320     pg_t pgid;
13321     string pgidstr;
13322     cmd_getval(cmdmap, "pgid", pgidstr);
13323     if (!pgid.parse(pgidstr.c_str())) {
13324       ss << "invalid pgid '" << pgidstr << "'";
13325       err = -EINVAL;
13326       goto reply;
13327     }
13328     if (!osdmap.pg_exists(pgid)) {
13329       ss << "pg " << pgid << " should not exist";
13330       err = -ENOENT;
13331       goto reply;
13332     }
13333     bool sure = false;
13334     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13335     if (!sure) {
13336       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13337          << "that the cluster will give up ever trying to recover the lost data.  Do this "
13338          << "only if you are certain that all copies of the PG are in fact lost and you are "
13339          << "willing to accept that the data is permanently destroyed.  Pass "
13340          << "--yes-i-really-mean-it to proceed.";
13341       err = -EPERM;
13342       goto reply;
13343     }
13344     bool creating_now;
13345     {
13346       std::lock_guard<std::mutex> l(creating_pgs_lock);
13347       auto emplaced = creating_pgs.pgs.emplace(
13348         pgid,
13349         creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13350                                        ceph_clock_now()));
13351       creating_now = emplaced.second;
13352     }
13353     if (creating_now) {
13354       ss << "pg " << pgidstr << " now creating, ok";
13355       // set the pool's CREATING flag so that (1) the osd won't ignore our
13356       // create message and (2) we won't propose any future pg_num changes
13357       // until after the PG has been instantiated.
13358       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13359         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13360       }
13361       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13362       err = 0;
13363       goto update;
13364     } else {
13365       ss << "pg " << pgid << " already creating";
13366       err = 0;
13367       goto reply;
13368     }
13369   } else {
13370     err = -EINVAL;
13371   }
13372
13373  reply:
13374   getline(ss, rs);
13375   if (err < 0 && rs.length() == 0)
13376     rs = cpp_strerror(err);
13377   mon->reply_command(op, err, rs, rdata, get_last_committed());
13378   return ret;
13379
13380  update:
13381   getline(ss, rs);
13382   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13383                                             get_last_committed() + 1));
13384   return true;
13385
13386  wait:
13387   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13388   return true;
13389 }
13390
13391 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13392 {
13393   op->mark_osdmon_event(__func__);
13394
13395   auto m = op->get_req<MPoolOp>();
13396   MonSession *session = op->get_session();
13397   if (!session) {
13398     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13399     return true;
13400   }
13401
13402   switch (m->op) {
13403   case POOL_OP_CREATE_UNMANAGED_SNAP:
13404   case POOL_OP_DELETE_UNMANAGED_SNAP:
13405     {
13406       const std::string* pool_name = nullptr;
13407       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13408       if (pg_pool != nullptr) {
13409         pool_name = &osdmap.get_pool_name(m->pool);
13410       }
13411
13412       if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13413                                           session->entity_name, session->caps,
13414                                           session->get_peer_socket_addr(),
13415                                           pool_name)) {
13416         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13417                 << "privileges. message: " << *m  << std::endl
13418                 << "caps: " << session->caps << dendl;
13419         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13420         return true;
13421       }
13422     }
13423     break;
13424   default:
13425     if (!session->is_capable("osd", MON_CAP_W)) {
13426       dout(0) << "got pool op from entity with insufficient privileges. "
13427               << "message: " << *m  << std::endl
13428               << "caps: " << session->caps << dendl;
13429       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13430       return true;
13431     }
13432     break;
13433   }
13434
13435   return false;
13436 }
13437
13438 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13439 {
13440   op->mark_osdmon_event(__func__);
13441   auto m = op->get_req<MPoolOp>();
13442
13443   if (enforce_pool_op_caps(op)) {
13444     return true;
13445   }
13446
13447   if (m->fsid != mon->monmap->fsid) {
13448     dout(0) << __func__ << " drop message on fsid " << m->fsid
13449             << " != " << mon->monmap->fsid << " for " << *m << dendl;
13450     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13451     return true;
13452   }
13453
13454   if (m->op == POOL_OP_CREATE)
13455     return preprocess_pool_op_create(op);
13456
13457   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13458   if (p == nullptr) {
13459     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13460     if (m->op == POOL_OP_DELETE) {
13461       _pool_op_reply(op, 0, osdmap.get_epoch());
13462     } else {
13463       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13464     }
13465     return true;
13466   }
13467
13468   // check if the snap and snapname exist
13469   bool snap_exists = false;
13470   if (p->snap_exists(m->name.c_str()))
13471     snap_exists = true;
13472
13473   switch (m->op) {
13474   case POOL_OP_CREATE_SNAP:
13475     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13476       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13477       return true;
13478     }
13479     if (snap_exists) {
13480       _pool_op_reply(op, 0, osdmap.get_epoch());
13481       return true;
13482     }
13483     return false;
13484   case POOL_OP_CREATE_UNMANAGED_SNAP:
13485     if (p->is_pool_snaps_mode()) {
13486       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13487       return true;
13488     }
13489     return false;
13490   case POOL_OP_DELETE_SNAP:
13491     if (p->is_unmanaged_snaps_mode()) {
13492       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13493       return true;
13494     }
13495     if (!snap_exists) {
13496       _pool_op_reply(op, 0, osdmap.get_epoch());
13497       return true;
13498     }
13499     return false;
13500   case POOL_OP_DELETE_UNMANAGED_SNAP:
13501     if (p->is_pool_snaps_mode()) {
13502       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13503       return true;
13504     }
13505     if (_is_removed_snap(m->pool, m->snapid)) {
13506       _pool_op_reply(op, 0, osdmap.get_epoch());
13507       return true;
13508     }
13509     return false;
13510   case POOL_OP_DELETE:
13511     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13512       _pool_op_reply(op, 0, osdmap.get_epoch());
13513       return true;
13514     }
13515     return false;
13516   case POOL_OP_AUID_CHANGE:
13517     return false;
13518   default:
13519     ceph_abort();
13520     break;
13521   }
13522
13523   return false;
13524 }
13525
13526 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13527 {
13528   if (!osdmap.have_pg_pool(pool)) {
13529     dout(10) << __func__ << " pool " << pool << " snap " << snap
13530              << " - pool dne" << dendl;
13531     return true;
13532   }
13533   if (osdmap.in_removed_snaps_queue(pool, snap)) {
13534     dout(10) << __func__ << " pool " << pool << " snap " << snap
13535              << " - in osdmap removed_snaps_queue" << dendl;
13536     return true;
13537   }
13538   snapid_t begin, end;
13539   int r = lookup_purged_snap(pool, snap, &begin, &end);
13540   if (r == 0) {
13541     dout(10) << __func__ << " pool " << pool << " snap " << snap
13542              << " - purged, [" << begin << "," << end << ")" << dendl;
13543     return true;
13544   }
13545   return false;
13546 }
13547
13548 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13549 {
13550   if (pending_inc.old_pools.count(pool)) {
13551     dout(10) << __func__ << " pool " << pool << " snap " << snap
13552              << " - pool pending deletion" << dendl;
13553     return true;
13554   }
13555   if (pending_inc.in_new_removed_snaps(pool, snap)) {
13556     dout(10) << __func__ << " pool " << pool << " snap " << snap
13557              << " - in pending new_removed_snaps" << dendl;
13558     return true;
13559   }
13560   return false;
13561 }
13562
13563 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13564 {
13565   op->mark_osdmon_event(__func__);
13566   auto m = op->get_req<MPoolOp>();
13567   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13568   if (pool >= 0) {
13569     _pool_op_reply(op, 0, osdmap.get_epoch());
13570     return true;
13571   }
13572
13573   return false;
13574 }
13575
13576 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13577 {
13578   op->mark_osdmon_event(__func__);
13579   auto m = op->get_req<MPoolOp>();
13580   dout(10) << "prepare_pool_op " << *m << dendl;
13581   if (m->op == POOL_OP_CREATE) {
13582     return prepare_pool_op_create(op);
13583   } else if (m->op == POOL_OP_DELETE) {
13584     return prepare_pool_op_delete(op);
13585   }
13586
13587   int ret = 0;
13588   bool changed = false;
13589
13590   if (!osdmap.have_pg_pool(m->pool)) {
13591     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13592     return false;
13593   }
13594
13595   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13596
13597   switch (m->op) {
13598     case POOL_OP_CREATE_SNAP:
13599       if (pool->is_tier()) {
13600         ret = -EINVAL;
13601         _pool_op_reply(op, ret, osdmap.get_epoch());
13602         return false;
13603       }  // else, fall through
13604     case POOL_OP_DELETE_SNAP:
13605       if (!pool->is_unmanaged_snaps_mode()) {
13606         bool snap_exists = pool->snap_exists(m->name.c_str());
13607         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13608           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13609           ret = 0;
13610         } else {
13611           break;
13612         }
13613       } else {
13614         ret = -EINVAL;
13615       }
13616       _pool_op_reply(op, ret, osdmap.get_epoch());
13617       return false;
13618
13619     case POOL_OP_DELETE_UNMANAGED_SNAP:
13620       // we won't allow removal of an unmanaged snapshot from a pool
13621       // not in unmanaged snaps mode.
13622       if (!pool->is_unmanaged_snaps_mode()) {
13623         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13624         return false;
13625       }
13626       /* fall-thru */
13627     case POOL_OP_CREATE_UNMANAGED_SNAP:
13628       // but we will allow creating an unmanaged snapshot on any pool
13629       // as long as it is not in 'pool' snaps mode.
13630       if (pool->is_pool_snaps_mode()) {
13631         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13632         return false;
13633       }
13634   }
13635
13636   // projected pool info
13637   pg_pool_t pp;
13638   if (pending_inc.new_pools.count(m->pool))
13639     pp = pending_inc.new_pools[m->pool];
13640   else
13641     pp = *osdmap.get_pg_pool(m->pool);
13642
13643   bufferlist reply_data;
13644
13645   // pool snaps vs unmanaged snaps are mutually exclusive
13646   switch (m->op) {
13647   case POOL_OP_CREATE_SNAP:
13648   case POOL_OP_DELETE_SNAP:
13649     if (pp.is_unmanaged_snaps_mode()) {
13650       ret = -EINVAL;
13651       goto out;
13652     }
13653     break;
13654
13655   case POOL_OP_CREATE_UNMANAGED_SNAP:
13656   case POOL_OP_DELETE_UNMANAGED_SNAP:
13657     if (pp.is_pool_snaps_mode()) {
13658       ret = -EINVAL;
13659       goto out;
13660     }
13661   }
13662
13663   switch (m->op) {
13664   case POOL_OP_CREATE_SNAP:
13665     if (!pp.snap_exists(m->name.c_str())) {
13666       pp.add_snap(m->name.c_str(), ceph_clock_now());
13667       dout(10) << "create snap in pool " << m->pool << " " << m->name
13668                << " seq " << pp.get_snap_epoch() << dendl;
13669       changed = true;
13670     }
13671     break;
13672
13673   case POOL_OP_DELETE_SNAP:
13674     {
13675       snapid_t s = pp.snap_exists(m->name.c_str());
13676       if (s) {
13677         pp.remove_snap(s);
13678         pending_inc.new_removed_snaps[m->pool].insert(s);
13679         changed = true;
13680       }
13681     }
13682     break;
13683
13684   case POOL_OP_CREATE_UNMANAGED_SNAP:
13685     {
13686       uint64_t snapid = pp.add_unmanaged_snap(
13687         osdmap.require_osd_release < ceph_release_t::octopus);
13688       encode(snapid, reply_data);
13689       changed = true;
13690     }
13691     break;
13692
13693   case POOL_OP_DELETE_UNMANAGED_SNAP:
13694     if (!_is_removed_snap(m->pool, m->snapid) &&
13695         !_is_pending_removed_snap(m->pool, m->snapid)) {
13696       if (m->snapid > pp.get_snap_seq()) {
13697         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13698         return false;
13699       }
13700       pp.remove_unmanaged_snap(
13701         m->snapid,
13702         osdmap.require_osd_release < ceph_release_t::octopus);
13703       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13704       // also record the new seq as purged: this avoids a discontinuity
13705       // after all of the snaps have been purged, since the seq assigned
13706       // during removal lives in the same namespace as the actual snaps.
13707       pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13708       changed = true;
13709     }
13710     break;
13711
13712   case POOL_OP_AUID_CHANGE:
13713     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13714     return false;
13715
13716   default:
13717     ceph_abort();
13718     break;
13719   }
13720
13721   if (changed) {
13722     pp.set_snap_epoch(pending_inc.epoch);
13723     pending_inc.new_pools[m->pool] = pp;
13724   }
13725
13726  out:
13727   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13728   return true;
13729 }
13730
13731 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13732 {
13733   op->mark_osdmon_event(__func__);
13734   int err = prepare_new_pool(op);
13735   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13736   return true;
13737 }
13738
13739 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13740                                    ostream *ss)
13741 {
13742   const string& poolstr = osdmap.get_pool_name(pool_id);
13743
13744   // If the Pool is in use by CephFS, refuse to delete it
13745   FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13746   if (pending_fsmap.pool_in_use(pool_id)) {
13747     *ss << "pool '" << poolstr << "' is in use by CephFS";
13748     return -EBUSY;
13749   }
13750
13751   if (pool.tier_of >= 0) {
13752     *ss << "pool '" << poolstr << "' is a tier of '"
13753         << osdmap.get_pool_name(pool.tier_of) << "'";
13754     return -EBUSY;
13755   }
13756   if (!pool.tiers.empty()) {
13757     *ss << "pool '" << poolstr << "' has tiers";
13758     for(auto tier : pool.tiers) {
13759       *ss << " " << osdmap.get_pool_name(tier);
13760     }
13761     return -EBUSY;
13762   }
13763
13764   if (!g_conf()->mon_allow_pool_delete) {
13765     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13766     return -EPERM;
13767   }
13768
13769   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13770     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13771     return -EPERM;
13772   }
13773
13774   *ss << "pool '" << poolstr << "' removed";
13775   return 0;
13776 }
13777
13778 /**
13779  * Check if it is safe to add a tier to a base pool
13780  *
13781  * @return
13782  * True if the operation should proceed, false if we should abort here
13783  * (abort doesn't necessarily mean error, could be idempotency)
13784  */
13785 bool OSDMonitor::_check_become_tier(
13786     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13787     const int64_t base_pool_id, const pg_pool_t *base_pool,
13788     int *err,
13789     ostream *ss) const
13790 {
13791   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13792   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13793
13794   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13795   if (pending_fsmap.pool_in_use(tier_pool_id)) {
13796     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13797     *err = -EBUSY;
13798     return false;
13799   }
13800
13801   if (base_pool->tiers.count(tier_pool_id)) {
13802     ceph_assert(tier_pool->tier_of == base_pool_id);
13803     *err = 0;
13804     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13805       << base_pool_name << "'";
13806     return false;
13807   }
13808
13809   if (base_pool->is_tier()) {
13810     *ss << "pool '" << base_pool_name << "' is already a tier of '"
13811       << osdmap.get_pool_name(base_pool->tier_of) << "', "
13812       << "multiple tiers are not yet supported.";
13813     *err = -EINVAL;
13814     return false;
13815   }
13816
13817   if (tier_pool->has_tiers()) {
13818     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13819     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13820          it != tier_pool->tiers.end(); ++it)
13821       *ss << "'" << osdmap.get_pool_name(*it) << "',";
13822     *ss << " multiple tiers are not yet supported.";
13823     *err = -EINVAL;
13824     return false;
13825   }
13826
13827   if (tier_pool->is_tier()) {
13828     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13829        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13830     *err = -EINVAL;
13831     return false;
13832   }
13833
13834   *err = 0;
13835   return true;
13836 }
13837
13838
13839 /**
13840  * Check if it is safe to remove a tier from this base pool
13841  *
13842  * @return
13843  * True if the operation should proceed, false if we should abort here
13844  * (abort doesn't necessarily mean error, could be idempotency)
13845  */
13846 bool OSDMonitor::_check_remove_tier(
13847     const int64_t base_pool_id, const pg_pool_t *base_pool,
13848     const pg_pool_t *tier_pool,
13849     int *err, ostream *ss) const
13850 {
13851   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13852
13853   // Apply CephFS-specific checks
13854   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13855   if (pending_fsmap.pool_in_use(base_pool_id)) {
13856     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13857       // If the underlying pool is erasure coded and does not allow EC
13858       // overwrites, we can't permit the removal of the replicated tier that
13859       // CephFS relies on to access it
13860       *ss << "pool '" << base_pool_name <<
13861           "' does not allow EC overwrites and is in use by CephFS"
13862           " via its tier";
13863       *err = -EBUSY;
13864       return false;
13865     }
13866
13867     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13868       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13869              "tier is still in use as a writeback cache.  Change the cache "
13870              "mode and flush the cache before removing it";
13871       *err = -EBUSY;
13872       return false;
13873     }
13874   }
13875
13876   *err = 0;
13877   return true;
13878 }
13879
13880 int OSDMonitor::_prepare_remove_pool(
13881   int64_t pool, ostream *ss, bool no_fake)
13882 {
13883   dout(10) << __func__ << " " << pool << dendl;
13884   const pg_pool_t *p = osdmap.get_pg_pool(pool);
13885   int r = _check_remove_pool(pool, *p, ss);
13886   if (r < 0)
13887     return r;
13888
13889   auto new_pool = pending_inc.new_pools.find(pool);
13890   if (new_pool != pending_inc.new_pools.end()) {
13891     // if there is a problem with the pending info, wait and retry
13892     // this op.
13893     const auto& p = new_pool->second;
13894     int r = _check_remove_pool(pool, p, ss);
13895     if (r < 0)
13896       return -EAGAIN;
13897   }
13898
13899   if (pending_inc.old_pools.count(pool)) {
13900     dout(10) << __func__ << " " << pool << " already pending removal"
13901              << dendl;
13902     return 0;
13903   }
13904
13905   if (g_conf()->mon_fake_pool_delete && !no_fake) {
13906     string old_name = osdmap.get_pool_name(pool);
13907     string new_name = old_name + "." + stringify(pool) + ".DELETED";
13908     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13909             << old_name << " -> " << new_name << dendl;
13910     pending_inc.new_pool_names[pool] = new_name;
13911     return 0;
13912   }
13913
13914   // remove
13915   pending_inc.old_pools.insert(pool);
13916
13917   // remove any pg_temp mappings for this pool
13918   for (auto p = osdmap.pg_temp->begin();
13919        p != osdmap.pg_temp->end();
13920        ++p) {
13921     if (p->first.pool() == pool) {
13922       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
13923                << p->first << dendl;
13924       pending_inc.new_pg_temp[p->first].clear();
13925     }
13926   }
13927   // remove any primary_temp mappings for this pool
13928   for (auto p = osdmap.primary_temp->begin();
13929       p != osdmap.primary_temp->end();
13930       ++p) {
13931     if (p->first.pool() == pool) {
13932       dout(10) << __func__ << " " << pool
13933                << " removing obsolete primary_temp" << p->first << dendl;
13934       pending_inc.new_primary_temp[p->first] = -1;
13935     }
13936   }
13937   // remove any pg_upmap mappings for this pool
13938   for (auto& p : osdmap.pg_upmap) {
13939     if (p.first.pool() == pool) {
13940       dout(10) << __func__ << " " << pool
13941                << " removing obsolete pg_upmap "
13942                << p.first << dendl;
13943       pending_inc.old_pg_upmap.insert(p.first);
13944     }
13945   }
13946   // remove any pending pg_upmap mappings for this pool
13947   {
13948     auto it = pending_inc.new_pg_upmap.begin();
13949     while (it != pending_inc.new_pg_upmap.end()) {
13950       if (it->first.pool() == pool) {
13951         dout(10) << __func__ << " " << pool
13952                  << " removing pending pg_upmap "
13953                  << it->first << dendl;
13954         it = pending_inc.new_pg_upmap.erase(it);
13955       } else {
13956         it++;
13957       }
13958     }
13959   }
13960   // remove any pg_upmap_items mappings for this pool
13961   for (auto& p : osdmap.pg_upmap_items) {
13962     if (p.first.pool() == pool) {
13963       dout(10) << __func__ << " " << pool
13964                << " removing obsolete pg_upmap_items " << p.first
13965                << dendl;
13966       pending_inc.old_pg_upmap_items.insert(p.first);
13967     }
13968   }
13969   // remove any pending pg_upmap mappings for this pool
13970   {
13971     auto it = pending_inc.new_pg_upmap_items.begin();
13972     while (it != pending_inc.new_pg_upmap_items.end()) {
13973       if (it->first.pool() == pool) {
13974         dout(10) << __func__ << " " << pool
13975                  << " removing pending pg_upmap_items "
13976                  << it->first << dendl;
13977         it = pending_inc.new_pg_upmap_items.erase(it);
13978       } else {
13979         it++;
13980       }
13981     }
13982   }
13983
13984   // remove any choose_args for this pool
13985   CrushWrapper newcrush;
13986   _get_pending_crush(newcrush);
13987   if (newcrush.have_choose_args(pool)) {
13988     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13989     newcrush.rm_choose_args(pool);
13990     pending_inc.crush.clear();
13991     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13992   }
13993   return 0;
13994 }
13995
13996 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13997 {
13998   dout(10) << "_prepare_rename_pool " << pool << dendl;
13999   if (pending_inc.old_pools.count(pool)) {
14000     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14001     return -ENOENT;
14002   }
14003   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14004        p != pending_inc.new_pool_names.end();
14005        ++p) {
14006     if (p->second == newname && p->first != pool) {
14007       return -EEXIST;
14008     }
14009   }
14010
14011   pending_inc.new_pool_names[pool] = newname;
14012   return 0;
14013 }
14014
14015 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14016 {
14017   op->mark_osdmon_event(__func__);
14018   auto m = op->get_req<MPoolOp>();
14019   ostringstream ss;
14020   int ret = _prepare_remove_pool(m->pool, &ss, false);
14021   if (ret == -EAGAIN) {
14022     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14023     return true;
14024   }
14025   if (ret < 0)
14026     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14027   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14028                                                       pending_inc.epoch));
14029   return true;
14030 }
14031
14032 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14033                                 int ret, epoch_t epoch, bufferlist *blp)
14034 {
14035   op->mark_osdmon_event(__func__);
14036   auto m = op->get_req<MPoolOp>();
14037   dout(20) << "_pool_op_reply " << ret << dendl;
14038   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14039                                          ret, epoch, get_last_committed(), blp);
14040   mon->send_reply(op, reply);
14041 }
14042
14043 void OSDMonitor::convert_pool_priorities(void)
14044 {
14045   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14046   int64_t max_prio = 0;
14047   int64_t min_prio = 0;
14048   for (const auto &i : osdmap.get_pools()) {
14049     const auto &pool = i.second;
14050
14051     if (pool.opts.is_set(key)) {
14052       int64_t prio = 0;
14053       pool.opts.get(key, &prio);
14054       if (prio > max_prio)
14055         max_prio = prio;
14056       if (prio < min_prio)
14057         min_prio = prio;
14058     }
14059   }
14060   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14061     dout(20) << __func__ << " nothing to fix" << dendl;
14062     return;
14063   }
14064   // Current pool priorities exceeds new maximum
14065   for (const auto &i : osdmap.get_pools()) {
14066     const auto pool_id = i.first;
14067     pg_pool_t pool = i.second;
14068
14069     int64_t prio = 0;
14070     pool.opts.get(key, &prio);
14071     int64_t n;
14072
14073     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14074       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14075       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14076     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14077       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
14078       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14079     } else {
14080       continue;
14081     }
14082     if (n == 0) {
14083       pool.opts.unset(key);
14084     } else {
14085       pool.opts.set(key, static_cast<int64_t>(n));
14086     }
14087     dout(10) << __func__ << " pool " << pool_id
14088              << " recovery_priority adjusted "
14089              << prio << " to " << n << dendl;
14090     pool.last_change = pending_inc.epoch;
14091     pending_inc.new_pools[pool_id] = pool;
14092   }
14093 }