ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/ConfigKeyService.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDMarkMeDead.h"
  43 #include "messages/MOSDFull.h"
  44 #include "messages/MOSDMap.h"
  45 #include "messages/MMonGetOSDMap.h"
  46 #include "messages/MOSDBoot.h"
  47 #include "messages/MOSDAlive.h"
  48 #include "messages/MPoolOp.h"
  49 #include "messages/MPoolOpReply.h"
  50 #include "messages/MOSDPGCreate.h"
  51 #include "messages/MOSDPGCreate2.h"
  52 #include "messages/MOSDPGCreated.h"
  53 #include "messages/MOSDPGTemp.h"
  54 #include "messages/MOSDPGReadyToMerge.h"
  55 #include "messages/MMonCommand.h"
  56 #include "messages/MRemoveSnaps.h"
  57 #include "messages/MOSDScrub.h"
  58 #include "messages/MRoute.h"
  59 #include "messages/MMonGetPurgedSnaps.h"
  60 #include "messages/MMonGetPurgedSnapsReply.h"
  61
  62 #include "common/TextTable.h"
  63 #include "common/Timer.h"
  64 #include "common/ceph_argparse.h"
  65 #include "common/perf_counters.h"
  66 #include "common/PriorityCache.h"
  67 #include "common/strtol.h"
  68 #include "common/numa.h"
  69
  70 #include "common/config.h"
  71 #include "common/errno.h"
  72
  73 #include "erasure-code/ErasureCodePlugin.h"
  74 #include "compressor/Compressor.h"
  75 #include "common/Checksummer.h"
  76
  77 #include "include/compat.h"
  78 #include "include/ceph_assert.h"
  79 #include "include/stringify.h"
  80 #include "include/util.h"
  81 #include "common/cmdparse.h"
  82 #include "include/str_list.h"
  83 #include "include/str_map.h"
  84 #include "include/scope_guard.h"
  85 #include "perfglue/heap_profiler.h"
  86
  87 #include "auth/cephx/CephxKeyServer.h"
  88 #include "osd/OSDCap.h"
  89
  90 #include "json_spirit/json_spirit_reader.h"
  91
  92 #include <boost/algorithm/string/predicate.hpp>
  93
  94 #define dout_subsys ceph_subsys_mon
  95 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
  96 static const string OSD_METADATA_PREFIX("osd_metadata");
  97 static const string OSD_SNAP_PREFIX("osd_snap");
  98
  99 /*
 100
 101   OSD snapshot metadata
 102   ---------------------
 103
 104   -- starting with mimic, removed in octopus --
 105
 106   "removed_epoch_%llu_%08lx" % (pool, epoch)
 107    -> interval_set<snapid_t>
 108
 109   "removed_snap_%llu_%016llx" % (pool, last_snap)
 110    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 111
 112
 113   -- starting with mimic --
 114
 115   "purged_snap_%llu_%016llx" % (pool, last_snap)
 116    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 117
 118   - note that the {removed,purged}_snap put the last snap in they key so
 119     that we can use forward iteration only to search for an epoch in an
 120     interval.  e.g., to test if epoch N is removed/purged, we'll find a key
 121     >= N that either does or doesn't contain the given snap.
 122
 123
 124   -- starting with octopus --
 125
 126   "purged_epoch_%08lx" % epoch
 127   -> map<int64_t,interval_set<snapid_t>>
 128
 129   */
 130 using namespace TOPNSPC::common;
 131 namespace {
 132
 133 struct OSDMemCache : public PriorityCache::PriCache {
 134   OSDMonitor *osdmon;
 135   int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
 136   int64_t committed_bytes = 0;
 137   double cache_ratio = 0;
 138
 139   OSDMemCache(OSDMonitor *m) : osdmon(m) {};
 140
 141   virtual uint64_t _get_used_bytes() const = 0;
 142
 143   virtual int64_t request_cache_bytes(
 144       PriorityCache::Priority pri, uint64_t total_cache) const {
 145     int64_t assigned = get_cache_bytes(pri);
 146
 147     switch (pri) {
 148     // All cache items are currently set to have PRI1 priority
 149     case PriorityCache::Priority::PRI1:
 150       {
 151         int64_t request = _get_used_bytes();
 152         return (request > assigned) ? request - assigned : 0;
 153       }
 154     default:
 155       break;
 156     }
 157     return -EOPNOTSUPP;
 158   }
 159
 160   virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
 161       return cache_bytes[pri];
 162   }
 163
 164   virtual int64_t get_cache_bytes() const {
 165     int64_t total = 0;
 166
 167     for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
 168       PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
 169       total += get_cache_bytes(pri);
 170     }
 171     return total;
 172   }
 173
 174   virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 175     cache_bytes[pri] = bytes;
 176   }
 177   virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 178     cache_bytes[pri] += bytes;
 179   }
 180   virtual int64_t commit_cache_size(uint64_t total_cache) {
 181     committed_bytes = PriorityCache::get_chunk(
 182         get_cache_bytes(), total_cache);
 183     return committed_bytes;
 184   }
 185   virtual int64_t get_committed_size() const {
 186     return committed_bytes;
 187   }
 188   virtual double get_cache_ratio() const {
 189     return cache_ratio;
 190   }
 191   virtual void set_cache_ratio(double ratio) {
 192     cache_ratio = ratio;
 193   }
 194   virtual string get_cache_name() const = 0;
 195 };
 196
 197 struct IncCache : public OSDMemCache {
 198   IncCache(OSDMonitor *m) : OSDMemCache(m) {};
 199
 200   virtual uint64_t _get_used_bytes() const {
 201     return osdmon->inc_osd_cache.get_bytes();
 202   }
 203
 204   virtual string get_cache_name() const {
 205     return "OSDMap Inc Cache";
 206   }
 207
 208   uint64_t _get_num_osdmaps() const {
 209     return osdmon->inc_osd_cache.get_size();
 210   }
 211 };
 212
 213 struct FullCache : public OSDMemCache {
 214   FullCache(OSDMonitor *m) : OSDMemCache(m) {};
 215
 216   virtual uint64_t _get_used_bytes() const {
 217     return osdmon->full_osd_cache.get_bytes();
 218   }
 219
 220   virtual string get_cache_name() const {
 221     return "OSDMap Full Cache";
 222   }
 223
 224   uint64_t _get_num_osdmaps() const {
 225     return osdmon->full_osd_cache.get_size();
 226   }
 227 };
 228
 229 std::shared_ptr<IncCache> inc_cache;
 230 std::shared_ptr<FullCache> full_cache;
 231
 232 const uint32_t MAX_POOL_APPLICATIONS = 4;
 233 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 234 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 235
 236 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 237   // Note: this doesn't include support for the application tag match
 238   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 239     auto& match = grant.match;
 240     if (match.is_match_all()) {
 241       return true;
 242     } else if (pool_name != nullptr &&
 243                !match.pool_namespace.pool_name.empty() &&
 244                match.pool_namespace.pool_name == *pool_name) {
 245       return true;
 246     }
 247   }
 248   return false;
 249 }
 250
 251 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 252                                     const KeyServer& key_server,
 253                                     const EntityName& entity_name,
 254                                     const MonCap& mon_caps,
 255                                     const entity_addr_t& peer_socket_addr,
 256                                     const std::string* pool_name)
 257 {
 258   typedef std::map<std::string, std::string> CommandArgs;
 259
 260   if (mon_caps.is_capable(
 261         cct, entity_name, "osd",
 262         "osd pool op unmanaged-snap",
 263         (pool_name == nullptr ?
 264          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 265          CommandArgs{{"poolname", *pool_name}}),
 266         false, true, false,
 267         peer_socket_addr)) {
 268     return true;
 269   }
 270
 271   AuthCapsInfo caps_info;
 272   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 273                                    caps_info)) {
 274     dout(10) << "unable to locate OSD cap data for " << entity_name
 275              << " in auth db" << dendl;
 276     return false;
 277   }
 278
 279   string caps_str;
 280   if (caps_info.caps.length() > 0) {
 281     auto p = caps_info.caps.cbegin();
 282     try {
 283       decode(caps_str, p);
 284     } catch (const buffer::error &err) {
 285       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 286            << dendl;
 287       return false;
 288     }
 289   }
 290
 291   OSDCap osd_cap;
 292   if (!osd_cap.parse(caps_str, nullptr)) {
 293     dout(10) << "unable to parse OSD cap data for " << entity_name
 294              << " in auth db" << dendl;
 295     return false;
 296   }
 297
 298   // if the entity has write permissions in one or all pools, permit
 299   // usage of unmanaged-snapshots
 300   if (osd_cap.allow_all()) {
 301     return true;
 302   }
 303
 304   for (auto& grant : osd_cap.grants) {
 305     if (grant.profile.is_valid()) {
 306       for (auto& profile_grant : grant.profile_grants) {
 307         if (is_osd_writable(profile_grant, pool_name)) {
 308           return true;
 309         }
 310       }
 311     } else if (is_osd_writable(grant, pool_name)) {
 312       return true;
 313     }
 314   }
 315
 316   return false;
 317 }
 318
 319 } // anonymous namespace
 320
 321 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
 322 {
 323   if (epoch_by_pg.size() <= ps) {
 324     epoch_by_pg.resize(ps + 1, 0);
 325   }
 326   const auto old_lec = epoch_by_pg[ps];
 327   if (old_lec >= last_epoch_clean) {
 328     // stale lec
 329     return;
 330   }
 331   epoch_by_pg[ps] = last_epoch_clean;
 332   if (last_epoch_clean < floor) {
 333     floor = last_epoch_clean;
 334   } else if (last_epoch_clean > floor) {
 335     if (old_lec == floor) {
 336       // probably should increase floor?
 337       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 338                                         std::end(epoch_by_pg));
 339       floor = *new_floor;
 340     }
 341   }
 342   if (ps != next_missing) {
 343     return;
 344   }
 345   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 346     if (epoch_by_pg[next_missing] == 0) {
 347       break;
 348     }
 349   }
 350 }
 351
 352 void LastEpochClean::remove_pool(uint64_t pool)
 353 {
 354   report_by_pool.erase(pool);
 355 }
 356
 357 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
 358 {
 359   auto& lec = report_by_pool[pg.pool()];
 360   return lec.report(pg.ps(), last_epoch_clean);
 361 }
 362
 363 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 364 {
 365   auto floor = latest.get_epoch();
 366   for (auto& pool : latest.get_pools()) {
 367     auto reported = report_by_pool.find(pool.first);
 368     if (reported == report_by_pool.end()) {
 369       return 0;
 370     }
 371     if (reported->second.next_missing < pool.second.get_pg_num()) {
 372       return 0;
 373     }
 374     if (reported->second.floor < floor) {
 375       floor = reported->second.floor;
 376     }
 377   }
 378   return floor;
 379 }
 380
 381 void LastEpochClean::dump(Formatter *f) const
 382 {
 383   f->open_array_section("per_pool");
 384
 385   for (auto& it : report_by_pool) {
 386     f->open_object_section("pool");
 387     f->dump_unsigned("poolid", it.first);
 388     f->dump_unsigned("floor", it.second.floor);
 389     f->close_section();
 390   }
 391
 392   f->close_section();
 393 }
 394
 395 class C_UpdateCreatingPGs : public Context {
 396 public:
 397   OSDMonitor *osdmon;
 398   utime_t start;
 399   epoch_t epoch;
 400   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 401     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 402   void finish(int r) override {
 403     if (r >= 0) {
 404       utime_t end = ceph_clock_now();
 405       dout(10) << "osdmap epoch " << epoch << " mapping took "
 406                << (end - start) << " seconds" << dendl;
 407       osdmon->update_creating_pgs();
 408       osdmon->check_pg_creates_subs();
 409     }
 410   }
 411 };
 412
 413 #undef dout_prefix
 414 #define dout_prefix _prefix(_dout, mon, osdmap)
 415 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
 416   return *_dout << "mon." << mon->name << "@" << mon->rank
 417                 << "(" << mon->get_state_name()
 418                 << ").osd e" << osdmap.get_epoch() << " ";
 419 }
 420
 421 OSDMonitor::OSDMonitor(
 422   CephContext *cct,
 423   Monitor *mn,
 424   Paxos *p,
 425   const string& service_name)
 426  : PaxosService(mn, p, service_name),
 427    cct(cct),
 428    inc_osd_cache(g_conf()->mon_osd_cache_size),
 429    full_osd_cache(g_conf()->mon_osd_cache_size),
 430    has_osdmap_manifest(false),
 431    mapper(mn->cct, &mn->cpu_tp)
 432 {
 433   inc_cache = std::make_shared<IncCache>(this);
 434   full_cache = std::make_shared<FullCache>(this);
 435   cct->_conf.add_observer(this);
 436   int r = _set_cache_sizes();
 437   if (r < 0) {
 438     derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
 439          << g_conf()->mon_osd_cache_size
 440          << ") without priority cache management"
 441          << dendl;
 442   }
 443 }
 444
 445 const char **OSDMonitor::get_tracked_conf_keys() const
 446 {
 447   static const char* KEYS[] = {
 448     "mon_memory_target",
 449     "mon_memory_autotune",
 450     "rocksdb_cache_size",
 451     NULL
 452   };
 453   return KEYS;
 454 }
 455
 456 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
 457                                     const std::set<std::string> &changed)
 458 {
 459   dout(10) << __func__ << " " << changed << dendl;
 460
 461   if (changed.count("mon_memory_autotune")) {
 462     _set_cache_autotuning();
 463   }
 464   if (changed.count("mon_memory_target") ||
 465       changed.count("rocksdb_cache_size")) {
 466     int r = _update_mon_cache_settings();
 467     if (r < 0) {
 468       derr << __func__ << " mon_memory_target:"
 469            << g_conf()->mon_memory_target
 470            << " rocksdb_cache_size:"
 471            << g_conf()->rocksdb_cache_size
 472            << ". Unable to update cache size."
 473            << dendl;
 474     }
 475   }
 476 }
 477
 478 void OSDMonitor::_set_cache_autotuning()
 479 {
 480   if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
 481     // Disable cache autotuning
 482     std::lock_guard l(balancer_lock);
 483     pcm = nullptr;
 484   }
 485
 486   if (g_conf()->mon_memory_autotune && pcm == nullptr) {
 487     int r = register_cache_with_pcm();
 488     if (r < 0) {
 489       dout(10) << __func__
 490                << " Error while registering osdmon caches with pcm."
 491                << " Cache auto tuning not enabled."
 492                << dendl;
 493       mon_memory_autotune = false;
 494     } else {
 495       mon_memory_autotune = true;
 496     }
 497   }
 498 }
 499
 500 int OSDMonitor::_update_mon_cache_settings()
 501 {
 502   if (g_conf()->mon_memory_target <= 0 ||
 503       g_conf()->mon_memory_target < mon_memory_min ||
 504       g_conf()->rocksdb_cache_size <= 0) {
 505     return -EINVAL;
 506   }
 507
 508   if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
 509     derr << __func__ << " not using pcm and rocksdb" << dendl;
 510     return -EINVAL;
 511   }
 512
 513   uint64_t old_mon_memory_target = mon_memory_target;
 514   uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
 515
 516   // Set the new pcm memory cache sizes
 517   mon_memory_target = g_conf()->mon_memory_target;
 518   rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 519
 520   uint64_t base = mon_memory_base;
 521   double fragmentation = mon_memory_fragmentation;
 522   uint64_t target = mon_memory_target;
 523   uint64_t min = mon_memory_min;
 524   uint64_t max = min;
 525
 526   uint64_t ltarget = (1.0 - fragmentation) * target;
 527   if (ltarget > base + min) {
 528     max = ltarget - base;
 529   }
 530
 531   int r = _set_cache_ratios();
 532   if (r < 0) {
 533     derr << __func__ << " Cache ratios for pcm could not be set."
 534          << " Review the kv (rocksdb) and mon_memory_target sizes."
 535          << dendl;
 536     mon_memory_target = old_mon_memory_target;
 537     rocksdb_cache_size = old_rocksdb_cache_size;
 538     return -EINVAL;
 539   }
 540
 541   if (mon_memory_autotune && pcm != nullptr) {
 542     std::lock_guard l(balancer_lock);
 543     // set pcm cache levels
 544     pcm->set_target_memory(target);
 545     pcm->set_min_memory(min);
 546     pcm->set_max_memory(max);
 547     // tune memory based on new values
 548     pcm->tune_memory();
 549     pcm->balance();
 550     _set_new_cache_sizes();
 551     dout(1) << __func__ << " Updated mon cache setting."
 552              << " target: " << target
 553              << " min: " << min
 554              << " max: " << max
 555              << dendl;
 556   }
 557   return 0;
 558 }
 559
 560 int OSDMonitor::_set_cache_sizes()
 561 {
 562   if (g_conf()->mon_memory_autotune) {
 563     // set the new osdmon cache targets to be managed by pcm
 564     mon_osd_cache_size = g_conf()->mon_osd_cache_size;
 565     rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 566     mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
 567     mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
 568     mon_memory_target = g_conf()->mon_memory_target;
 569     mon_memory_min = g_conf()->mon_osd_cache_size_min;
 570     if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 571       derr << __func__ << " mon_memory_target:" << mon_memory_target
 572            << " mon_memory_min:" << mon_memory_min
 573            << ". Invalid size option(s) provided."
 574            << dendl;
 575       return -EINVAL;
 576     }
 577     // Set the initial inc and full LRU cache sizes
 578     inc_osd_cache.set_bytes(mon_memory_min);
 579     full_osd_cache.set_bytes(mon_memory_min);
 580     mon_memory_autotune = g_conf()->mon_memory_autotune;
 581   }
 582   return 0;
 583 }
 584
 585 bool OSDMonitor::_have_pending_crush()
 586 {
 587   return pending_inc.crush.length() > 0;
 588 }
 589
 590 CrushWrapper &OSDMonitor::_get_stable_crush()
 591 {
 592   return *osdmap.crush;
 593 }
 594
 595 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
 596 {
 597   bufferlist bl;
 598   if (pending_inc.crush.length())
 599     bl = pending_inc.crush;
 600   else
 601     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 602
 603   auto p = bl.cbegin();
 604   newcrush.decode(p);
 605 }
 606
 607 void OSDMonitor::create_initial()
 608 {
 609   dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
 610
 611   OSDMap newmap;
 612
 613   bufferlist bl;
 614   mon->store->get("mkfs", "osdmap", bl);
 615
 616   if (bl.length()) {
 617     newmap.decode(bl);
 618     newmap.set_fsid(mon->monmap->fsid);
 619   } else {
 620     newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
 621   }
 622   newmap.set_epoch(1);
 623   newmap.created = newmap.modified = ceph_clock_now();
 624
 625   // new clusters should sort bitwise by default.
 626   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 627
 628   newmap.flags |=
 629     CEPH_OSDMAP_RECOVERY_DELETES |
 630     CEPH_OSDMAP_PURGED_SNAPDIRS |
 631     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 632   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 633   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 634   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 635   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 636   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 637   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 638
 639   // new cluster should require latest by default
 640   if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
 641     if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
 642       derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
 643       newmap.require_osd_release = ceph_release_t::mimic;
 644     } else {
 645       derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
 646       newmap.require_osd_release = ceph_release_t::nautilus;
 647     }
 648   } else {
 649     newmap.require_osd_release = ceph_release_t::octopus;
 650     ceph_release_t r = ceph_release_from_name(
 651       g_conf()->mon_osd_initial_require_min_compat_client);
 652     if (!r) {
 653       ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 654     }
 655     newmap.require_min_compat_client = r;
 656   }
 657
 658   // encode into pending incremental
 659   uint64_t features = newmap.get_encoding_features();
 660   newmap.encode(pending_inc.fullmap,
 661                 features | CEPH_FEATURE_RESERVED);
 662   pending_inc.full_crc = newmap.get_crc();
 663   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 664 }
 665
 666 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 667 {
 668   s.insert(service_name);
 669   s.insert(OSD_PG_CREATING_PREFIX);
 670   s.insert(OSD_METADATA_PREFIX);
 671   s.insert(OSD_SNAP_PREFIX);
 672 }
 673
 674 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 675 {
 676   // we really don't care if the version has been updated, because we may
 677   // have trimmed without having increased the last committed; yet, we may
 678   // need to update the in-memory manifest.
 679   load_osdmap_manifest();
 680
 681   version_t version = get_last_committed();
 682   if (version == osdmap.epoch)
 683     return;
 684   ceph_assert(version > osdmap.epoch);
 685
 686   dout(15) << "update_from_paxos paxos e " << version
 687            << ", my e " << osdmap.epoch << dendl;
 688
 689   if (mapping_job) {
 690     if (!mapping_job->is_done()) {
 691       dout(1) << __func__ << " mapping job "
 692               << mapping_job.get() << " did not complete, "
 693               << mapping_job->shards << " left, canceling" << dendl;
 694       mapping_job->abort();
 695     }
 696     mapping_job.reset();
 697   }
 698
 699   load_health();
 700
 701   /*
 702    * We will possibly have a stashed latest that *we* wrote, and we will
 703    * always be sure to have the oldest full map in the first..last range
 704    * due to encode_trim_extra(), which includes the oldest full map in the trim
 705    * transaction.
 706    *
 707    * encode_trim_extra() does not however write the full map's
 708    * version to 'full_latest'.  This is only done when we are building the
 709    * full maps from the incremental versions.  But don't panic!  We make sure
 710    * that the following conditions find whichever full map version is newer.
 711    */
 712   version_t latest_full = get_version_latest_full();
 713   if (latest_full == 0 && get_first_committed() > 1)
 714     latest_full = get_first_committed();
 715
 716   if (get_first_committed() > 1 &&
 717       latest_full < get_first_committed()) {
 718     // the monitor could be just sync'ed with its peer, and the latest_full key
 719     // is not encoded in the paxos commits in encode_pending(), so we need to
 720     // make sure we get it pointing to a proper version.
 721     version_t lc = get_last_committed();
 722     version_t fc = get_first_committed();
 723
 724     dout(10) << __func__ << " looking for valid full map in interval"
 725              << " [" << fc << ", " << lc << "]" << dendl;
 726
 727     latest_full = 0;
 728     for (version_t v = lc; v >= fc; v--) {
 729       string full_key = "full_" + stringify(v);
 730       if (mon->store->exists(get_service_name(), full_key)) {
 731         dout(10) << __func__ << " found latest full map v " << v << dendl;
 732         latest_full = v;
 733         break;
 734       }
 735     }
 736
 737     ceph_assert(latest_full > 0);
 738     auto t(std::make_shared<MonitorDBStore::Transaction>());
 739     put_version_latest_full(t, latest_full);
 740     mon->store->apply_transaction(t);
 741     dout(10) << __func__ << " updated the on-disk full map version to "
 742              << latest_full << dendl;
 743   }
 744
 745   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 746     bufferlist latest_bl;
 747     get_version_full(latest_full, latest_bl);
 748     ceph_assert(latest_bl.length() != 0);
 749     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 750     osdmap = OSDMap();
 751     osdmap.decode(latest_bl);
 752   }
 753
 754   bufferlist bl;
 755   if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 756     auto p = bl.cbegin();
 757     std::lock_guard<std::mutex> l(creating_pgs_lock);
 758     creating_pgs.decode(p);
 759     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 760             << creating_pgs.last_scan_epoch
 761             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 762   } else {
 763     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 764             << dendl;
 765   }
 766
 767   // walk through incrementals
 768   MonitorDBStore::TransactionRef t;
 769   size_t tx_size = 0;
 770   while (version > osdmap.epoch) {
 771     bufferlist inc_bl;
 772     int err = get_version(osdmap.epoch+1, inc_bl);
 773     ceph_assert(err == 0);
 774     ceph_assert(inc_bl.length());
 775     // set priority cache manager levels if the osdmap is
 776     // being populated for the first time.
 777     if (mon_memory_autotune && pcm == nullptr) {
 778       int r = register_cache_with_pcm();
 779       if (r < 0) {
 780         dout(10) << __func__
 781                  << " Error while registering osdmon caches with pcm."
 782                  << " Proceeding without cache auto tuning."
 783                  << dendl;
 784       }
 785     }
 786
 787     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 788             << dendl;
 789     OSDMap::Incremental inc(inc_bl);
 790     err = osdmap.apply_incremental(inc);
 791     ceph_assert(err == 0);
 792
 793     if (!t)
 794       t.reset(new MonitorDBStore::Transaction);
 795
 796     // Write out the full map for all past epochs.  Encode the full
 797     // map with the same features as the incremental.  If we don't
 798     // know, use the quorum features.  If we don't know those either,
 799     // encode with all features.
 800     uint64_t f = inc.encode_features;
 801     if (!f)
 802       f = mon->get_quorum_con_features();
 803     if (!f)
 804       f = -1;
 805     bufferlist full_bl;
 806     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 807     tx_size += full_bl.length();
 808
 809     bufferlist orig_full_bl;
 810     get_version_full(osdmap.epoch, orig_full_bl);
 811     if (orig_full_bl.length()) {
 812       // the primary provided the full map
 813       ceph_assert(inc.have_crc);
 814       if (inc.full_crc != osdmap.crc) {
 815         // This will happen if the mons were running mixed versions in
 816         // the past or some other circumstance made the full encoded
 817         // maps divergent.  Reloading here will bring us back into
 818         // sync with the primary for this and all future maps.  OSDs
 819         // will also be brought back into sync when they discover the
 820         // crc mismatch and request a full map from a mon.
 821         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 822              << dendl;
 823
 824         dout(20) << __func__ << " my (bad) full osdmap:\n";
 825         JSONFormatter jf(true);
 826         jf.dump_object("osdmap", osdmap);
 827         jf.flush(*_dout);
 828         *_dout << "\nhexdump:\n";
 829         full_bl.hexdump(*_dout);
 830         *_dout << dendl;
 831
 832         osdmap = OSDMap();
 833         osdmap.decode(orig_full_bl);
 834
 835         dout(20) << __func__ << " canonical full osdmap:\n";
 836         JSONFormatter jf(true);
 837         jf.dump_object("osdmap", osdmap);
 838         jf.flush(*_dout);
 839         *_dout << "\nhexdump:\n";
 840         orig_full_bl.hexdump(*_dout);
 841         *_dout << dendl;
 842       }
 843     } else {
 844       ceph_assert(!inc.have_crc);
 845       put_version_full(t, osdmap.epoch, full_bl);
 846     }
 847     put_version_latest_full(t, osdmap.epoch);
 848
 849     // share
 850     dout(1) << osdmap << dendl;
 851
 852     if (osdmap.epoch == 1) {
 853       t->erase("mkfs", "osdmap");
 854     }
 855
 856     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 857       mon->store->apply_transaction(t);
 858       t = MonitorDBStore::TransactionRef();
 859       tx_size = 0;
 860     }
 861     for (const auto &osd_state : inc.new_state) {
 862       if (osd_state.second & CEPH_OSD_UP) {
 863         // could be marked up *or* down, but we're too lazy to check which
 864         last_osd_report.erase(osd_state.first);
 865       }
 866       if (osd_state.second & CEPH_OSD_EXISTS) {
 867         // could be created *or* destroyed, but we can safely drop it
 868         osd_epochs.erase(osd_state.first);
 869       }
 870     }
 871   }
 872
 873   if (t) {
 874     mon->store->apply_transaction(t);
 875   }
 876
 877   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 878     if (osdmap.is_out(o))
 879       continue;
 880     auto found = down_pending_out.find(o);
 881     if (osdmap.is_down(o)) {
 882       // populate down -> out map
 883       if (found == down_pending_out.end()) {
 884         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 885         down_pending_out[o] = ceph_clock_now();
 886       }
 887     } else {
 888       if (found != down_pending_out.end()) {
 889         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 890         down_pending_out.erase(found);
 891       }
 892     }
 893   }
 894   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 895
 896   check_osdmap_subs();
 897   check_pg_creates_subs();
 898
 899   share_map_with_random_osd();
 900   update_logger();
 901   process_failures();
 902
 903   // make sure our feature bits reflect the latest map
 904   update_msgr_features();
 905
 906   if (!mon->is_leader()) {
 907     // will be called by on_active() on the leader, avoid doing so twice
 908     start_mapping();
 909   }
 910 }
 911
 912 int OSDMonitor::register_cache_with_pcm()
 913 {
 914   if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 915     derr << __func__ << " Invalid memory size specified for mon caches."
 916          << " Caches will not be auto-tuned."
 917          << dendl;
 918     return -EINVAL;
 919   }
 920   uint64_t base = mon_memory_base;
 921   double fragmentation = mon_memory_fragmentation;
 922   // For calculating total target memory, consider rocksdb cache size.
 923   uint64_t target = mon_memory_target;
 924   uint64_t min = mon_memory_min;
 925   uint64_t max = min;
 926
 927   // Apply the same logic as in bluestore to set the max amount
 928   // of memory to use for cache. Assume base memory for OSDMaps
 929   // and then add in some overhead for fragmentation.
 930   uint64_t ltarget = (1.0 - fragmentation) * target;
 931   if (ltarget > base + min) {
 932     max = ltarget - base;
 933   }
 934
 935   rocksdb_binned_kv_cache = mon->store->get_priority_cache();
 936   if (!rocksdb_binned_kv_cache) {
 937     derr << __func__ << " not using rocksdb" << dendl;
 938     return -EINVAL;
 939   }
 940
 941   int r = _set_cache_ratios();
 942   if (r < 0) {
 943     derr << __func__ << " Cache ratios for pcm could not be set."
 944          << " Review the kv (rocksdb) and mon_memory_target sizes."
 945          << dendl;
 946     return -EINVAL;
 947   }
 948
 949   pcm = std::make_shared<PriorityCache::Manager>(
 950       cct, min, max, target, true);
 951   pcm->insert("kv", rocksdb_binned_kv_cache, true);
 952   pcm->insert("inc", inc_cache, true);
 953   pcm->insert("full", full_cache, true);
 954   dout(1) << __func__ << " pcm target: " << target
 955            << " pcm max: " << max
 956            << " pcm min: " << min
 957            << " inc_osd_cache size: " << inc_osd_cache.get_size()
 958            << dendl;
 959   return 0;
 960 }
 961
 962 int OSDMonitor::_set_cache_ratios()
 963 {
 964   double old_cache_kv_ratio = cache_kv_ratio;
 965
 966   // Set the cache ratios for kv(rocksdb), inc and full caches
 967   cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
 968   if (cache_kv_ratio >= 1.0) {
 969     derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
 970          << ") must be in range [0,<1.0]."
 971          << dendl;
 972     cache_kv_ratio = old_cache_kv_ratio;
 973     return -EINVAL;
 974   }
 975   rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
 976   cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
 977   inc_cache->set_cache_ratio(cache_inc_ratio);
 978   full_cache->set_cache_ratio(cache_full_ratio);
 979
 980   dout(1) << __func__ << " kv ratio " << cache_kv_ratio
 981            << " inc ratio " << cache_inc_ratio
 982            << " full ratio " << cache_full_ratio
 983            << dendl;
 984   return 0;
 985 }
 986
 987 void OSDMonitor::start_mapping()
 988 {
 989   // initiate mapping job
 990   if (mapping_job) {
 991     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
 992              << dendl;
 993     mapping_job->abort();
 994   }
 995   if (!osdmap.get_pools().empty()) {
 996     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
 997     mapping_job = mapping.start_update(osdmap, mapper,
 998                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
 999     dout(10) << __func__ << " started mapping job " << mapping_job.get()
1000              << " at " << fin->start << dendl;
1001     mapping_job->set_finish_event(fin);
1002   } else {
1003     dout(10) << __func__ << " no pools, no mapping job" << dendl;
1004     mapping_job = nullptr;
1005   }
1006 }
1007
1008 void OSDMonitor::update_msgr_features()
1009 {
1010   set<int> types;
1011   types.insert((int)entity_name_t::TYPE_OSD);
1012   types.insert((int)entity_name_t::TYPE_CLIENT);
1013   types.insert((int)entity_name_t::TYPE_MDS);
1014   types.insert((int)entity_name_t::TYPE_MON);
1015   for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1016     uint64_t mask;
1017     uint64_t features = osdmap.get_features(*q, &mask);
1018     if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1019       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1020       ceph::net::Policy p = mon->messenger->get_policy(*q);
1021       p.features_required = (p.features_required & ~mask) | features;
1022       mon->messenger->set_policy(*q, p);
1023     }
1024   }
1025 }
1026
1027 void OSDMonitor::on_active()
1028 {
1029   update_logger();
1030
1031   if (mon->is_leader()) {
1032     mon->clog->debug() << "osdmap " << osdmap;
1033     if (!priority_convert) {
1034       // Only do this once at start-up
1035       convert_pool_priorities();
1036       priority_convert = true;
1037     }
1038   } else {
1039     list<MonOpRequestRef> ls;
1040     take_all_failures(ls);
1041     while (!ls.empty()) {
1042       MonOpRequestRef op = ls.front();
1043       op->mark_osdmon_event(__func__);
1044       dispatch(op);
1045       ls.pop_front();
1046     }
1047   }
1048   start_mapping();
1049 }
1050
1051 void OSDMonitor::on_restart()
1052 {
1053   last_osd_report.clear();
1054 }
1055
1056 void OSDMonitor::on_shutdown()
1057 {
1058   dout(10) << __func__ << dendl;
1059   if (mapping_job) {
1060     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1061              << dendl;
1062     mapping_job->abort();
1063   }
1064
1065   // discard failure info, waiters
1066   list<MonOpRequestRef> ls;
1067   take_all_failures(ls);
1068   ls.clear();
1069 }
1070
1071 void OSDMonitor::update_logger()
1072 {
1073   dout(10) << "update_logger" << dendl;
1074
1075   mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1076   mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1077   mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1078   mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1079 }
1080
1081 void OSDMonitor::create_pending()
1082 {
1083   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1084   pending_inc.fsid = mon->monmap->fsid;
1085   pending_metadata.clear();
1086   pending_metadata_rm.clear();
1087   pending_pseudo_purged_snaps.clear();
1088
1089   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1090
1091   // safety checks (this shouldn't really happen)
1092   {
1093     if (osdmap.backfillfull_ratio <= 0) {
1094       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1095       if (pending_inc.new_backfillfull_ratio > 1.0)
1096         pending_inc.new_backfillfull_ratio /= 100;
1097       dout(1) << __func__ << " setting backfillfull_ratio = "
1098               << pending_inc.new_backfillfull_ratio << dendl;
1099     }
1100     if (osdmap.full_ratio <= 0) {
1101       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1102       if (pending_inc.new_full_ratio > 1.0)
1103         pending_inc.new_full_ratio /= 100;
1104       dout(1) << __func__ << " setting full_ratio = "
1105               << pending_inc.new_full_ratio << dendl;
1106     }
1107     if (osdmap.nearfull_ratio <= 0) {
1108       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1109       if (pending_inc.new_nearfull_ratio > 1.0)
1110         pending_inc.new_nearfull_ratio /= 100;
1111       dout(1) << __func__ << " setting nearfull_ratio = "
1112               << pending_inc.new_nearfull_ratio << dendl;
1113     }
1114   }
1115
1116   // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1117   // structure.
1118   if (osdmap.crush->has_legacy_rule_ids()) {
1119     CrushWrapper newcrush;
1120     _get_pending_crush(newcrush);
1121
1122     // First, for all pools, work out which rule they really used
1123     // by resolving ruleset to rule.
1124     for (const auto &i : osdmap.get_pools()) {
1125       const auto pool_id = i.first;
1126       const auto &pool = i.second;
1127       int new_rule_id = newcrush.find_rule(pool.crush_rule,
1128                                            pool.type, pool.size);
1129
1130       dout(1) << __func__ << " rewriting pool "
1131               << osdmap.get_pool_name(pool_id) << " crush ruleset "
1132               << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1133       if (pending_inc.new_pools.count(pool_id) == 0) {
1134         pending_inc.new_pools[pool_id] = pool;
1135       }
1136       pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1137     }
1138
1139     // Now, go ahead and renumber all the rules so that their
1140     // rule_id field corresponds to their position in the array
1141     auto old_to_new = newcrush.renumber_rules();
1142     dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1143     for (const auto &i : old_to_new) {
1144       dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1145     }
1146     pending_inc.crush.clear();
1147     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1148   }
1149 }
1150
1151 creating_pgs_t
1152 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1153                                const OSDMap& nextmap)
1154 {
1155   dout(10) << __func__ << dendl;
1156   creating_pgs_t pending_creatings;
1157   {
1158     std::lock_guard<std::mutex> l(creating_pgs_lock);
1159     pending_creatings = creating_pgs;
1160   }
1161   // check for new or old pools
1162   if (pending_creatings.last_scan_epoch < inc.epoch) {
1163     unsigned queued = 0;
1164     queued += scan_for_creating_pgs(osdmap.get_pools(),
1165                                     inc.old_pools,
1166                                     inc.modified,
1167                                     &pending_creatings);
1168     queued += scan_for_creating_pgs(inc.new_pools,
1169                                     inc.old_pools,
1170                                     inc.modified,
1171                                     &pending_creatings);
1172     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1173     for (auto deleted_pool : inc.old_pools) {
1174       auto removed = pending_creatings.remove_pool(deleted_pool);
1175       dout(10) << __func__ << " " << removed
1176                << " pg removed because containing pool deleted: "
1177                << deleted_pool << dendl;
1178       last_epoch_clean.remove_pool(deleted_pool);
1179     }
1180     // pgmon updates its creating_pgs in check_osd_map() which is called by
1181     // on_active() and check_osd_map() could be delayed if lease expires, so its
1182     // creating_pgs could be stale in comparison with the one of osdmon. let's
1183     // trim them here. otherwise, they will be added back after being erased.
1184     unsigned removed = 0;
1185     for (auto& pg : pending_created_pgs) {
1186       dout(20) << __func__ << " noting created pg " << pg << dendl;
1187       pending_creatings.created_pools.insert(pg.pool());
1188       removed += pending_creatings.pgs.erase(pg);
1189     }
1190     pending_created_pgs.clear();
1191     dout(10) << __func__ << " " << removed
1192              << " pgs removed because they're created" << dendl;
1193     pending_creatings.last_scan_epoch = osdmap.get_epoch();
1194   }
1195
1196   // filter out any pgs that shouldn't exist.
1197   {
1198     auto i = pending_creatings.pgs.begin();
1199     while (i != pending_creatings.pgs.end()) {
1200       if (!nextmap.pg_exists(i->first)) {
1201         dout(10) << __func__ << " removing pg " << i->first
1202                  << " which should not exist" << dendl;
1203         i = pending_creatings.pgs.erase(i);
1204       } else {
1205         ++i;
1206       }
1207     }
1208   }
1209
1210   // process queue
1211   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1212   const auto total = pending_creatings.pgs.size();
1213   while (pending_creatings.pgs.size() < max &&
1214          !pending_creatings.queue.empty()) {
1215     auto p = pending_creatings.queue.begin();
1216     int64_t poolid = p->first;
1217     dout(10) << __func__ << " pool " << poolid
1218              << " created " << p->second.created
1219              << " modified " << p->second.modified
1220              << " [" << p->second.start << "-" << p->second.end << ")"
1221              << dendl;
1222     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1223                                   p->second.end - p->second.start);
1224     ps_t first = p->second.start;
1225     ps_t end = first + n;
1226     for (ps_t ps = first; ps < end; ++ps) {
1227       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1228       // NOTE: use the *current* epoch as the PG creation epoch so that the
1229       // OSD does not have to generate a long set of PastIntervals.
1230       pending_creatings.pgs.emplace(
1231         pgid,
1232         creating_pgs_t::pg_create_info(inc.epoch,
1233                                        p->second.modified));
1234       dout(10) << __func__ << " adding " << pgid << dendl;
1235     }
1236     p->second.start = end;
1237     if (p->second.done()) {
1238       dout(10) << __func__ << " done with queue for " << poolid << dendl;
1239       pending_creatings.queue.erase(p);
1240     } else {
1241       dout(10) << __func__ << " pool " << poolid
1242                << " now [" << p->second.start << "-" << p->second.end << ")"
1243                << dendl;
1244     }
1245   }
1246   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1247            << " pools" << dendl;
1248
1249   if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1250     // walk creating pgs' history and past_intervals forward
1251     for (auto& i : pending_creatings.pgs) {
1252       // this mirrors PG::start_peering_interval()
1253       pg_t pgid = i.first;
1254
1255       // this is a bit imprecise, but sufficient?
1256       struct min_size_predicate_t : public IsPGRecoverablePredicate {
1257         const pg_pool_t *pi;
1258         bool operator()(const set<pg_shard_t> &have) const {
1259           return have.size() >= pi->min_size;
1260         }
1261         explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1262       } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1263
1264       vector<int> up, acting;
1265       int up_primary, acting_primary;
1266       nextmap.pg_to_up_acting_osds(
1267         pgid, &up, &up_primary, &acting, &acting_primary);
1268       if (i.second.history.epoch_created == 0) {
1269         // new pg entry, set it up
1270         i.second.up = up;
1271         i.second.acting = acting;
1272         i.second.up_primary = up_primary;
1273         i.second.acting_primary = acting_primary;
1274         i.second.history = pg_history_t(i.second.create_epoch,
1275                                         i.second.create_stamp);
1276         dout(10) << __func__ << "  pg " << pgid << " just added, "
1277                  << " up " << i.second.up
1278                  << " p " << i.second.up_primary
1279                  << " acting " << i.second.acting
1280                  << " p " << i.second.acting_primary
1281                  << " history " << i.second.history
1282                  << " past_intervals " << i.second.past_intervals
1283                  << dendl;
1284      } else {
1285         std::stringstream debug;
1286         if (PastIntervals::check_new_interval(
1287               i.second.acting_primary, acting_primary,
1288               i.second.acting, acting,
1289               i.second.up_primary, up_primary,
1290               i.second.up, up,
1291               i.second.history.same_interval_since,
1292               i.second.history.last_epoch_clean,
1293               &nextmap,
1294               &osdmap,
1295               pgid,
1296               min_size_predicate,
1297               &i.second.past_intervals,
1298               &debug)) {
1299           epoch_t e = inc.epoch;
1300           i.second.history.same_interval_since = e;
1301           if (i.second.up != up) {
1302             i.second.history.same_up_since = e;
1303           }
1304           if (i.second.acting_primary != acting_primary) {
1305             i.second.history.same_primary_since = e;
1306           }
1307           if (pgid.is_split(
1308                 osdmap.get_pg_num(pgid.pool()),
1309                 nextmap.get_pg_num(pgid.pool()),
1310                 nullptr)) {
1311             i.second.history.last_epoch_split = e;
1312           }
1313           dout(10) << __func__ << "  pg " << pgid << " new interval,"
1314                    << " up " << i.second.up << " -> " << up
1315                    << " p " << i.second.up_primary << " -> " << up_primary
1316                    << " acting " << i.second.acting << " -> " << acting
1317                    << " p " << i.second.acting_primary << " -> "
1318                    << acting_primary
1319                    << " history " << i.second.history
1320                    << " past_intervals " << i.second.past_intervals
1321                    << dendl;
1322           dout(20) << "  debug: " << debug.str() << dendl;
1323           i.second.up = up;
1324           i.second.acting = acting;
1325           i.second.up_primary = up_primary;
1326           i.second.acting_primary = acting_primary;
1327         }
1328       }
1329     }
1330   }
1331   dout(10) << __func__
1332            << " " << (pending_creatings.pgs.size() - total)
1333            << "/" << pending_creatings.pgs.size()
1334            << " pgs added from queued pools" << dendl;
1335   return pending_creatings;
1336 }
1337
1338 void OSDMonitor::maybe_prime_pg_temp()
1339 {
1340   bool all = false;
1341   if (pending_inc.crush.length()) {
1342     dout(10) << __func__ << " new crush map, all" << dendl;
1343     all = true;
1344   }
1345
1346   if (!pending_inc.new_up_client.empty()) {
1347     dout(10) << __func__ << " new up osds, all" << dendl;
1348     all = true;
1349   }
1350
1351   // check for interesting OSDs
1352   set<int> osds;
1353   for (auto p = pending_inc.new_state.begin();
1354        !all && p != pending_inc.new_state.end();
1355        ++p) {
1356     if ((p->second & CEPH_OSD_UP) &&
1357         osdmap.is_up(p->first)) {
1358       osds.insert(p->first);
1359     }
1360   }
1361   for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1362        !all && p != pending_inc.new_weight.end();
1363        ++p) {
1364     if (p->second < osdmap.get_weight(p->first)) {
1365       // weight reduction
1366       osds.insert(p->first);
1367     } else {
1368       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1369                << dendl;
1370       all = true;
1371     }
1372   }
1373
1374   if (!all && osds.empty())
1375     return;
1376
1377   if (!all) {
1378     unsigned estimate =
1379       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1380     if (estimate > mapping.get_num_pgs() *
1381         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1382       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1383                << osds.size() << " osds >= "
1384                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1385                << mapping.get_num_pgs() << " pgs, all"
1386                << dendl;
1387       all = true;
1388     } else {
1389       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1390                << osds.size() << " osds" << dendl;
1391     }
1392   }
1393
1394   OSDMap next;
1395   next.deepish_copy_from(osdmap);
1396   next.apply_incremental(pending_inc);
1397
1398   if (next.get_pools().empty()) {
1399     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1400   } else if (all) {
1401     PrimeTempJob job(next, this);
1402     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1403     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1404       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1405     } else {
1406       dout(10) << __func__ << " did not finish in "
1407                << g_conf()->mon_osd_prime_pg_temp_max_time
1408                << ", stopping" << dendl;
1409       job.abort();
1410     }
1411   } else {
1412     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1413     utime_t stop = ceph_clock_now();
1414     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1415     const int chunk = 1000;
1416     int n = chunk;
1417     std::unordered_set<pg_t> did_pgs;
1418     for (auto osd : osds) {
1419       auto& pgs = mapping.get_osd_acting_pgs(osd);
1420       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1421       for (auto pgid : pgs) {
1422         if (!did_pgs.insert(pgid).second) {
1423           continue;
1424         }
1425         prime_pg_temp(next, pgid);
1426         if (--n <= 0) {
1427           n = chunk;
1428           if (ceph_clock_now() > stop) {
1429             dout(10) << __func__ << " consumed more than "
1430                      << g_conf()->mon_osd_prime_pg_temp_max_time
1431                      << " seconds, stopping"
1432                      << dendl;
1433             return;
1434           }
1435         }
1436       }
1437     }
1438   }
1439 }
1440
1441 void OSDMonitor::prime_pg_temp(
1442   const OSDMap& next,
1443   pg_t pgid)
1444 {
1445   // TODO: remove this creating_pgs direct access?
1446   if (creating_pgs.pgs.count(pgid)) {
1447     return;
1448   }
1449   if (!osdmap.pg_exists(pgid)) {
1450     return;
1451   }
1452
1453   vector<int> up, acting;
1454   mapping.get(pgid, &up, nullptr, &acting, nullptr);
1455
1456   vector<int> next_up, next_acting;
1457   int next_up_primary, next_acting_primary;
1458   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1459                             &next_acting, &next_acting_primary);
1460   if (acting == next_acting &&
1461       !(up != acting && next_up == next_acting))
1462     return;  // no change since last epoch
1463
1464   if (acting.empty())
1465     return;  // if previously empty now we can be no worse off
1466   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1467   if (pool && acting.size() < pool->min_size)
1468     return;  // can be no worse off than before
1469
1470   if (next_up == next_acting) {
1471     acting.clear();
1472     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1473              << dendl;
1474   }
1475
1476   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1477            << " -> " << next_up << "/" << next_acting
1478            << ", priming " << acting
1479            << dendl;
1480   {
1481     std::lock_guard l(prime_pg_temp_lock);
1482     // do not touch a mapping if a change is pending
1483     pending_inc.new_pg_temp.emplace(
1484       pgid,
1485       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1486   }
1487 }
1488
1489 /**
1490  * @note receiving a transaction in this function gives a fair amount of
1491  * freedom to the service implementation if it does need it. It shouldn't.
1492  */
1493 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1494 {
1495   dout(10) << "encode_pending e " << pending_inc.epoch
1496            << dendl;
1497
1498   if (do_prune(t)) {
1499     dout(1) << __func__ << " osdmap full prune encoded e"
1500             << pending_inc.epoch << dendl;
1501   }
1502
1503   // finalize up pending_inc
1504   pending_inc.modified = ceph_clock_now();
1505
1506   int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1507   ceph_assert(r == 0);
1508
1509   if (mapping_job) {
1510     if (!mapping_job->is_done()) {
1511       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1512               << mapping_job.get() << " did not complete, "
1513               << mapping_job->shards << " left" << dendl;
1514       mapping_job->abort();
1515     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1516       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1517               << mapping_job.get() << " is prior epoch "
1518               << mapping.get_epoch() << dendl;
1519     } else {
1520       if (g_conf()->mon_osd_prime_pg_temp) {
1521         maybe_prime_pg_temp();
1522       }
1523     }
1524   } else if (g_conf()->mon_osd_prime_pg_temp) {
1525     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1526             << dendl;
1527   }
1528   mapping_job.reset();
1529
1530   // ensure we don't have blank new_state updates.  these are interrpeted as
1531   // CEPH_OSD_UP (and almost certainly not what we want!).
1532   auto p = pending_inc.new_state.begin();
1533   while (p != pending_inc.new_state.end()) {
1534     if (p->second == 0) {
1535       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1536       p = pending_inc.new_state.erase(p);
1537     } else {
1538       if (p->second & CEPH_OSD_UP) {
1539         pending_inc.new_last_up_change = pending_inc.modified;
1540       }
1541       ++p;
1542     }
1543   }
1544   if (!pending_inc.new_up_client.empty()) {
1545     pending_inc.new_last_up_change = pending_inc.modified;
1546   }
1547   for (auto& i : pending_inc.new_weight) {
1548     if (i.first >= osdmap.max_osd) {
1549       if (i.second) {
1550         // new osd is already marked in
1551         pending_inc.new_last_in_change = pending_inc.modified;
1552         break;
1553       }
1554     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1555       // existing osd marked in or out
1556       pending_inc.new_last_in_change = pending_inc.modified;
1557       break;
1558     }
1559   }
1560
1561   {
1562     OSDMap tmp;
1563     tmp.deepish_copy_from(osdmap);
1564     tmp.apply_incremental(pending_inc);
1565
1566     // clean pg_temp mappings
1567     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1568
1569     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1570     {
1571       // check every upmapped pg for now
1572       // until we could reliably identify certain cases to ignore,
1573       // which is obviously the hard part TBD..
1574       vector<pg_t> pgs_to_check;
1575       tmp.get_upmap_pgs(&pgs_to_check);
1576       if (pgs_to_check.size() <
1577           static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1578         // not enough pgs, do it inline
1579         tmp.clean_pg_upmaps(cct, &pending_inc);
1580       } else {
1581         CleanUpmapJob job(cct, tmp, pending_inc);
1582         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1583         job.wait();
1584       }
1585     }
1586
1587     // update creating pgs first so that we can remove the created pgid and
1588     // process the pool flag removal below in the same osdmap epoch.
1589     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1590     bufferlist creatings_bl;
1591     uint64_t features = CEPH_FEATURES_ALL;
1592     if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1593       dout(20) << __func__ << " encoding pending pgs without octopus features"
1594                << dendl;
1595       features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1596     }
1597     encode(pending_creatings, creatings_bl, features);
1598     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1599
1600     // remove any old (or incompat) POOL_CREATING flags
1601     for (auto& i : tmp.get_pools()) {
1602       if (tmp.require_osd_release < ceph_release_t::nautilus) {
1603         // pre-nautilus OSDMaps shouldn't get this flag.
1604         if (pending_inc.new_pools.count(i.first)) {
1605           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1606         }
1607       }
1608       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1609           !pending_creatings.still_creating_pool(i.first)) {
1610         dout(10) << __func__ << " done creating pool " << i.first
1611                  << ", clearing CREATING flag" << dendl;
1612         if (pending_inc.new_pools.count(i.first) == 0) {
1613           pending_inc.new_pools[i.first] = i.second;
1614         }
1615         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1616       }
1617     }
1618
1619     // collect which pools are currently affected by
1620     // the near/backfill/full osd(s),
1621     // and set per-pool near/backfill/full flag instead
1622     set<int64_t> full_pool_ids;
1623     set<int64_t> backfillfull_pool_ids;
1624     set<int64_t> nearfull_pool_ids;
1625     tmp.get_full_pools(cct,
1626                        &full_pool_ids,
1627                        &backfillfull_pool_ids,
1628                          &nearfull_pool_ids);
1629     if (full_pool_ids.empty() ||
1630         backfillfull_pool_ids.empty() ||
1631         nearfull_pool_ids.empty()) {
1632       // normal case - no nearfull, backfillfull or full osds
1633         // try cancel any improper nearfull/backfillfull/full pool
1634         // flags first
1635       for (auto &pool: tmp.get_pools()) {
1636         auto p = pool.first;
1637         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1638             nearfull_pool_ids.empty()) {
1639           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1640                    << "'s nearfull flag" << dendl;
1641           if (pending_inc.new_pools.count(p) == 0) {
1642             // load original pool info first!
1643             pending_inc.new_pools[p] = pool.second;
1644           }
1645           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1646         }
1647         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1648             backfillfull_pool_ids.empty()) {
1649           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650                    << "'s backfillfull flag" << dendl;
1651           if (pending_inc.new_pools.count(p) == 0) {
1652             pending_inc.new_pools[p] = pool.second;
1653           }
1654           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1655         }
1656         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1657             full_pool_ids.empty()) {
1658           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1659             // set by EQUOTA, skipping
1660             continue;
1661           }
1662           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1663                    << "'s full flag" << dendl;
1664           if (pending_inc.new_pools.count(p) == 0) {
1665             pending_inc.new_pools[p] = pool.second;
1666           }
1667           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1668         }
1669       }
1670     }
1671     if (!full_pool_ids.empty()) {
1672       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1673                << " as full" << dendl;
1674       for (auto &p: full_pool_ids) {
1675         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1676           continue;
1677         }
1678         if (pending_inc.new_pools.count(p) == 0) {
1679           pending_inc.new_pools[p] = tmp.pools[p];
1680         }
1681         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1682         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1683         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1684       }
1685       // cancel FLAG_FULL for pools which are no longer full too
1686       for (auto &pool: tmp.get_pools()) {
1687         auto p = pool.first;
1688         if (full_pool_ids.count(p)) {
1689           // skip pools we have just marked as full above
1690           continue;
1691         }
1692         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1693             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1694           // don't touch if currently is not full
1695           // or is running out of quota (and hence considered as full)
1696           continue;
1697         }
1698         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1699                  << "'s full flag" << dendl;
1700         if (pending_inc.new_pools.count(p) == 0) {
1701           pending_inc.new_pools[p] = pool.second;
1702         }
1703         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1704       }
1705     }
1706     if (!backfillfull_pool_ids.empty()) {
1707       for (auto &p: backfillfull_pool_ids) {
1708         if (full_pool_ids.count(p)) {
1709           // skip pools we have already considered as full above
1710           continue;
1711         }
1712         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1713           // make sure FLAG_FULL is truly set, so we are safe not
1714           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1715           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1716           continue;
1717         }
1718         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1719           // don't bother if pool is already marked as backfillfull
1720           continue;
1721         }
1722         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1723                  << "'s as backfillfull" << dendl;
1724         if (pending_inc.new_pools.count(p) == 0) {
1725           pending_inc.new_pools[p] = tmp.pools[p];
1726         }
1727         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1728         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1729       }
1730       // cancel FLAG_BACKFILLFULL for pools
1731       // which are no longer backfillfull too
1732       for (auto &pool: tmp.get_pools()) {
1733         auto p = pool.first;
1734         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1735           // skip pools we have just marked as backfillfull/full above
1736           continue;
1737         }
1738         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1739           // and don't touch if currently is not backfillfull
1740           continue;
1741         }
1742         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743                  << "'s backfillfull flag" << dendl;
1744         if (pending_inc.new_pools.count(p) == 0) {
1745           pending_inc.new_pools[p] = pool.second;
1746         }
1747         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1748       }
1749     }
1750     if (!nearfull_pool_ids.empty()) {
1751       for (auto &p: nearfull_pool_ids) {
1752         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1753           continue;
1754         }
1755         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1756           // make sure FLAG_FULL is truly set, so we are safe not
1757           // to set a extra (redundant) FLAG_NEARFULL flag
1758           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1759           continue;
1760         }
1761         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1762           // don't bother if pool is already marked as nearfull
1763           continue;
1764         }
1765         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1766                  << "'s as nearfull" << dendl;
1767         if (pending_inc.new_pools.count(p) == 0) {
1768           pending_inc.new_pools[p] = tmp.pools[p];
1769         }
1770         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1771       }
1772       // cancel FLAG_NEARFULL for pools
1773       // which are no longer nearfull too
1774       for (auto &pool: tmp.get_pools()) {
1775         auto p = pool.first;
1776         if (full_pool_ids.count(p) ||
1777             backfillfull_pool_ids.count(p) ||
1778             nearfull_pool_ids.count(p)) {
1779           // skip pools we have just marked as
1780           // nearfull/backfillfull/full above
1781           continue;
1782         }
1783         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1784           // and don't touch if currently is not nearfull
1785           continue;
1786         }
1787         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1788                  << "'s nearfull flag" << dendl;
1789         if (pending_inc.new_pools.count(p) == 0) {
1790           pending_inc.new_pools[p] = pool.second;
1791         }
1792         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1793       }
1794     }
1795
1796     // min_compat_client?
1797     if (!tmp.require_min_compat_client) {
1798       auto mv = tmp.get_min_compat_client();
1799       dout(1) << __func__ << " setting require_min_compat_client to currently "
1800               << "required " << mv << dendl;
1801       mon->clog->info() << "setting require_min_compat_client to currently "
1802                         << "required " << mv;
1803       pending_inc.new_require_min_compat_client = mv;
1804     }
1805
1806     if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1807         tmp.require_osd_release >= ceph_release_t::nautilus) {
1808       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1809       // add creating flags?
1810       for (auto& i : tmp.get_pools()) {
1811         if (pending_creatings.still_creating_pool(i.first)) {
1812           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1813                    << dendl;
1814           if (pending_inc.new_pools.count(i.first) == 0) {
1815             pending_inc.new_pools[i.first] = i.second;
1816           }
1817           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1818         }
1819       }
1820       // adjust blacklist items to all be TYPE_ANY
1821       for (auto& i : tmp.blacklist) {
1822         auto a = i.first;
1823         a.set_type(entity_addr_t::TYPE_ANY);
1824         pending_inc.new_blacklist[a] = i.second;
1825         pending_inc.old_blacklist.push_back(i.first);
1826       }
1827     }
1828
1829     if (osdmap.require_osd_release < ceph_release_t::octopus &&
1830         tmp.require_osd_release >= ceph_release_t::octopus) {
1831       dout(10) << __func__ << " first octopus+ epoch" << dendl;
1832
1833       // adjust obsoleted cache modes
1834       for (auto& [poolid, pi] : tmp.pools) {
1835         if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1836           if (pending_inc.new_pools.count(poolid) == 0) {
1837             pending_inc.new_pools[poolid] = pi;
1838           }
1839           dout(10) << __func__ << " switching pool " << poolid
1840                    << " cachemode from forward -> proxy" << dendl;
1841           pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1842         }
1843         if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1844           if (pending_inc.new_pools.count(poolid) == 0) {
1845             pending_inc.new_pools[poolid] = pi;
1846           }
1847           dout(10) << __func__ << " switching pool " << poolid
1848                    << " cachemode from readforward -> readproxy" << dendl;
1849           pending_inc.new_pools[poolid].cache_mode =
1850             pg_pool_t::CACHEMODE_READPROXY;
1851         }
1852       }
1853
1854       // clear removed_snaps for every pool
1855       for (auto& [poolid, pi] : tmp.pools) {
1856         if (pi.removed_snaps.empty()) {
1857           continue;
1858         }
1859         if (pending_inc.new_pools.count(poolid) == 0) {
1860           pending_inc.new_pools[poolid] = pi;
1861         }
1862         dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1863                  << dendl;
1864         pending_inc.new_pools[poolid].removed_snaps.clear();
1865       }
1866
1867       // create a combined purged snap epoch key for all purged snaps
1868       // prior to this epoch, and store it in the current epoch (i.e.,
1869       // the last pre-octopus epoch, just prior to the one we're
1870       // encoding now).
1871       auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1872       it->lower_bound("purged_snap_");
1873       map<int64_t,snap_interval_set_t> combined;
1874       while (it->valid()) {
1875         if (it->key().find("purged_snap_") != 0) {
1876           break;
1877         }
1878         string k = it->key();
1879         long long unsigned pool;
1880         int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1881         if (n != 1) {
1882           derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1883         } else {
1884           bufferlist v = it->value();
1885           auto p = v.cbegin();
1886           snapid_t begin, end;
1887           ceph::decode(begin, p);
1888           ceph::decode(end, p);
1889           combined[pool].insert(begin, end - begin);
1890         }
1891         it->next();
1892       }
1893       if (!combined.empty()) {
1894         string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1895         bufferlist v;
1896         ceph::encode(combined, v);
1897         t->put(OSD_SNAP_PREFIX, k, v);
1898         dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1899                  << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1900                  << dendl;
1901       } else {
1902         dout(10) << __func__ << " there were no pre-octopus purged snaps"
1903                  << dendl;
1904       }
1905
1906       // clean out the old removed_snap_ and removed_epoch keys
1907       // ('`' is ASCII '_' + 1)
1908       t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1909       t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1910     }
1911   }
1912
1913   // tell me about it
1914   for (auto i = pending_inc.new_state.begin();
1915        i != pending_inc.new_state.end();
1916        ++i) {
1917     int s = i->second ? i->second : CEPH_OSD_UP;
1918     if (s & CEPH_OSD_UP)
1919       dout(2) << " osd." << i->first << " DOWN" << dendl;
1920     if (s & CEPH_OSD_EXISTS)
1921       dout(2) << " osd." << i->first << " DNE" << dendl;
1922   }
1923   for (auto i = pending_inc.new_up_client.begin();
1924        i != pending_inc.new_up_client.end();
1925        ++i) {
1926     //FIXME: insert cluster addresses too
1927     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1928   }
1929   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1930        i != pending_inc.new_weight.end();
1931        ++i) {
1932     if (i->second == CEPH_OSD_OUT) {
1933       dout(2) << " osd." << i->first << " OUT" << dendl;
1934     } else if (i->second == CEPH_OSD_IN) {
1935       dout(2) << " osd." << i->first << " IN" << dendl;
1936     } else {
1937       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1938     }
1939   }
1940
1941   // features for osdmap and its incremental
1942   uint64_t features;
1943
1944   // encode full map and determine its crc
1945   OSDMap tmp;
1946   {
1947     tmp.deepish_copy_from(osdmap);
1948     tmp.apply_incremental(pending_inc);
1949
1950     // determine appropriate features
1951     features = tmp.get_encoding_features();
1952     dout(10) << __func__ << " encoding full map with "
1953              << tmp.require_osd_release
1954              << " features " << features << dendl;
1955
1956     // the features should be a subset of the mon quorum's features!
1957     ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1958
1959     bufferlist fullbl;
1960     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1961     pending_inc.full_crc = tmp.get_crc();
1962
1963     // include full map in the txn.  note that old monitors will
1964     // overwrite this.  new ones will now skip the local full map
1965     // encode and reload from this.
1966     put_version_full(t, pending_inc.epoch, fullbl);
1967   }
1968
1969   // encode
1970   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1971   bufferlist bl;
1972   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1973
1974   dout(20) << " full_crc " << tmp.get_crc()
1975            << " inc_crc " << pending_inc.inc_crc << dendl;
1976
1977   /* put everything in the transaction */
1978   put_version(t, pending_inc.epoch, bl);
1979   put_last_committed(t, pending_inc.epoch);
1980
1981   // metadata, too!
1982   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1983        p != pending_metadata.end();
1984        ++p)
1985     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1986   for (set<int>::iterator p = pending_metadata_rm.begin();
1987        p != pending_metadata_rm.end();
1988        ++p)
1989     t->erase(OSD_METADATA_PREFIX, stringify(*p));
1990   pending_metadata.clear();
1991   pending_metadata_rm.clear();
1992
1993   // purged_snaps
1994   if (tmp.require_osd_release >= ceph_release_t::octopus &&
1995       !pending_inc.new_purged_snaps.empty()) {
1996     // all snaps purged this epoch (across all pools)
1997     string k = make_purged_snap_epoch_key(pending_inc.epoch);
1998     bufferlist v;
1999     encode(pending_inc.new_purged_snaps, v);
2000     t->put(OSD_SNAP_PREFIX, k, v);
2001   }
2002   for (auto& i : pending_inc.new_purged_snaps) {
2003     for (auto q = i.second.begin();
2004          q != i.second.end();
2005          ++q) {
2006       insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2007                                 pending_inc.epoch,
2008                                 t);
2009     }
2010   }
2011   for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2012     for (auto snap : snaps) {
2013       insert_purged_snap_update(pool, snap, snap + 1,
2014                                 pending_inc.epoch,
2015                                 t);
2016     }
2017   }
2018
2019   // health
2020   health_check_map_t next;
2021   tmp.check_health(cct, &next);
2022   encode_health(next, t);
2023 }
2024
2025 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2026 {
2027   bufferlist bl;
2028   int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2029   if (r < 0)
2030     return r;
2031   try {
2032     auto p = bl.cbegin();
2033     decode(m, p);
2034   }
2035   catch (buffer::error& e) {
2036     if (err)
2037       *err << "osd." << osd << " metadata is corrupt";
2038     return -EIO;
2039   }
2040   return 0;
2041 }
2042
2043 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2044 {
2045   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2046     if (osdmap.is_up(osd)) {
2047       map<string,string> meta;
2048       load_metadata(osd, meta, nullptr);
2049       auto p = meta.find(field);
2050       if (p == meta.end()) {
2051         (*out)["unknown"]++;
2052       } else {
2053         (*out)[p->second]++;
2054       }
2055     }
2056   }
2057 }
2058
2059 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2060 {
2061   map<string,int> by_val;
2062   count_metadata(field, &by_val);
2063   f->open_object_section(field.c_str());
2064   for (auto& p : by_val) {
2065     f->dump_int(p.first.c_str(), p.second);
2066   }
2067   f->close_section();
2068 }
2069
2070 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2071 {
2072   map<string, string> metadata;
2073   int r = load_metadata(osd, metadata, nullptr);
2074   if (r < 0)
2075     return r;
2076
2077   auto it = metadata.find("osd_objectstore");
2078   if (it == metadata.end())
2079     return -ENOENT;
2080   *type = it->second;
2081   return 0;
2082 }
2083
2084 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2085                                                  const pg_pool_t &pool,
2086                                                  ostream *err)
2087 {
2088   // just check a few pgs for efficiency - this can't give a guarantee anyway,
2089   // since filestore osds could always join the pool later
2090   set<int> checked_osds;
2091   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2092     vector<int> up, acting;
2093     pg_t pgid(ps, pool_id);
2094     osdmap.pg_to_up_acting_osds(pgid, up, acting);
2095     for (int osd : up) {
2096       if (checked_osds.find(osd) != checked_osds.end())
2097         continue;
2098       string objectstore_type;
2099       int r = get_osd_objectstore_type(osd, &objectstore_type);
2100       // allow with missing metadata, e.g. due to an osd never booting yet
2101       if (r < 0 || objectstore_type == "bluestore") {
2102         checked_osds.insert(osd);
2103         continue;
2104       }
2105       *err << "osd." << osd << " uses " << objectstore_type;
2106       return false;
2107     }
2108   }
2109   return true;
2110 }
2111
2112 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2113 {
2114   map<string,string> m;
2115   if (int r = load_metadata(osd, m, err))
2116     return r;
2117   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2118     f->dump_string(p->first.c_str(), p->second);
2119   return 0;
2120 }
2121
2122 void OSDMonitor::print_nodes(Formatter *f)
2123 {
2124   // group OSDs by their hosts
2125   map<string, list<int> > osds; // hostname => osd
2126   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2127     map<string, string> m;
2128     if (load_metadata(osd, m, NULL)) {
2129       continue;
2130     }
2131     map<string, string>::iterator hostname = m.find("hostname");
2132     if (hostname == m.end()) {
2133       // not likely though
2134       continue;
2135     }
2136     osds[hostname->second].push_back(osd);
2137   }
2138
2139   dump_services(f, osds, "osd");
2140 }
2141
2142 void OSDMonitor::share_map_with_random_osd()
2143 {
2144   if (osdmap.get_num_up_osds() == 0) {
2145     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2146     return;
2147   }
2148
2149   MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2150   if (!s) {
2151     dout(10) << __func__ << " no up osd on our session map" << dendl;
2152     return;
2153   }
2154
2155   dout(10) << "committed, telling random " << s->name
2156            << " all about it" << dendl;
2157
2158   // get feature of the peer
2159   // use quorum_con_features, if it's an anonymous connection.
2160   uint64_t features = s->con_features ? s->con_features :
2161                                         mon->get_quorum_con_features();
2162   // whatev, they'll request more if they need it
2163   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2164   s->con->send_message(m);
2165   // NOTE: do *not* record osd has up to this epoch (as we do
2166   // elsewhere) as they may still need to request older values.
2167 }
2168
2169 version_t OSDMonitor::get_trim_to() const
2170 {
2171   if (mon->get_quorum().empty()) {
2172     dout(10) << __func__ << ": quorum not formed" << dendl;
2173     return 0;
2174   }
2175
2176   {
2177     std::lock_guard<std::mutex> l(creating_pgs_lock);
2178     if (!creating_pgs.pgs.empty()) {
2179       return 0;
2180     }
2181   }
2182
2183   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2184     dout(0) << __func__
2185             << " blocking osdmap trim"
2186                " ('mon_debug_block_osdmap_trim' set to 'true')"
2187             << dendl;
2188     return 0;
2189   }
2190
2191   {
2192     epoch_t floor = get_min_last_epoch_clean();
2193     dout(10) << " min_last_epoch_clean " << floor << dendl;
2194     if (g_conf()->mon_osd_force_trim_to > 0 &&
2195         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2196       floor = g_conf()->mon_osd_force_trim_to;
2197       dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2198     }
2199     unsigned min = g_conf()->mon_min_osdmap_epochs;
2200     if (floor + min > get_last_committed()) {
2201       if (min < get_last_committed())
2202         floor = get_last_committed() - min;
2203       else
2204         floor = 0;
2205     }
2206     if (floor > get_first_committed())
2207       return floor;
2208   }
2209   return 0;
2210 }
2211
2212 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2213 {
2214   auto floor = last_epoch_clean.get_lower_bound(osdmap);
2215   // also scan osd epochs
2216   // don't trim past the oldest reported osd epoch
2217   for (auto& osd_epoch : osd_epochs) {
2218     if (osd_epoch.second < floor &&
2219         osdmap.is_out(osd_epoch.first)) {
2220       floor = osd_epoch.second;
2221     }
2222   }
2223   return floor;
2224 }
2225
2226 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2227                                    version_t first)
2228 {
2229   dout(10) << __func__ << " including full map for e " << first << dendl;
2230   bufferlist bl;
2231   get_version_full(first, bl);
2232   put_version_full(tx, first, bl);
2233
2234   if (has_osdmap_manifest &&
2235       first > osdmap_manifest.get_first_pinned()) {
2236     _prune_update_trimmed(tx, first);
2237   }
2238 }
2239
2240
2241 /* full osdmap prune
2242  *
2243  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2244  */
2245
2246 void OSDMonitor::load_osdmap_manifest()
2247 {
2248   bool store_has_manifest =
2249     mon->store->exists(get_service_name(), "osdmap_manifest");
2250
2251   if (!store_has_manifest) {
2252     if (!has_osdmap_manifest) {
2253       return;
2254     }
2255
2256     dout(20) << __func__
2257              << " dropping osdmap manifest from memory." << dendl;
2258     osdmap_manifest = osdmap_manifest_t();
2259     has_osdmap_manifest = false;
2260     return;
2261   }
2262
2263   dout(20) << __func__
2264            << " osdmap manifest detected in store; reload." << dendl;
2265
2266   bufferlist manifest_bl;
2267   int r = get_value("osdmap_manifest", manifest_bl);
2268   if (r < 0) {
2269     derr << __func__ << " unable to read osdmap version manifest" << dendl;
2270     ceph_abort_msg("error reading manifest");
2271   }
2272   osdmap_manifest.decode(manifest_bl);
2273   has_osdmap_manifest = true;
2274
2275   dout(10) << __func__ << " store osdmap manifest pinned ("
2276            << osdmap_manifest.get_first_pinned()
2277            << " .. "
2278            << osdmap_manifest.get_last_pinned()
2279            << ")"
2280            << dendl;
2281 }
2282
2283 bool OSDMonitor::should_prune() const
2284 {
2285   version_t first = get_first_committed();
2286   version_t last = get_last_committed();
2287   version_t min_osdmap_epochs =
2288     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2289   version_t prune_min =
2290     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2291   version_t prune_interval =
2292     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2293   version_t last_pinned = osdmap_manifest.get_last_pinned();
2294   version_t last_to_pin = last - min_osdmap_epochs;
2295
2296   // Make it or break it constraints.
2297   //
2298   // If any of these conditions fails, we will not prune, regardless of
2299   // whether we have an on-disk manifest with an on-going pruning state.
2300   //
2301   if ((last - first) <= min_osdmap_epochs) {
2302     // between the first and last committed epochs, we don't have
2303     // enough epochs to trim, much less to prune.
2304     dout(10) << __func__
2305              << " currently holding only " << (last - first)
2306              << " epochs (min osdmap epochs: " << min_osdmap_epochs
2307              << "); do not prune."
2308              << dendl;
2309     return false;
2310
2311   } else if ((last_to_pin - first) < prune_min) {
2312     // between the first committed epoch and the last epoch we would prune,
2313     // we simply don't have enough versions over the minimum to prune maps.
2314     dout(10) << __func__
2315              << " could only prune " << (last_to_pin - first)
2316              << " epochs (" << first << ".." << last_to_pin << "), which"
2317                 " is less than the required minimum (" << prune_min << ")"
2318              << dendl;
2319     return false;
2320
2321   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2322     dout(10) << __func__
2323              << " we have pruned as far as we can; do not prune."
2324              << dendl;
2325     return false;
2326
2327   } else if (last_pinned + prune_interval > last_to_pin) {
2328     dout(10) << __func__
2329              << " not enough epochs to form an interval (last pinned: "
2330              << last_pinned << ", last to pin: "
2331              << last_to_pin << ", interval: " << prune_interval << ")"
2332              << dendl;
2333     return false;
2334   }
2335
2336   dout(15) << __func__
2337            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2338            << " lc (" << first << ".." << last << ")"
2339            << dendl;
2340   return true;
2341 }
2342
2343 void OSDMonitor::_prune_update_trimmed(
2344     MonitorDBStore::TransactionRef tx,
2345     version_t first)
2346 {
2347   dout(10) << __func__
2348            << " first " << first
2349            << " last_pinned " << osdmap_manifest.get_last_pinned()
2350            << " last_pinned " << osdmap_manifest.get_last_pinned()
2351            << dendl;
2352
2353   osdmap_manifest_t manifest = osdmap_manifest;
2354
2355   if (!manifest.is_pinned(first)) {
2356     manifest.pin(first);
2357   }
2358
2359   set<version_t>::iterator p_end = manifest.pinned.find(first);
2360   set<version_t>::iterator p = manifest.pinned.begin();
2361   manifest.pinned.erase(p, p_end);
2362   ceph_assert(manifest.get_first_pinned() == first);
2363
2364   if (manifest.get_last_pinned() == first+1 ||
2365       manifest.pinned.size() == 1) {
2366     // we reached the end of the line, as pinned maps go; clean up our
2367     // manifest, and let `should_prune()` decide whether we should prune
2368     // again.
2369     tx->erase(get_service_name(), "osdmap_manifest");
2370     return;
2371   }
2372
2373   bufferlist bl;
2374   manifest.encode(bl);
2375   tx->put(get_service_name(), "osdmap_manifest", bl);
2376 }
2377
2378 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2379 {
2380   dout(1) << __func__ << dendl;
2381
2382   version_t pin_first;
2383
2384   // verify constrainsts on stable in-memory state
2385   if (!has_osdmap_manifest) {
2386     // we must have never pruned, OR if we pruned the state must no longer
2387     // be relevant (i.e., the state must have been removed alongside with
2388     // the trim that *must* have removed past the last pinned map in a
2389     // previous prune).
2390     ceph_assert(osdmap_manifest.pinned.empty());
2391     ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2392     pin_first = get_first_committed();
2393
2394   } else {
2395     // we must have pruned in the past AND its state is still relevant
2396     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2397     // and thus we still hold a manifest in the store).
2398     ceph_assert(!osdmap_manifest.pinned.empty());
2399     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2400     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2401
2402     dout(10) << __func__
2403              << " first_pinned " << osdmap_manifest.get_first_pinned()
2404              << " last_pinned " << osdmap_manifest.get_last_pinned()
2405              << dendl;
2406
2407     pin_first = osdmap_manifest.get_last_pinned();
2408   }
2409
2410   manifest.pin(pin_first);
2411 }
2412
2413 bool OSDMonitor::_prune_sanitize_options() const
2414 {
2415   uint64_t prune_interval =
2416     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2417   uint64_t prune_min =
2418     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2419   uint64_t txsize =
2420     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2421
2422   bool r = true;
2423
2424   if (prune_interval == 0) {
2425     derr << __func__
2426          << " prune is enabled BUT prune interval is zero; abort."
2427          << dendl;
2428     r = false;
2429   } else if (prune_interval == 1) {
2430     derr << __func__
2431          << " prune interval is equal to one, which essentially means"
2432             " no pruning; abort."
2433          << dendl;
2434     r = false;
2435   }
2436   if (prune_min == 0) {
2437     derr << __func__
2438          << " prune is enabled BUT prune min is zero; abort."
2439          << dendl;
2440     r = false;
2441   }
2442   if (prune_interval > prune_min) {
2443     derr << __func__
2444          << " impossible to ascertain proper prune interval because"
2445          << " it is greater than the minimum prune epochs"
2446          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2447          << dendl;
2448     r = false;
2449   }
2450
2451   if (txsize < prune_interval - 1) {
2452     derr << __func__
2453          << "'mon_osdmap_full_prune_txsize' (" << txsize
2454          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2455          << "); abort." << dendl;
2456     r = false;
2457   }
2458   return r;
2459 }
2460
2461 bool OSDMonitor::is_prune_enabled() const {
2462   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2463 }
2464
2465 bool OSDMonitor::is_prune_supported() const {
2466   return mon->get_required_mon_features().contains_any(
2467       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2468 }
2469
2470 /** do_prune
2471  *
2472  * @returns true if has side-effects; false otherwise.
2473  */
2474 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2475 {
2476   bool enabled = is_prune_enabled();
2477
2478   dout(1) << __func__ << " osdmap full prune "
2479           << ( enabled ? "enabled" : "disabled")
2480           << dendl;
2481
2482   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2483     return false;
2484   }
2485
2486   // we are beyond the minimum prune versions, we need to remove maps because
2487   // otherwise the store will grow unbounded and we may end up having issues
2488   // with available disk space or store hangs.
2489
2490   // we will not pin all versions. We will leave a buffer number of versions.
2491   // this allows us the monitor to trim maps without caring too much about
2492   // pinned maps, and then allow us to use another ceph-mon without these
2493   // capabilities, without having to repair the store.
2494
2495   osdmap_manifest_t manifest = osdmap_manifest;
2496
2497   version_t first = get_first_committed();
2498   version_t last = get_last_committed();
2499
2500   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2501   version_t last_pinned = manifest.get_last_pinned();
2502   uint64_t prune_interval =
2503     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2504   uint64_t txsize =
2505     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2506
2507   prune_init(manifest);
2508
2509   // we need to get rid of some osdmaps
2510
2511   dout(5) << __func__
2512           << " lc (" << first << " .. " << last << ")"
2513           << " last_pinned " << last_pinned
2514           << " interval " << prune_interval
2515           << " last_to_pin " << last_to_pin
2516           << dendl;
2517
2518   // We will be erasing maps as we go.
2519   //
2520   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2521   //
2522   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2523   // we stop pruning. We could prune the maps between `next_to_pin` and
2524   // `last_to_pin`, but by not doing it we end up with neater pruned
2525   // intervals, aligned with `prune_interval`. Besides, this should not be a
2526   // problem as long as `prune_interval` is set to a sane value, instead of
2527   // hundreds or thousands of maps.
2528
2529   auto map_exists = [this](version_t v) {
2530     string k = mon->store->combine_strings("full", v);
2531     return mon->store->exists(get_service_name(), k);
2532   };
2533
2534   // 'interval' represents the number of maps from the last pinned
2535   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2536   // version 11 next; all intermediate versions will be removed.
2537   //
2538   // 'txsize' represents the maximum number of versions we'll be removing in
2539   // this iteration. If 'txsize' is large enough to perform multiple passes
2540   // pinning and removing maps, we will do so; if not, we'll do at least one
2541   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2542   // ensure that we never go *over* the maximum.
2543
2544   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2545   uint64_t removal_interval = prune_interval - 1;
2546
2547   if (txsize < removal_interval) {
2548     dout(5) << __func__
2549             << " setting txsize to removal interval size ("
2550             << removal_interval << " versions"
2551             << dendl;
2552     txsize = removal_interval;
2553   }
2554   ceph_assert(removal_interval > 0);
2555
2556   uint64_t num_pruned = 0;
2557   while (num_pruned + removal_interval <= txsize) {
2558     last_pinned = manifest.get_last_pinned();
2559
2560     if (last_pinned + prune_interval > last_to_pin) {
2561       break;
2562     }
2563     ceph_assert(last_pinned < last_to_pin);
2564
2565     version_t next_pinned = last_pinned + prune_interval;
2566     ceph_assert(next_pinned <= last_to_pin);
2567     manifest.pin(next_pinned);
2568
2569     dout(20) << __func__
2570              << " last_pinned " << last_pinned
2571              << " next_pinned " << next_pinned
2572              << " num_pruned " << num_pruned
2573              << " removal interval (" << (last_pinned+1)
2574              << ".." << (next_pinned-1) << ")"
2575              << " txsize " << txsize << dendl;
2576
2577     ceph_assert(map_exists(last_pinned));
2578     ceph_assert(map_exists(next_pinned));
2579
2580     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2581       ceph_assert(!manifest.is_pinned(v));
2582
2583       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2584       string full_key = mon->store->combine_strings("full", v);
2585       tx->erase(get_service_name(), full_key);
2586       ++num_pruned;
2587     }
2588   }
2589
2590   ceph_assert(num_pruned > 0);
2591
2592   bufferlist bl;
2593   manifest.encode(bl);
2594   tx->put(get_service_name(), "osdmap_manifest", bl);
2595
2596   return true;
2597 }
2598
2599
2600 // -------------
2601
2602 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2603 {
2604   op->mark_osdmon_event(__func__);
2605   Message *m = op->get_req();
2606   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2607
2608   switch (m->get_type()) {
2609     // READs
2610   case MSG_MON_COMMAND:
2611     try {
2612       return preprocess_command(op);
2613     } catch (const bad_cmd_get& e) {
2614       bufferlist bl;
2615       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2616       return true;
2617     }
2618   case CEPH_MSG_MON_GET_OSDMAP:
2619     return preprocess_get_osdmap(op);
2620
2621     // damp updates
2622   case MSG_OSD_MARK_ME_DOWN:
2623     return preprocess_mark_me_down(op);
2624   case MSG_OSD_MARK_ME_DEAD:
2625     return preprocess_mark_me_dead(op);
2626   case MSG_OSD_FULL:
2627     return preprocess_full(op);
2628   case MSG_OSD_FAILURE:
2629     return preprocess_failure(op);
2630   case MSG_OSD_BOOT:
2631     return preprocess_boot(op);
2632   case MSG_OSD_ALIVE:
2633     return preprocess_alive(op);
2634   case MSG_OSD_PG_CREATED:
2635     return preprocess_pg_created(op);
2636   case MSG_OSD_PG_READY_TO_MERGE:
2637     return preprocess_pg_ready_to_merge(op);
2638   case MSG_OSD_PGTEMP:
2639     return preprocess_pgtemp(op);
2640   case MSG_OSD_BEACON:
2641     return preprocess_beacon(op);
2642
2643   case CEPH_MSG_POOLOP:
2644     return preprocess_pool_op(op);
2645
2646   case MSG_REMOVE_SNAPS:
2647     return preprocess_remove_snaps(op);
2648
2649   case MSG_MON_GET_PURGED_SNAPS:
2650     return preprocess_get_purged_snaps(op);
2651
2652   default:
2653     ceph_abort();
2654     return true;
2655   }
2656 }
2657
2658 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2659 {
2660   op->mark_osdmon_event(__func__);
2661   Message *m = op->get_req();
2662   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2663
2664   switch (m->get_type()) {
2665     // damp updates
2666   case MSG_OSD_MARK_ME_DOWN:
2667     return prepare_mark_me_down(op);
2668   case MSG_OSD_MARK_ME_DEAD:
2669     return prepare_mark_me_dead(op);
2670   case MSG_OSD_FULL:
2671     return prepare_full(op);
2672   case MSG_OSD_FAILURE:
2673     return prepare_failure(op);
2674   case MSG_OSD_BOOT:
2675     return prepare_boot(op);
2676   case MSG_OSD_ALIVE:
2677     return prepare_alive(op);
2678   case MSG_OSD_PG_CREATED:
2679     return prepare_pg_created(op);
2680   case MSG_OSD_PGTEMP:
2681     return prepare_pgtemp(op);
2682   case MSG_OSD_PG_READY_TO_MERGE:
2683     return prepare_pg_ready_to_merge(op);
2684   case MSG_OSD_BEACON:
2685     return prepare_beacon(op);
2686
2687   case MSG_MON_COMMAND:
2688     try {
2689       return prepare_command(op);
2690     } catch (const bad_cmd_get& e) {
2691       bufferlist bl;
2692       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2693       return true;
2694     }
2695
2696   case CEPH_MSG_POOLOP:
2697     return prepare_pool_op(op);
2698
2699   case MSG_REMOVE_SNAPS:
2700     return prepare_remove_snaps(op);
2701
2702
2703   default:
2704     ceph_abort();
2705   }
2706
2707   return false;
2708 }
2709
2710 bool OSDMonitor::should_propose(double& delay)
2711 {
2712   dout(10) << "should_propose" << dendl;
2713
2714   // if full map, propose immediately!  any subsequent changes will be clobbered.
2715   if (pending_inc.fullmap.length())
2716     return true;
2717
2718   // adjust osd weights?
2719   if (!osd_weight.empty() &&
2720       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2721     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2722     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2723     delay = 0.0;
2724     osd_weight.clear();
2725     return true;
2726   }
2727
2728   return PaxosService::should_propose(delay);
2729 }
2730
2731
2732
2733 // ---------------------------
2734 // READs
2735
2736 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2737 {
2738   op->mark_osdmon_event(__func__);
2739   auto m = op->get_req<MMonGetOSDMap>();
2740
2741   uint64_t features = mon->get_quorum_con_features();
2742   if (op->get_session() && op->get_session()->con_features)
2743     features = op->get_session()->con_features;
2744
2745   dout(10) << __func__ << " " << *m << dendl;
2746   MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2747   epoch_t first = get_first_committed();
2748   epoch_t last = osdmap.get_epoch();
2749   int max = g_conf()->osd_map_message_max;
2750   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2751   for (epoch_t e = std::max(first, m->get_full_first());
2752        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2753        ++e, --max) {
2754     bufferlist& bl = reply->maps[e];
2755     int r = get_version_full(e, features, bl);
2756     ceph_assert(r >= 0);
2757     max_bytes -= bl.length();
2758   }
2759   for (epoch_t e = std::max(first, m->get_inc_first());
2760        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2761        ++e, --max) {
2762     bufferlist& bl = reply->incremental_maps[e];
2763     int r = get_version(e, features, bl);
2764     ceph_assert(r >= 0);
2765     max_bytes -= bl.length();
2766   }
2767   reply->oldest_map = first;
2768   reply->newest_map = last;
2769   mon->send_reply(op, reply);
2770   return true;
2771 }
2772
2773
2774 // ---------------------------
2775 // UPDATEs
2776
2777 // failure --
2778
2779 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2780   // check permissions
2781   MonSession *session = op->get_session();
2782   if (!session)
2783     return true;
2784   if (!session->is_capable("osd", MON_CAP_X)) {
2785     dout(0) << "got MOSDFailure from entity with insufficient caps "
2786             << session->caps << dendl;
2787     return true;
2788   }
2789   if (fsid != mon->monmap->fsid) {
2790     dout(0) << "check_source: on fsid " << fsid
2791             << " != " << mon->monmap->fsid << dendl;
2792     return true;
2793   }
2794   return false;
2795 }
2796
2797
2798 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2799 {
2800   op->mark_osdmon_event(__func__);
2801   auto m = op->get_req<MOSDFailure>();
2802   // who is target_osd
2803   int badboy = m->get_target_osd();
2804
2805   // check permissions
2806   if (check_source(op, m->fsid))
2807     goto didit;
2808
2809   // first, verify the reporting host is valid
2810   if (m->get_orig_source().is_osd()) {
2811     int from = m->get_orig_source().num();
2812     if (!osdmap.exists(from) ||
2813         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2814         (osdmap.is_down(from) && m->if_osd_failed())) {
2815       dout(5) << "preprocess_failure from dead osd." << from
2816               << ", ignoring" << dendl;
2817       send_incremental(op, m->get_epoch()+1);
2818       goto didit;
2819     }
2820   }
2821
2822
2823   // weird?
2824   if (osdmap.is_down(badboy)) {
2825     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2826             << " " << m->get_target_addrs()
2827             << ", from " << m->get_orig_source() << dendl;
2828     if (m->get_epoch() < osdmap.get_epoch())
2829       send_incremental(op, m->get_epoch()+1);
2830     goto didit;
2831   }
2832   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2833     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2834             << " " << m->get_target_addrs()
2835             << " != map's " << osdmap.get_addrs(badboy)
2836             << ", from " << m->get_orig_source() << dendl;
2837     if (m->get_epoch() < osdmap.get_epoch())
2838       send_incremental(op, m->get_epoch()+1);
2839     goto didit;
2840   }
2841
2842   // already reported?
2843   if (osdmap.is_down(badboy) ||
2844       osdmap.get_up_from(badboy) > m->get_epoch()) {
2845     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2846             << " " << m->get_target_addrs()
2847             << ", from " << m->get_orig_source() << dendl;
2848     if (m->get_epoch() < osdmap.get_epoch())
2849       send_incremental(op, m->get_epoch()+1);
2850     goto didit;
2851   }
2852
2853   if (!can_mark_down(badboy)) {
2854     dout(5) << "preprocess_failure ignoring report of osd."
2855             << m->get_target_osd() << " " << m->get_target_addrs()
2856             << " from " << m->get_orig_source() << dendl;
2857     goto didit;
2858   }
2859
2860   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2861            << " " << m->get_target_addrs()
2862            << ", from " << m->get_orig_source() << dendl;
2863   return false;
2864
2865  didit:
2866   mon->no_reply(op);
2867   return true;
2868 }
2869
2870 class C_AckMarkedDown : public C_MonOp {
2871   OSDMonitor *osdmon;
2872 public:
2873   C_AckMarkedDown(
2874     OSDMonitor *osdmon,
2875     MonOpRequestRef op)
2876     : C_MonOp(op), osdmon(osdmon) {}
2877
2878   void _finish(int r) override {
2879     if (r == 0) {
2880       auto m = op->get_req<MOSDMarkMeDown>();
2881       osdmon->mon->send_reply(
2882         op,
2883         new MOSDMarkMeDown(
2884           m->fsid,
2885           m->target_osd,
2886           m->target_addrs,
2887           m->get_epoch(),
2888           false));   // ACK itself does not request an ack
2889     } else if (r == -EAGAIN) {
2890         osdmon->dispatch(op);
2891     } else {
2892         ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2893     }
2894   }
2895   ~C_AckMarkedDown() override {
2896   }
2897 };
2898
2899 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2900 {
2901   op->mark_osdmon_event(__func__);
2902   auto m = op->get_req<MOSDMarkMeDown>();
2903   int from = m->target_osd;
2904
2905   // check permissions
2906   if (check_source(op, m->fsid))
2907     goto reply;
2908
2909   // first, verify the reporting host is valid
2910   if (!m->get_orig_source().is_osd())
2911     goto reply;
2912
2913   if (!osdmap.exists(from) ||
2914       osdmap.is_down(from) ||
2915       osdmap.get_addrs(from) != m->target_addrs) {
2916     dout(5) << "preprocess_mark_me_down from dead osd."
2917             << from << ", ignoring" << dendl;
2918     send_incremental(op, m->get_epoch()+1);
2919     goto reply;
2920   }
2921
2922   // no down might be set
2923   if (!can_mark_down(from))
2924     goto reply;
2925
2926   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2927            << " " << m->target_addrs << dendl;
2928   return false;
2929
2930  reply:
2931   if (m->request_ack) {
2932     Context *c(new C_AckMarkedDown(this, op));
2933     c->complete(0);
2934   }
2935   return true;
2936 }
2937
2938 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2939 {
2940   op->mark_osdmon_event(__func__);
2941   auto m = op->get_req<MOSDMarkMeDown>();
2942   int target_osd = m->target_osd;
2943
2944   ceph_assert(osdmap.is_up(target_osd));
2945   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2946
2947   mon->clog->info() << "osd." << target_osd << " marked itself down";
2948   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2949   if (m->request_ack)
2950     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2951   return true;
2952 }
2953
2954 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2955 {
2956   op->mark_osdmon_event(__func__);
2957   auto m = op->get_req<MOSDMarkMeDead>();
2958   int from = m->target_osd;
2959
2960   // check permissions
2961   if (check_source(op, m->fsid)) {
2962     mon->no_reply(op);
2963     return true;
2964   }
2965
2966   // first, verify the reporting host is valid
2967   if (!m->get_orig_source().is_osd()) {
2968     mon->no_reply(op);
2969     return true;
2970   }
2971
2972   if (!osdmap.exists(from) ||
2973       !osdmap.is_down(from)) {
2974     dout(5) << __func__ << " from nonexistent or up osd." << from
2975             << ", ignoring" << dendl;
2976     send_incremental(op, m->get_epoch()+1);
2977     mon->no_reply(op);
2978     return true;
2979   }
2980
2981   return false;
2982 }
2983
2984 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2985 {
2986   op->mark_osdmon_event(__func__);
2987   auto m = op->get_req<MOSDMarkMeDead>();
2988   int target_osd = m->target_osd;
2989
2990   ceph_assert(osdmap.is_down(target_osd));
2991
2992   mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
2993                     << m->get_epoch();
2994   if (!pending_inc.new_xinfo.count(target_osd)) {
2995     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
2996   }
2997   pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
2998   wait_for_finished_proposal(
2999     op,
3000     new LambdaContext(
3001       [op, this] (int r) {
3002         if (r >= 0) {
3003           mon->no_reply(op);      // ignore on success
3004         }
3005       }
3006       ));
3007   return true;
3008 }
3009
3010 bool OSDMonitor::can_mark_down(int i)
3011 {
3012   if (osdmap.is_nodown(i)) {
3013     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3014             << "will not mark it down" << dendl;
3015     return false;
3016   }
3017
3018   int num_osds = osdmap.get_num_osds();
3019   if (num_osds == 0) {
3020     dout(5) << __func__ << " no osds" << dendl;
3021     return false;
3022   }
3023   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3024   float up_ratio = (float)up / (float)num_osds;
3025   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3026     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3027             << g_conf()->mon_osd_min_up_ratio
3028             << ", will not mark osd." << i << " down" << dendl;
3029     return false;
3030   }
3031   return true;
3032 }
3033
3034 bool OSDMonitor::can_mark_up(int i)
3035 {
3036   if (osdmap.is_noup(i)) {
3037     dout(5) << __func__ << " osd." << i << " is marked as noup, "
3038             << "will not mark it up" << dendl;
3039     return false;
3040   }
3041
3042   return true;
3043 }
3044
3045 /**
3046  * @note the parameter @p i apparently only exists here so we can output the
3047  *       osd's id on messages.
3048  */
3049 bool OSDMonitor::can_mark_out(int i)
3050 {
3051   if (osdmap.is_noout(i)) {
3052     dout(5) << __func__ << " osd." << i << " is marked as noout, "
3053             << "will not mark it out" << dendl;
3054     return false;
3055   }
3056
3057   int num_osds = osdmap.get_num_osds();
3058   if (num_osds == 0) {
3059     dout(5) << __func__ << " no osds" << dendl;
3060     return false;
3061   }
3062   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3063   float in_ratio = (float)in / (float)num_osds;
3064   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3065     if (i >= 0)
3066       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3067               << g_conf()->mon_osd_min_in_ratio
3068               << ", will not mark osd." << i << " out" << dendl;
3069     else
3070       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3071               << g_conf()->mon_osd_min_in_ratio
3072               << ", will not mark osds out" << dendl;
3073     return false;
3074   }
3075
3076   return true;
3077 }
3078
3079 bool OSDMonitor::can_mark_in(int i)
3080 {
3081   if (osdmap.is_noin(i)) {
3082     dout(5) << __func__ << " osd." << i << " is marked as noin, "
3083             << "will not mark it in" << dendl;
3084     return false;
3085   }
3086
3087   return true;
3088 }
3089
3090 bool OSDMonitor::check_failures(utime_t now)
3091 {
3092   bool found_failure = false;
3093   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3094        p != failure_info.end();
3095        ++p) {
3096     if (can_mark_down(p->first)) {
3097       found_failure |= check_failure(now, p->first, p->second);
3098     }
3099   }
3100   return found_failure;
3101 }
3102
3103 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3104 {
3105   // already pending failure?
3106   if (pending_inc.new_state.count(target_osd) &&
3107       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3108     dout(10) << " already pending failure" << dendl;
3109     return true;
3110   }
3111
3112   set<string> reporters_by_subtree;
3113   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3114   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3115   utime_t max_failed_since = fi.get_failed_since();
3116   utime_t failed_for = now - max_failed_since;
3117
3118   utime_t grace = orig_grace;
3119   double my_grace = 0, peer_grace = 0;
3120   double decay_k = 0;
3121   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3122     double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3123     decay_k = ::log(.5) / halflife;
3124
3125     // scale grace period based on historical probability of 'lagginess'
3126     // (false positive failures due to slowness).
3127     const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3128     double decay = exp((double)failed_for * decay_k);
3129     dout(20) << " halflife " << halflife << " decay_k " << decay_k
3130              << " failed_for " << failed_for << " decay " << decay << dendl;
3131     my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3132     grace += my_grace;
3133   }
3134
3135   // consider the peers reporting a failure a proxy for a potential
3136   // 'subcluster' over the overall cluster that is similarly
3137   // laggy.  this is clearly not true in all cases, but will sometimes
3138   // help us localize the grace correction to a subset of the system
3139   // (say, a rack with a bad switch) that is unhappy.
3140   ceph_assert(fi.reporters.size());
3141   for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3142     // get the parent bucket whose type matches with "reporter_subtree_level".
3143     // fall back to OSD if the level doesn't exist.
3144     if (osdmap.exists(p->first)) {
3145       auto reporter_loc = osdmap.crush->get_full_location(p->first);
3146       if (auto iter = reporter_loc.find(reporter_subtree_level);
3147           iter == reporter_loc.end()) {
3148         reporters_by_subtree.insert("osd." + to_string(p->first));
3149       } else {
3150         reporters_by_subtree.insert(iter->second);
3151       }
3152       if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3153         const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3154         utime_t elapsed = now - xi.down_stamp;
3155         double decay = exp((double)elapsed * decay_k);
3156         peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3157       }
3158       ++p;
3159     } else {
3160       fi.cancel_report(p->first);;
3161       p = fi.reporters.erase(p);
3162     }
3163   }
3164
3165   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3166     peer_grace /= (double)fi.reporters.size();
3167     grace += peer_grace;
3168   }
3169
3170   dout(10) << " osd." << target_osd << " has "
3171            << fi.reporters.size() << " reporters, "
3172            << grace << " grace (" << orig_grace << " + " << my_grace
3173            << " + " << peer_grace << "), max_failed_since " << max_failed_since
3174            << dendl;
3175
3176   if (failed_for >= grace &&
3177       reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3178     dout(1) << " we have enough reporters to mark osd." << target_osd
3179             << " down" << dendl;
3180     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3181
3182     mon->clog->info() << "osd." << target_osd << " failed ("
3183                       << osdmap.crush->get_full_location_ordered_string(
3184                         target_osd)
3185                       << ") ("
3186                       << (int)reporters_by_subtree.size()
3187                       << " reporters from different "
3188                       << reporter_subtree_level << " after "
3189                       << failed_for << " >= grace " << grace << ")";
3190     return true;
3191   }
3192   return false;
3193 }
3194
3195 void OSDMonitor::force_failure(int target_osd, int by)
3196 {
3197   // already pending failure?
3198   if (pending_inc.new_state.count(target_osd) &&
3199       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3200     dout(10) << " already pending failure" << dendl;
3201     return;
3202   }
3203
3204   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3205   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3206   if (!pending_inc.new_xinfo.count(target_osd)) {
3207     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3208   }
3209   pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3210
3211   mon->clog->info() << "osd." << target_osd << " failed ("
3212                     << osdmap.crush->get_full_location_ordered_string(target_osd)
3213                     << ") (connection refused reported by osd." << by << ")";
3214   return;
3215 }
3216
3217 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3218 {
3219   op->mark_osdmon_event(__func__);
3220   auto m = op->get_req<MOSDFailure>();
3221   dout(1) << "prepare_failure osd." << m->get_target_osd()
3222           << " " << m->get_target_addrs()
3223           << " from " << m->get_orig_source()
3224           << " is reporting failure:" << m->if_osd_failed() << dendl;
3225
3226   int target_osd = m->get_target_osd();
3227   int reporter = m->get_orig_source().num();
3228   ceph_assert(osdmap.is_up(target_osd));
3229   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3230
3231   mon->no_reply(op);
3232
3233   if (m->if_osd_failed()) {
3234     // calculate failure time
3235     utime_t now = ceph_clock_now();
3236     utime_t failed_since =
3237       m->get_recv_stamp() - utime_t(m->failed_for, 0);
3238
3239     // add a report
3240     if (m->is_immediate()) {
3241       mon->clog->debug() << "osd." << m->get_target_osd()
3242                          << " reported immediately failed by "
3243                          << m->get_orig_source();
3244       force_failure(target_osd, reporter);
3245       return true;
3246     }
3247     mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3248                       << m->get_orig_source();
3249
3250     failure_info_t& fi = failure_info[target_osd];
3251     MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3252     if (old_op) {
3253       mon->no_reply(old_op);
3254     }
3255
3256     return check_failure(now, target_osd, fi);
3257   } else {
3258     // remove the report
3259     mon->clog->debug() << "osd." << m->get_target_osd()
3260                        << " failure report canceled by "
3261                        << m->get_orig_source();
3262     if (failure_info.count(target_osd)) {
3263       failure_info_t& fi = failure_info[target_osd];
3264       MonOpRequestRef report_op = fi.cancel_report(reporter);
3265       if (report_op) {
3266         mon->no_reply(report_op);
3267       }
3268       if (fi.reporters.empty()) {
3269         dout(10) << " removing last failure_info for osd." << target_osd
3270                  << dendl;
3271         failure_info.erase(target_osd);
3272       } else {
3273         dout(10) << " failure_info for osd." << target_osd << " now "
3274                  << fi.reporters.size() << " reporters" << dendl;
3275       }
3276     } else {
3277       dout(10) << " no failure_info for osd." << target_osd << dendl;
3278     }
3279   }
3280
3281   return false;
3282 }
3283
3284 void OSDMonitor::process_failures()
3285 {
3286   map<int,failure_info_t>::iterator p = failure_info.begin();
3287   while (p != failure_info.end()) {
3288     if (osdmap.is_up(p->first)) {
3289       ++p;
3290     } else {
3291       dout(10) << "process_failures osd." << p->first << dendl;
3292       list<MonOpRequestRef> ls;
3293       p->second.take_report_messages(ls);
3294       failure_info.erase(p++);
3295
3296       while (!ls.empty()) {
3297         MonOpRequestRef o = ls.front();
3298         if (o) {
3299           o->mark_event(__func__);
3300           MOSDFailure *m = o->get_req<MOSDFailure>();
3301           send_latest(o, m->get_epoch());
3302           mon->no_reply(o);
3303         }
3304         ls.pop_front();
3305       }
3306     }
3307   }
3308 }
3309
3310 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3311 {
3312   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3313
3314   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3315        p != failure_info.end();
3316        ++p) {
3317     p->second.take_report_messages(ls);
3318   }
3319   failure_info.clear();
3320 }
3321
3322
3323 // boot --
3324
3325 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3326 {
3327   op->mark_osdmon_event(__func__);
3328   auto m = op->get_req<MOSDBoot>();
3329   int from = m->get_orig_source_inst().name.num();
3330
3331   // check permissions, ignore if failed (no response expected)
3332   MonSession *session = op->get_session();
3333   if (!session)
3334     goto ignore;
3335   if (!session->is_capable("osd", MON_CAP_X)) {
3336     dout(0) << "got preprocess_boot message from entity with insufficient caps"
3337             << session->caps << dendl;
3338     goto ignore;
3339   }
3340
3341   if (m->sb.cluster_fsid != mon->monmap->fsid) {
3342     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3343             << " != " << mon->monmap->fsid << dendl;
3344     goto ignore;
3345   }
3346
3347   if (m->get_orig_source_inst().addr.is_blank_ip()) {
3348     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3349     goto ignore;
3350   }
3351
3352   ceph_assert(m->get_orig_source_inst().name.is_osd());
3353
3354   // force all osds to have gone through luminous prior to upgrade to nautilus
3355   {
3356     vector<string> missing;
3357     if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3358       missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3359     }
3360     if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3361       missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3362     }
3363     if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3364       missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3365     }
3366     if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3367       missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3368     }
3369
3370     if (!missing.empty()) {
3371       using std::experimental::make_ostream_joiner;
3372
3373       stringstream ss;
3374       copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3375
3376       mon->clog->info() << "disallowing boot of OSD "
3377                         << m->get_orig_source_inst()
3378                         << " because the osd lacks " << ss.str();
3379       goto ignore;
3380     }
3381   }
3382
3383   // make sure osd versions do not span more than 3 releases
3384   if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3385       osdmap.require_osd_release < ceph_release_t::mimic) {
3386     mon->clog->info() << "disallowing boot of octopus+ OSD "
3387                       << m->get_orig_source_inst()
3388                       << " because require_osd_release < mimic";
3389     goto ignore;
3390   }
3391
3392   // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3393   // we are reusing a jewel feature bit that was retired in luminous.
3394   if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3395       osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3396       !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3397     mon->clog->info() << "disallowing boot of OSD "
3398                       << m->get_orig_source_inst()
3399                       << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3400     goto ignore;
3401   }
3402
3403   // already booted?
3404   if (osdmap.is_up(from) &&
3405       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3406       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3407     // yup.
3408     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3409             << " " << m->get_orig_source_addrs()
3410             << " =~ " << osdmap.get_addrs(from) << dendl;
3411     _booted(op, false);
3412     return true;
3413   }
3414
3415   if (osdmap.exists(from) &&
3416       !osdmap.get_uuid(from).is_zero() &&
3417       osdmap.get_uuid(from) != m->sb.osd_fsid) {
3418     dout(7) << __func__ << " from " << m->get_orig_source_inst()
3419             << " clashes with existing osd: different fsid"
3420             << " (ours: " << osdmap.get_uuid(from)
3421             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3422     goto ignore;
3423   }
3424
3425   if (osdmap.exists(from) &&
3426       osdmap.get_info(from).up_from > m->version &&
3427       osdmap.get_most_recent_addrs(from).legacy_equals(
3428         m->get_orig_source_addrs())) {
3429     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3430     send_latest(op, m->sb.current_epoch+1);
3431     return true;
3432   }
3433
3434   // noup?
3435   if (!can_mark_up(from)) {
3436     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3437     send_latest(op, m->sb.current_epoch+1);
3438     return true;
3439   }
3440
3441   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3442   return false;
3443
3444  ignore:
3445   return true;
3446 }
3447
3448 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3449 {
3450   op->mark_osdmon_event(__func__);
3451   auto m = op->get_req<MOSDBoot>();
3452   dout(7) << __func__ << " from " << m->get_source()
3453           << " sb " << m->sb
3454           << " client_addrs" << m->get_connection()->get_peer_addrs()
3455           << " cluster_addrs " << m->cluster_addrs
3456           << " hb_back_addrs " << m->hb_back_addrs
3457           << " hb_front_addrs " << m->hb_front_addrs
3458           << dendl;
3459
3460   ceph_assert(m->get_orig_source().is_osd());
3461   int from = m->get_orig_source().num();
3462
3463   // does this osd exist?
3464   if (from >= osdmap.get_max_osd()) {
3465     dout(1) << "boot from osd." << from << " >= max_osd "
3466             << osdmap.get_max_osd() << dendl;
3467     return false;
3468   }
3469
3470   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3471   if (pending_inc.new_state.count(from))
3472     oldstate ^= pending_inc.new_state[from];
3473
3474   // already up?  mark down first?
3475   if (osdmap.is_up(from)) {
3476     dout(7) << __func__ << " was up, first marking down osd." << from << " "
3477             << osdmap.get_addrs(from) << dendl;
3478     // preprocess should have caught these;  if not, assert.
3479     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3480                   m->get_orig_source_addrs()) ||
3481                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3482     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3483
3484     if (pending_inc.new_state.count(from) == 0 ||
3485         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3486       // mark previous guy down
3487       pending_inc.new_state[from] = CEPH_OSD_UP;
3488     }
3489     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3490   } else if (pending_inc.new_up_client.count(from)) {
3491     // already prepared, just wait
3492     dout(7) << __func__ << " already prepared, waiting on "
3493             << m->get_orig_source_addr() << dendl;
3494     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3495   } else {
3496     // mark new guy up.
3497     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3498     pending_inc.new_up_cluster[from] = m->cluster_addrs;
3499     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3500     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3501
3502     down_pending_out.erase(from);  // if any
3503
3504     if (m->sb.weight)
3505       osd_weight[from] = m->sb.weight;
3506
3507     // set uuid?
3508     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3509              << dendl;
3510     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3511       // preprocess should have caught this;  if not, assert.
3512       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3513       pending_inc.new_uuid[from] = m->sb.osd_fsid;
3514     }
3515
3516     // fresh osd?
3517     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3518       const osd_info_t& i = osdmap.get_info(from);
3519       if (i.up_from > i.lost_at) {
3520         dout(10) << " fresh osd; marking lost_at too" << dendl;
3521         pending_inc.new_lost[from] = osdmap.get_epoch();
3522       }
3523     }
3524
3525     // metadata
3526     bufferlist osd_metadata;
3527     encode(m->metadata, osd_metadata);
3528     pending_metadata[from] = osd_metadata;
3529     pending_metadata_rm.erase(from);
3530
3531     // adjust last clean unmount epoch?
3532     const osd_info_t& info = osdmap.get_info(from);
3533     dout(10) << " old osd_info: " << info << dendl;
3534     if (m->sb.mounted > info.last_clean_begin ||
3535         (m->sb.mounted == info.last_clean_begin &&
3536          m->sb.clean_thru > info.last_clean_end)) {
3537       epoch_t begin = m->sb.mounted;
3538       epoch_t end = m->sb.clean_thru;
3539
3540       dout(10) << __func__ << " osd." << from << " last_clean_interval "
3541                << "[" << info.last_clean_begin << "," << info.last_clean_end
3542                << ") -> [" << begin << "-" << end << ")"
3543                << dendl;
3544       pending_inc.new_last_clean_interval[from] =
3545         pair<epoch_t,epoch_t>(begin, end);
3546     }
3547
3548     if (pending_inc.new_xinfo.count(from) == 0)
3549       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3550     osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3551     if (m->boot_epoch == 0) {
3552       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3553       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3554       dout(10) << " not laggy, new xi " << xi << dendl;
3555     } else {
3556       if (xi.down_stamp.sec()) {
3557         int interval = ceph_clock_now().sec() -
3558           xi.down_stamp.sec();
3559         if (g_conf()->mon_osd_laggy_max_interval &&
3560             (interval > g_conf()->mon_osd_laggy_max_interval)) {
3561           interval =  g_conf()->mon_osd_laggy_max_interval;
3562         }
3563         xi.laggy_interval =
3564           interval * g_conf()->mon_osd_laggy_weight +
3565           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3566       }
3567       xi.laggy_probability =
3568         g_conf()->mon_osd_laggy_weight +
3569         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3570       dout(10) << " laggy, now xi " << xi << dendl;
3571     }
3572
3573     // set features shared by the osd
3574     if (m->osd_features)
3575       xi.features = m->osd_features;
3576     else
3577       xi.features = m->get_connection()->get_features();
3578
3579     // mark in?
3580     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3581          (oldstate & CEPH_OSD_AUTOOUT)) ||
3582         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3583         (g_conf()->mon_osd_auto_mark_in)) {
3584       if (can_mark_in(from)) {
3585         if (xi.old_weight > 0) {
3586           pending_inc.new_weight[from] = xi.old_weight;
3587           xi.old_weight = 0;
3588         } else {
3589           pending_inc.new_weight[from] = CEPH_OSD_IN;
3590         }
3591       } else {
3592         dout(7) << __func__ << " NOIN set, will not mark in "
3593                 << m->get_orig_source_addr() << dendl;
3594       }
3595     }
3596
3597     // wait
3598     wait_for_finished_proposal(op, new C_Booted(this, op));
3599   }
3600   return true;
3601 }
3602
3603 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3604 {
3605   op->mark_osdmon_event(__func__);
3606   auto m = op->get_req<MOSDBoot>();
3607   dout(7) << "_booted " << m->get_orig_source_inst()
3608           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3609
3610   if (logit) {
3611     mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3612                       << " boot";
3613   }
3614
3615   send_latest(op, m->sb.current_epoch+1);
3616 }
3617
3618
3619 // -------------
3620 // full
3621
3622 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3623 {
3624   op->mark_osdmon_event(__func__);
3625   auto m = op->get_req<MOSDFull>();
3626   int from = m->get_orig_source().num();
3627   set<string> state;
3628   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3629
3630   // check permissions, ignore if failed
3631   MonSession *session = op->get_session();
3632   if (!session)
3633     goto ignore;
3634   if (!session->is_capable("osd", MON_CAP_X)) {
3635     dout(0) << "MOSDFull from entity with insufficient privileges:"
3636             << session->caps << dendl;
3637     goto ignore;
3638   }
3639
3640   // ignore a full message from the osd instance that already went down
3641   if (!osdmap.exists(from)) {
3642     dout(7) << __func__ << " ignoring full message from nonexistent "
3643             << m->get_orig_source_inst() << dendl;
3644     goto ignore;
3645   }
3646   if ((!osdmap.is_up(from) &&
3647        osdmap.get_most_recent_addrs(from).legacy_equals(
3648          m->get_orig_source_addrs())) ||
3649       (osdmap.is_up(from) &&
3650        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3651     dout(7) << __func__ << " ignoring full message from down "
3652             << m->get_orig_source_inst() << dendl;
3653     goto ignore;
3654   }
3655
3656   OSDMap::calc_state_set(osdmap.get_state(from), state);
3657
3658   if ((osdmap.get_state(from) & mask) == m->state) {
3659     dout(7) << __func__ << " state already " << state << " for osd." << from
3660             << " " << m->get_orig_source_inst() << dendl;
3661     _reply_map(op, m->version);
3662     goto ignore;
3663   }
3664
3665   dout(10) << __func__ << " want state " << state << " for osd." << from
3666            << " " << m->get_orig_source_inst() << dendl;
3667   return false;
3668
3669  ignore:
3670   return true;
3671 }
3672
3673 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3674 {
3675   op->mark_osdmon_event(__func__);
3676   auto m = op->get_req<MOSDFull>();
3677   const int from = m->get_orig_source().num();
3678
3679   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3680   const unsigned want_state = m->state & mask;  // safety first
3681
3682   unsigned cur_state = osdmap.get_state(from);
3683   auto p = pending_inc.new_state.find(from);
3684   if (p != pending_inc.new_state.end()) {
3685     cur_state ^= p->second;
3686   }
3687   cur_state &= mask;
3688
3689   set<string> want_state_set, cur_state_set;
3690   OSDMap::calc_state_set(want_state, want_state_set);
3691   OSDMap::calc_state_set(cur_state, cur_state_set);
3692
3693   if (cur_state != want_state) {
3694     if (p != pending_inc.new_state.end()) {
3695       p->second &= ~mask;
3696     } else {
3697       pending_inc.new_state[from] = 0;
3698     }
3699     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3700     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3701             << " -> " << want_state_set << dendl;
3702   } else {
3703     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3704             << " = wanted " << want_state_set << ", just waiting" << dendl;
3705   }
3706
3707   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3708   return true;
3709 }
3710
3711 // -------------
3712 // alive
3713
3714 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3715 {
3716   op->mark_osdmon_event(__func__);
3717   auto m = op->get_req<MOSDAlive>();
3718   int from = m->get_orig_source().num();
3719
3720   // check permissions, ignore if failed
3721   MonSession *session = op->get_session();
3722   if (!session)
3723     goto ignore;
3724   if (!session->is_capable("osd", MON_CAP_X)) {
3725     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3726             << session->caps << dendl;
3727     goto ignore;
3728   }
3729
3730   if (!osdmap.is_up(from) ||
3731       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3732     dout(7) << "preprocess_alive ignoring alive message from down "
3733             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3734             << dendl;
3735     goto ignore;
3736   }
3737
3738   if (osdmap.get_up_thru(from) >= m->want) {
3739     // yup.
3740     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3741     _reply_map(op, m->version);
3742     return true;
3743   }
3744
3745   dout(10) << "preprocess_alive want up_thru " << m->want
3746            << " from " << m->get_orig_source_inst() << dendl;
3747   return false;
3748
3749  ignore:
3750   return true;
3751 }
3752
3753 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3754 {
3755   op->mark_osdmon_event(__func__);
3756   auto m = op->get_req<MOSDAlive>();
3757   int from = m->get_orig_source().num();
3758
3759   if (0) {  // we probably don't care much about these
3760     mon->clog->debug() << m->get_orig_source_inst() << " alive";
3761   }
3762
3763   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3764           << " from " << m->get_orig_source_inst() << dendl;
3765
3766   update_up_thru(from, m->version); // set to the latest map the OSD has
3767   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3768   return true;
3769 }
3770
3771 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3772 {
3773   op->mark_osdmon_event(__func__);
3774   dout(7) << "_reply_map " << e
3775           << " from " << op->get_req()->get_orig_source_inst()
3776           << dendl;
3777   send_latest(op, e);
3778 }
3779
3780 // pg_created
3781 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3782 {
3783   op->mark_osdmon_event(__func__);
3784   auto m  = op->get_req<MOSDPGCreated>();
3785   dout(10) << __func__ << " " << *m << dendl;
3786   auto session = op->get_session();
3787   mon->no_reply(op);
3788   if (!session) {
3789     dout(10) << __func__ << ": no monitor session!" << dendl;
3790     return true;
3791   }
3792   if (!session->is_capable("osd", MON_CAP_X)) {
3793     derr << __func__ << " received from entity "
3794          << "with insufficient privileges " << session->caps << dendl;
3795     return true;
3796   }
3797   // always forward the "created!" to the leader
3798   return false;
3799 }
3800
3801 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3802 {
3803   op->mark_osdmon_event(__func__);
3804   auto m = op->get_req<MOSDPGCreated>();
3805   dout(10) << __func__ << " " << *m << dendl;
3806   auto src = m->get_orig_source();
3807   auto from = src.num();
3808   if (!src.is_osd() ||
3809       !mon->osdmon()->osdmap.is_up(from) ||
3810       !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3811         m->get_orig_source_addrs())) {
3812     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3813     return false;
3814   }
3815   pending_created_pgs.push_back(m->pgid);
3816   return true;
3817 }
3818
3819 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3820 {
3821   op->mark_osdmon_event(__func__);
3822   auto m = op->get_req<MOSDPGReadyToMerge>();
3823   dout(10) << __func__ << " " << *m << dendl;
3824   const pg_pool_t *pi;
3825   auto session = op->get_session();
3826   if (!session) {
3827     dout(10) << __func__ << ": no monitor session!" << dendl;
3828     goto ignore;
3829   }
3830   if (!session->is_capable("osd", MON_CAP_X)) {
3831     derr << __func__ << " received from entity "
3832          << "with insufficient privileges " << session->caps << dendl;
3833     goto ignore;
3834   }
3835   pi = osdmap.get_pg_pool(m->pgid.pool());
3836   if (!pi) {
3837     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3838     goto ignore;
3839   }
3840   if (pi->get_pg_num() <= m->pgid.ps()) {
3841     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3842     goto ignore;
3843   }
3844   if (pi->get_pg_num() != m->pgid.ps() + 1) {
3845     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3846     goto ignore;
3847   }
3848   if (pi->get_pg_num_pending() > m->pgid.ps()) {
3849     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3850     goto ignore;
3851   }
3852   return false;
3853
3854  ignore:
3855   mon->no_reply(op);
3856   return true;
3857 }
3858
3859 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3860 {
3861   op->mark_osdmon_event(__func__);
3862   auto m  = op->get_req<MOSDPGReadyToMerge>();
3863   dout(10) << __func__ << " " << *m << dendl;
3864   pg_pool_t p;
3865   if (pending_inc.new_pools.count(m->pgid.pool()))
3866     p = pending_inc.new_pools[m->pgid.pool()];
3867   else
3868     p = *osdmap.get_pg_pool(m->pgid.pool());
3869   if (p.get_pg_num() != m->pgid.ps() + 1 ||
3870       p.get_pg_num_pending() > m->pgid.ps()) {
3871     dout(10) << __func__
3872              << " race with concurrent pg_num[_pending] update, will retry"
3873              << dendl;
3874     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3875     return true;
3876   }
3877
3878   if (m->ready) {
3879     p.dec_pg_num(m->pgid,
3880                  pending_inc.epoch,
3881                  m->source_version,
3882                  m->target_version,
3883                  m->last_epoch_started,
3884                  m->last_epoch_clean);
3885     p.last_change = pending_inc.epoch;
3886   } else {
3887     // back off the merge attempt!
3888     p.set_pg_num_pending(p.get_pg_num());
3889   }
3890
3891   // force pre-nautilus clients to resend their ops, since they
3892   // don't understand pg_num_pending changes form a new interval
3893   p.last_force_op_resend_prenautilus = pending_inc.epoch;
3894
3895   pending_inc.new_pools[m->pgid.pool()] = p;
3896
3897   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3898   if (m->ready &&
3899       prob > 0 &&
3900       prob > (double)(rand() % 1000)/1000.0) {
3901     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3902     auto n = new MMonCommand(mon->monmap->get_fsid());
3903     n->set_connection(m->get_connection());
3904     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3905                osdmap.get_pool_name(m->pgid.pool()) +
3906                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3907                stringify(m->pgid.ps() + 1) + "\"}" };
3908     MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3909     nop->set_type_service();
3910     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3911   } else {
3912     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3913   }
3914   return true;
3915 }
3916
3917
3918 // -------------
3919 // pg_temp changes
3920
3921 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3922 {
3923   auto m = op->get_req<MOSDPGTemp>();
3924   dout(10) << "preprocess_pgtemp " << *m << dendl;
3925   mempool::osdmap::vector<int> empty;
3926   int from = m->get_orig_source().num();
3927   size_t ignore_cnt = 0;
3928
3929   // check caps
3930   MonSession *session = op->get_session();
3931   if (!session)
3932     goto ignore;
3933   if (!session->is_capable("osd", MON_CAP_X)) {
3934     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3935             << session->caps << dendl;
3936     goto ignore;
3937   }
3938
3939   if (!osdmap.is_up(from) ||
3940       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3941     dout(7) << "ignoring pgtemp message from down "
3942             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3943             << dendl;
3944     goto ignore;
3945   }
3946
3947   if (m->forced) {
3948     return false;
3949   }
3950
3951   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3952     dout(20) << " " << p->first
3953              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3954              << " -> " << p->second << dendl;
3955
3956     // does the pool exist?
3957     if (!osdmap.have_pg_pool(p->first.pool())) {
3958       /*
3959        * 1. If the osdmap does not have the pool, it means the pool has been
3960        *    removed in-between the osd sending this message and us handling it.
3961        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3962        *    not exist in the pending either, as the osds would not send a
3963        *    message about a pool they know nothing about (yet).
3964        * 3. However, if the pool does exist in the pending, then it must be a
3965        *    new pool, and not relevant to this message (see 1).
3966        */
3967       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3968                << ": pool has been removed" << dendl;
3969       ignore_cnt++;
3970       continue;
3971     }
3972
3973     int acting_primary = -1;
3974     osdmap.pg_to_up_acting_osds(
3975       p->first, nullptr, nullptr, nullptr, &acting_primary);
3976     if (acting_primary != from) {
3977       /* If the source isn't the primary based on the current osdmap, we know
3978        * that the interval changed and that we can discard this message.
3979        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3980        * which of two pg temp mappings on the same pg is more recent.
3981        */
3982       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3983                << ": primary has changed" << dendl;
3984       ignore_cnt++;
3985       continue;
3986     }
3987
3988     // removal?
3989     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3990                               osdmap.primary_temp->count(p->first)))
3991       return false;
3992     // change?
3993     //  NOTE: we assume that this will clear pg_primary, so consider
3994     //        an existing pg_primary field to imply a change
3995     if (p->second.size() &&
3996         (osdmap.pg_temp->count(p->first) == 0 ||
3997          osdmap.pg_temp->get(p->first) != p->second ||
3998          osdmap.primary_temp->count(p->first)))
3999       return false;
4000   }
4001
4002   // should we ignore all the pgs?
4003   if (ignore_cnt == m->pg_temp.size())
4004     goto ignore;
4005
4006   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4007   _reply_map(op, m->map_epoch);
4008   return true;
4009
4010  ignore:
4011   return true;
4012 }
4013
4014 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4015 {
4016   epoch_t old_up_thru = osdmap.get_up_thru(from);
4017   auto ut = pending_inc.new_up_thru.find(from);
4018   if (ut != pending_inc.new_up_thru.end()) {
4019     old_up_thru = ut->second;
4020   }
4021   if (up_thru > old_up_thru) {
4022     // set up_thru too, so the osd doesn't have to ask again
4023     pending_inc.new_up_thru[from] = up_thru;
4024   }
4025 }
4026
4027 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4028 {
4029   op->mark_osdmon_event(__func__);
4030   auto m = op->get_req<MOSDPGTemp>();
4031   int from = m->get_orig_source().num();
4032   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4033   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4034     uint64_t pool = p->first.pool();
4035     if (pending_inc.old_pools.count(pool)) {
4036       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4037                << ": pool pending removal" << dendl;
4038       continue;
4039     }
4040     if (!osdmap.have_pg_pool(pool)) {
4041       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4042                << ": pool has been removed" << dendl;
4043       continue;
4044     }
4045     pending_inc.new_pg_temp[p->first] =
4046       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4047
4048     // unconditionally clear pg_primary (until this message can encode
4049     // a change for that, too.. at which point we need to also fix
4050     // preprocess_pg_temp)
4051     if (osdmap.primary_temp->count(p->first) ||
4052         pending_inc.new_primary_temp.count(p->first))
4053       pending_inc.new_primary_temp[p->first] = -1;
4054   }
4055
4056   // set up_thru too, so the osd doesn't have to ask again
4057   update_up_thru(from, m->map_epoch);
4058
4059   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4060   return true;
4061 }
4062
4063
4064 // ---
4065
4066 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4067 {
4068   op->mark_osdmon_event(__func__);
4069   auto m = op->get_req<MRemoveSnaps>();
4070   dout(7) << "preprocess_remove_snaps " << *m << dendl;
4071
4072   // check privilege, ignore if failed
4073   MonSession *session = op->get_session();
4074   mon->no_reply(op);
4075   if (!session)
4076     goto ignore;
4077   if (!session->caps.is_capable(
4078         cct,
4079         session->entity_name,
4080         "osd", "osd pool rmsnap", {}, true, true, false,
4081         session->get_peer_socket_addr())) {
4082     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4083             << session->caps << dendl;
4084     goto ignore;
4085   }
4086
4087   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4088        q != m->snaps.end();
4089        ++q) {
4090     if (!osdmap.have_pg_pool(q->first)) {
4091       dout(10) << " ignoring removed_snaps " << q->second
4092                << " on non-existent pool " << q->first << dendl;
4093       continue;
4094     }
4095     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4096     for (vector<snapid_t>::iterator p = q->second.begin();
4097          p != q->second.end();
4098          ++p) {
4099       if (*p > pi->get_snap_seq() ||
4100           !_is_removed_snap(q->first, *p)) {
4101         return false;
4102       }
4103     }
4104   }
4105
4106   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4107     auto reply = make_message<MRemoveSnaps>();
4108     reply->snaps = m->snaps;
4109     mon->send_reply(op, reply.detach());
4110   }
4111
4112  ignore:
4113   return true;
4114 }
4115
4116 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4117 {
4118   op->mark_osdmon_event(__func__);
4119   auto m = op->get_req<MRemoveSnaps>();
4120   dout(7) << "prepare_remove_snaps " << *m << dendl;
4121
4122   for (auto& [pool, snaps] : m->snaps) {
4123     if (!osdmap.have_pg_pool(pool)) {
4124       dout(10) << " ignoring removed_snaps " << snaps
4125                << " on non-existent pool " << pool << dendl;
4126       continue;
4127     }
4128
4129     pg_pool_t& pi = osdmap.pools[pool];
4130     for (auto s : snaps) {
4131       if (!_is_removed_snap(pool, s) &&
4132           (!pending_inc.new_pools.count(pool) ||
4133            !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4134           (!pending_inc.new_removed_snaps.count(pool) ||
4135            !pending_inc.new_removed_snaps[pool].contains(s))) {
4136         pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4137         if (osdmap.require_osd_release < ceph_release_t::octopus) {
4138           newpi->removed_snaps.insert(s);
4139           dout(10) << " pool " << pool << " removed_snaps added " << s
4140                    << " (now " << newpi->removed_snaps << ")" << dendl;
4141         }
4142         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4143         if (s > newpi->get_snap_seq()) {
4144           dout(10) << " pool " << pool << " snap_seq "
4145                    << newpi->get_snap_seq() << " -> " << s << dendl;
4146           newpi->set_snap_seq(s);
4147         }
4148         newpi->set_snap_epoch(pending_inc.epoch);
4149         dout(10) << " added pool " << pool << " snap " << s
4150                  << " to removed_snaps queue" << dendl;
4151         pending_inc.new_removed_snaps[pool].insert(s);
4152       }
4153     }
4154   }
4155
4156   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4157     auto reply = make_message<MRemoveSnaps>();
4158     reply->snaps = m->snaps;
4159     wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4160   }
4161
4162   return true;
4163 }
4164
4165 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4166 {
4167   op->mark_osdmon_event(__func__);
4168   auto m = op->get_req<MMonGetPurgedSnaps>();
4169   dout(7) << __func__ << " " << *m << dendl;
4170
4171   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4172
4173   string k = make_purged_snap_epoch_key(m->start);
4174   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4175   it->upper_bound(k);
4176   unsigned long epoch = m->last;
4177   while (it->valid()) {
4178     if (it->key().find("purged_epoch_") != 0) {
4179       break;
4180     }
4181     string k = it->key();
4182     int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4183     if (n != 1) {
4184       derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4185     } else if (epoch > m->last) {
4186       break;
4187     } else {
4188       bufferlist bl = it->value();
4189       auto p = bl.cbegin();
4190       auto &v = r[epoch];
4191       try {
4192         ceph::decode(v, p);
4193       } catch (buffer::error& e) {
4194         derr << __func__ << " unable to parse value for key '" << it->key()
4195              << "': \n";
4196         bl.hexdump(*_dout);
4197         *_dout << dendl;
4198       }
4199       n += 4 + v.size() * 16;
4200     }
4201     if (n > 1048576) {
4202       // impose a semi-arbitrary limit to message size
4203       break;
4204     }
4205     it->next();
4206   }
4207
4208   auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4209   reply->purged_snaps.swap(r);
4210   mon->send_reply(op, reply.detach());
4211
4212   return true;
4213 }
4214
4215 // osd beacon
4216 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4217 {
4218   op->mark_osdmon_event(__func__);
4219   // check caps
4220   auto session = op->get_session();
4221   mon->no_reply(op);
4222   if (!session) {
4223     dout(10) << __func__ << " no monitor session!" << dendl;
4224     return true;
4225   }
4226   if (!session->is_capable("osd", MON_CAP_X)) {
4227     derr << __func__ << " received from entity "
4228          << "with insufficient privileges " << session->caps << dendl;
4229     return true;
4230   }
4231   // Always forward the beacon to the leader, even if they are the same as
4232   // the old one. The leader will mark as down osds that haven't sent
4233   // beacon for a few minutes.
4234   return false;
4235 }
4236
4237 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4238 {
4239   op->mark_osdmon_event(__func__);
4240   const auto beacon = op->get_req<MOSDBeacon>();
4241   const auto src = beacon->get_orig_source();
4242   dout(10) << __func__ << " " << *beacon
4243            << " from " << src << dendl;
4244   int from = src.num();
4245
4246   if (!src.is_osd() ||
4247       !osdmap.is_up(from) ||
4248       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4249     if (src.is_osd() && !osdmap.is_up(from)) {
4250       // share some new maps with this guy in case it may not be
4251       // aware of its own deadness...
4252       send_latest(op, beacon->version+1);
4253     }
4254     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4255     return false;
4256   }
4257
4258   last_osd_report[from] = ceph_clock_now();
4259   osd_epochs[from] = beacon->version;
4260
4261   for (const auto& pg : beacon->pgs) {
4262     last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4263   }
4264
4265   if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4266       beacon->last_purged_snaps_scrub) {
4267     if (pending_inc.new_xinfo.count(from) == 0) {
4268       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4269     }
4270     pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4271       beacon->last_purged_snaps_scrub;
4272     return true;
4273   } else {
4274     return false;
4275   }
4276 }
4277
4278 // ---------------
4279 // map helpers
4280
4281 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4282 {
4283   op->mark_osdmon_event(__func__);
4284   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4285           << " start " << start << dendl;
4286   if (start == 0)
4287     send_full(op);
4288   else
4289     send_incremental(op, start);
4290 }
4291
4292
4293 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4294 {
4295   MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4296   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4297   r->oldest_map = get_first_committed();
4298   r->newest_map = osdmap.get_epoch();
4299   return r;
4300 }
4301
4302 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4303 {
4304   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4305            << std::hex << features << std::dec << dendl;
4306   MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
4307   m->oldest_map = get_first_committed();
4308   m->newest_map = osdmap.get_epoch();
4309
4310   for (epoch_t e = to; e >= from && e > 0; e--) {
4311     bufferlist bl;
4312     int err = get_version(e, features, bl);
4313     if (err == 0) {
4314       ceph_assert(bl.length());
4315       // if (get_version(e, bl) > 0) {
4316       dout(20) << "build_incremental    inc " << e << " "
4317                << bl.length() << " bytes" << dendl;
4318       m->incremental_maps[e] = bl;
4319     } else {
4320       ceph_assert(err == -ENOENT);
4321       ceph_assert(!bl.length());
4322       get_version_full(e, features, bl);
4323       if (bl.length() > 0) {
4324       //else if (get_version("full", e, bl) > 0) {
4325       dout(20) << "build_incremental   full " << e << " "
4326                << bl.length() << " bytes" << dendl;
4327       m->maps[e] = bl;
4328       } else {
4329         ceph_abort();  // we should have all maps.
4330       }
4331     }
4332   }
4333   return m;
4334 }
4335
4336 void OSDMonitor::send_full(MonOpRequestRef op)
4337 {
4338   op->mark_osdmon_event(__func__);
4339   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4340   mon->send_reply(op, build_latest_full(op->get_session()->con_features));
4341 }
4342
4343 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4344 {
4345   op->mark_osdmon_event(__func__);
4346
4347   MonSession *s = op->get_session();
4348   ceph_assert(s);
4349
4350   if (s->proxy_con) {
4351     // oh, we can tell the other mon to do it
4352     dout(10) << __func__ << " asking proxying mon to send_incremental from "
4353              << first << dendl;
4354     MRoute *r = new MRoute(s->proxy_tid, NULL);
4355     r->send_osdmap_first = first;
4356     s->proxy_con->send_message(r);
4357     op->mark_event("reply: send routed send_osdmap_first reply");
4358   } else {
4359     // do it ourselves
4360     send_incremental(first, s, false, op);
4361   }
4362 }
4363
4364 void OSDMonitor::send_incremental(epoch_t first,
4365                                   MonSession *session,
4366                                   bool onetime,
4367                                   MonOpRequestRef req)
4368 {
4369   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4370           << " to " << session->name << dendl;
4371
4372   // get feature of the peer
4373   // use quorum_con_features, if it's an anonymous connection.
4374   uint64_t features = session->con_features ? session->con_features :
4375     mon->get_quorum_con_features();
4376
4377   if (first <= session->osd_epoch) {
4378     dout(10) << __func__ << " " << session->name << " should already have epoch "
4379              << session->osd_epoch << dendl;
4380     first = session->osd_epoch + 1;
4381   }
4382
4383   if (first < get_first_committed()) {
4384     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4385     m->oldest_map = get_first_committed();
4386     m->newest_map = osdmap.get_epoch();
4387
4388     first = get_first_committed();
4389     bufferlist bl;
4390     int err = get_version_full(first, features, bl);
4391     ceph_assert(err == 0);
4392     ceph_assert(bl.length());
4393     dout(20) << "send_incremental starting with base full "
4394              << first << " " << bl.length() << " bytes" << dendl;
4395     m->maps[first] = bl;
4396
4397     if (req) {
4398       mon->send_reply(req, m);
4399       session->osd_epoch = first;
4400       return;
4401     } else {
4402       session->con->send_message(m);
4403       session->osd_epoch = first;
4404     }
4405     first++;
4406   }
4407
4408   while (first <= osdmap.get_epoch()) {
4409     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4410                                      osdmap.get_epoch());
4411     MOSDMap *m = build_incremental(first, last, features);
4412
4413     if (req) {
4414       // send some maps.  it may not be all of them, but it will get them
4415       // started.
4416       mon->send_reply(req, m);
4417     } else {
4418       session->con->send_message(m);
4419       first = last + 1;
4420     }
4421     session->osd_epoch = last;
4422     if (onetime || req)
4423       break;
4424   }
4425 }
4426
4427 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4428 {
4429   return get_version(ver, mon->get_quorum_con_features(), bl);
4430 }
4431
4432 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4433 {
4434   OSDMap::Incremental inc;
4435   auto q = bl.cbegin();
4436   inc.decode(q);
4437   // always encode with subset of osdmap's canonical features
4438   uint64_t f = features & inc.encode_features;
4439   dout(20) << __func__ << " " << inc.epoch << " with features " << f
4440            << dendl;
4441   bl.clear();
4442   if (inc.fullmap.length()) {
4443     // embedded full map?
4444     OSDMap m;
4445     m.decode(inc.fullmap);
4446     inc.fullmap.clear();
4447     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4448   }
4449   if (inc.crush.length()) {
4450     // embedded crush map
4451     CrushWrapper c;
4452     auto p = inc.crush.cbegin();
4453     c.decode(p);
4454     inc.crush.clear();
4455     c.encode(inc.crush, f);
4456   }
4457   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4458 }
4459
4460 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4461 {
4462   OSDMap m;
4463   auto q = bl.cbegin();
4464   m.decode(q);
4465   // always encode with subset of osdmap's canonical features
4466   uint64_t f = features & m.get_encoding_features();
4467   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4468            << dendl;
4469   bl.clear();
4470   m.encode(bl, f | CEPH_FEATURE_RESERVED);
4471 }
4472
4473 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4474 {
4475   uint64_t significant_features = OSDMap::get_significant_features(features);
4476   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4477     return 0;
4478   }
4479   int ret = PaxosService::get_version(ver, bl);
4480   if (ret < 0) {
4481     return ret;
4482   }
4483   // NOTE: this check is imprecise; the OSDMap encoding features may
4484   // be a subset of the latest mon quorum features, but worst case we
4485   // reencode once and then cache the (identical) result under both
4486   // feature masks.
4487   if (significant_features !=
4488       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4489     reencode_incremental_map(bl, features);
4490   }
4491   inc_osd_cache.add_bytes({ver, significant_features}, bl);
4492   return 0;
4493 }
4494
4495 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4496 {
4497   bufferlist inc_bl;
4498   int err = get_version(ver, inc_bl);
4499   ceph_assert(err == 0);
4500   ceph_assert(inc_bl.length());
4501
4502   auto p = inc_bl.cbegin();
4503   inc.decode(p);
4504   dout(10) << __func__ << "     "
4505            << " epoch " << inc.epoch
4506            << " inc_crc " << inc.inc_crc
4507            << " full_crc " << inc.full_crc
4508            << " encode_features " << inc.encode_features << dendl;
4509   return 0;
4510 }
4511
4512 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4513 {
4514   dout(10) << __func__ << " ver " << ver << dendl;
4515
4516   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4517   if (closest_pinned == 0) {
4518     return -ENOENT;
4519   }
4520   if (closest_pinned > ver) {
4521     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4522   }
4523   ceph_assert(closest_pinned <= ver);
4524
4525   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4526
4527   // get osdmap incremental maps and apply on top of this one.
4528   bufferlist osdm_bl;
4529   bool has_cached_osdmap = false;
4530   for (version_t v = ver-1; v >= closest_pinned; --v) {
4531     if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4532                                 &osdm_bl)) {
4533       dout(10) << __func__ << " found map in cache ver " << v << dendl;
4534       closest_pinned = v;
4535       has_cached_osdmap = true;
4536       break;
4537     }
4538   }
4539
4540   if (!has_cached_osdmap) {
4541     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4542     if (err != 0) {
4543       derr << __func__ << " closest pinned map ver " << closest_pinned
4544            << " not available! error: " << cpp_strerror(err) << dendl;
4545     }
4546     ceph_assert(err == 0);
4547   }
4548
4549   ceph_assert(osdm_bl.length());
4550
4551   OSDMap osdm;
4552   osdm.decode(osdm_bl);
4553
4554   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4555            << " e" << osdm.epoch
4556            << " crc " << osdm.get_crc()
4557            << " -- applying incremental maps." << dendl;
4558
4559   uint64_t encode_features = 0;
4560   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4561     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
4562
4563     OSDMap::Incremental inc;
4564     int err = get_inc(v, inc);
4565     ceph_assert(err == 0);
4566
4567     encode_features = inc.encode_features;
4568
4569     err = osdm.apply_incremental(inc);
4570     ceph_assert(err == 0);
4571
4572     // this block performs paranoid checks on map retrieval
4573     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4574         inc.full_crc != 0) {
4575
4576       uint64_t f = encode_features;
4577       if (!f) {
4578         f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4579       }
4580
4581       // encode osdmap to force calculating crcs
4582       bufferlist tbl;
4583       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4584       // decode osdmap to compare crcs with what's expected by incremental
4585       OSDMap tosdm;
4586       tosdm.decode(tbl);
4587
4588       if (tosdm.get_crc() != inc.full_crc) {
4589         derr << __func__
4590              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4591              << ", expected " << inc.full_crc << ")" << dendl;
4592         ceph_abort_msg("osdmap crc mismatch");
4593       }
4594     }
4595
4596     // note: we cannot add the recently computed map to the cache, as is,
4597     // because we have not encoded the map into a bl.
4598   }
4599
4600   if (!encode_features) {
4601     dout(10) << __func__
4602              << " last incremental map didn't have features;"
4603              << " defaulting to quorum's or all" << dendl;
4604     encode_features =
4605       (mon->quorum_con_features ? mon->quorum_con_features : -1);
4606   }
4607   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4608
4609   return 0;
4610 }
4611
4612 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4613 {
4614   return get_version_full(ver, mon->get_quorum_con_features(), bl);
4615 }
4616
4617 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4618                                  bufferlist& bl)
4619 {
4620   uint64_t significant_features = OSDMap::get_significant_features(features);
4621   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4622     return 0;
4623   }
4624   int ret = PaxosService::get_version_full(ver, bl);
4625   if (ret == -ENOENT) {
4626     // build map?
4627     ret = get_full_from_pinned_map(ver, bl);
4628   }
4629   if (ret < 0) {
4630     return ret;
4631   }
4632   // NOTE: this check is imprecise; the OSDMap encoding features may
4633   // be a subset of the latest mon quorum features, but worst case we
4634   // reencode once and then cache the (identical) result under both
4635   // feature masks.
4636   if (significant_features !=
4637       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4638     reencode_full_map(bl, features);
4639   }
4640   full_osd_cache.add_bytes({ver, significant_features}, bl);
4641   return 0;
4642 }
4643
4644 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4645 {
4646   dout(10) << "blacklist " << av << " until " << until << dendl;
4647   for (auto a : av.v) {
4648     if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4649       a.set_type(entity_addr_t::TYPE_ANY);
4650     } else {
4651       a.set_type(entity_addr_t::TYPE_LEGACY);
4652     }
4653     pending_inc.new_blacklist[a] = until;
4654   }
4655   return pending_inc.epoch;
4656 }
4657
4658 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4659 {
4660   if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4661     a.set_type(entity_addr_t::TYPE_ANY);
4662   } else {
4663     a.set_type(entity_addr_t::TYPE_LEGACY);
4664   }
4665   dout(10) << "blacklist " << a << " until " << until << dendl;
4666   pending_inc.new_blacklist[a] = until;
4667   return pending_inc.epoch;
4668 }
4669
4670
4671 void OSDMonitor::check_osdmap_subs()
4672 {
4673   dout(10) << __func__ << dendl;
4674   if (!osdmap.get_epoch()) {
4675     return;
4676   }
4677   auto osdmap_subs = mon->session_map.subs.find("osdmap");
4678   if (osdmap_subs == mon->session_map.subs.end()) {
4679     return;
4680   }
4681   auto p = osdmap_subs->second->begin();
4682   while (!p.end()) {
4683     auto sub = *p;
4684     ++p;
4685     check_osdmap_sub(sub);
4686   }
4687 }
4688
4689 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4690 {
4691   dout(10) << __func__ << " " << sub << " next " << sub->next
4692            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4693   if (sub->next <= osdmap.get_epoch()) {
4694     if (sub->next >= 1)
4695       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4696     else
4697       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4698     if (sub->onetime)
4699       mon->session_map.remove_sub(sub);
4700     else
4701       sub->next = osdmap.get_epoch() + 1;
4702   }
4703 }
4704
4705 void OSDMonitor::check_pg_creates_subs()
4706 {
4707   if (!osdmap.get_num_up_osds()) {
4708     return;
4709   }
4710   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4711   mon->with_session_map([this](const MonSessionMap& session_map) {
4712       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4713       if (pg_creates_subs == session_map.subs.end()) {
4714         return;
4715       }
4716       for (auto sub : *pg_creates_subs->second) {
4717         check_pg_creates_sub(sub);
4718       }
4719     });
4720 }
4721
4722 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4723 {
4724   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4725   ceph_assert(sub->type == "osd_pg_creates");
4726   // only send these if the OSD is up.  we will check_subs() when they do
4727   // come up so they will get the creates then.
4728   if (sub->session->name.is_osd() &&
4729       mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4730     sub->next = send_pg_creates(sub->session->name.num(),
4731                                 sub->session->con.get(),
4732                                 sub->next);
4733   }
4734 }
4735
4736 void OSDMonitor::do_application_enable(int64_t pool_id,
4737                                        const std::string &app_name,
4738                                        const std::string &app_key,
4739                                        const std::string &app_value,
4740                                        bool force)
4741 {
4742   ceph_assert(paxos->is_plugged() && is_writeable());
4743
4744   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4745            << dendl;
4746
4747   ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4748
4749   auto pp = osdmap.get_pg_pool(pool_id);
4750   ceph_assert(pp != nullptr);
4751
4752   pg_pool_t p = *pp;
4753   if (pending_inc.new_pools.count(pool_id)) {
4754     p = pending_inc.new_pools[pool_id];
4755   }
4756
4757   if (app_key.empty()) {
4758     p.application_metadata.insert({app_name, {}});
4759   } else {
4760     if (force) {
4761       p.application_metadata[app_name][app_key] = app_value;
4762     } else {
4763       p.application_metadata.insert({app_name, {{app_key, app_value}}});
4764     }
4765   }
4766   p.last_change = pending_inc.epoch;
4767   pending_inc.new_pools[pool_id] = p;
4768 }
4769
4770 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4771                                  pool_opts_t::key_t opt,
4772                                  pool_opts_t::value_t val)
4773 {
4774   auto p = pending_inc.new_pools.try_emplace(
4775     pool_id, *osdmap.get_pg_pool(pool_id));
4776   p.first->second.opts.set(opt, val);
4777 }
4778
4779 unsigned OSDMonitor::scan_for_creating_pgs(
4780   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4781   const mempool::osdmap::set<int64_t>& removed_pools,
4782   utime_t modified,
4783   creating_pgs_t* creating_pgs) const
4784 {
4785   unsigned queued = 0;
4786   for (auto& p : pools) {
4787     int64_t poolid = p.first;
4788     if (creating_pgs->created_pools.count(poolid)) {
4789       dout(10) << __func__ << " already created " << poolid << dendl;
4790       continue;
4791     }
4792     const pg_pool_t& pool = p.second;
4793     int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4794                                          pool.get_type(), pool.get_size());
4795     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4796       continue;
4797
4798     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4799     const auto created = pool.get_last_change();
4800     if (last_scan_epoch && created <= last_scan_epoch) {
4801       dout(10) << __func__ << " no change in pool " << poolid
4802                << " " << pool << dendl;
4803       continue;
4804     }
4805     if (removed_pools.count(poolid)) {
4806       dout(10) << __func__ << " pool is being removed: " << poolid
4807                << " " << pool << dendl;
4808       continue;
4809     }
4810     dout(10) << __func__ << " queueing pool create for " << poolid
4811              << " " << pool << dendl;
4812     creating_pgs->create_pool(poolid, pool.get_pg_num(),
4813                               created, modified);
4814     queued++;
4815   }
4816   return queued;
4817 }
4818
4819 void OSDMonitor::update_creating_pgs()
4820 {
4821   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4822            << creating_pgs.queue.size() << " pools in queue" << dendl;
4823   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4824   std::lock_guard<std::mutex> l(creating_pgs_lock);
4825   for (const auto& pg : creating_pgs.pgs) {
4826     int acting_primary = -1;
4827     auto pgid = pg.first;
4828     if (!osdmap.pg_exists(pgid)) {
4829       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4830                << dendl;
4831       continue;
4832     }
4833     auto mapped = pg.second.create_epoch;
4834     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4835     spg_t spgid(pgid);
4836     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4837     // check the previous creating_pgs, look for the target to whom the pg was
4838     // previously mapped
4839     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4840       const auto last_acting_primary = pgs_by_epoch.first;
4841       for (auto& pgs: pgs_by_epoch.second) {
4842         if (pgs.second.count(spgid)) {
4843           if (last_acting_primary == acting_primary) {
4844             mapped = pgs.first;
4845           } else {
4846             dout(20) << __func__ << " " << pgid << " "
4847                      << " acting_primary:" << last_acting_primary
4848                      << " -> " << acting_primary << dendl;
4849             // note epoch if the target of the create message changed.
4850             mapped = mapping.get_epoch();
4851           }
4852           break;
4853         } else {
4854           // newly creating
4855           mapped = mapping.get_epoch();
4856         }
4857       }
4858     }
4859     dout(10) << __func__ << " will instruct osd." << acting_primary
4860              << " to create " << pgid << "@" << mapped << dendl;
4861     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4862   }
4863   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4864   creating_pgs_epoch = mapping.get_epoch();
4865 }
4866
4867 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4868 {
4869   dout(30) << __func__ << " osd." << osd << " next=" << next
4870            << " " << creating_pgs_by_osd_epoch << dendl;
4871   std::lock_guard<std::mutex> l(creating_pgs_lock);
4872   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4873     dout(20) << __func__
4874              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4875     // the subscribers will be updated when the mapping is completed anyway
4876     return next;
4877   }
4878   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4879   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4880     return next;
4881   ceph_assert(!creating_pgs_by_epoch->second.empty());
4882
4883   MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4884   MOSDPGCreate2 *m = nullptr;
4885
4886   bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
4887
4888   epoch_t last = 0;
4889   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4890        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4891     auto epoch = epoch_pgs->first;
4892     auto& pgs = epoch_pgs->second;
4893     dout(20) << __func__ << " osd." << osd << " from " << next
4894              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4895     last = epoch;
4896     for (auto& pg : pgs) {
4897       // Need the create time from the monitor using its clock to set
4898       // last_scrub_stamp upon pg creation.
4899       auto create = creating_pgs.pgs.find(pg.pgid);
4900       ceph_assert(create != creating_pgs.pgs.end());
4901       if (old) {
4902         if (!oldm) {
4903           oldm = new MOSDPGCreate(creating_pgs_epoch);
4904         }
4905         oldm->mkpg.emplace(pg.pgid,
4906                            pg_create_t{create->second.create_epoch, pg.pgid, 0});
4907         oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
4908       } else {
4909         if (!m) {
4910           m = new MOSDPGCreate2(creating_pgs_epoch);
4911         }
4912         m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4913                                      create->second.create_stamp));
4914         if (create->second.history.epoch_created) {
4915           dout(20) << __func__ << "   " << pg << " " << create->second.history
4916                    << " " << create->second.past_intervals << dendl;
4917           m->pg_extra.emplace(pg, make_pair(create->second.history,
4918                                             create->second.past_intervals));
4919         }
4920       }
4921       dout(20) << __func__ << " will create " << pg
4922                << " at " << create->second.create_epoch << dendl;
4923     }
4924   }
4925   if (m) {
4926     con->send_message(m);
4927   } else if (oldm) {
4928     con->send_message(oldm);
4929   } else {
4930     dout(20) << __func__ << " osd." << osd << " from " << next
4931              << " has nothing to send" << dendl;
4932     return next;
4933   }
4934
4935   // sub is current through last + 1
4936   return last + 1;
4937 }
4938
4939 // TICK
4940
4941
4942 void OSDMonitor::tick()
4943 {
4944   if (!is_active()) return;
4945
4946   dout(10) << osdmap << dendl;
4947
4948   // always update osdmap manifest, regardless of being the leader.
4949   load_osdmap_manifest();
4950
4951   // always tune priority cache manager memory on leader and peons
4952   if (ceph_using_tcmalloc() && mon_memory_autotune) {
4953     std::lock_guard l(balancer_lock);
4954     if (pcm != nullptr) {
4955       pcm->tune_memory();
4956       pcm->balance();
4957       _set_new_cache_sizes();
4958       dout(10) << "tick balancer "
4959                << " inc cache_bytes: " << inc_cache->get_cache_bytes()
4960                << " inc comtd_bytes: " << inc_cache->get_committed_size()
4961                << " inc used_bytes: " << inc_cache->_get_used_bytes()
4962                << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
4963                << dendl;
4964       dout(10) << "tick balancer "
4965                << " full cache_bytes: " << full_cache->get_cache_bytes()
4966                << " full comtd_bytes: " << full_cache->get_committed_size()
4967                << " full used_bytes: " << full_cache->_get_used_bytes()
4968                << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
4969                << dendl;
4970     }
4971   }
4972
4973   if (!mon->is_leader()) return;
4974
4975   bool do_propose = false;
4976   utime_t now = ceph_clock_now();
4977
4978   if (handle_osd_timeouts(now, last_osd_report)) {
4979     do_propose = true;
4980   }
4981
4982   // mark osds down?
4983   if (check_failures(now)) {
4984     do_propose = true;
4985   }
4986
4987   // Force a proposal if we need to prune; pruning is performed on
4988   // ``encode_pending()``, hence why we need to regularly trigger a proposal
4989   // even if there's nothing going on.
4990   if (is_prune_enabled() && should_prune()) {
4991     do_propose = true;
4992   }
4993
4994   // mark down osds out?
4995
4996   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4997    * influence at all. The decision is made based on the ratio of "in" osds,
4998    * and the function returns false if this ratio is lower that the minimum
4999    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5000    */
5001   if (can_mark_out(-1)) {
5002     string down_out_subtree_limit = g_conf().get_val<string>(
5003       "mon_osd_down_out_subtree_limit");
5004     set<int> down_cache;  // quick cache of down subtrees
5005
5006     map<int,utime_t>::iterator i = down_pending_out.begin();
5007     while (i != down_pending_out.end()) {
5008       int o = i->first;
5009       utime_t down = now;
5010       down -= i->second;
5011       ++i;
5012
5013       if (osdmap.is_down(o) &&
5014           osdmap.is_in(o) &&
5015           can_mark_out(o)) {
5016         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5017         utime_t grace = orig_grace;
5018         double my_grace = 0.0;
5019
5020         if (g_conf()->mon_osd_adjust_down_out_interval) {
5021           // scale grace period the same way we do the heartbeat grace.
5022           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5023           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5024           double decay_k = ::log(.5) / halflife;
5025           double decay = exp((double)down * decay_k);
5026           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5027                    << " down for " << down << " decay " << decay << dendl;
5028           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5029           grace += my_grace;
5030         }
5031
5032         // is this an entire large subtree down?
5033         if (down_out_subtree_limit.length()) {
5034           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5035           if (type > 0) {
5036             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5037               dout(10) << "tick entire containing " << down_out_subtree_limit
5038                        << " subtree for osd." << o
5039                        << " is down; resetting timer" << dendl;
5040               // reset timer, too.
5041               down_pending_out[o] = now;
5042               continue;
5043             }
5044           }
5045         }
5046
5047         bool down_out = !osdmap.is_destroyed(o) &&
5048           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5049         bool destroyed_out = osdmap.is_destroyed(o) &&
5050           g_conf()->mon_osd_destroyed_out_interval > 0 &&
5051         // this is not precise enough as we did not make a note when this osd
5052         // was marked as destroyed, but let's not bother with that
5053         // complexity for now.
5054           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5055         if (down_out || destroyed_out) {
5056           dout(10) << "tick marking osd." << o << " OUT after " << down
5057                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5058           pending_inc.new_weight[o] = CEPH_OSD_OUT;
5059
5060           // set the AUTOOUT bit.
5061           if (pending_inc.new_state.count(o) == 0)
5062             pending_inc.new_state[o] = 0;
5063           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5064
5065           // remember previous weight
5066           if (pending_inc.new_xinfo.count(o) == 0)
5067             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5068           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5069
5070           do_propose = true;
5071
5072           mon->clog->info() << "Marking osd." << o << " out (has been down for "
5073                             << int(down.sec()) << " seconds)";
5074         } else
5075           continue;
5076       }
5077
5078       down_pending_out.erase(o);
5079     }
5080   } else {
5081     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5082   }
5083
5084   // expire blacklisted items?
5085   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5086        p != osdmap.blacklist.end();
5087        ++p) {
5088     if (p->second < now) {
5089       dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5090       pending_inc.old_blacklist.push_back(p->first);
5091       do_propose = true;
5092     }
5093   }
5094
5095   if (try_prune_purged_snaps()) {
5096     do_propose = true;
5097   }
5098
5099   if (update_pools_status())
5100     do_propose = true;
5101
5102   if (do_propose ||
5103       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
5104     propose_pending();
5105 }
5106
5107 void OSDMonitor::_set_new_cache_sizes()
5108 {
5109   uint64_t cache_size = 0;
5110   int64_t inc_alloc = 0;
5111   int64_t full_alloc = 0;
5112   int64_t kv_alloc = 0;
5113
5114   if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5115     cache_size = pcm->get_tuned_mem();
5116     inc_alloc = inc_cache->get_committed_size();
5117     full_alloc = full_cache->get_committed_size();
5118     kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5119   }
5120
5121   inc_osd_cache.set_bytes(inc_alloc);
5122   full_osd_cache.set_bytes(full_alloc);
5123
5124   dout(1) << __func__ << " cache_size:" << cache_size
5125            << " inc_alloc: " << inc_alloc
5126            << " full_alloc: " << full_alloc
5127            << " kv_alloc: " << kv_alloc
5128            << dendl;
5129 }
5130
5131 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5132                                      std::map<int,utime_t> &last_osd_report)
5133 {
5134   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5135   if (now - mon->get_leader_since() < timeo) {
5136     // We haven't been the leader for long enough to consider OSD timeouts
5137     return false;
5138   }
5139
5140   int max_osd = osdmap.get_max_osd();
5141   bool new_down = false;
5142
5143   for (int i=0; i < max_osd; ++i) {
5144     dout(30) << __func__ << ": checking up on osd " << i << dendl;
5145     if (!osdmap.exists(i)) {
5146       last_osd_report.erase(i); // if any
5147       continue;
5148     }
5149     if (!osdmap.is_up(i))
5150       continue;
5151     const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5152     if (t == last_osd_report.end()) {
5153       // it wasn't in the map; start the timer.
5154       last_osd_report[i] = now;
5155     } else if (can_mark_down(i)) {
5156       utime_t diff = now - t->second;
5157       if (diff > timeo) {
5158         mon->clog->info() << "osd." << i << " marked down after no beacon for "
5159                           << diff << " seconds";
5160         derr << "no beacon from osd." << i << " since " << t->second
5161              << ", " << diff << " seconds ago.  marking down" << dendl;
5162         pending_inc.new_state[i] = CEPH_OSD_UP;
5163         new_down = true;
5164       }
5165     }
5166   }
5167   return new_down;
5168 }
5169
5170 static void dump_cpu_list(Formatter *f, const char *name,
5171                           const string& strlist)
5172 {
5173   cpu_set_t cpu_set;
5174   size_t cpu_set_size;
5175   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5176     return;
5177   }
5178   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5179   f->open_array_section(name);
5180   for (auto cpu : cpus) {
5181     f->dump_int("cpu", cpu);
5182   }
5183   f->close_section();
5184 }
5185
5186 void OSDMonitor::dump_info(Formatter *f)
5187 {
5188   f->open_object_section("osdmap");
5189   osdmap.dump(f);
5190   f->close_section();
5191
5192   f->open_array_section("osd_metadata");
5193   for (int i=0; i<osdmap.get_max_osd(); ++i) {
5194     if (osdmap.exists(i)) {
5195       f->open_object_section("osd");
5196       f->dump_unsigned("id", i);
5197       dump_osd_metadata(i, f, NULL);
5198       f->close_section();
5199     }
5200   }
5201   f->close_section();
5202
5203   f->open_object_section("osdmap_clean_epochs");
5204   f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5205
5206   f->open_object_section("last_epoch_clean");
5207   last_epoch_clean.dump(f);
5208   f->close_section();
5209
5210   f->open_array_section("osd_epochs");
5211   for (auto& osd_epoch : osd_epochs) {
5212     f->open_object_section("osd");
5213     f->dump_unsigned("id", osd_epoch.first);
5214     f->dump_unsigned("epoch", osd_epoch.second);
5215     f->close_section();
5216   }
5217   f->close_section(); // osd_epochs
5218
5219   f->close_section(); // osd_clean_epochs
5220
5221   f->dump_unsigned("osdmap_first_committed", get_first_committed());
5222   f->dump_unsigned("osdmap_last_committed", get_last_committed());
5223
5224   f->open_object_section("crushmap");
5225   osdmap.crush->dump(f);
5226   f->close_section();
5227
5228   if (has_osdmap_manifest) {
5229     f->open_object_section("osdmap_manifest");
5230     osdmap_manifest.dump(f);
5231     f->close_section();
5232   }
5233 }
5234
5235 namespace {
5236   enum osd_pool_get_choices {
5237     SIZE, MIN_SIZE,
5238     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5239     NODELETE, NOPGCHANGE, NOSIZECHANGE,
5240     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5241     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5242     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5243     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5244     CACHE_TARGET_FULL_RATIO,
5245     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5246     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5247     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5248     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5249     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5250     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5251     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5252     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5253     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5254     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5255     PG_AUTOSCALE_BIAS };
5256
5257   std::set<osd_pool_get_choices>
5258     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5259                                 const std::set<osd_pool_get_choices>& second)
5260     {
5261       std::set<osd_pool_get_choices> result;
5262       std::set_difference(first.begin(), first.end(),
5263                           second.begin(), second.end(),
5264                           std::inserter(result, result.end()));
5265       return result;
5266     }
5267 }
5268
5269
5270 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5271 {
5272   op->mark_osdmon_event(__func__);
5273   auto m = op->get_req<MMonCommand>();
5274   int r = 0;
5275   bufferlist rdata;
5276   stringstream ss, ds;
5277
5278   cmdmap_t cmdmap;
5279   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5280     string rs = ss.str();
5281     mon->reply_command(op, -EINVAL, rs, get_last_committed());
5282     return true;
5283   }
5284
5285   MonSession *session = op->get_session();
5286   if (!session) {
5287     derr << __func__ << " no session" << dendl;
5288     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5289     return true;
5290   }
5291
5292   string prefix;
5293   cmd_getval(cmdmap, "prefix", prefix);
5294
5295   string format;
5296   cmd_getval(cmdmap, "format", format, string("plain"));
5297   boost::scoped_ptr<Formatter> f(Formatter::create(format));
5298
5299   if (prefix == "osd stat") {
5300     if (f) {
5301       f->open_object_section("osdmap");
5302       osdmap.print_summary(f.get(), ds, "", true);
5303       f->close_section();
5304       f->flush(rdata);
5305     } else {
5306       osdmap.print_summary(nullptr, ds, "", true);
5307       rdata.append(ds);
5308     }
5309   }
5310   else if (prefix == "osd dump" ||
5311            prefix == "osd tree" ||
5312            prefix == "osd tree-from" ||
5313            prefix == "osd ls" ||
5314            prefix == "osd getmap" ||
5315            prefix == "osd getcrushmap" ||
5316            prefix == "osd ls-tree" ||
5317            prefix == "osd info") {
5318     string val;
5319
5320     epoch_t epoch = 0;
5321     int64_t epochnum;
5322     cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5323     epoch = epochnum;
5324
5325     bufferlist osdmap_bl;
5326     int err = get_version_full(epoch, osdmap_bl);
5327     if (err == -ENOENT) {
5328       r = -ENOENT;
5329       ss << "there is no map for epoch " << epoch;
5330       goto reply;
5331     }
5332     ceph_assert(err == 0);
5333     ceph_assert(osdmap_bl.length());
5334
5335     OSDMap *p;
5336     if (epoch == osdmap.get_epoch()) {
5337       p = &osdmap;
5338     } else {
5339       p = new OSDMap;
5340       p->decode(osdmap_bl);
5341     }
5342
5343     auto sg = make_scope_guard([&] {
5344       if (p != &osdmap) {
5345         delete p;
5346       }
5347     });
5348
5349     if (prefix == "osd dump") {
5350       stringstream ds;
5351       if (f) {
5352         f->open_object_section("osdmap");
5353         p->dump(f.get());
5354         f->close_section();
5355         f->flush(ds);
5356       } else {
5357         p->print(ds);
5358       }
5359       rdata.append(ds);
5360       if (!f)
5361         ds << " ";
5362     } else if (prefix == "osd ls") {
5363       if (f) {
5364         f->open_array_section("osds");
5365         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5366           if (osdmap.exists(i)) {
5367             f->dump_int("osd", i);
5368           }
5369         }
5370         f->close_section();
5371         f->flush(ds);
5372       } else {
5373         bool first = true;
5374         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5375           if (osdmap.exists(i)) {
5376             if (!first)
5377               ds << "\n";
5378             first = false;
5379             ds << i;
5380           }
5381         }
5382       }
5383       rdata.append(ds);
5384     } else if (prefix == "osd info") {
5385       int64_t osd_id;
5386       bool do_single_osd = true;
5387       if (!cmd_getval(cmdmap, "id", osd_id)) {
5388         do_single_osd = false;
5389       }
5390
5391       if (do_single_osd && !osdmap.exists(osd_id)) {
5392         ss << "osd." << osd_id << " does not exist";
5393         r = -EINVAL;
5394         goto reply;
5395       }
5396
5397       if (f) {
5398         if (do_single_osd) {
5399           osdmap.dump_osd(osd_id, f.get());
5400         } else {
5401           osdmap.dump_osds(f.get());
5402         }
5403         f->flush(ds);
5404       } else {
5405         if (do_single_osd) {
5406           osdmap.print_osd(osd_id, ds);
5407         } else {
5408           osdmap.print_osds(ds);
5409         }
5410       }
5411       rdata.append(ds);
5412     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5413       string bucket;
5414       if (prefix == "osd tree-from") {
5415         cmd_getval(cmdmap, "bucket", bucket);
5416         if (!osdmap.crush->name_exists(bucket)) {
5417           ss << "bucket '" << bucket << "' does not exist";
5418           r = -ENOENT;
5419           goto reply;
5420         }
5421         int id = osdmap.crush->get_item_id(bucket);
5422         if (id >= 0) {
5423           ss << "\"" << bucket << "\" is not a bucket";
5424           r = -EINVAL;
5425           goto reply;
5426         }
5427       }
5428
5429       vector<string> states;
5430       cmd_getval(cmdmap, "states", states);
5431       unsigned filter = 0;
5432       for (auto& s : states) {
5433         if (s == "up") {
5434           filter |= OSDMap::DUMP_UP;
5435         } else if (s == "down") {
5436           filter |= OSDMap::DUMP_DOWN;
5437         } else if (s == "in") {
5438           filter |= OSDMap::DUMP_IN;
5439         } else if (s == "out") {
5440           filter |= OSDMap::DUMP_OUT;
5441         } else if (s == "destroyed") {
5442           filter |= OSDMap::DUMP_DESTROYED;
5443         } else {
5444           ss << "unrecognized state '" << s << "'";
5445           r = -EINVAL;
5446           goto reply;
5447         }
5448       }
5449       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5450           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5451         ss << "cannot specify both 'in' and 'out'";
5452         r = -EINVAL;
5453         goto reply;
5454       }
5455       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5456            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5457            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5458            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5459            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5460            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5461         ss << "can specify only one of 'up', 'down' and 'destroyed'";
5462         r = -EINVAL;
5463         goto reply;
5464       }
5465       if (f) {
5466         f->open_object_section("tree");
5467         p->print_tree(f.get(), NULL, filter, bucket);
5468         f->close_section();
5469         f->flush(ds);
5470       } else {
5471         p->print_tree(NULL, &ds, filter, bucket);
5472       }
5473       rdata.append(ds);
5474     } else if (prefix == "osd getmap") {
5475       rdata.append(osdmap_bl);
5476       ss << "got osdmap epoch " << p->get_epoch();
5477     } else if (prefix == "osd getcrushmap") {
5478       p->crush->encode(rdata, mon->get_quorum_con_features());
5479       ss << p->get_crush_version();
5480     } else if (prefix == "osd ls-tree") {
5481       string bucket_name;
5482       cmd_getval(cmdmap, "name", bucket_name);
5483       set<int> osds;
5484       r = p->get_osds_by_bucket_name(bucket_name, &osds);
5485       if (r == -ENOENT) {
5486         ss << "\"" << bucket_name << "\" does not exist";
5487         goto reply;
5488       } else if (r < 0) {
5489         ss << "can not parse bucket name:\"" << bucket_name << "\"";
5490         goto reply;
5491       }
5492
5493       if (f) {
5494         f->open_array_section("osds");
5495         for (auto &i : osds) {
5496           if (osdmap.exists(i)) {
5497             f->dump_int("osd", i);
5498           }
5499         }
5500         f->close_section();
5501         f->flush(ds);
5502       } else {
5503         bool first = true;
5504         for (auto &i : osds) {
5505           if (osdmap.exists(i)) {
5506             if (!first)
5507               ds << "\n";
5508             first = false;
5509             ds << i;
5510           }
5511         }
5512       }
5513
5514       rdata.append(ds);
5515     }
5516   } else if (prefix == "osd getmaxosd") {
5517     if (f) {
5518       f->open_object_section("getmaxosd");
5519       f->dump_unsigned("epoch", osdmap.get_epoch());
5520       f->dump_int("max_osd", osdmap.get_max_osd());
5521       f->close_section();
5522       f->flush(rdata);
5523     } else {
5524       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5525       rdata.append(ds);
5526     }
5527   } else if (prefix == "osd utilization") {
5528     string out;
5529     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5530     if (f)
5531       f->flush(rdata);
5532     else
5533       rdata.append(out);
5534     r = 0;
5535     goto reply;
5536   } else if (prefix  == "osd find") {
5537     int64_t osd;
5538     if (!cmd_getval(cmdmap, "id", osd)) {
5539       ss << "unable to parse osd id value '"
5540          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5541       r = -EINVAL;
5542       goto reply;
5543     }
5544     if (!osdmap.exists(osd)) {
5545       ss << "osd." << osd << " does not exist";
5546       r = -ENOENT;
5547       goto reply;
5548     }
5549     string format;
5550     cmd_getval(cmdmap, "format", format);
5551     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5552     f->open_object_section("osd_location");
5553     f->dump_int("osd", osd);
5554     f->dump_object("addrs", osdmap.get_addrs(osd));
5555     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5556
5557     // try to identify host, pod/container name, etc.
5558     map<string,string> m;
5559     load_metadata(osd, m, nullptr);
5560     if (auto p = m.find("hostname"); p != m.end()) {
5561       f->dump_string("host", p->second);
5562     }
5563     for (auto& k : {
5564         "pod_name", "pod_namespace", // set by rook
5565         "container_name"             // set by cephadm, ceph-ansible
5566         }) {
5567       if (auto p = m.find(k); p != m.end()) {
5568         f->dump_string(k, p->second);
5569       }
5570     }
5571
5572     // crush is helpful too
5573     f->open_object_section("crush_location");
5574     map<string,string> loc = osdmap.crush->get_full_location(osd);
5575     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5576       f->dump_string(p->first.c_str(), p->second);
5577     f->close_section();
5578     f->close_section();
5579     f->flush(rdata);
5580   } else if (prefix == "osd metadata") {
5581     int64_t osd = -1;
5582     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5583         !cmd_getval(cmdmap, "id", osd)) {
5584       ss << "unable to parse osd id value '"
5585          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5586       r = -EINVAL;
5587       goto reply;
5588     }
5589     if (osd >= 0 && !osdmap.exists(osd)) {
5590       ss << "osd." << osd << " does not exist";
5591       r = -ENOENT;
5592       goto reply;
5593     }
5594     string format;
5595     cmd_getval(cmdmap, "format", format);
5596     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5597     if (osd >= 0) {
5598       f->open_object_section("osd_metadata");
5599       f->dump_unsigned("id", osd);
5600       r = dump_osd_metadata(osd, f.get(), &ss);
5601       if (r < 0)
5602         goto reply;
5603       f->close_section();
5604     } else {
5605       r = 0;
5606       f->open_array_section("osd_metadata");
5607       for (int i=0; i<osdmap.get_max_osd(); ++i) {
5608         if (osdmap.exists(i)) {
5609           f->open_object_section("osd");
5610           f->dump_unsigned("id", i);
5611           r = dump_osd_metadata(i, f.get(), NULL);
5612           if (r == -EINVAL || r == -ENOENT) {
5613             // Drop error, continue to get other daemons' metadata
5614             dout(4) << "No metadata for osd." << i << dendl;
5615             r = 0;
5616           } else if (r < 0) {
5617             // Unexpected error
5618             goto reply;
5619           }
5620           f->close_section();
5621         }
5622       }
5623       f->close_section();
5624     }
5625     f->flush(rdata);
5626   } else if (prefix == "osd versions") {
5627     if (!f)
5628       f.reset(Formatter::create("json-pretty"));
5629     count_metadata("ceph_version", f.get());
5630     f->flush(rdata);
5631     r = 0;
5632   } else if (prefix == "osd count-metadata") {
5633     if (!f)
5634       f.reset(Formatter::create("json-pretty"));
5635     string field;
5636     cmd_getval(cmdmap, "property", field);
5637     count_metadata(field, f.get());
5638     f->flush(rdata);
5639     r = 0;
5640   } else if (prefix == "osd numa-status") {
5641     TextTable tbl;
5642     if (f) {
5643       f->open_array_section("osds");
5644     } else {
5645       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5646       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5647       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5648       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5649       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5650       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5651     }
5652     for (int i=0; i<osdmap.get_max_osd(); ++i) {
5653       if (osdmap.exists(i)) {
5654         map<string,string> m;
5655         ostringstream err;
5656         if (load_metadata(i, m, &err) < 0) {
5657           continue;
5658         }
5659         string host;
5660         auto p = m.find("hostname");
5661         if (p != m.end()) {
5662           host = p->second;
5663         }
5664         if (f) {
5665           f->open_object_section("osd");
5666           f->dump_int("osd", i);
5667           f->dump_string("host", host);
5668           for (auto n : { "network_numa_node", "objectstore_numa_node",
5669                 "numa_node" }) {
5670             p = m.find(n);
5671             if (p != m.end()) {
5672               f->dump_int(n, atoi(p->second.c_str()));
5673             }
5674           }
5675           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5676             p = m.find(n);
5677             if (p != m.end()) {
5678               list<string> ls = get_str_list(p->second, ",");
5679               f->open_array_section(n);
5680               for (auto node : ls) {
5681                 f->dump_int("node", atoi(node.c_str()));
5682               }
5683               f->close_section();
5684             }
5685           }
5686           for (auto n : { "numa_node_cpus" }) {
5687             p = m.find(n);
5688             if (p != m.end()) {
5689               dump_cpu_list(f.get(), n, p->second);
5690             }
5691           }
5692           f->close_section();
5693         } else {
5694           tbl << i;
5695           tbl << host;
5696           p = m.find("network_numa_nodes");
5697           if (p != m.end()) {
5698             tbl << p->second;
5699           } else {
5700             tbl << "-";
5701           }
5702           p = m.find("objectstore_numa_nodes");
5703           if (p != m.end()) {
5704             tbl << p->second;
5705           } else {
5706             tbl << "-";
5707           }
5708           p = m.find("numa_node");
5709           auto q = m.find("numa_node_cpus");
5710           if (p != m.end() && q != m.end()) {
5711             tbl << p->second;
5712             tbl << q->second;
5713           } else {
5714             tbl << "-";
5715             tbl << "-";
5716           }
5717           tbl << TextTable::endrow;
5718         }
5719       }
5720     }
5721     if (f) {
5722       f->close_section();
5723       f->flush(rdata);
5724     } else {
5725       rdata.append(stringify(tbl));
5726     }
5727   } else if (prefix == "osd map") {
5728     string poolstr, objstr, namespacestr;
5729     cmd_getval(cmdmap, "pool", poolstr);
5730     cmd_getval(cmdmap, "object", objstr);
5731     cmd_getval(cmdmap, "nspace", namespacestr);
5732
5733     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5734     if (pool < 0) {
5735       ss << "pool " << poolstr << " does not exist";
5736       r = -ENOENT;
5737       goto reply;
5738     }
5739     object_locator_t oloc(pool, namespacestr);
5740     object_t oid(objstr);
5741     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5742     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5743     vector<int> up, acting;
5744     int up_p, acting_p;
5745     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5746
5747     string fullobjname;
5748     if (!namespacestr.empty())
5749       fullobjname = namespacestr + string("/") + oid.name;
5750     else
5751       fullobjname = oid.name;
5752     if (f) {
5753       f->open_object_section("osd_map");
5754       f->dump_unsigned("epoch", osdmap.get_epoch());
5755       f->dump_string("pool", poolstr);
5756       f->dump_int("pool_id", pool);
5757       f->dump_stream("objname") << fullobjname;
5758       f->dump_stream("raw_pgid") << pgid;
5759       f->dump_stream("pgid") << mpgid;
5760       f->open_array_section("up");
5761       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5762         f->dump_int("osd", *p);
5763       f->close_section();
5764       f->dump_int("up_primary", up_p);
5765       f->open_array_section("acting");
5766       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5767         f->dump_int("osd", *p);
5768       f->close_section();
5769       f->dump_int("acting_primary", acting_p);
5770       f->close_section(); // osd_map
5771       f->flush(rdata);
5772     } else {
5773       ds << "osdmap e" << osdmap.get_epoch()
5774         << " pool '" << poolstr << "' (" << pool << ")"
5775         << " object '" << fullobjname << "' ->"
5776         << " pg " << pgid << " (" << mpgid << ")"
5777         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5778         << pg_vector_string(acting) << ", p" << acting_p << ")";
5779       rdata.append(ds);
5780     }
5781
5782   } else if (prefix == "pg map") {
5783     pg_t pgid;
5784     string pgidstr;
5785     cmd_getval(cmdmap, "pgid", pgidstr);
5786     if (!pgid.parse(pgidstr.c_str())) {
5787       ss << "invalid pgid '" << pgidstr << "'";
5788       r = -EINVAL;
5789       goto reply;
5790     }
5791     vector<int> up, acting;
5792     if (!osdmap.have_pg_pool(pgid.pool())) {
5793       ss << "pg '" << pgidstr << "' does not exist";
5794       r = -ENOENT;
5795       goto reply;
5796     }
5797     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5798     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5799     if (f) {
5800       f->open_object_section("pg_map");
5801       f->dump_unsigned("epoch", osdmap.get_epoch());
5802       f->dump_stream("raw_pgid") << pgid;
5803       f->dump_stream("pgid") << mpgid;
5804       f->open_array_section("up");
5805       for (auto osd : up) {
5806         f->dump_int("up_osd", osd);
5807       }
5808       f->close_section();
5809       f->open_array_section("acting");
5810       for (auto osd : acting) {
5811         f->dump_int("acting_osd", osd);
5812       }
5813       f->close_section();
5814       f->close_section();
5815       f->flush(rdata);
5816     } else {
5817       ds << "osdmap e" << osdmap.get_epoch()
5818          << " pg " << pgid << " (" << mpgid << ")"
5819          << " -> up " << up << " acting " << acting;
5820       rdata.append(ds);
5821     }
5822     goto reply;
5823
5824   } else if (prefix == "osd lspools") {
5825     if (f)
5826       f->open_array_section("pools");
5827     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5828          p != osdmap.pools.end();
5829          ++p) {
5830       if (f) {
5831         f->open_object_section("pool");
5832         f->dump_int("poolnum", p->first);
5833         f->dump_string("poolname", osdmap.pool_name[p->first]);
5834         f->close_section();
5835       } else {
5836         ds << p->first << ' ' << osdmap.pool_name[p->first];
5837         if (next(p) != osdmap.pools.end()) {
5838           ds << '\n';
5839         }
5840       }
5841     }
5842     if (f) {
5843       f->close_section();
5844       f->flush(ds);
5845     }
5846     rdata.append(ds);
5847   } else if (prefix == "osd blacklist ls") {
5848     if (f)
5849       f->open_array_section("blacklist");
5850
5851     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5852          p != osdmap.blacklist.end();
5853          ++p) {
5854       if (f) {
5855         f->open_object_section("entry");
5856         f->dump_string("addr", p->first.get_legacy_str());
5857         f->dump_stream("until") << p->second;
5858         f->close_section();
5859       } else {
5860         stringstream ss;
5861         string s;
5862         ss << p->first << " " << p->second;
5863         getline(ss, s);
5864         s += "\n";
5865         rdata.append(s);
5866       }
5867     }
5868     if (f) {
5869       f->close_section();
5870       f->flush(rdata);
5871     }
5872     ss << "listed " << osdmap.blacklist.size() << " entries";
5873
5874   } else if (prefix == "osd pool ls") {
5875     string detail;
5876     cmd_getval(cmdmap, "detail", detail);
5877     if (!f && detail == "detail") {
5878       ostringstream ss;
5879       osdmap.print_pools(ss);
5880       rdata.append(ss.str());
5881     } else {
5882       if (f)
5883         f->open_array_section("pools");
5884       for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5885            it != osdmap.get_pools().end();
5886            ++it) {
5887         if (f) {
5888           if (detail == "detail") {
5889             f->open_object_section("pool");
5890             f->dump_int("pool_id", it->first);
5891             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5892             it->second.dump(f.get());
5893             f->close_section();
5894           } else {
5895             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5896           }
5897         } else {
5898           rdata.append(osdmap.get_pool_name(it->first) + "\n");
5899         }
5900       }
5901       if (f) {
5902         f->close_section();
5903         f->flush(rdata);
5904       }
5905     }
5906
5907   } else if (prefix == "osd crush get-tunable") {
5908     string tunable;
5909     cmd_getval(cmdmap, "tunable", tunable);
5910     ostringstream rss;
5911     if (f)
5912       f->open_object_section("tunable");
5913     if (tunable == "straw_calc_version") {
5914       if (f)
5915         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5916       else
5917         rss << osdmap.crush->get_straw_calc_version() << "\n";
5918     } else {
5919       r = -EINVAL;
5920       goto reply;
5921     }
5922     if (f) {
5923       f->close_section();
5924       f->flush(rdata);
5925     } else {
5926       rdata.append(rss.str());
5927     }
5928     r = 0;
5929
5930   } else if (prefix == "osd pool get") {
5931     string poolstr;
5932     cmd_getval(cmdmap, "pool", poolstr);
5933     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5934     if (pool < 0) {
5935       ss << "unrecognized pool '" << poolstr << "'";
5936       r = -ENOENT;
5937       goto reply;
5938     }
5939
5940     const pg_pool_t *p = osdmap.get_pg_pool(pool);
5941     string var;
5942     cmd_getval(cmdmap, "var", var);
5943
5944     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5945     const choices_map_t ALL_CHOICES = {
5946       {"size", SIZE},
5947       {"min_size", MIN_SIZE},
5948       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5949       {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5950       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5951       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5952       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5953       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5954       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5955       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5956       {"use_gmt_hitset", USE_GMT_HITSET},
5957       {"target_max_objects", TARGET_MAX_OBJECTS},
5958       {"target_max_bytes", TARGET_MAX_BYTES},
5959       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5960       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5961       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5962       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5963       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5964       {"erasure_code_profile", ERASURE_CODE_PROFILE},
5965       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5966       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5967       {"fast_read", FAST_READ},
5968       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5969       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5970       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5971       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5972       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5973       {"recovery_priority", RECOVERY_PRIORITY},
5974       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5975       {"scrub_priority", SCRUB_PRIORITY},
5976       {"compression_mode", COMPRESSION_MODE},
5977       {"compression_algorithm", COMPRESSION_ALGORITHM},
5978       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5979       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5980       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5981       {"csum_type", CSUM_TYPE},
5982       {"csum_max_block", CSUM_MAX_BLOCK},
5983       {"csum_min_block", CSUM_MIN_BLOCK},
5984       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5985       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5986       {"pg_num_min", PG_NUM_MIN},
5987       {"target_size_bytes", TARGET_SIZE_BYTES},
5988       {"target_size_ratio", TARGET_SIZE_RATIO},
5989       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
5990     };
5991
5992     typedef std::set<osd_pool_get_choices> choices_set_t;
5993
5994     const choices_set_t ONLY_TIER_CHOICES = {
5995       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5996       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5997       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5998       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5999       MIN_READ_RECENCY_FOR_PROMOTE,
6000       MIN_WRITE_RECENCY_FOR_PROMOTE,
6001       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6002     };
6003     const choices_set_t ONLY_ERASURE_CHOICES = {
6004       EC_OVERWRITES, ERASURE_CODE_PROFILE
6005     };
6006
6007     choices_set_t selected_choices;
6008     if (var == "all") {
6009       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6010           it != ALL_CHOICES.end(); ++it) {
6011         selected_choices.insert(it->second);
6012       }
6013
6014       if(!p->is_tier()) {
6015         selected_choices = subtract_second_from_first(selected_choices,
6016                                                       ONLY_TIER_CHOICES);
6017       }
6018
6019       if(!p->is_erasure()) {
6020         selected_choices = subtract_second_from_first(selected_choices,
6021                                                       ONLY_ERASURE_CHOICES);
6022       }
6023     } else /* var != "all" */  {
6024       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6025       osd_pool_get_choices selected = found->second;
6026
6027       if (!p->is_tier() &&
6028           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6029         ss << "pool '" << poolstr
6030            << "' is not a tier pool: variable not applicable";
6031         r = -EACCES;
6032         goto reply;
6033       }
6034
6035       if (!p->is_erasure() &&
6036           ONLY_ERASURE_CHOICES.find(selected)
6037           != ONLY_ERASURE_CHOICES.end()) {
6038         ss << "pool '" << poolstr
6039            << "' is not a erasure pool: variable not applicable";
6040         r = -EACCES;
6041         goto reply;
6042       }
6043
6044       if (pool_opts_t::is_opt_name(var) &&
6045           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6046         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6047         r = -ENOENT;
6048         goto reply;
6049       }
6050
6051       selected_choices.insert(selected);
6052     }
6053
6054     if (f) {
6055       f->open_object_section("pool");
6056       f->dump_string("pool", poolstr);
6057       f->dump_int("pool_id", pool);
6058       for(choices_set_t::const_iterator it = selected_choices.begin();
6059           it != selected_choices.end(); ++it) {
6060         choices_map_t::const_iterator i;
6061         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6062           if (i->second == *it) {
6063             break;
6064           }
6065         }
6066         ceph_assert(i != ALL_CHOICES.end());
6067         switch(*it) {
6068           case PG_NUM:
6069             f->dump_int("pg_num", p->get_pg_num());
6070             break;
6071           case PGP_NUM:
6072             f->dump_int("pgp_num", p->get_pgp_num());
6073             break;
6074           case SIZE:
6075             f->dump_int("size", p->get_size());
6076             break;
6077           case MIN_SIZE:
6078             f->dump_int("min_size", p->get_min_size());
6079             break;
6080           case CRUSH_RULE:
6081             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6082               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6083                                p->get_crush_rule()));
6084             } else {
6085               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6086             }
6087             break;
6088           case EC_OVERWRITES:
6089             f->dump_bool("allow_ec_overwrites",
6090                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6091             break;
6092           case PG_AUTOSCALE_MODE:
6093             f->dump_string("pg_autoscale_mode",
6094                            pg_pool_t::get_pg_autoscale_mode_name(
6095                              p->pg_autoscale_mode));
6096             break;
6097           case HASHPSPOOL:
6098           case NODELETE:
6099           case NOPGCHANGE:
6100           case NOSIZECHANGE:
6101           case WRITE_FADVISE_DONTNEED:
6102           case NOSCRUB:
6103           case NODEEP_SCRUB:
6104             f->dump_bool(i->first.c_str(),
6105                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6106             break;
6107           case HIT_SET_PERIOD:
6108             f->dump_int("hit_set_period", p->hit_set_period);
6109             break;
6110           case HIT_SET_COUNT:
6111             f->dump_int("hit_set_count", p->hit_set_count);
6112             break;
6113           case HIT_SET_TYPE:
6114             f->dump_string("hit_set_type",
6115                            HitSet::get_type_name(p->hit_set_params.get_type()));
6116             break;
6117           case HIT_SET_FPP:
6118             {
6119               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6120                 BloomHitSet::Params *bloomp =
6121                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6122                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6123               } else if(var != "all") {
6124                 f->close_section();
6125                 ss << "hit set is not of type Bloom; " <<
6126                   "invalid to get a false positive rate!";
6127                 r = -EINVAL;
6128                 goto reply;
6129               }
6130             }
6131             break;
6132           case USE_GMT_HITSET:
6133             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6134             break;
6135           case TARGET_MAX_OBJECTS:
6136             f->dump_unsigned("target_max_objects", p->target_max_objects);
6137             break;
6138           case TARGET_MAX_BYTES:
6139             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6140             break;
6141           case CACHE_TARGET_DIRTY_RATIO:
6142             f->dump_unsigned("cache_target_dirty_ratio_micro",
6143                              p->cache_target_dirty_ratio_micro);
6144             f->dump_float("cache_target_dirty_ratio",
6145                           ((float)p->cache_target_dirty_ratio_micro/1000000));
6146             break;
6147           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6148             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6149                              p->cache_target_dirty_high_ratio_micro);
6150             f->dump_float("cache_target_dirty_high_ratio",
6151                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6152             break;
6153           case CACHE_TARGET_FULL_RATIO:
6154             f->dump_unsigned("cache_target_full_ratio_micro",
6155                              p->cache_target_full_ratio_micro);
6156             f->dump_float("cache_target_full_ratio",
6157                           ((float)p->cache_target_full_ratio_micro/1000000));
6158             break;
6159           case CACHE_MIN_FLUSH_AGE:
6160             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6161             break;
6162           case CACHE_MIN_EVICT_AGE:
6163             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6164             break;
6165           case ERASURE_CODE_PROFILE:
6166             f->dump_string("erasure_code_profile", p->erasure_code_profile);
6167             break;
6168           case MIN_READ_RECENCY_FOR_PROMOTE:
6169             f->dump_int("min_read_recency_for_promote",
6170                         p->min_read_recency_for_promote);
6171             break;
6172           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6173             f->dump_int("min_write_recency_for_promote",
6174                         p->min_write_recency_for_promote);
6175             break;
6176           case FAST_READ:
6177             f->dump_int("fast_read", p->fast_read);
6178             break;
6179           case HIT_SET_GRADE_DECAY_RATE:
6180             f->dump_int("hit_set_grade_decay_rate",
6181                         p->hit_set_grade_decay_rate);
6182             break;
6183           case HIT_SET_SEARCH_LAST_N:
6184             f->dump_int("hit_set_search_last_n",
6185                         p->hit_set_search_last_n);
6186             break;
6187           case SCRUB_MIN_INTERVAL:
6188           case SCRUB_MAX_INTERVAL:
6189           case DEEP_SCRUB_INTERVAL:
6190           case RECOVERY_PRIORITY:
6191           case RECOVERY_OP_PRIORITY:
6192           case SCRUB_PRIORITY:
6193           case COMPRESSION_MODE:
6194           case COMPRESSION_ALGORITHM:
6195           case COMPRESSION_REQUIRED_RATIO:
6196           case COMPRESSION_MAX_BLOB_SIZE:
6197           case COMPRESSION_MIN_BLOB_SIZE:
6198           case CSUM_TYPE:
6199           case CSUM_MAX_BLOCK:
6200           case CSUM_MIN_BLOCK:
6201           case FINGERPRINT_ALGORITHM:
6202           case PG_NUM_MIN:
6203           case TARGET_SIZE_BYTES:
6204           case TARGET_SIZE_RATIO:
6205           case PG_AUTOSCALE_BIAS:
6206             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6207             if (p->opts.is_set(key)) {
6208               if(*it == CSUM_TYPE) {
6209                 int64_t val;
6210                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6211                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6212               } else {
6213                 p->opts.dump(i->first, f.get());
6214               }
6215             }
6216             break;
6217         }
6218       }
6219       f->close_section();
6220       f->flush(rdata);
6221     } else /* !f */ {
6222       for(choices_set_t::const_iterator it = selected_choices.begin();
6223           it != selected_choices.end(); ++it) {
6224         choices_map_t::const_iterator i;
6225         switch(*it) {
6226           case PG_NUM:
6227             ss << "pg_num: " << p->get_pg_num() << "\n";
6228             break;
6229           case PGP_NUM:
6230             ss << "pgp_num: " << p->get_pgp_num() << "\n";
6231             break;
6232           case SIZE:
6233             ss << "size: " << p->get_size() << "\n";
6234             break;
6235           case MIN_SIZE:
6236             ss << "min_size: " << p->get_min_size() << "\n";
6237             break;
6238           case CRUSH_RULE:
6239             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6240               ss << "crush_rule: " << osdmap.crush->get_rule_name(
6241                 p->get_crush_rule()) << "\n";
6242             } else {
6243               ss << "crush_rule: " << p->get_crush_rule() << "\n";
6244             }
6245             break;
6246           case PG_AUTOSCALE_MODE:
6247             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6248               p->pg_autoscale_mode) <<"\n";
6249             break;
6250           case HIT_SET_PERIOD:
6251             ss << "hit_set_period: " << p->hit_set_period << "\n";
6252             break;
6253           case HIT_SET_COUNT:
6254             ss << "hit_set_count: " << p->hit_set_count << "\n";
6255             break;
6256           case HIT_SET_TYPE:
6257             ss << "hit_set_type: " <<
6258               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6259             break;
6260           case HIT_SET_FPP:
6261             {
6262               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6263                 BloomHitSet::Params *bloomp =
6264                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6265                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6266               } else if(var != "all") {
6267                 ss << "hit set is not of type Bloom; " <<
6268                   "invalid to get a false positive rate!";
6269                 r = -EINVAL;
6270                 goto reply;
6271               }
6272             }
6273             break;
6274           case USE_GMT_HITSET:
6275             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6276             break;
6277           case TARGET_MAX_OBJECTS:
6278             ss << "target_max_objects: " << p->target_max_objects << "\n";
6279             break;
6280           case TARGET_MAX_BYTES:
6281             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6282             break;
6283           case CACHE_TARGET_DIRTY_RATIO:
6284             ss << "cache_target_dirty_ratio: "
6285                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6286             break;
6287           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6288             ss << "cache_target_dirty_high_ratio: "
6289                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6290             break;
6291           case CACHE_TARGET_FULL_RATIO:
6292             ss << "cache_target_full_ratio: "
6293                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6294             break;
6295           case CACHE_MIN_FLUSH_AGE:
6296             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6297             break;
6298           case CACHE_MIN_EVICT_AGE:
6299             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6300             break;
6301           case ERASURE_CODE_PROFILE:
6302             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6303             break;
6304           case MIN_READ_RECENCY_FOR_PROMOTE:
6305             ss << "min_read_recency_for_promote: " <<
6306               p->min_read_recency_for_promote << "\n";
6307             break;
6308           case HIT_SET_GRADE_DECAY_RATE:
6309             ss << "hit_set_grade_decay_rate: " <<
6310               p->hit_set_grade_decay_rate << "\n";
6311             break;
6312           case HIT_SET_SEARCH_LAST_N:
6313             ss << "hit_set_search_last_n: " <<
6314               p->hit_set_search_last_n << "\n";
6315             break;
6316           case EC_OVERWRITES:
6317             ss << "allow_ec_overwrites: " <<
6318               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6319               "\n";
6320             break;
6321           case HASHPSPOOL:
6322           case NODELETE:
6323           case NOPGCHANGE:
6324           case NOSIZECHANGE:
6325           case WRITE_FADVISE_DONTNEED:
6326           case NOSCRUB:
6327           case NODEEP_SCRUB:
6328             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6329               if (i->second == *it)
6330                 break;
6331             }
6332             ceph_assert(i != ALL_CHOICES.end());
6333             ss << i->first << ": " <<
6334               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6335                "true" : "false") << "\n";
6336             break;
6337           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6338             ss << "min_write_recency_for_promote: " <<
6339               p->min_write_recency_for_promote << "\n";
6340             break;
6341           case FAST_READ:
6342             ss << "fast_read: " << p->fast_read << "\n";
6343             break;
6344           case SCRUB_MIN_INTERVAL:
6345           case SCRUB_MAX_INTERVAL:
6346           case DEEP_SCRUB_INTERVAL:
6347           case RECOVERY_PRIORITY:
6348           case RECOVERY_OP_PRIORITY:
6349           case SCRUB_PRIORITY:
6350           case COMPRESSION_MODE:
6351           case COMPRESSION_ALGORITHM:
6352           case COMPRESSION_REQUIRED_RATIO:
6353           case COMPRESSION_MAX_BLOB_SIZE:
6354           case COMPRESSION_MIN_BLOB_SIZE:
6355           case CSUM_TYPE:
6356           case CSUM_MAX_BLOCK:
6357           case CSUM_MIN_BLOCK:
6358           case FINGERPRINT_ALGORITHM:
6359           case PG_NUM_MIN:
6360           case TARGET_SIZE_BYTES:
6361           case TARGET_SIZE_RATIO:
6362           case PG_AUTOSCALE_BIAS:
6363             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6364               if (i->second == *it)
6365                 break;
6366             }
6367             ceph_assert(i != ALL_CHOICES.end());
6368             {
6369               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6370               if (p->opts.is_set(key)) {
6371                 if(key == pool_opts_t::CSUM_TYPE) {
6372                   int64_t val;
6373                   p->opts.get(key, &val);
6374                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6375                 } else {
6376                   ss << i->first << ": " << p->opts.get(key) << "\n";
6377                 }
6378               }
6379             }
6380             break;
6381         }
6382         rdata.append(ss.str());
6383         ss.str("");
6384       }
6385     }
6386     r = 0;
6387   } else if (prefix == "osd pool get-quota") {
6388     string pool_name;
6389     cmd_getval(cmdmap, "pool", pool_name);
6390
6391     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6392     if (poolid < 0) {
6393       ceph_assert(poolid == -ENOENT);
6394       ss << "unrecognized pool '" << pool_name << "'";
6395       r = -ENOENT;
6396       goto reply;
6397     }
6398     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6399     const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6400     const object_stat_sum_t& sum = pstat->stats.sum;
6401     if (f) {
6402       f->open_object_section("pool_quotas");
6403       f->dump_string("pool_name", pool_name);
6404       f->dump_unsigned("pool_id", poolid);
6405       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6406       f->dump_int("current_num_objects", sum.num_objects);
6407       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6408       f->dump_int("current_num_bytes", sum.num_bytes);
6409       f->close_section();
6410       f->flush(rdata);
6411     } else {
6412       stringstream rs;
6413       rs << "quotas for pool '" << pool_name << "':\n"
6414          << "  max objects: ";
6415       if (p->quota_max_objects == 0)
6416         rs << "N/A";
6417       else {
6418         rs << si_u_t(p->quota_max_objects) << " objects";
6419         rs << "  (current num objects: " << sum.num_objects << " objects)";
6420       }
6421       rs << "\n"
6422          << "  max bytes  : ";
6423       if (p->quota_max_bytes == 0)
6424         rs << "N/A";
6425       else {
6426         rs << byte_u_t(p->quota_max_bytes);
6427         rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
6428       }
6429       rdata.append(rs.str());
6430     }
6431     rdata.append("\n");
6432     r = 0;
6433   } else if (prefix == "osd crush rule list" ||
6434              prefix == "osd crush rule ls") {
6435     if (f) {
6436       f->open_array_section("rules");
6437       osdmap.crush->list_rules(f.get());
6438       f->close_section();
6439       f->flush(rdata);
6440     } else {
6441       ostringstream ss;
6442       osdmap.crush->list_rules(&ss);
6443       rdata.append(ss.str());
6444     }
6445   } else if (prefix == "osd crush rule ls-by-class") {
6446     string class_name;
6447     cmd_getval(cmdmap, "class", class_name);
6448     if (class_name.empty()) {
6449       ss << "no class specified";
6450       r = -EINVAL;
6451       goto reply;
6452     }
6453     set<int> rules;
6454     r = osdmap.crush->get_rules_by_class(class_name, &rules);
6455     if (r < 0) {
6456       ss << "failed to get rules by class '" << class_name << "'";
6457       goto reply;
6458     }
6459     if (f) {
6460       f->open_array_section("rules");
6461       for (auto &rule: rules) {
6462         f->dump_string("name", osdmap.crush->get_rule_name(rule));
6463       }
6464       f->close_section();
6465       f->flush(rdata);
6466     } else {
6467       ostringstream rs;
6468       for (auto &rule: rules) {
6469         rs << osdmap.crush->get_rule_name(rule) << "\n";
6470       }
6471       rdata.append(rs.str());
6472     }
6473   } else if (prefix == "osd crush rule dump") {
6474     string name;
6475     cmd_getval(cmdmap, "name", name);
6476     string format;
6477     cmd_getval(cmdmap, "format", format);
6478     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6479     if (name == "") {
6480       f->open_array_section("rules");
6481       osdmap.crush->dump_rules(f.get());
6482       f->close_section();
6483     } else {
6484       int ruleno = osdmap.crush->get_rule_id(name);
6485       if (ruleno < 0) {
6486         ss << "unknown crush rule '" << name << "'";
6487         r = ruleno;
6488         goto reply;
6489       }
6490       osdmap.crush->dump_rule(ruleno, f.get());
6491     }
6492     ostringstream rs;
6493     f->flush(rs);
6494     rs << "\n";
6495     rdata.append(rs.str());
6496   } else if (prefix == "osd crush dump") {
6497     string format;
6498     cmd_getval(cmdmap, "format", format);
6499     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6500     f->open_object_section("crush_map");
6501     osdmap.crush->dump(f.get());
6502     f->close_section();
6503     ostringstream rs;
6504     f->flush(rs);
6505     rs << "\n";
6506     rdata.append(rs.str());
6507   } else if (prefix == "osd crush show-tunables") {
6508     string format;
6509     cmd_getval(cmdmap, "format", format);
6510     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6511     f->open_object_section("crush_map_tunables");
6512     osdmap.crush->dump_tunables(f.get());
6513     f->close_section();
6514     ostringstream rs;
6515     f->flush(rs);
6516     rs << "\n";
6517     rdata.append(rs.str());
6518   } else if (prefix == "osd crush tree") {
6519     string shadow;
6520     cmd_getval(cmdmap, "shadow", shadow);
6521     bool show_shadow = shadow == "--show-shadow";
6522     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6523     if (f) {
6524       f->open_object_section("crush_tree");
6525       osdmap.crush->dump_tree(nullptr,
6526                               f.get(),
6527                               osdmap.get_pool_names(),
6528                               show_shadow);
6529       f->close_section();
6530       f->flush(rdata);
6531     } else {
6532       ostringstream ss;
6533       osdmap.crush->dump_tree(&ss,
6534                               nullptr,
6535                               osdmap.get_pool_names(),
6536                               show_shadow);
6537       rdata.append(ss.str());
6538     }
6539   } else if (prefix == "osd crush ls") {
6540     string name;
6541     if (!cmd_getval(cmdmap, "node", name)) {
6542       ss << "no node specified";
6543       r = -EINVAL;
6544       goto reply;
6545     }
6546     if (!osdmap.crush->name_exists(name)) {
6547       ss << "node '" << name << "' does not exist";
6548       r = -ENOENT;
6549       goto reply;
6550     }
6551     int id = osdmap.crush->get_item_id(name);
6552     list<int> result;
6553     if (id >= 0) {
6554       result.push_back(id);
6555     } else {
6556       int num = osdmap.crush->get_bucket_size(id);
6557       for (int i = 0; i < num; ++i) {
6558         result.push_back(osdmap.crush->get_bucket_item(id, i));
6559       }
6560     }
6561     if (f) {
6562       f->open_array_section("items");
6563       for (auto i : result) {
6564         f->dump_string("item", osdmap.crush->get_item_name(i));
6565       }
6566       f->close_section();
6567       f->flush(rdata);
6568     } else {
6569       ostringstream ss;
6570       for (auto i : result) {
6571         ss << osdmap.crush->get_item_name(i) << "\n";
6572       }
6573       rdata.append(ss.str());
6574     }
6575     r = 0;
6576   } else if (prefix == "osd crush class ls") {
6577     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6578     f->open_array_section("crush_classes");
6579     for (auto i : osdmap.crush->class_name)
6580       f->dump_string("class", i.second);
6581     f->close_section();
6582     f->flush(rdata);
6583   } else if (prefix == "osd crush class ls-osd") {
6584     string name;
6585     cmd_getval(cmdmap, "class", name);
6586     set<int> osds;
6587     osdmap.crush->get_devices_by_class(name, &osds);
6588     if (f) {
6589       f->open_array_section("osds");
6590       for (auto &osd: osds)
6591         f->dump_int("osd", osd);
6592       f->close_section();
6593       f->flush(rdata);
6594     } else {
6595       bool first = true;
6596       for (auto &osd : osds) {
6597         if (!first)
6598           ds << "\n";
6599         first = false;
6600         ds << osd;
6601       }
6602       rdata.append(ds);
6603     }
6604   } else if (prefix == "osd crush get-device-class") {
6605     vector<string> idvec;
6606     cmd_getval(cmdmap, "ids", idvec);
6607     map<int, string> class_by_osd;
6608     for (auto& id : idvec) {
6609       ostringstream ts;
6610       long osd = parse_osd_id(id.c_str(), &ts);
6611       if (osd < 0) {
6612         ss << "unable to parse osd id:'" << id << "'";
6613         r = -EINVAL;
6614         goto reply;
6615       }
6616       auto device_class = osdmap.crush->get_item_class(osd);
6617       if (device_class)
6618         class_by_osd[osd] = device_class;
6619       else
6620         class_by_osd[osd] = ""; // no class
6621     }
6622     if (f) {
6623       f->open_array_section("osd_device_classes");
6624       for (auto& i : class_by_osd) {
6625         f->open_object_section("osd_device_class");
6626         f->dump_int("osd", i.first);
6627         f->dump_string("device_class", i.second);
6628         f->close_section();
6629       }
6630       f->close_section();
6631       f->flush(rdata);
6632     } else {
6633       if (class_by_osd.size() == 1) {
6634         // for single input, make a clean output
6635         ds << class_by_osd.begin()->second;
6636       } else {
6637         // note that we do not group osds by class here
6638         for (auto it = class_by_osd.begin();
6639              it != class_by_osd.end();
6640              it++) {
6641           ds << "osd." << it->first << ' ' << it->second;
6642           if (next(it) != class_by_osd.end())
6643             ds << '\n';
6644         }
6645       }
6646       rdata.append(ds);
6647     }
6648   } else if (prefix == "osd erasure-code-profile ls") {
6649     const auto &profiles = osdmap.get_erasure_code_profiles();
6650     if (f)
6651       f->open_array_section("erasure-code-profiles");
6652     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6653       if (f)
6654         f->dump_string("profile", i->first.c_str());
6655       else
6656         rdata.append(i->first + "\n");
6657     }
6658     if (f) {
6659       f->close_section();
6660       ostringstream rs;
6661       f->flush(rs);
6662       rs << "\n";
6663       rdata.append(rs.str());
6664     }
6665   } else if (prefix == "osd crush weight-set ls") {
6666     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6667     if (f) {
6668       f->open_array_section("weight_sets");
6669       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6670         f->dump_string("pool", "(compat)");
6671       }
6672       for (auto& i : osdmap.crush->choose_args) {
6673         if (i.first >= 0) {
6674           f->dump_string("pool", osdmap.get_pool_name(i.first));
6675         }
6676       }
6677       f->close_section();
6678       f->flush(rdata);
6679     } else {
6680       ostringstream rs;
6681       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6682         rs << "(compat)\n";
6683       }
6684       for (auto& i : osdmap.crush->choose_args) {
6685         if (i.first >= 0) {
6686           rs << osdmap.get_pool_name(i.first) << "\n";
6687         }
6688       }
6689       rdata.append(rs.str());
6690     }
6691   } else if (prefix == "osd crush weight-set dump") {
6692     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6693                                                      "json-pretty"));
6694     osdmap.crush->dump_choose_args(f.get());
6695     f->flush(rdata);
6696   } else if (prefix == "osd erasure-code-profile get") {
6697     string name;
6698     cmd_getval(cmdmap, "name", name);
6699     if (!osdmap.has_erasure_code_profile(name)) {
6700       ss << "unknown erasure code profile '" << name << "'";
6701       r = -ENOENT;
6702       goto reply;
6703     }
6704     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6705     if (f)
6706       f->open_object_section("profile");
6707     for (map<string,string>::const_iterator i = profile.begin();
6708          i != profile.end();
6709          ++i) {
6710       if (f)
6711         f->dump_string(i->first.c_str(), i->second.c_str());
6712       else
6713         rdata.append(i->first + "=" + i->second + "\n");
6714     }
6715     if (f) {
6716       f->close_section();
6717       ostringstream rs;
6718       f->flush(rs);
6719       rs << "\n";
6720       rdata.append(rs.str());
6721     }
6722   } else if (prefix == "osd pool application get") {
6723     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6724                                                      "json-pretty"));
6725     string pool_name;
6726     cmd_getval(cmdmap, "pool", pool_name);
6727     string app;
6728     cmd_getval(cmdmap, "app", app);
6729     string key;
6730     cmd_getval(cmdmap, "key", key);
6731
6732     if (pool_name.empty()) {
6733       // all
6734       f->open_object_section("pools");
6735       for (const auto &pool : osdmap.pools) {
6736         std::string name("<unknown>");
6737         const auto &pni = osdmap.pool_name.find(pool.first);
6738         if (pni != osdmap.pool_name.end())
6739           name = pni->second;
6740         f->open_object_section(name.c_str());
6741         for (auto &app_pair : pool.second.application_metadata) {
6742           f->open_object_section(app_pair.first.c_str());
6743           for (auto &kv_pair : app_pair.second) {
6744             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6745           }
6746           f->close_section();
6747         }
6748         f->close_section(); // name
6749       }
6750       f->close_section(); // pools
6751       f->flush(rdata);
6752     } else {
6753       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6754       if (pool < 0) {
6755         ss << "unrecognized pool '" << pool_name << "'";
6756         r = -ENOENT;
6757         goto reply;
6758       }
6759       auto p = osdmap.get_pg_pool(pool);
6760       // filter by pool
6761       if (app.empty()) {
6762         f->open_object_section(pool_name.c_str());
6763         for (auto &app_pair : p->application_metadata) {
6764           f->open_object_section(app_pair.first.c_str());
6765           for (auto &kv_pair : app_pair.second) {
6766             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6767           }
6768           f->close_section(); // application
6769         }
6770         f->close_section(); // pool_name
6771         f->flush(rdata);
6772         goto reply;
6773       }
6774
6775       auto app_it = p->application_metadata.find(app);
6776       if (app_it == p->application_metadata.end()) {
6777         ss << "pool '" << pool_name << "' has no application '" << app << "'";
6778         r = -ENOENT;
6779         goto reply;
6780       }
6781       // filter by pool + app
6782       if (key.empty()) {
6783         f->open_object_section(app_it->first.c_str());
6784         for (auto &kv_pair : app_it->second) {
6785           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6786         }
6787         f->close_section(); // application
6788         f->flush(rdata);
6789         goto reply;
6790       }
6791       // filter by pool + app + key
6792       auto key_it = app_it->second.find(key);
6793       if (key_it == app_it->second.end()) {
6794         ss << "application '" << app << "' on pool '" << pool_name
6795            << "' does not have key '" << key << "'";
6796         r = -ENOENT;
6797         goto reply;
6798       }
6799       ss << key_it->second << "\n";
6800       rdata.append(ss.str());
6801       ss.str("");
6802     }
6803   } else if (prefix == "osd get-require-min-compat-client") {
6804     ss << osdmap.require_min_compat_client << std::endl;
6805     rdata.append(ss.str());
6806     ss.str("");
6807     goto reply;
6808   } else if (prefix == "osd pool application enable" ||
6809              prefix == "osd pool application disable" ||
6810              prefix == "osd pool application set" ||
6811              prefix == "osd pool application rm") {
6812     bool changed = false;
6813     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6814     if (r != 0) {
6815       // Error, reply.
6816       goto reply;
6817     } else if (changed) {
6818       // Valid mutation, proceed to prepare phase
6819       return false;
6820     } else {
6821       // Idempotent case, reply
6822       goto reply;
6823     }
6824   } else {
6825     // try prepare update
6826     return false;
6827   }
6828
6829  reply:
6830   string rs;
6831   getline(ss, rs);
6832   mon->reply_command(op, r, rs, rdata, get_last_committed());
6833   return true;
6834 }
6835
6836 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6837 {
6838   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6839     osdmap.get_pg_pool(pool_id));
6840   ceph_assert(pool);
6841   pool->set_flag(flags);
6842 }
6843
6844 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6845 {
6846   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6847     osdmap.get_pg_pool(pool_id));
6848   ceph_assert(pool);
6849   pool->unset_flag(flags);
6850 }
6851
6852 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
6853 {
6854   char k[80];
6855   snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
6856   return k;
6857 }
6858
6859 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
6860 {
6861   char k[80];
6862   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6863            (unsigned long long)pool, (unsigned long long)snap);
6864   return k;
6865 }
6866
6867 string OSDMonitor::make_purged_snap_key_value(
6868   int64_t pool, snapid_t snap, snapid_t num,
6869   epoch_t epoch, bufferlist *v)
6870 {
6871   // encode the *last* epoch in the key so that we can use forward
6872   // iteration only to search for an epoch in an interval.
6873   encode(snap, *v);
6874   encode(snap + num, *v);
6875   encode(epoch, *v);
6876   return make_purged_snap_key(pool, snap + num - 1);
6877 }
6878
6879
6880 int OSDMonitor::lookup_purged_snap(
6881   int64_t pool, snapid_t snap,
6882   snapid_t *begin, snapid_t *end)
6883 {
6884   string k = make_purged_snap_key(pool, snap);
6885   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6886   it->lower_bound(k);
6887   if (!it->valid()) {
6888     dout(20) << __func__
6889              << " pool " << pool << " snap " << snap
6890              << " - key '" << k << "' not found" << dendl;
6891     return -ENOENT;
6892   }
6893   if (it->key().find("purged_snap_") != 0) {
6894     dout(20) << __func__
6895              << " pool " << pool << " snap " << snap
6896              << " - key '" << k << "' got '" << it->key()
6897              << "', wrong prefix" << dendl;
6898     return -ENOENT;
6899   }
6900   string gotk = it->key();
6901   const char *format = "purged_snap_%llu_";
6902   long long int keypool;
6903   int n = sscanf(gotk.c_str(), format, &keypool);
6904   if (n != 1) {
6905     derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6906     return -ENOENT;
6907   }
6908   if (pool != keypool) {
6909     dout(20) << __func__
6910              << " pool " << pool << " snap " << snap
6911              << " - key '" << k << "' got '" << gotk
6912              << "', wrong pool " << keypool
6913              << dendl;
6914     return -ENOENT;
6915   }
6916   bufferlist v = it->value();
6917   auto p = v.cbegin();
6918   decode(*begin, p);
6919   decode(*end, p);
6920   if (snap < *begin || snap >= *end) {
6921     dout(20) << __func__
6922              << " pool " << pool << " snap " << snap
6923              << " - found [" << *begin << "," << *end << "), no overlap"
6924              << dendl;
6925     return -ENOENT;
6926   }
6927   return 0;
6928 }
6929
6930 void OSDMonitor::insert_purged_snap_update(
6931   int64_t pool,
6932   snapid_t start, snapid_t end,
6933   epoch_t epoch,
6934   MonitorDBStore::TransactionRef t)
6935 {
6936   snapid_t before_begin, before_end;
6937   snapid_t after_begin, after_end;
6938   int b = lookup_purged_snap(pool, start - 1,
6939                              &before_begin, &before_end);
6940   int a = lookup_purged_snap(pool, end,
6941                              &after_begin, &after_end);
6942   if (!b && !a) {
6943     dout(10) << __func__
6944              << " [" << start << "," << end << ") - joins ["
6945              << before_begin << "," << before_end << ") and ["
6946              << after_begin << "," << after_end << ")" << dendl;
6947     // erase only the begin record; we'll overwrite the end one.
6948     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6949     bufferlist v;
6950     string k = make_purged_snap_key_value(pool,
6951                                           before_begin, after_end - before_begin,
6952                                           pending_inc.epoch, &v);
6953     t->put(OSD_SNAP_PREFIX, k, v);
6954   } else if (!b) {
6955     dout(10) << __func__
6956              << " [" << start << "," << end << ") - join with earlier ["
6957              << before_begin << "," << before_end << ")" << dendl;
6958     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6959     bufferlist v;
6960     string k = make_purged_snap_key_value(pool,
6961                                           before_begin, end - before_begin,
6962                                           pending_inc.epoch, &v);
6963     t->put(OSD_SNAP_PREFIX, k, v);
6964   } else if (!a) {
6965     dout(10) << __func__
6966              << " [" << start << "," << end << ") - join with later ["
6967              << after_begin << "," << after_end << ")" << dendl;
6968     // overwrite after record
6969     bufferlist v;
6970     string k = make_purged_snap_key_value(pool,
6971                                           start, after_end - start,
6972                                           pending_inc.epoch, &v);
6973     t->put(OSD_SNAP_PREFIX, k, v);
6974   } else {
6975     dout(10) << __func__
6976              << " [" << start << "," << end << ") - new"
6977              << dendl;
6978     bufferlist v;
6979     string k = make_purged_snap_key_value(pool,
6980                                           start, end - start,
6981                                           pending_inc.epoch, &v);
6982     t->put(OSD_SNAP_PREFIX, k, v);
6983   }
6984 }
6985
6986 bool OSDMonitor::try_prune_purged_snaps()
6987 {
6988   if (!mon->mgrstatmon()->is_readable()) {
6989     return false;
6990   }
6991   if (!pending_inc.new_purged_snaps.empty()) {
6992     return false;  // we already pruned for this epoch
6993   }
6994
6995   unsigned max_prune = cct->_conf.get_val<uint64_t>(
6996     "mon_max_snap_prune_per_epoch");
6997   if (!max_prune) {
6998     max_prune = 100000;
6999   }
7000   dout(10) << __func__ << " max_prune " << max_prune << dendl;
7001
7002   unsigned actually_pruned = 0;
7003   auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
7004   for (auto& p : osdmap.get_pools()) {
7005     auto q = purged_snaps.find(p.first);
7006     if (q == purged_snaps.end()) {
7007       continue;
7008     }
7009     auto& purged = q->second;
7010     if (purged.empty()) {
7011       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7012       continue;
7013     }
7014     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7015     snap_interval_set_t to_prune;
7016     unsigned maybe_pruned = actually_pruned;
7017     for (auto i = purged.begin(); i != purged.end(); ++i) {
7018       snapid_t begin = i.get_start();
7019       auto end = i.get_start() + i.get_len();
7020       snapid_t pbegin = 0, pend = 0;
7021       int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7022       if (r == 0) {
7023         // already purged.
7024         // be a bit aggressive about backing off here, because the mon may
7025         // do a lot of work going through this set, and if we know the
7026         // purged set from the OSDs is at least *partly* stale we may as
7027         // well wait for it to be fresh.
7028         dout(20) << __func__ << "  we've already purged " << pbegin
7029                  << "~" << (pend - pbegin) << dendl;
7030         break;  // next pool
7031       }
7032       if (pbegin && pbegin > begin && pbegin < end) {
7033         // the tail of [begin,end) is purged; shorten the range
7034         end = pbegin;
7035       }
7036       to_prune.insert(begin, end - begin);
7037       maybe_pruned += end - begin;
7038       if (maybe_pruned >= max_prune) {
7039         break;
7040       }
7041     }
7042     if (!to_prune.empty()) {
7043       // PGs may still be reporting things as purged that we have already
7044       // pruned from removed_snaps_queue.
7045       snap_interval_set_t actual;
7046       auto r = osdmap.removed_snaps_queue.find(p.first);
7047       if (r != osdmap.removed_snaps_queue.end()) {
7048         actual.intersection_of(to_prune, r->second);
7049       }
7050       actually_pruned += actual.size();
7051       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7052                << ", actual pruned " << actual << dendl;
7053       if (!actual.empty()) {
7054         pending_inc.new_purged_snaps[p.first].swap(actual);
7055       }
7056     }
7057     if (actually_pruned >= max_prune) {
7058       break;
7059     }
7060   }
7061   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7062   return !!actually_pruned;
7063 }
7064
7065 bool OSDMonitor::update_pools_status()
7066 {
7067   if (!mon->mgrstatmon()->is_readable())
7068     return false;
7069
7070   bool ret = false;
7071
7072   auto& pools = osdmap.get_pools();
7073   for (auto it = pools.begin(); it != pools.end(); ++it) {
7074     const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
7075     if (!pstat)
7076       continue;
7077     const object_stat_sum_t& sum = pstat->stats.sum;
7078     const pg_pool_t &pool = it->second;
7079     const string& pool_name = osdmap.get_pool_name(it->first);
7080
7081     bool pool_is_full =
7082       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7083       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7084
7085     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7086       if (pool_is_full)
7087         continue;
7088
7089       mon->clog->info() << "pool '" << pool_name
7090                        << "' no longer out of quota; removing NO_QUOTA flag";
7091       // below we cancel FLAG_FULL too, we'll set it again in
7092       // OSDMonitor::encode_pending if it still fails the osd-full checking.
7093       clear_pool_flags(it->first,
7094                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7095       ret = true;
7096     } else {
7097       if (!pool_is_full)
7098         continue;
7099
7100       if (pool.quota_max_bytes > 0 &&
7101           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7102         mon->clog->warn() << "pool '" << pool_name << "' is full"
7103                          << " (reached quota's max_bytes: "
7104                          << byte_u_t(pool.quota_max_bytes) << ")";
7105       }
7106       if (pool.quota_max_objects > 0 &&
7107                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7108         mon->clog->warn() << "pool '" << pool_name << "' is full"
7109                          << " (reached quota's max_objects: "
7110                          << pool.quota_max_objects << ")";
7111       }
7112       // set both FLAG_FULL_QUOTA and FLAG_FULL
7113       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7114       // since FLAG_FULL should always take precedence
7115       set_pool_flags(it->first,
7116                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7117       clear_pool_flags(it->first,
7118                        pg_pool_t::FLAG_NEARFULL |
7119                        pg_pool_t::FLAG_BACKFILLFULL);
7120       ret = true;
7121     }
7122   }
7123   return ret;
7124 }
7125
7126 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7127 {
7128   op->mark_osdmon_event(__func__);
7129   auto m = op->get_req<MPoolOp>();
7130   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7131   MonSession *session = op->get_session();
7132   if (!session)
7133     return -EPERM;
7134   string erasure_code_profile;
7135   stringstream ss;
7136   string rule_name;
7137   int ret = 0;
7138   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7139                          0, 0, 0, 0, 0, 0.0,
7140                          erasure_code_profile,
7141                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7142                          &ss);
7143
7144   if (ret < 0) {
7145     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7146   }
7147   return ret;
7148 }
7149
7150 int OSDMonitor::crush_rename_bucket(const string& srcname,
7151                                     const string& dstname,
7152                                     ostream *ss)
7153 {
7154   int ret;
7155   //
7156   // Avoid creating a pending crush if it does not already exists and
7157   // the rename would fail.
7158   //
7159   if (!_have_pending_crush()) {
7160     ret = _get_stable_crush().can_rename_bucket(srcname,
7161                                                 dstname,
7162                                                 ss);
7163     if (ret)
7164       return ret;
7165   }
7166
7167   CrushWrapper newcrush;
7168   _get_pending_crush(newcrush);
7169
7170   ret = newcrush.rename_bucket(srcname,
7171                                dstname,
7172                                ss);
7173   if (ret)
7174     return ret;
7175
7176   pending_inc.crush.clear();
7177   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7178   *ss << "renamed bucket " << srcname << " into " << dstname;
7179   return 0;
7180 }
7181
7182 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7183 {
7184   string replacement = "";
7185
7186   if (plugin == "jerasure_generic" ||
7187       plugin == "jerasure_sse3" ||
7188       plugin == "jerasure_sse4" ||
7189       plugin == "jerasure_neon") {
7190     replacement = "jerasure";
7191   } else if (plugin == "shec_generic" ||
7192              plugin == "shec_sse3" ||
7193              plugin == "shec_sse4" ||
7194              plugin == "shec_neon") {
7195     replacement = "shec";
7196   }
7197
7198   if (replacement != "") {
7199     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7200             << plugin << " that has been deprecated. Please use "
7201             << replacement << " instead." << dendl;
7202   }
7203 }
7204
7205 int OSDMonitor::normalize_profile(const string& profilename,
7206                                   ErasureCodeProfile &profile,
7207                                   bool force,
7208                                   ostream *ss)
7209 {
7210   ErasureCodeInterfaceRef erasure_code;
7211   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7212   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7213   check_legacy_ec_plugin(plugin->second, profilename);
7214   int err = instance.factory(plugin->second,
7215                              g_conf().get_val<std::string>("erasure_code_dir"),
7216                              profile, &erasure_code, ss);
7217   if (err) {
7218     return err;
7219   }
7220
7221   err = erasure_code->init(profile, ss);
7222   if (err) {
7223     return err;
7224   }
7225
7226   auto it = profile.find("stripe_unit");
7227   if (it != profile.end()) {
7228     string err_str;
7229     uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7230     if (!err_str.empty()) {
7231       *ss << "could not parse stripe_unit '" << it->second
7232           << "': " << err_str << std::endl;
7233       return -EINVAL;
7234     }
7235     uint32_t data_chunks = erasure_code->get_data_chunk_count();
7236     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7237     if (chunk_size != stripe_unit) {
7238       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7239           << "alignment. Would be padded to " << chunk_size
7240           << std::endl;
7241       return -EINVAL;
7242     }
7243     if ((stripe_unit % 4096) != 0 && !force) {
7244       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7245           << "use --force to override this check" << std::endl;
7246       return -EINVAL;
7247     }
7248   }
7249   return 0;
7250 }
7251
7252 int OSDMonitor::crush_rule_create_erasure(const string &name,
7253                                              const string &profile,
7254                                              int *rule,
7255                                              ostream *ss)
7256 {
7257   int ruleid = osdmap.crush->get_rule_id(name);
7258   if (ruleid != -ENOENT) {
7259     *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7260     return -EEXIST;
7261   }
7262
7263   CrushWrapper newcrush;
7264   _get_pending_crush(newcrush);
7265
7266   ruleid = newcrush.get_rule_id(name);
7267   if (ruleid != -ENOENT) {
7268     *rule = newcrush.get_rule_mask_ruleset(ruleid);
7269     return -EALREADY;
7270   } else {
7271     ErasureCodeInterfaceRef erasure_code;
7272     int err = get_erasure_code(profile, &erasure_code, ss);
7273     if (err) {
7274       *ss << "failed to load plugin using profile " << profile << std::endl;
7275       return err;
7276     }
7277
7278     err = erasure_code->create_rule(name, newcrush, ss);
7279     erasure_code.reset();
7280     if (err < 0)
7281       return err;
7282     *rule = err;
7283     pending_inc.crush.clear();
7284     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7285     return 0;
7286   }
7287 }
7288
7289 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7290                                  ErasureCodeInterfaceRef *erasure_code,
7291                                  ostream *ss) const
7292 {
7293   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7294     return -EAGAIN;
7295   ErasureCodeProfile profile =
7296     osdmap.get_erasure_code_profile(erasure_code_profile);
7297   ErasureCodeProfile::const_iterator plugin =
7298     profile.find("plugin");
7299   if (plugin == profile.end()) {
7300     *ss << "cannot determine the erasure code plugin"
7301         << " because there is no 'plugin' entry in the erasure_code_profile "
7302         << profile << std::endl;
7303     return -EINVAL;
7304   }
7305   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7306   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7307   return instance.factory(plugin->second,
7308                           g_conf().get_val<std::string>("erasure_code_dir"),
7309                           profile, erasure_code, ss);
7310 }
7311
7312 int OSDMonitor::check_cluster_features(uint64_t features,
7313                                        stringstream &ss)
7314 {
7315   stringstream unsupported_ss;
7316   int unsupported_count = 0;
7317   if ((mon->get_quorum_con_features() & features) != features) {
7318     unsupported_ss << "the monitor cluster";
7319     ++unsupported_count;
7320   }
7321
7322   set<int32_t> up_osds;
7323   osdmap.get_up_osds(up_osds);
7324   for (set<int32_t>::iterator it = up_osds.begin();
7325        it != up_osds.end(); ++it) {
7326     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7327     if ((xi.features & features) != features) {
7328       if (unsupported_count > 0)
7329         unsupported_ss << ", ";
7330       unsupported_ss << "osd." << *it;
7331       unsupported_count ++;
7332     }
7333   }
7334
7335   if (unsupported_count > 0) {
7336     ss << "features " << features << " unsupported by: "
7337        << unsupported_ss.str();
7338     return -ENOTSUP;
7339   }
7340
7341   // check pending osd state, too!
7342   for (map<int32_t,osd_xinfo_t>::const_iterator p =
7343          pending_inc.new_xinfo.begin();
7344        p != pending_inc.new_xinfo.end(); ++p) {
7345     const osd_xinfo_t &xi = p->second;
7346     if ((xi.features & features) != features) {
7347       dout(10) << __func__ << " pending osd." << p->first
7348                << " features are insufficient; retry" << dendl;
7349       return -EAGAIN;
7350     }
7351   }
7352
7353   return 0;
7354 }
7355
7356 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7357                                                  stringstream& ss)
7358 {
7359   OSDMap::Incremental new_pending = pending_inc;
7360   encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7361   OSDMap newmap;
7362   newmap.deepish_copy_from(osdmap);
7363   newmap.apply_incremental(new_pending);
7364
7365   // client compat
7366   if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7367     auto mv = newmap.get_min_compat_client();
7368     if (mv > newmap.require_min_compat_client) {
7369       ss << "new crush map requires client version " << mv
7370          << " but require_min_compat_client is "
7371          << newmap.require_min_compat_client;
7372       return false;
7373     }
7374   }
7375
7376   // osd compat
7377   uint64_t features =
7378     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7379     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7380   stringstream features_ss;
7381   int r = check_cluster_features(features, features_ss);
7382   if (r) {
7383     ss << "Could not change CRUSH: " << features_ss.str();
7384     return false;
7385   }
7386
7387   return true;
7388 }
7389
7390 bool OSDMonitor::erasure_code_profile_in_use(
7391   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7392   const string &profile,
7393   ostream *ss)
7394 {
7395   bool found = false;
7396   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7397        p != pools.end();
7398        ++p) {
7399     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7400       *ss << osdmap.pool_name[p->first] << " ";
7401       found = true;
7402     }
7403   }
7404   if (found) {
7405     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7406   }
7407   return found;
7408 }
7409
7410 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7411                                            map<string,string> *erasure_code_profile_map,
7412                                            ostream *ss)
7413 {
7414   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7415                                    get_json_str_map,
7416                                    *ss,
7417                                    erasure_code_profile_map,
7418                                    true);
7419   if (r)
7420     return r;
7421   ceph_assert((*erasure_code_profile_map).count("plugin"));
7422   string default_plugin = (*erasure_code_profile_map)["plugin"];
7423   map<string,string> user_map;
7424   for (vector<string>::const_iterator i = erasure_code_profile.begin();
7425        i != erasure_code_profile.end();
7426        ++i) {
7427     size_t equal = i->find('=');
7428     if (equal == string::npos) {
7429       user_map[*i] = string();
7430       (*erasure_code_profile_map)[*i] = string();
7431     } else {
7432       const string key = i->substr(0, equal);
7433       equal++;
7434       const string value = i->substr(equal);
7435       if (key.find("ruleset-") == 0) {
7436         *ss << "property '" << key << "' is no longer supported; try "
7437             << "'crush-" << key.substr(8) << "' instead";
7438         return -EINVAL;
7439       }
7440       user_map[key] = value;
7441       (*erasure_code_profile_map)[key] = value;
7442     }
7443   }
7444
7445   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7446     (*erasure_code_profile_map) = user_map;
7447
7448   return 0;
7449 }
7450
7451 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7452                                   const string &erasure_code_profile,
7453                                   uint8_t repl_size,
7454                                   unsigned *size, unsigned *min_size,
7455                                   ostream *ss)
7456 {
7457   int err = 0;
7458   switch (pool_type) {
7459   case pg_pool_t::TYPE_REPLICATED:
7460     if (repl_size == 0) {
7461       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7462     }
7463     *size = repl_size;
7464     *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7465     break;
7466   case pg_pool_t::TYPE_ERASURE:
7467     {
7468       ErasureCodeInterfaceRef erasure_code;
7469       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7470       if (err == 0) {
7471         *size = erasure_code->get_chunk_count();
7472         *min_size =
7473           erasure_code->get_data_chunk_count() +
7474           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7475         assert(*min_size <= *size);
7476         assert(*min_size >= erasure_code->get_data_chunk_count());
7477       }
7478     }
7479     break;
7480   default:
7481     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7482     err = -EINVAL;
7483     break;
7484   }
7485   return err;
7486 }
7487
7488 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7489                                           const string &erasure_code_profile,
7490                                           uint32_t *stripe_width,
7491                                           ostream *ss)
7492 {
7493   int err = 0;
7494   switch (pool_type) {
7495   case pg_pool_t::TYPE_REPLICATED:
7496     // ignored
7497     break;
7498   case pg_pool_t::TYPE_ERASURE:
7499     {
7500       ErasureCodeProfile profile =
7501         osdmap.get_erasure_code_profile(erasure_code_profile);
7502       ErasureCodeInterfaceRef erasure_code;
7503       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7504       if (err)
7505         break;
7506       uint32_t data_chunks = erasure_code->get_data_chunk_count();
7507       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7508       auto it = profile.find("stripe_unit");
7509       if (it != profile.end()) {
7510         string err_str;
7511         stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7512         ceph_assert(err_str.empty());
7513       }
7514       *stripe_width = data_chunks *
7515         erasure_code->get_chunk_size(stripe_unit * data_chunks);
7516     }
7517     break;
7518   default:
7519     *ss << "prepare_pool_stripe_width: "
7520        << pool_type << " is not a known pool type";
7521     err = -EINVAL;
7522     break;
7523   }
7524   return err;
7525 }
7526
7527 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7528                                         const string &erasure_code_profile,
7529                                         const string &rule_name,
7530                                         int *crush_rule,
7531                                         ostream *ss)
7532 {
7533
7534   if (*crush_rule < 0) {
7535     switch (pool_type) {
7536     case pg_pool_t::TYPE_REPLICATED:
7537       {
7538         if (rule_name == "") {
7539           // Use default rule
7540           *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7541           if (*crush_rule < 0) {
7542             // Errors may happen e.g. if no valid rule is available
7543             *ss << "No suitable CRUSH rule exists, check "
7544                 << "'osd pool default crush *' config options";
7545             return -ENOENT;
7546           }
7547         } else {
7548           return get_crush_rule(rule_name, crush_rule, ss);
7549         }
7550       }
7551       break;
7552     case pg_pool_t::TYPE_ERASURE:
7553       {
7554         int err = crush_rule_create_erasure(rule_name,
7555                                                erasure_code_profile,
7556                                                crush_rule, ss);
7557         switch (err) {
7558         case -EALREADY:
7559           dout(20) << "prepare_pool_crush_rule: rule "
7560                    << rule_name << " try again" << dendl;
7561           // fall through
7562         case 0:
7563           // need to wait for the crush rule to be proposed before proceeding
7564           err = -EAGAIN;
7565           break;
7566         case -EEXIST:
7567           err = 0;
7568           break;
7569         }
7570         return err;
7571       }
7572       break;
7573     default:
7574       *ss << "prepare_pool_crush_rule: " << pool_type
7575          << " is not a known pool type";
7576       return -EINVAL;
7577       break;
7578     }
7579   } else {
7580     if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7581       *ss << "CRUSH rule " << *crush_rule << " not found";
7582       return -ENOENT;
7583     }
7584   }
7585
7586   return 0;
7587 }
7588
7589 int OSDMonitor::get_crush_rule(const string &rule_name,
7590                                int *crush_rule,
7591                                ostream *ss)
7592 {
7593   int ret;
7594   ret = osdmap.crush->get_rule_id(rule_name);
7595   if (ret != -ENOENT) {
7596     // found it, use it
7597     *crush_rule = ret;
7598   } else {
7599     CrushWrapper newcrush;
7600     _get_pending_crush(newcrush);
7601
7602     ret = newcrush.get_rule_id(rule_name);
7603     if (ret != -ENOENT) {
7604       // found it, wait for it to be proposed
7605       dout(20) << __func__ << ": rule " << rule_name
7606                << " try again" << dendl;
7607       return -EAGAIN;
7608     } else {
7609       // Cannot find it , return error
7610       *ss << "specified rule " << rule_name << " doesn't exist";
7611       return ret;
7612     }
7613   }
7614   return 0;
7615 }
7616
7617 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7618 {
7619   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7620   auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
7621   auto max_pgs = max_pgs_per_osd * num_osds;
7622   uint64_t projected = 0;
7623   if (pool < 0) {
7624     projected += pg_num * size;
7625   }
7626   for (const auto& i : osdmap.get_pools()) {
7627     if (i.first == pool) {
7628       projected += pg_num * size;
7629     } else {
7630       projected += i.second.get_pg_num_target() * i.second.get_size();
7631     }
7632   }
7633   if (projected > max_pgs) {
7634     if (pool >= 0) {
7635       *ss << "pool id " << pool;
7636     }
7637     *ss << " pg_num " << pg_num << " size " << size
7638         << " would mean " << projected
7639         << " total pgs, which exceeds max " << max_pgs
7640         << " (mon_max_pg_per_osd " << max_pgs_per_osd
7641         << " * num_in_osds " << num_osds << ")";
7642     return -ERANGE;
7643   }
7644   return 0;
7645 }
7646
7647 /**
7648  * @param name The name of the new pool
7649  * @param crush_rule The crush rule to use. If <0, will use the system default
7650  * @param crush_rule_name The crush rule to use, if crush_rulset <0
7651  * @param pg_num The pg_num to use. If set to 0, will use the system default
7652  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7653  * @param repl_size Replication factor, or 0 for default
7654  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7655  * @param pool_type TYPE_ERASURE, or TYPE_REP
7656  * @param expected_num_objects expected number of objects on the pool
7657  * @param fast_read fast read type.
7658  * @param ss human readable error message, if any.
7659  *
7660  * @return 0 on success, negative errno on failure.
7661  */
7662 int OSDMonitor::prepare_new_pool(string& name,
7663                                  int crush_rule,
7664                                  const string &crush_rule_name,
7665                                  unsigned pg_num, unsigned pgp_num,
7666                                  unsigned pg_num_min,
7667                                  const uint64_t repl_size,
7668                                  const uint64_t target_size_bytes,
7669                                  const float target_size_ratio,
7670                                  const string &erasure_code_profile,
7671                                  const unsigned pool_type,
7672                                  const uint64_t expected_num_objects,
7673                                  FastReadType fast_read,
7674                                  const string& pg_autoscale_mode,
7675                                  ostream *ss)
7676 {
7677   if (name.length() == 0)
7678     return -EINVAL;
7679   if (pg_num == 0)
7680     pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7681   if (pgp_num == 0)
7682     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7683   if (!pgp_num)
7684     pgp_num = pg_num;
7685   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7686     *ss << "'pg_num' must be greater than 0 and less than or equal to "
7687         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7688         << " (you may adjust 'mon max pool pg num' for higher values)";
7689     return -ERANGE;
7690   }
7691   if (pgp_num > pg_num) {
7692     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7693         << ", which in this case is " << pg_num;
7694     return -ERANGE;
7695   }
7696   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7697     *ss << "'fast_read' can only apply to erasure coding pool";
7698     return -EINVAL;
7699   }
7700   int r;
7701   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7702                                  crush_rule_name, &crush_rule, ss);
7703   if (r) {
7704     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7705     return r;
7706   }
7707   if (g_conf()->mon_osd_crush_smoke_test) {
7708     CrushWrapper newcrush;
7709     _get_pending_crush(newcrush);
7710     ostringstream err;
7711     CrushTester tester(newcrush, err);
7712     tester.set_min_x(0);
7713     tester.set_max_x(50);
7714     tester.set_rule(crush_rule);
7715     auto start = ceph::coarse_mono_clock::now();
7716     r = tester.test_with_fork(g_conf()->mon_lease);
7717     auto duration = ceph::coarse_mono_clock::now() - start;
7718     if (r < 0) {
7719       dout(10) << "tester.test_with_fork returns " << r
7720                << ": " << err.str() << dendl;
7721       *ss << "crush test failed with " << r << ": " << err.str();
7722       return r;
7723     }
7724     dout(10) << __func__ << " crush smoke test duration: "
7725              << duration << dendl;
7726   }
7727   unsigned size, min_size;
7728   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7729                         &size, &min_size, ss);
7730   if (r) {
7731     dout(10) << "prepare_pool_size returns " << r << dendl;
7732     return r;
7733   }
7734   r = check_pg_num(-1, pg_num, size, ss);
7735   if (r) {
7736     dout(10) << "check_pg_num returns " << r << dendl;
7737     return r;
7738   }
7739
7740   if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7741     return -EINVAL;
7742   }
7743
7744   uint32_t stripe_width = 0;
7745   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7746   if (r) {
7747     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7748     return r;
7749   }
7750
7751   bool fread = false;
7752   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7753     switch (fast_read) {
7754       case FAST_READ_OFF:
7755         fread = false;
7756         break;
7757       case FAST_READ_ON:
7758         fread = true;
7759         break;
7760       case FAST_READ_DEFAULT:
7761         fread = g_conf()->osd_pool_default_ec_fast_read;
7762         break;
7763       default:
7764         *ss << "invalid fast_read setting: " << fast_read;
7765         return -EINVAL;
7766     }
7767   }
7768
7769   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7770        p != pending_inc.new_pool_names.end();
7771        ++p) {
7772     if (p->second == name)
7773       return 0;
7774   }
7775
7776   if (-1 == pending_inc.new_pool_max)
7777     pending_inc.new_pool_max = osdmap.pool_max;
7778   int64_t pool = ++pending_inc.new_pool_max;
7779   pg_pool_t empty;
7780   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7781   pi->create_time = ceph_clock_now();
7782   pi->type = pool_type;
7783   pi->fast_read = fread;
7784   pi->flags = g_conf()->osd_pool_default_flags;
7785   if (g_conf()->osd_pool_default_flag_hashpspool)
7786     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7787   if (g_conf()->osd_pool_default_flag_nodelete)
7788     pi->set_flag(pg_pool_t::FLAG_NODELETE);
7789   if (g_conf()->osd_pool_default_flag_nopgchange)
7790     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7791   if (g_conf()->osd_pool_default_flag_nosizechange)
7792     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7793   pi->set_flag(pg_pool_t::FLAG_CREATING);
7794   if (g_conf()->osd_pool_use_gmt_hitset)
7795     pi->use_gmt_hitset = true;
7796   else
7797     pi->use_gmt_hitset = false;
7798
7799   pi->size = size;
7800   pi->min_size = min_size;
7801   pi->crush_rule = crush_rule;
7802   pi->expected_num_objects = expected_num_objects;
7803   pi->object_hash = CEPH_STR_HASH_RJENKINS;
7804
7805   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7806         g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7807       m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7808     pi->pg_autoscale_mode = m;
7809   } else {
7810     pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7811   }
7812   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7813   pi->set_pg_num(
7814     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7815     : pg_num);
7816   pi->set_pg_num_pending(pi->get_pg_num());
7817   pi->set_pg_num_target(pg_num);
7818   pi->set_pgp_num(pi->get_pg_num());
7819   pi->set_pgp_num_target(pgp_num);
7820   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7821       pg_num_min) {
7822     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7823   }
7824   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7825         pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7826     pi->pg_autoscale_mode = m;
7827   }
7828
7829   pi->last_change = pending_inc.epoch;
7830   pi->auid = 0;
7831
7832   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7833       pi->erasure_code_profile = erasure_code_profile;
7834   } else {
7835       pi->erasure_code_profile = "";
7836   }
7837   pi->stripe_width = stripe_width;
7838
7839   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7840       target_size_bytes) {
7841     // only store for nautilus+ because TARGET_SIZE_BYTES may be
7842     // larger than int32_t max.
7843     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7844   }
7845   if (target_size_ratio > 0.0 &&
7846       osdmap.require_osd_release >= ceph_release_t::nautilus) {
7847     // only store for nautilus+, just to be consistent and tidy.
7848     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7849   }
7850
7851   pi->cache_target_dirty_ratio_micro =
7852     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7853   pi->cache_target_dirty_high_ratio_micro =
7854     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7855   pi->cache_target_full_ratio_micro =
7856     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7857   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7858   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7859
7860   pending_inc.new_pool_names[pool] = name;
7861   return 0;
7862 }
7863
7864 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7865 {
7866   op->mark_osdmon_event(__func__);
7867   ostringstream ss;
7868   if (pending_inc.new_flags < 0)
7869     pending_inc.new_flags = osdmap.get_flags();
7870   pending_inc.new_flags |= flag;
7871   ss << OSDMap::get_flag_string(flag) << " is set";
7872   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7873                                                     get_last_committed() + 1));
7874   return true;
7875 }
7876
7877 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7878 {
7879   op->mark_osdmon_event(__func__);
7880   ostringstream ss;
7881   if (pending_inc.new_flags < 0)
7882     pending_inc.new_flags = osdmap.get_flags();
7883   pending_inc.new_flags &= ~flag;
7884   ss << OSDMap::get_flag_string(flag) << " is unset";
7885   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7886                                                     get_last_committed() + 1));
7887   return true;
7888 }
7889
7890 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7891                                          stringstream& ss)
7892 {
7893   string poolstr;
7894   cmd_getval(cmdmap, "pool", poolstr);
7895   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7896   if (pool < 0) {
7897     ss << "unrecognized pool '" << poolstr << "'";
7898     return -ENOENT;
7899   }
7900   string var;
7901   cmd_getval(cmdmap, "var", var);
7902
7903   pg_pool_t p = *osdmap.get_pg_pool(pool);
7904   if (pending_inc.new_pools.count(pool))
7905     p = pending_inc.new_pools[pool];
7906
7907   // accept val as a json string in the normal case (current
7908   // generation monitor).  parse out int or float values from the
7909   // string as needed.  however, if it is not a string, try to pull
7910   // out an int, in case an older monitor with an older json schema is
7911   // forwarding a request.
7912   string val;
7913   string interr, floaterr;
7914   int64_t n = 0;
7915   double f = 0;
7916   int64_t uf = 0;  // micro-f
7917   cmd_getval(cmdmap, "val", val);
7918
7919   auto si_options = {
7920     "target_max_objects"
7921   };
7922   auto iec_options = {
7923     "target_max_bytes",
7924     "target_size_bytes",
7925     "compression_max_blob_size",
7926     "compression_min_blob_size",
7927     "csum_max_block",
7928     "csum_min_block",
7929   };
7930   if (count(begin(si_options), end(si_options), var)) {
7931     n = strict_si_cast<int64_t>(val.c_str(), &interr);
7932   } else if (count(begin(iec_options), end(iec_options), var)) {
7933     n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7934   } else {
7935     // parse string as both int and float; different fields use different types.
7936     n = strict_strtoll(val.c_str(), 10, &interr);
7937     f = strict_strtod(val.c_str(), &floaterr);
7938     uf = llrintl(f * (double)1000000.0);
7939   }
7940
7941   if (!p.is_tier() &&
7942       (var == "hit_set_type" || var == "hit_set_period" ||
7943        var == "hit_set_count" || var == "hit_set_fpp" ||
7944        var == "target_max_objects" || var == "target_max_bytes" ||
7945        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7946        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7947        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7948        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7949        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7950     return -EACCES;
7951   }
7952
7953   if (var == "size") {
7954     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7955       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7956       return -EPERM;
7957     }
7958     if (p.type == pg_pool_t::TYPE_ERASURE) {
7959       ss << "can not change the size of an erasure-coded pool";
7960       return -ENOTSUP;
7961     }
7962     if (interr.length()) {
7963       ss << "error parsing integer value '" << val << "': " << interr;
7964       return -EINVAL;
7965     }
7966     if (n <= 0 || n > 10) {
7967       ss << "pool size must be between 1 and 10";
7968       return -EINVAL;
7969     }
7970     if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
7971       return -EINVAL;
7972     }
7973     int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7974     if (r < 0) {
7975       return r;
7976     }
7977     p.size = n;
7978     p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
7979   } else if (var == "min_size") {
7980     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7981       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7982       return -EPERM;
7983     }
7984     if (interr.length()) {
7985       ss << "error parsing integer value '" << val << "': " << interr;
7986       return -EINVAL;
7987     }
7988
7989     if (p.type != pg_pool_t::TYPE_ERASURE) {
7990       if (n < 1 || n > p.size) {
7991         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7992         return -EINVAL;
7993       }
7994     } else {
7995        ErasureCodeInterfaceRef erasure_code;
7996        int k;
7997        stringstream tmp;
7998        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7999        if (err == 0) {
8000          k = erasure_code->get_data_chunk_count();
8001        } else {
8002          ss << __func__ << " get_erasure_code failed: " << tmp.str();
8003          return err;
8004        }
8005
8006        if (n < k || n > p.size) {
8007          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8008          return -EINVAL;
8009        }
8010     }
8011     p.min_size = n;
8012   } else if (var == "pg_num_actual") {
8013     if (interr.length()) {
8014       ss << "error parsing integer value '" << val << "': " << interr;
8015       return -EINVAL;
8016     }
8017     if (n == (int)p.get_pg_num()) {
8018       return 0;
8019     }
8020     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8021       ss << "'pg_num' must be greater than 0 and less than or equal to "
8022          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8023          << " (you may adjust 'mon max pool pg num' for higher values)";
8024       return -ERANGE;
8025     }
8026     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8027       ss << "cannot adjust pg_num while initial PGs are being created";
8028       return -EBUSY;
8029     }
8030     if (n > (int)p.get_pg_num()) {
8031       if (p.get_pg_num() != p.get_pg_num_pending()) {
8032         // force pre-nautilus clients to resend their ops, since they
8033         // don't understand pg_num_pending changes form a new interval
8034         p.last_force_op_resend_prenautilus = pending_inc.epoch;
8035       }
8036       p.set_pg_num(n);
8037     } else {
8038       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8039         ss << "nautilus OSDs are required to adjust pg_num_pending";
8040         return -EPERM;
8041       }
8042       if (n < (int)p.get_pgp_num()) {
8043         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8044         return -EINVAL;
8045       }
8046       if (n < (int)p.get_pg_num() - 1) {
8047         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8048            << ") - 1; only single pg decrease is currently supported";
8049         return -EINVAL;
8050       }
8051       p.set_pg_num_pending(n);
8052       // force pre-nautilus clients to resend their ops, since they
8053       // don't understand pg_num_pending changes form a new interval
8054       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8055     }
8056     // force pre-luminous clients to resend their ops, since they
8057     // don't understand that split PGs now form a new interval.
8058     p.last_force_op_resend_preluminous = pending_inc.epoch;
8059   } else if (var == "pg_num") {
8060     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8061       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8062       return -EPERM;
8063     }
8064     if (interr.length()) {
8065       ss << "error parsing integer value '" << val << "': " << interr;
8066       return -EINVAL;
8067     }
8068     if (n == (int)p.get_pg_num_target()) {
8069       return 0;
8070     }
8071     if (n <= 0 || static_cast<uint64_t>(n) >
8072                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8073       ss << "'pg_num' must be greater than 0 and less than or equal to "
8074          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8075          << " (you may adjust 'mon max pool pg num' for higher values)";
8076       return -ERANGE;
8077     }
8078     if (n > (int)p.get_pg_num_target()) {
8079       int r = check_pg_num(pool, n, p.get_size(), &ss);
8080       if (r) {
8081         return r;
8082       }
8083       bool force = false;
8084       cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8085       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8086         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
8087         return -EPERM;
8088       }
8089     } else {
8090       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8091         ss << "nautilus OSDs are required to decrease pg_num";
8092         return -EPERM;
8093       }
8094     }
8095     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8096       // pre-nautilus osdmap format; increase pg_num directly
8097       assert(n > (int)p.get_pg_num());
8098       // force pre-nautilus clients to resend their ops, since they
8099       // don't understand pg_num_target changes form a new interval
8100       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8101       // force pre-luminous clients to resend their ops, since they
8102       // don't understand that split PGs now form a new interval.
8103       p.last_force_op_resend_preluminous = pending_inc.epoch;
8104       p.set_pg_num(n);
8105     } else {
8106       // set targets; mgr will adjust pg_num_actual and pgp_num later.
8107       // make pgp_num track pg_num if it already matches.  if it is set
8108       // differently, leave it different and let the user control it
8109       // manually.
8110       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8111         p.set_pgp_num_target(n);
8112       }
8113       p.set_pg_num_target(n);
8114     }
8115   } else if (var == "pgp_num_actual") {
8116     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8117       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8118       return -EPERM;
8119     }
8120     if (interr.length()) {
8121       ss << "error parsing integer value '" << val << "': " << interr;
8122       return -EINVAL;
8123     }
8124     if (n <= 0) {
8125       ss << "specified pgp_num must > 0, but you set to " << n;
8126       return -EINVAL;
8127     }
8128     if (n > (int)p.get_pg_num()) {
8129       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8130       return -EINVAL;
8131     }
8132     if (n > (int)p.get_pg_num_pending()) {
8133       ss << "specified pgp_num " << n
8134          << " > pg_num_pending " << p.get_pg_num_pending();
8135       return -EINVAL;
8136     }
8137     p.set_pgp_num(n);
8138   } else if (var == "pgp_num") {
8139     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8140       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8141       return -EPERM;
8142     }
8143     if (interr.length()) {
8144       ss << "error parsing integer value '" << val << "': " << interr;
8145       return -EINVAL;
8146     }
8147     if (n <= 0) {
8148       ss << "specified pgp_num must > 0, but you set to " << n;
8149       return -EINVAL;
8150     }
8151     if (n > (int)p.get_pg_num_target()) {
8152       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8153       return -EINVAL;
8154     }
8155     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8156       // pre-nautilus osdmap format; increase pgp_num directly
8157       p.set_pgp_num(n);
8158     } else {
8159       p.set_pgp_num_target(n);
8160     }
8161   } else if (var == "pg_autoscale_mode") {
8162     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8163     if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8164       ss << "specified invalid mode " << val;
8165       return -EINVAL;
8166     }
8167     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8168       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8169       return -EINVAL;
8170     }
8171     p.pg_autoscale_mode = m;
8172   } else if (var == "crush_rule") {
8173     int id = osdmap.crush->get_rule_id(val);
8174     if (id == -ENOENT) {
8175       ss << "crush rule " << val << " does not exist";
8176       return -ENOENT;
8177     }
8178     if (id < 0) {
8179       ss << cpp_strerror(id);
8180       return -ENOENT;
8181     }
8182     if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8183       return -EINVAL;
8184     }
8185     p.crush_rule = id;
8186   } else if (var == "nodelete" || var == "nopgchange" ||
8187              var == "nosizechange" || var == "write_fadvise_dontneed" ||
8188              var == "noscrub" || var == "nodeep-scrub") {
8189     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8190     // make sure we only compare against 'n' if we didn't receive a string
8191     if (val == "true" || (interr.empty() && n == 1)) {
8192       p.set_flag(flag);
8193     } else if (val == "false" || (interr.empty() && n == 0)) {
8194       p.unset_flag(flag);
8195     } else {
8196       ss << "expecting value 'true', 'false', '0', or '1'";
8197       return -EINVAL;
8198     }
8199   } else if (var == "hashpspool") {
8200     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8201     bool force = false;
8202     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8203
8204     if (!force) {
8205       ss << "are you SURE?  this will remap all placement groups in this pool,"
8206             " this triggers large data movement,"
8207             " pass --yes-i-really-mean-it if you really do.";
8208       return -EPERM;
8209     }
8210     // make sure we only compare against 'n' if we didn't receive a string
8211     if (val == "true" || (interr.empty() && n == 1)) {
8212       p.set_flag(flag);
8213     } else if (val == "false" || (interr.empty() && n == 0)) {
8214       p.unset_flag(flag);
8215     } else {
8216       ss << "expecting value 'true', 'false', '0', or '1'";
8217       return -EINVAL;
8218     }
8219   } else if (var == "hit_set_type") {
8220     if (val == "none")
8221       p.hit_set_params = HitSet::Params();
8222     else {
8223       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8224       if (err)
8225         return err;
8226       if (val == "bloom") {
8227         BloomHitSet::Params *bsp = new BloomHitSet::Params;
8228         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8229         p.hit_set_params = HitSet::Params(bsp);
8230       } else if (val == "explicit_hash")
8231         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8232       else if (val == "explicit_object")
8233         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8234       else {
8235         ss << "unrecognized hit_set type '" << val << "'";
8236         return -EINVAL;
8237       }
8238     }
8239   } else if (var == "hit_set_period") {
8240     if (interr.length()) {
8241       ss << "error parsing integer value '" << val << "': " << interr;
8242       return -EINVAL;
8243     } else if (n < 0) {
8244       ss << "hit_set_period should be non-negative";
8245       return -EINVAL;
8246     }
8247     p.hit_set_period = n;
8248   } else if (var == "hit_set_count") {
8249     if (interr.length()) {
8250       ss << "error parsing integer value '" << val << "': " << interr;
8251       return -EINVAL;
8252     } else if (n < 0) {
8253       ss << "hit_set_count should be non-negative";
8254       return -EINVAL;
8255     }
8256     p.hit_set_count = n;
8257   } else if (var == "hit_set_fpp") {
8258     if (floaterr.length()) {
8259       ss << "error parsing floating point value '" << val << "': " << floaterr;
8260       return -EINVAL;
8261     } else if (f < 0 || f > 1.0) {
8262       ss << "hit_set_fpp should be in the range 0..1";
8263       return -EINVAL;
8264     }
8265     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8266       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8267       return -EINVAL;
8268     }
8269     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8270     bloomp->set_fpp(f);
8271   } else if (var == "use_gmt_hitset") {
8272     if (val == "true" || (interr.empty() && n == 1)) {
8273       p.use_gmt_hitset = true;
8274     } else {
8275       ss << "expecting value 'true' or '1'";
8276       return -EINVAL;
8277     }
8278   } else if (var == "allow_ec_overwrites") {
8279     if (!p.is_erasure()) {
8280       ss << "ec overwrites can only be enabled for an erasure coded pool";
8281       return -EINVAL;
8282     }
8283     stringstream err;
8284     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8285         !is_pool_currently_all_bluestore(pool, p, &err)) {
8286       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8287       return -EINVAL;
8288     }
8289     if (val == "true" || (interr.empty() && n == 1)) {
8290         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8291     } else if (val == "false" || (interr.empty() && n == 0)) {
8292       ss << "ec overwrites cannot be disabled once enabled";
8293       return -EINVAL;
8294     } else {
8295       ss << "expecting value 'true', 'false', '0', or '1'";
8296       return -EINVAL;
8297     }
8298   } else if (var == "target_max_objects") {
8299     if (interr.length()) {
8300       ss << "error parsing int '" << val << "': " << interr;
8301       return -EINVAL;
8302     }
8303     p.target_max_objects = n;
8304   } else if (var == "target_max_bytes") {
8305     if (interr.length()) {
8306       ss << "error parsing int '" << val << "': " << interr;
8307       return -EINVAL;
8308     }
8309     p.target_max_bytes = n;
8310   } else if (var == "cache_target_dirty_ratio") {
8311     if (floaterr.length()) {
8312       ss << "error parsing float '" << val << "': " << floaterr;
8313       return -EINVAL;
8314     }
8315     if (f < 0 || f > 1.0) {
8316       ss << "value must be in the range 0..1";
8317       return -ERANGE;
8318     }
8319     p.cache_target_dirty_ratio_micro = uf;
8320   } else if (var == "cache_target_dirty_high_ratio") {
8321     if (floaterr.length()) {
8322       ss << "error parsing float '" << val << "': " << floaterr;
8323       return -EINVAL;
8324     }
8325     if (f < 0 || f > 1.0) {
8326       ss << "value must be in the range 0..1";
8327       return -ERANGE;
8328     }
8329     p.cache_target_dirty_high_ratio_micro = uf;
8330   } else if (var == "cache_target_full_ratio") {
8331     if (floaterr.length()) {
8332       ss << "error parsing float '" << val << "': " << floaterr;
8333       return -EINVAL;
8334     }
8335     if (f < 0 || f > 1.0) {
8336       ss << "value must be in the range 0..1";
8337       return -ERANGE;
8338     }
8339     p.cache_target_full_ratio_micro = uf;
8340   } else if (var == "cache_min_flush_age") {
8341     if (interr.length()) {
8342       ss << "error parsing int '" << val << "': " << interr;
8343       return -EINVAL;
8344     }
8345     p.cache_min_flush_age = n;
8346   } else if (var == "cache_min_evict_age") {
8347     if (interr.length()) {
8348       ss << "error parsing int '" << val << "': " << interr;
8349       return -EINVAL;
8350     }
8351     p.cache_min_evict_age = n;
8352   } else if (var == "min_read_recency_for_promote") {
8353     if (interr.length()) {
8354       ss << "error parsing integer value '" << val << "': " << interr;
8355       return -EINVAL;
8356     }
8357     p.min_read_recency_for_promote = n;
8358   } else if (var == "hit_set_grade_decay_rate") {
8359     if (interr.length()) {
8360       ss << "error parsing integer value '" << val << "': " << interr;
8361       return -EINVAL;
8362     }
8363     if (n > 100 || n < 0) {
8364       ss << "value out of range,valid range is 0 - 100";
8365       return -EINVAL;
8366     }
8367     p.hit_set_grade_decay_rate = n;
8368   } else if (var == "hit_set_search_last_n") {
8369     if (interr.length()) {
8370       ss << "error parsing integer value '" << val << "': " << interr;
8371       return -EINVAL;
8372     }
8373     if (n > p.hit_set_count || n < 0) {
8374       ss << "value out of range,valid range is 0 - hit_set_count";
8375       return -EINVAL;
8376     }
8377     p.hit_set_search_last_n = n;
8378   } else if (var == "min_write_recency_for_promote") {
8379     if (interr.length()) {
8380       ss << "error parsing integer value '" << val << "': " << interr;
8381       return -EINVAL;
8382     }
8383     p.min_write_recency_for_promote = n;
8384   } else if (var == "fast_read") {
8385     if (p.is_replicated()) {
8386         ss << "fast read is not supported in replication pool";
8387         return -EINVAL;
8388     }
8389     if (val == "true" || (interr.empty() && n == 1)) {
8390       p.fast_read = true;
8391     } else if (val == "false" || (interr.empty() && n == 0)) {
8392       p.fast_read = false;
8393     } else {
8394       ss << "expecting value 'true', 'false', '0', or '1'";
8395       return -EINVAL;
8396     }
8397   } else if (pool_opts_t::is_opt_name(var)) {
8398     bool unset = val == "unset";
8399     if (var == "compression_mode") {
8400       if (!unset) {
8401         auto cmode = Compressor::get_comp_mode_type(val);
8402         if (!cmode) {
8403           ss << "unrecognized compression mode '" << val << "'";
8404           return -EINVAL;
8405         }
8406       }
8407     } else if (var == "compression_algorithm") {
8408       if (!unset) {
8409         auto alg = Compressor::get_comp_alg_type(val);
8410         if (!alg) {
8411           ss << "unrecognized compression_algorithm '" << val << "'";
8412           return -EINVAL;
8413         }
8414       }
8415     } else if (var == "compression_required_ratio") {
8416       if (floaterr.length()) {
8417         ss << "error parsing float value '" << val << "': " << floaterr;
8418         return -EINVAL;
8419       }
8420       if (f < 0 || f > 1) {
8421         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8422         return -EINVAL;
8423       }
8424     } else if (var == "csum_type") {
8425       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8426       if (t < 0 ) {
8427         ss << "unrecognized csum_type '" << val << "'";
8428         return -EINVAL;
8429       }
8430       //preserve csum_type numeric value
8431       n = t;
8432       interr.clear();
8433     } else if (var == "compression_max_blob_size" ||
8434                var == "compression_min_blob_size" ||
8435                var == "csum_max_block" ||
8436                var == "csum_min_block") {
8437       if (interr.length()) {
8438         ss << "error parsing int value '" << val << "': " << interr;
8439         return -EINVAL;
8440       }
8441     } else if (var == "fingerprint_algorithm") {
8442       if (!unset) {
8443         auto alg = pg_pool_t::get_fingerprint_from_str(val);
8444         if (!alg) {
8445           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8446           return -EINVAL;
8447         }
8448       }
8449     } else if (var == "target_size_bytes") {
8450       if (interr.length()) {
8451         ss << "error parsing unit value '" << val << "': " << interr;
8452         return -EINVAL;
8453       }
8454       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8455         ss << "must set require_osd_release to nautilus or "
8456            << "later before setting target_size_bytes";
8457         return -EINVAL;
8458       }
8459     } else if (var == "pg_num_min") {
8460       if (interr.length()) {
8461         ss << "error parsing int value '" << val << "': " << interr;
8462         return -EINVAL;
8463       }
8464       if (n > (int)p.get_pg_num_target()) {
8465         ss << "specified pg_num_min " << n
8466            << " > pg_num " << p.get_pg_num_target();
8467         return -EINVAL;
8468       }
8469     } else if (var == "recovery_priority") {
8470       if (interr.length()) {
8471         ss << "error parsing int value '" << val << "': " << interr;
8472         return -EINVAL;
8473       }
8474       if (!g_conf()->debug_allow_any_pool_priority) {
8475         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8476           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8477              << " and " << OSD_POOL_PRIORITY_MAX;
8478           return -EINVAL;
8479         }
8480       }
8481     } else if (var == "pg_autoscale_bias") {
8482       if (f < 0.0 || f > 1000.0) {
8483         ss << "pg_autoscale_bias must be between 0 and 1000";
8484         return -EINVAL;
8485       }
8486     }
8487
8488     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8489     switch (desc.type) {
8490     case pool_opts_t::STR:
8491       if (unset) {
8492         p.opts.unset(desc.key);
8493       } else {
8494         p.opts.set(desc.key, static_cast<std::string>(val));
8495       }
8496       break;
8497     case pool_opts_t::INT:
8498       if (interr.length()) {
8499         ss << "error parsing integer value '" << val << "': " << interr;
8500         return -EINVAL;
8501       }
8502       if (n == 0) {
8503         p.opts.unset(desc.key);
8504       } else {
8505         p.opts.set(desc.key, static_cast<int64_t>(n));
8506       }
8507       break;
8508     case pool_opts_t::DOUBLE:
8509       if (floaterr.length()) {
8510         ss << "error parsing floating point value '" << val << "': " << floaterr;
8511         return -EINVAL;
8512       }
8513       if (f == 0) {
8514         p.opts.unset(desc.key);
8515       } else {
8516         p.opts.set(desc.key, static_cast<double>(f));
8517       }
8518       break;
8519     default:
8520       ceph_assert(!"unknown type");
8521     }
8522   } else {
8523     ss << "unrecognized variable '" << var << "'";
8524     return -EINVAL;
8525   }
8526   if (val != "unset") {
8527     ss << "set pool " << pool << " " << var << " to " << val;
8528   } else {
8529     ss << "unset pool " << pool << " " << var;
8530   }
8531   p.last_change = pending_inc.epoch;
8532   pending_inc.new_pools[pool] = p;
8533   return 0;
8534 }
8535
8536 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8537                                                  const cmdmap_t& cmdmap,
8538                                                  stringstream& ss)
8539 {
8540   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8541 }
8542
8543 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8544                                                     const cmdmap_t& cmdmap,
8545                                                     stringstream& ss,
8546                                                     bool *modified)
8547 {
8548   return _command_pool_application(prefix, cmdmap, ss, modified, false);
8549 }
8550
8551
8552 /**
8553  * Common logic for preprocess and prepare phases of pool application
8554  * tag commands.  In preprocess mode we're only detecting invalid
8555  * commands, and determining whether it was a modification or a no-op.
8556  * In prepare mode we're actually updating the pending state.
8557  */
8558 int OSDMonitor::_command_pool_application(const string &prefix,
8559                                           const cmdmap_t& cmdmap,
8560                                           stringstream& ss,
8561                                           bool *modified,
8562                                           bool preparing)
8563 {
8564   string pool_name;
8565   cmd_getval(cmdmap, "pool", pool_name);
8566   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8567   if (pool < 0) {
8568     ss << "unrecognized pool '" << pool_name << "'";
8569     return -ENOENT;
8570   }
8571
8572   pg_pool_t p = *osdmap.get_pg_pool(pool);
8573   if (preparing) {
8574     if (pending_inc.new_pools.count(pool)) {
8575       p = pending_inc.new_pools[pool];
8576     }
8577   }
8578
8579   string app;
8580   cmd_getval(cmdmap, "app", app);
8581   bool app_exists = (p.application_metadata.count(app) > 0);
8582
8583   string key;
8584   cmd_getval(cmdmap, "key", key);
8585   if (key == "all") {
8586     ss << "key cannot be 'all'";
8587     return -EINVAL;
8588   }
8589
8590   string value;
8591   cmd_getval(cmdmap, "value", value);
8592   if (value == "all") {
8593     ss << "value cannot be 'all'";
8594     return -EINVAL;
8595   }
8596
8597   if (boost::algorithm::ends_with(prefix, "enable")) {
8598     if (app.empty()) {
8599       ss << "application name must be provided";
8600       return -EINVAL;
8601     }
8602
8603     if (p.is_tier()) {
8604       ss << "application must be enabled on base tier";
8605       return -EINVAL;
8606     }
8607
8608     bool force = false;
8609     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8610
8611     if (!app_exists && !p.application_metadata.empty() && !force) {
8612       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8613          << "application; pass --yes-i-really-mean-it to proceed anyway";
8614       return -EPERM;
8615     }
8616
8617     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8618       ss << "too many enabled applications on pool '" << pool_name << "'; "
8619          << "max " << MAX_POOL_APPLICATIONS;
8620       return -EINVAL;
8621     }
8622
8623     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8624       ss << "application name '" << app << "' too long; max length "
8625          << MAX_POOL_APPLICATION_LENGTH;
8626       return -EINVAL;
8627     }
8628
8629     if (!app_exists) {
8630       p.application_metadata[app] = {};
8631     }
8632     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8633
8634   } else if (boost::algorithm::ends_with(prefix, "disable")) {
8635     bool force = false;
8636     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8637
8638     if (!force) {
8639       ss << "Are you SURE? Disabling an application within a pool might result "
8640          << "in loss of application functionality; pass "
8641          << "--yes-i-really-mean-it to proceed anyway";
8642       return -EPERM;
8643     }
8644
8645     if (!app_exists) {
8646       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8647          << "'";
8648       return 0; // idempotent
8649     }
8650
8651     p.application_metadata.erase(app);
8652     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8653
8654   } else if (boost::algorithm::ends_with(prefix, "set")) {
8655     if (p.is_tier()) {
8656       ss << "application metadata must be set on base tier";
8657       return -EINVAL;
8658     }
8659
8660     if (!app_exists) {
8661       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8662          << "'";
8663       return -ENOENT;
8664     }
8665
8666     string key;
8667     cmd_getval(cmdmap, "key", key);
8668
8669     if (key.empty()) {
8670       ss << "key must be provided";
8671       return -EINVAL;
8672     }
8673
8674     auto &app_keys = p.application_metadata[app];
8675     if (app_keys.count(key) == 0 &&
8676         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8677       ss << "too many keys set for application '" << app << "' on pool '"
8678          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8679       return -EINVAL;
8680     }
8681
8682     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8683       ss << "key '" << app << "' too long; max length "
8684          << MAX_POOL_APPLICATION_LENGTH;
8685       return -EINVAL;
8686     }
8687
8688     string value;
8689     cmd_getval(cmdmap, "value", value);
8690     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8691       ss << "value '" << value << "' too long; max length "
8692          << MAX_POOL_APPLICATION_LENGTH;
8693       return -EINVAL;
8694     }
8695
8696     p.application_metadata[app][key] = value;
8697     ss << "set application '" << app << "' key '" << key << "' to '"
8698        << value << "' on pool '" << pool_name << "'";
8699   } else if (boost::algorithm::ends_with(prefix, "rm")) {
8700     if (!app_exists) {
8701       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8702          << "'";
8703       return -ENOENT;
8704     }
8705
8706     string key;
8707     cmd_getval(cmdmap, "key", key);
8708     auto it = p.application_metadata[app].find(key);
8709     if (it == p.application_metadata[app].end()) {
8710       ss << "application '" << app << "' on pool '" << pool_name
8711          << "' does not have key '" << key << "'";
8712       return 0; // idempotent
8713     }
8714
8715     p.application_metadata[app].erase(it);
8716     ss << "removed application '" << app << "' key '" << key << "' on pool '"
8717        << pool_name << "'";
8718   } else {
8719     ceph_abort();
8720   }
8721
8722   if (preparing) {
8723     p.last_change = pending_inc.epoch;
8724     pending_inc.new_pools[pool] = p;
8725   }
8726
8727   // Because we fell through this far, we didn't hit no-op cases,
8728   // so pool was definitely modified
8729   if (modified != nullptr) {
8730     *modified = true;
8731   }
8732
8733   return 0;
8734 }
8735
8736 int OSDMonitor::_prepare_command_osd_crush_remove(
8737     CrushWrapper &newcrush,
8738     int32_t id,
8739     int32_t ancestor,
8740     bool has_ancestor,
8741     bool unlink_only)
8742 {
8743   int err = 0;
8744
8745   if (has_ancestor) {
8746     err = newcrush.remove_item_under(cct, id, ancestor,
8747         unlink_only);
8748   } else {
8749     err = newcrush.remove_item(cct, id, unlink_only);
8750   }
8751   return err;
8752 }
8753
8754 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8755 {
8756   pending_inc.crush.clear();
8757   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8758 }
8759
8760 int OSDMonitor::prepare_command_osd_crush_remove(
8761     CrushWrapper &newcrush,
8762     int32_t id,
8763     int32_t ancestor,
8764     bool has_ancestor,
8765     bool unlink_only)
8766 {
8767   int err = _prepare_command_osd_crush_remove(
8768       newcrush, id, ancestor,
8769       has_ancestor, unlink_only);
8770
8771   if (err < 0)
8772     return err;
8773
8774   ceph_assert(err == 0);
8775   do_osd_crush_remove(newcrush);
8776
8777   return 0;
8778 }
8779
8780 int OSDMonitor::prepare_command_osd_remove(int32_t id)
8781 {
8782   if (osdmap.is_up(id)) {
8783     return -EBUSY;
8784   }
8785
8786   pending_inc.new_state[id] = osdmap.get_state(id);
8787   pending_inc.new_uuid[id] = uuid_d();
8788   pending_metadata_rm.insert(id);
8789   pending_metadata.erase(id);
8790
8791   return 0;
8792 }
8793
8794 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8795 {
8796   ceph_assert(existing_id);
8797   *existing_id = -1;
8798
8799   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8800     if (!osdmap.exists(i) &&
8801         pending_inc.new_up_client.count(i) == 0 &&
8802         (pending_inc.new_state.count(i) == 0 ||
8803          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8804       *existing_id = i;
8805       return -1;
8806     }
8807   }
8808
8809   if (pending_inc.new_max_osd < 0) {
8810     return osdmap.get_max_osd();
8811   }
8812   return pending_inc.new_max_osd;
8813 }
8814
8815 void OSDMonitor::do_osd_create(
8816     const int32_t id,
8817     const uuid_d& uuid,
8818     const string& device_class,
8819     int32_t* new_id)
8820 {
8821   dout(10) << __func__ << " uuid " << uuid << dendl;
8822   ceph_assert(new_id);
8823
8824   // We presume validation has been performed prior to calling this
8825   // function. We assert with prejudice.
8826
8827   int32_t allocated_id = -1; // declare here so we can jump
8828   int32_t existing_id = -1;
8829   if (!uuid.is_zero()) {
8830     existing_id = osdmap.identify_osd(uuid);
8831     if (existing_id >= 0) {
8832       ceph_assert(id < 0 || id == existing_id);
8833       *new_id = existing_id;
8834       goto out;
8835     } else if (id >= 0) {
8836       // uuid does not exist, and id has been provided, so just create
8837       // the new osd.id
8838       *new_id = id;
8839       goto out;
8840     }
8841   }
8842
8843   // allocate a new id
8844   allocated_id = _allocate_osd_id(&existing_id);
8845   dout(10) << __func__ << " allocated id " << allocated_id
8846            << " existing id " << existing_id << dendl;
8847   if (existing_id >= 0) {
8848     ceph_assert(existing_id < osdmap.get_max_osd());
8849     ceph_assert(allocated_id < 0);
8850     pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8851     *new_id = existing_id;
8852   } else if (allocated_id >= 0) {
8853     ceph_assert(existing_id < 0);
8854     // raise max_osd
8855     if (pending_inc.new_max_osd < 0) {
8856       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8857     } else {
8858       ++pending_inc.new_max_osd;
8859     }
8860     *new_id = pending_inc.new_max_osd - 1;
8861     ceph_assert(*new_id == allocated_id);
8862   } else {
8863     ceph_abort_msg("unexpected condition");
8864   }
8865
8866 out:
8867   if (device_class.size()) {
8868     CrushWrapper newcrush;
8869     _get_pending_crush(newcrush);
8870     if (newcrush.get_max_devices() < *new_id + 1) {
8871       newcrush.set_max_devices(*new_id + 1);
8872     }
8873     string name = string("osd.") + stringify(*new_id);
8874     if (!newcrush.item_exists(*new_id)) {
8875       newcrush.set_item_name(*new_id, name);
8876     }
8877     ostringstream ss;
8878     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8879     if (r < 0) {
8880       derr << __func__ << " failed to set " << name << " device_class "
8881            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8882            << dendl;
8883       // non-fatal... this might be a replay and we want to be idempotent.
8884     } else {
8885       dout(20) << __func__ << " set " << name << " device_class " << device_class
8886                << dendl;
8887       pending_inc.crush.clear();
8888       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8889     }
8890   } else {
8891     dout(20) << __func__ << " no device_class" << dendl;
8892   }
8893
8894   dout(10) << __func__ << " using id " << *new_id << dendl;
8895   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8896     pending_inc.new_max_osd = *new_id + 1;
8897   }
8898
8899   pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8900   if (!uuid.is_zero())
8901     pending_inc.new_uuid[*new_id] = uuid;
8902 }
8903
8904 int OSDMonitor::validate_osd_create(
8905     const int32_t id,
8906     const uuid_d& uuid,
8907     const bool check_osd_exists,
8908     int32_t* existing_id,
8909     stringstream& ss)
8910 {
8911
8912   dout(10) << __func__ << " id " << id << " uuid " << uuid
8913            << " check_osd_exists " << check_osd_exists << dendl;
8914
8915   ceph_assert(existing_id);
8916
8917   if (id < 0 && uuid.is_zero()) {
8918     // we have nothing to validate
8919     *existing_id = -1;
8920     return 0;
8921   } else if (uuid.is_zero()) {
8922     // we have an id but we will ignore it - because that's what
8923     // `osd create` does.
8924     return 0;
8925   }
8926
8927   /*
8928    * This function will be used to validate whether we are able to
8929    * create a new osd when the `uuid` is specified.
8930    *
8931    * It will be used by both `osd create` and `osd new`, as the checks
8932    * are basically the same when it pertains to osd id and uuid validation.
8933    * However, `osd create` presumes an `uuid` is optional, for legacy
8934    * reasons, while `osd new` requires the `uuid` to be provided. This
8935    * means that `osd create` will not be idempotent if an `uuid` is not
8936    * provided, but we will always guarantee the idempotency of `osd new`.
8937    */
8938
8939   ceph_assert(!uuid.is_zero());
8940   if (pending_inc.identify_osd(uuid) >= 0) {
8941     // osd is about to exist
8942     return -EAGAIN;
8943   }
8944
8945   int32_t i = osdmap.identify_osd(uuid);
8946   if (i >= 0) {
8947     // osd already exists
8948     if (id >= 0 && i != id) {
8949       ss << "uuid " << uuid << " already in use for different id " << i;
8950       return -EEXIST;
8951     }
8952     // return a positive errno to distinguish between a blocking error
8953     // and an error we consider to not be a problem (i.e., this would be
8954     // an idempotent operation).
8955     *existing_id = i;
8956     return EEXIST;
8957   }
8958   // i < 0
8959   if (id >= 0) {
8960     if (pending_inc.new_state.count(id)) {
8961       // osd is about to exist
8962       return -EAGAIN;
8963     }
8964     // we may not care if an osd exists if we are recreating a previously
8965     // destroyed osd.
8966     if (check_osd_exists && osdmap.exists(id)) {
8967       ss << "id " << id << " already in use and does not match uuid "
8968          << uuid;
8969       return -EINVAL;
8970     }
8971   }
8972   return 0;
8973 }
8974
8975 int OSDMonitor::prepare_command_osd_create(
8976     const int32_t id,
8977     const uuid_d& uuid,
8978     int32_t* existing_id,
8979     stringstream& ss)
8980 {
8981   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8982   ceph_assert(existing_id);
8983   if (osdmap.is_destroyed(id)) {
8984     ss << "ceph osd create has been deprecated. Please use ceph osd new "
8985           "instead.";
8986     return -EINVAL;
8987   }
8988
8989   if (uuid.is_zero()) {
8990     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8991   }
8992
8993   return validate_osd_create(id, uuid, true, existing_id, ss);
8994 }
8995
8996 int OSDMonitor::prepare_command_osd_new(
8997     MonOpRequestRef op,
8998     const cmdmap_t& cmdmap,
8999     const map<string,string>& params,
9000     stringstream &ss,
9001     Formatter *f)
9002 {
9003   uuid_d uuid;
9004   string uuidstr;
9005   int64_t id = -1;
9006
9007   ceph_assert(paxos->is_plugged());
9008
9009   dout(10) << __func__ << " " << op << dendl;
9010
9011   /* validate command. abort now if something's wrong. */
9012
9013   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9014    *
9015    * If `id` is not specified, we will identify any existing osd based
9016    * on `uuid`. Operation will be idempotent iff secrets match.
9017    *
9018    * If `id` is specified, we will identify any existing osd based on
9019    * `uuid` and match against `id`. If they match, operation will be
9020    * idempotent iff secrets match.
9021    *
9022    * `-i secrets.json` will be optional. If supplied, will be used
9023    * to check for idempotency when `id` and `uuid` match.
9024    *
9025    * If `id` is not specified, and `uuid` does not exist, an id will
9026    * be found or allocated for the osd.
9027    *
9028    * If `id` is specified, and the osd has been previously marked
9029    * as destroyed, then the `id` will be reused.
9030    */
9031   if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9032     ss << "requires the OSD's UUID to be specified.";
9033     return -EINVAL;
9034   } else if (!uuid.parse(uuidstr.c_str())) {
9035     ss << "invalid UUID value '" << uuidstr << "'.";
9036     return -EINVAL;
9037   }
9038
9039   if (cmd_getval(cmdmap, "id", id) &&
9040       (id < 0)) {
9041     ss << "invalid OSD id; must be greater or equal than zero.";
9042     return -EINVAL;
9043   }
9044
9045   // are we running an `osd create`-like command, or recreating
9046   // a previously destroyed osd?
9047
9048   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9049
9050   // we will care about `id` to assess whether osd is `destroyed`, or
9051   // to create a new osd.
9052   // we will need an `id` by the time we reach auth.
9053
9054   int32_t existing_id = -1;
9055   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9056                                 &existing_id, ss);
9057
9058   bool may_be_idempotent = false;
9059   if (err == EEXIST) {
9060     // this is idempotent from the osdmon's point-of-view
9061     may_be_idempotent = true;
9062     ceph_assert(existing_id >= 0);
9063     id = existing_id;
9064   } else if (err < 0) {
9065     return err;
9066   }
9067
9068   if (!may_be_idempotent) {
9069     // idempotency is out of the window. We are either creating a new
9070     // osd or recreating a destroyed osd.
9071     //
9072     // We now need to figure out if we have an `id` (and if it's valid),
9073     // of find an `id` if we don't have one.
9074
9075     // NOTE: we need to consider the case where the `id` is specified for
9076     // `osd create`, and we must honor it. So this means checking if
9077     // the `id` is destroyed, and if so assume the destroy; otherwise,
9078     // check if it `exists` - in which case we complain about not being
9079     // `destroyed`. In the end, if nothing fails, we must allow the
9080     // creation, so that we are compatible with `create`.
9081     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9082       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9083       ss << "OSD " << id << " has not yet been destroyed";
9084       return -EINVAL;
9085     } else if (id < 0) {
9086       // find an `id`
9087       id = _allocate_osd_id(&existing_id);
9088       if (id < 0) {
9089         ceph_assert(existing_id >= 0);
9090         id = existing_id;
9091       }
9092       dout(10) << __func__ << " found id " << id << " to use" << dendl;
9093     } else if (id >= 0 && osdmap.is_destroyed(id)) {
9094       dout(10) << __func__ << " recreating osd." << id << dendl;
9095     } else {
9096       dout(10) << __func__ << " creating new osd." << id << dendl;
9097     }
9098   } else {
9099     ceph_assert(id >= 0);
9100     ceph_assert(osdmap.exists(id));
9101   }
9102
9103   // we are now able to either create a brand new osd or reuse an existing
9104   // osd that has been previously destroyed.
9105
9106   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9107
9108   if (may_be_idempotent && params.empty()) {
9109     // nothing to do, really.
9110     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9111     ceph_assert(id >= 0);
9112     if (f) {
9113       f->open_object_section("created_osd");
9114       f->dump_int("osdid", id);
9115       f->close_section();
9116     } else {
9117       ss << id;
9118     }
9119     return EEXIST;
9120   }
9121
9122   string device_class;
9123   auto p = params.find("crush_device_class");
9124   if (p != params.end()) {
9125     device_class = p->second;
9126     dout(20) << __func__ << " device_class will be " << device_class << dendl;
9127   }
9128   string cephx_secret, lockbox_secret, dmcrypt_key;
9129   bool has_lockbox = false;
9130   bool has_secrets = params.count("cephx_secret")
9131     || params.count("cephx_lockbox_secret")
9132     || params.count("dmcrypt_key");
9133
9134   ConfigKeyService *svc = nullptr;
9135   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9136
9137   if (has_secrets) {
9138     if (params.count("cephx_secret") == 0) {
9139       ss << "requires a cephx secret.";
9140       return -EINVAL;
9141     }
9142     cephx_secret = params.at("cephx_secret");
9143
9144     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9145     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9146
9147     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9148              << " dmcrypt " << has_dmcrypt_key << dendl;
9149
9150     if (has_lockbox_secret && has_dmcrypt_key) {
9151       has_lockbox = true;
9152       lockbox_secret = params.at("cephx_lockbox_secret");
9153       dmcrypt_key = params.at("dmcrypt_key");
9154     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9155       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9156       return -EINVAL;
9157     }
9158
9159     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9160
9161     err = mon->authmon()->validate_osd_new(id, uuid,
9162         cephx_secret,
9163         lockbox_secret,
9164         cephx_entity,
9165         lockbox_entity,
9166         ss);
9167     if (err < 0) {
9168       return err;
9169     } else if (may_be_idempotent && err != EEXIST) {
9170       // for this to be idempotent, `id` should already be >= 0; no need
9171       // to use validate_id.
9172       ceph_assert(id >= 0);
9173       ss << "osd." << id << " exists but secrets do not match";
9174       return -EEXIST;
9175     }
9176
9177     if (has_lockbox) {
9178       svc = (ConfigKeyService*)mon->config_key_service;
9179       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9180       if (err < 0) {
9181         return err;
9182       } else if (may_be_idempotent && err != EEXIST) {
9183         ceph_assert(id >= 0);
9184         ss << "osd." << id << " exists but dm-crypt key does not match.";
9185         return -EEXIST;
9186       }
9187     }
9188   }
9189   ceph_assert(!has_secrets || !cephx_secret.empty());
9190   ceph_assert(!has_lockbox || !lockbox_secret.empty());
9191
9192   if (may_be_idempotent) {
9193     // we have nothing to do for either the osdmon or the authmon,
9194     // and we have no lockbox - so the config key service will not be
9195     // touched. This is therefore an idempotent operation, and we can
9196     // just return right away.
9197     dout(10) << __func__ << " idempotent -- no op." << dendl;
9198     ceph_assert(id >= 0);
9199     if (f) {
9200       f->open_object_section("created_osd");
9201       f->dump_int("osdid", id);
9202       f->close_section();
9203     } else {
9204       ss << id;
9205     }
9206     return EEXIST;
9207   }
9208   ceph_assert(!may_be_idempotent);
9209
9210   // perform updates.
9211   if (has_secrets) {
9212     ceph_assert(!cephx_secret.empty());
9213     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9214            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9215
9216     err = mon->authmon()->do_osd_new(cephx_entity,
9217         lockbox_entity,
9218         has_lockbox);
9219     ceph_assert(0 == err);
9220
9221     if (has_lockbox) {
9222       ceph_assert(nullptr != svc);
9223       svc->do_osd_new(uuid, dmcrypt_key);
9224     }
9225   }
9226
9227   if (is_recreate_destroyed) {
9228     ceph_assert(id >= 0);
9229     ceph_assert(osdmap.is_destroyed(id));
9230     pending_inc.new_weight[id] = CEPH_OSD_OUT;
9231     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9232     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9233       pending_inc.new_state[id] |= CEPH_OSD_NEW;
9234     }
9235     if (osdmap.get_state(id) & CEPH_OSD_UP) {
9236       // due to http://tracker.ceph.com/issues/20751 some clusters may
9237       // have UP set for non-existent OSDs; make sure it is cleared
9238       // for a newly created osd.
9239       pending_inc.new_state[id] |= CEPH_OSD_UP;
9240     }
9241     pending_inc.new_uuid[id] = uuid;
9242   } else {
9243     ceph_assert(id >= 0);
9244     int32_t new_id = -1;
9245     do_osd_create(id, uuid, device_class, &new_id);
9246     ceph_assert(new_id >= 0);
9247     ceph_assert(id == new_id);
9248   }
9249
9250   if (f) {
9251     f->open_object_section("created_osd");
9252     f->dump_int("osdid", id);
9253     f->close_section();
9254   } else {
9255     ss << id;
9256   }
9257
9258   return 0;
9259 }
9260
9261 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9262 {
9263   op->mark_osdmon_event(__func__);
9264   auto m = op->get_req<MMonCommand>();
9265   stringstream ss;
9266   cmdmap_t cmdmap;
9267   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9268     string rs = ss.str();
9269     mon->reply_command(op, -EINVAL, rs, get_last_committed());
9270     return true;
9271   }
9272
9273   MonSession *session = op->get_session();
9274   if (!session) {
9275     derr << __func__ << " no session" << dendl;
9276     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9277     return true;
9278   }
9279
9280   return prepare_command_impl(op, cmdmap);
9281 }
9282
9283 static int parse_reweights(CephContext *cct,
9284                            const cmdmap_t& cmdmap,
9285                            const OSDMap& osdmap,
9286                            map<int32_t, uint32_t>* weights)
9287 {
9288   string weights_str;
9289   if (!cmd_getval(cmdmap, "weights", weights_str)) {
9290     return -EINVAL;
9291   }
9292   std::replace(begin(weights_str), end(weights_str), '\'', '"');
9293   json_spirit::mValue json_value;
9294   if (!json_spirit::read(weights_str, json_value)) {
9295     return -EINVAL;
9296   }
9297   if (json_value.type() != json_spirit::obj_type) {
9298     return -EINVAL;
9299   }
9300   const auto obj = json_value.get_obj();
9301   try {
9302     for (auto& osd_weight : obj) {
9303       auto osd_id = std::stoi(osd_weight.first);
9304       if (!osdmap.exists(osd_id)) {
9305         return -ENOENT;
9306       }
9307       if (osd_weight.second.type() != json_spirit::str_type) {
9308         return -EINVAL;
9309       }
9310       auto weight = std::stoul(osd_weight.second.get_str());
9311       weights->insert({osd_id, weight});
9312     }
9313   } catch (const std::logic_error& e) {
9314     return -EINVAL;
9315   }
9316   return 0;
9317 }
9318
9319 int OSDMonitor::prepare_command_osd_destroy(
9320     int32_t id,
9321     stringstream& ss)
9322 {
9323   ceph_assert(paxos->is_plugged());
9324
9325   // we check if the osd exists for the benefit of `osd purge`, which may
9326   // have previously removed the osd. If the osd does not exist, return
9327   // -ENOENT to convey this, and let the caller deal with it.
9328   //
9329   // we presume that all auth secrets and config keys were removed prior
9330   // to this command being called. if they exist by now, we also assume
9331   // they must have been created by some other command and do not pertain
9332   // to this non-existent osd.
9333   if (!osdmap.exists(id)) {
9334     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9335     return -ENOENT;
9336   }
9337
9338   uuid_d uuid = osdmap.get_uuid(id);
9339   dout(10) << __func__ << " destroying osd." << id
9340            << " uuid " << uuid << dendl;
9341
9342   // if it has been destroyed, we assume our work here is done.
9343   if (osdmap.is_destroyed(id)) {
9344     ss << "destroyed osd." << id;
9345     return 0;
9346   }
9347
9348   EntityName cephx_entity, lockbox_entity;
9349   bool idempotent_auth = false, idempotent_cks = false;
9350
9351   int err = mon->authmon()->validate_osd_destroy(id, uuid,
9352                                                  cephx_entity,
9353                                                  lockbox_entity,
9354                                                  ss);
9355   if (err < 0) {
9356     if (err == -ENOENT) {
9357       idempotent_auth = true;
9358     } else {
9359       return err;
9360     }
9361   }
9362
9363   ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9364   err = svc->validate_osd_destroy(id, uuid);
9365   if (err < 0) {
9366     ceph_assert(err == -ENOENT);
9367     err = 0;
9368     idempotent_cks = true;
9369   }
9370
9371   if (!idempotent_auth) {
9372     err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9373     ceph_assert(0 == err);
9374   }
9375
9376   if (!idempotent_cks) {
9377     svc->do_osd_destroy(id, uuid);
9378   }
9379
9380   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9381   pending_inc.new_uuid[id] = uuid_d();
9382
9383   // we can only propose_pending() once per service, otherwise we'll be
9384   // defying PaxosService and all laws of nature. Therefore, as we may
9385   // be used during 'osd purge', let's keep the caller responsible for
9386   // proposing.
9387   ceph_assert(err == 0);
9388   return 0;
9389 }
9390
9391 int OSDMonitor::prepare_command_osd_purge(
9392     int32_t id,
9393     stringstream& ss)
9394 {
9395   ceph_assert(paxos->is_plugged());
9396   dout(10) << __func__ << " purging osd." << id << dendl;
9397
9398   ceph_assert(!osdmap.is_up(id));
9399
9400   /*
9401    * This may look a bit weird, but this is what's going to happen:
9402    *
9403    *  1. we make sure that removing from crush works
9404    *  2. we call `prepare_command_osd_destroy()`. If it returns an
9405    *     error, then we abort the whole operation, as no updates
9406    *     have been made. However, we this function will have
9407    *     side-effects, thus we need to make sure that all operations
9408    *     performed henceforth will *always* succeed.
9409    *  3. we call `prepare_command_osd_remove()`. Although this
9410    *     function can return an error, it currently only checks if the
9411    *     osd is up - and we have made sure that it is not so, so there
9412    *     is no conflict, and it is effectively an update.
9413    *  4. finally, we call `do_osd_crush_remove()`, which will perform
9414    *     the crush update we delayed from before.
9415    */
9416
9417   CrushWrapper newcrush;
9418   _get_pending_crush(newcrush);
9419
9420   bool may_be_idempotent = false;
9421
9422   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9423   if (err == -ENOENT) {
9424     err = 0;
9425     may_be_idempotent = true;
9426   } else if (err < 0) {
9427     ss << "error removing osd." << id << " from crush";
9428     return err;
9429   }
9430
9431   // no point destroying the osd again if it has already been marked destroyed
9432   if (!osdmap.is_destroyed(id)) {
9433     err = prepare_command_osd_destroy(id, ss);
9434     if (err < 0) {
9435       if (err == -ENOENT) {
9436         err = 0;
9437       } else {
9438         return err;
9439       }
9440     } else {
9441       may_be_idempotent = false;
9442     }
9443   }
9444   ceph_assert(0 == err);
9445
9446   if (may_be_idempotent && !osdmap.exists(id)) {
9447     dout(10) << __func__ << " osd." << id << " does not exist and "
9448              << "we are idempotent." << dendl;
9449     return -ENOENT;
9450   }
9451
9452   err = prepare_command_osd_remove(id);
9453   // we should not be busy, as we should have made sure this id is not up.
9454   ceph_assert(0 == err);
9455
9456   do_osd_crush_remove(newcrush);
9457   return 0;
9458 }
9459
9460 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9461                                       const cmdmap_t& cmdmap)
9462 {
9463   op->mark_osdmon_event(__func__);
9464   auto m = op->get_req<MMonCommand>();
9465   bool ret = false;
9466   stringstream ss;
9467   string rs;
9468   bufferlist rdata;
9469   int err = 0;
9470
9471   string format;
9472   cmd_getval(cmdmap, "format", format, string("plain"));
9473   boost::scoped_ptr<Formatter> f(Formatter::create(format));
9474
9475   string prefix;
9476   cmd_getval(cmdmap, "prefix", prefix);
9477
9478   int64_t osdid;
9479   string osd_name;
9480   bool osdid_present = false;
9481   if (prefix != "osd pg-temp" &&
9482       prefix != "osd pg-upmap" &&
9483       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
9484     osdid_present = cmd_getval(cmdmap, "id", osdid);
9485   }
9486   if (osdid_present) {
9487     ostringstream oss;
9488     oss << "osd." << osdid;
9489     osd_name = oss.str();
9490   }
9491
9492   // Even if there's a pending state with changes that could affect
9493   // a command, considering that said state isn't yet committed, we
9494   // just don't care about those changes if the command currently being
9495   // handled acts as a no-op against the current committed state.
9496   // In a nutshell, we assume this command  happens *before*.
9497   //
9498   // Let me make this clearer:
9499   //
9500   //   - If we have only one client, and that client issues some
9501   //     operation that would conflict with this operation  but is
9502   //     still on the pending state, then we would be sure that said
9503   //     operation wouldn't have returned yet, so the client wouldn't
9504   //     issue this operation (unless the client didn't wait for the
9505   //     operation to finish, and that would be the client's own fault).
9506   //
9507   //   - If we have more than one client, each client will observe
9508   //     whatever is the state at the moment of the commit.  So, if we
9509   //     have two clients, one issuing an unlink and another issuing a
9510   //     link, and if the link happens while the unlink is still on the
9511   //     pending state, from the link's point-of-view this is a no-op.
9512   //     If different clients are issuing conflicting operations and
9513   //     they care about that, then the clients should make sure they
9514   //     enforce some kind of concurrency mechanism -- from our
9515   //     perspective that's what Douglas Adams would call an SEP.
9516   //
9517   // This should be used as a general guideline for most commands handled
9518   // in this function.  Adapt as you see fit, but please bear in mind that
9519   // this is the expected behavior.
9520
9521
9522   if (prefix == "osd setcrushmap" ||
9523       (prefix == "osd crush set" && !osdid_present)) {
9524     if (pending_inc.crush.length()) {
9525       dout(10) << __func__ << " waiting for pending crush update " << dendl;
9526       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9527       return true;
9528     }
9529     dout(10) << "prepare_command setting new crush map" << dendl;
9530     bufferlist data(m->get_data());
9531     CrushWrapper crush;
9532     try {
9533       auto bl = data.cbegin();
9534       crush.decode(bl);
9535     }
9536     catch (const std::exception &e) {
9537       err = -EINVAL;
9538       ss << "Failed to parse crushmap: " << e.what();
9539       goto reply;
9540     }
9541
9542     int64_t prior_version = 0;
9543     if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9544       if (prior_version == osdmap.get_crush_version() - 1) {
9545         // see if we are a resend of the last update.  this is imperfect
9546         // (multiple racing updaters may not both get reliable success)
9547         // but we expect crush updaters (via this interface) to be rare-ish.
9548         bufferlist current, proposed;
9549         osdmap.crush->encode(current, mon->get_quorum_con_features());
9550         crush.encode(proposed, mon->get_quorum_con_features());
9551         if (current.contents_equal(proposed)) {
9552           dout(10) << __func__
9553                    << " proposed matches current and version equals previous"
9554                    << dendl;
9555           err = 0;
9556           ss << osdmap.get_crush_version();
9557           goto reply;
9558         }
9559       }
9560       if (prior_version != osdmap.get_crush_version()) {
9561         err = -EPERM;
9562         ss << "prior_version " << prior_version << " != crush version "
9563            << osdmap.get_crush_version();
9564         goto reply;
9565       }
9566     }
9567
9568     if (crush.has_legacy_rule_ids()) {
9569       err = -EINVAL;
9570       ss << "crush maps with ruleset != ruleid are no longer allowed";
9571       goto reply;
9572     }
9573     if (!validate_crush_against_features(&crush, ss)) {
9574       err = -EINVAL;
9575       goto reply;
9576     }
9577
9578     err = osdmap.validate_crush_rules(&crush, &ss);
9579     if (err < 0) {
9580       goto reply;
9581     }
9582
9583     if (g_conf()->mon_osd_crush_smoke_test) {
9584       // sanity check: test some inputs to make sure this map isn't
9585       // totally broken
9586       dout(10) << " testing map" << dendl;
9587       stringstream ess;
9588       CrushTester tester(crush, ess);
9589       tester.set_min_x(0);
9590       tester.set_max_x(50);
9591       auto start = ceph::coarse_mono_clock::now();
9592       int r = tester.test_with_fork(g_conf()->mon_lease);
9593       auto duration = ceph::coarse_mono_clock::now() - start;
9594       if (r < 0) {
9595         dout(10) << " tester.test_with_fork returns " << r
9596                  << ": " << ess.str() << dendl;
9597         ss << "crush smoke test failed with " << r << ": " << ess.str();
9598         err = r;
9599         goto reply;
9600       }
9601       dout(10) << __func__ << " crush somke test duration: "
9602                << duration << ", result: " << ess.str() << dendl;
9603     }
9604
9605     pending_inc.crush = data;
9606     ss << osdmap.get_crush_version() + 1;
9607     goto update;
9608
9609   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9610     CrushWrapper newcrush;
9611     _get_pending_crush(newcrush);
9612     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9613       int bid = -1 - b;
9614       if (newcrush.bucket_exists(bid) &&
9615           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9616         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9617         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9618       }
9619     }
9620     if (!validate_crush_against_features(&newcrush, ss)) {
9621       err = -EINVAL;
9622       goto reply;
9623     }
9624     pending_inc.crush.clear();
9625     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9626     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9627                                               get_last_committed() + 1));
9628     return true;
9629   } else if (prefix == "osd crush set-device-class") {
9630     string device_class;
9631     if (!cmd_getval(cmdmap, "class", device_class)) {
9632       err = -EINVAL; // no value!
9633       goto reply;
9634     }
9635
9636     bool stop = false;
9637     vector<string> idvec;
9638     cmd_getval(cmdmap, "ids", idvec);
9639     CrushWrapper newcrush;
9640     _get_pending_crush(newcrush);
9641     set<int> updated;
9642     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9643       set<int> osds;
9644       // wildcard?
9645       if (j == 0 &&
9646           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9647         osdmap.get_all_osds(osds);
9648         stop = true;
9649       } else {
9650         // try traditional single osd way
9651         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9652         if (osd < 0) {
9653           // ss has reason for failure
9654           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9655           err = -EINVAL;
9656           continue;
9657         }
9658         osds.insert(osd);
9659       }
9660
9661       for (auto &osd : osds) {
9662         if (!osdmap.exists(osd)) {
9663           ss << "osd." << osd << " does not exist. ";
9664           continue;
9665         }
9666
9667         ostringstream oss;
9668         oss << "osd." << osd;
9669         string name = oss.str();
9670
9671         if (newcrush.get_max_devices() < osd + 1) {
9672           newcrush.set_max_devices(osd + 1);
9673         }
9674         string action;
9675         if (newcrush.item_exists(osd)) {
9676           action = "updating";
9677         } else {
9678           action = "creating";
9679           newcrush.set_item_name(osd, name);
9680         }
9681
9682         dout(5) << action << " crush item id " << osd << " name '" << name
9683                 << "' device_class '" << device_class << "'"
9684                 << dendl;
9685         err = newcrush.update_device_class(osd, device_class, name, &ss);
9686         if (err < 0) {
9687           goto reply;
9688         }
9689         if (err == 0 && !_have_pending_crush()) {
9690           if (!stop) {
9691             // for single osd only, wildcard makes too much noise
9692             ss << "set-device-class item id " << osd << " name '" << name
9693                << "' device_class '" << device_class << "': no change. ";
9694           }
9695         } else {
9696           updated.insert(osd);
9697         }
9698       }
9699     }
9700
9701     if (!updated.empty()) {
9702       pending_inc.crush.clear();
9703       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9704       ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9705       getline(ss, rs);
9706       wait_for_finished_proposal(op,
9707         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9708       return true;
9709     }
9710
9711  } else if (prefix == "osd crush rm-device-class") {
9712     bool stop = false;
9713     vector<string> idvec;
9714     cmd_getval(cmdmap, "ids", idvec);
9715     CrushWrapper newcrush;
9716     _get_pending_crush(newcrush);
9717     set<int> updated;
9718
9719     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9720       set<int> osds;
9721
9722       // wildcard?
9723       if (j == 0 &&
9724           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9725         osdmap.get_all_osds(osds);
9726         stop = true;
9727       } else {
9728         // try traditional single osd way
9729         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9730         if (osd < 0) {
9731           // ss has reason for failure
9732           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9733           err = -EINVAL;
9734           goto reply;
9735         }
9736         osds.insert(osd);
9737       }
9738
9739       for (auto &osd : osds) {
9740         if (!osdmap.exists(osd)) {
9741           ss << "osd." << osd << " does not exist. ";
9742           continue;
9743         }
9744
9745         auto class_name = newcrush.get_item_class(osd);
9746         if (!class_name) {
9747           ss << "osd." << osd << " belongs to no class, ";
9748           continue;
9749         }
9750         // note that we do not verify if class_is_in_use here
9751         // in case the device is misclassified and user wants
9752         // to overridely reset...
9753
9754         err = newcrush.remove_device_class(cct, osd, &ss);
9755         if (err < 0) {
9756           // ss has reason for failure
9757           goto reply;
9758         }
9759         updated.insert(osd);
9760       }
9761     }
9762
9763     if (!updated.empty()) {
9764       pending_inc.crush.clear();
9765       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9766       ss << "done removing class of osd(s): " << updated;
9767       getline(ss, rs);
9768       wait_for_finished_proposal(op,
9769         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9770       return true;
9771     }
9772   } else if (prefix == "osd crush class create") {
9773     string device_class;
9774     if (!cmd_getval(cmdmap, "class", device_class)) {
9775       err = -EINVAL; // no value!
9776       goto reply;
9777     }
9778     if (osdmap.require_osd_release < ceph_release_t::luminous) {
9779       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9780          << "luminous' before using crush device classes";
9781       err = -EPERM;
9782       goto reply;
9783     }
9784     if (!_have_pending_crush() &&
9785         _get_stable_crush().class_exists(device_class)) {
9786       ss << "class '" << device_class << "' already exists";
9787       goto reply;
9788     }
9789      CrushWrapper newcrush;
9790     _get_pending_crush(newcrush);
9791      if (newcrush.class_exists(device_class)) {
9792       ss << "class '" << device_class << "' already exists";
9793       goto update;
9794     }
9795     int class_id = newcrush.get_or_create_class_id(device_class);
9796     pending_inc.crush.clear();
9797     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9798     ss << "created class " << device_class << " with id " << class_id
9799        << " to crush map";
9800     goto update;
9801   } else if (prefix == "osd crush class rm") {
9802     string device_class;
9803     if (!cmd_getval(cmdmap, "class", device_class)) {
9804        err = -EINVAL; // no value!
9805        goto reply;
9806      }
9807     if (osdmap.require_osd_release < ceph_release_t::luminous) {
9808        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9809          << "luminous' before using crush device classes";
9810        err = -EPERM;
9811        goto reply;
9812      }
9813
9814      if (!osdmap.crush->class_exists(device_class)) {
9815        err = 0;
9816        goto reply;
9817      }
9818
9819      CrushWrapper newcrush;
9820      _get_pending_crush(newcrush);
9821      if (!newcrush.class_exists(device_class)) {
9822        err = 0; // make command idempotent
9823        goto wait;
9824      }
9825      int class_id = newcrush.get_class_id(device_class);
9826      stringstream ts;
9827      if (newcrush.class_is_in_use(class_id, &ts)) {
9828        err = -EBUSY;
9829        ss << "class '" << device_class << "' " << ts.str();
9830        goto reply;
9831      }
9832
9833      // check if class is used by any erasure-code-profiles
9834      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9835        osdmap.get_erasure_code_profiles();
9836      auto ec_profiles = pending_inc.get_erasure_code_profiles();
9837 #ifdef HAVE_STDLIB_MAP_SPLICING
9838      ec_profiles.merge(old_ec_profiles);
9839 #else
9840      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9841                         make_move_iterator(end(old_ec_profiles)));
9842 #endif
9843      list<string> referenced_by;
9844      for (auto &i: ec_profiles) {
9845        for (auto &j: i.second) {
9846          if ("crush-device-class" == j.first && device_class == j.second) {
9847            referenced_by.push_back(i.first);
9848          }
9849        }
9850      }
9851      if (!referenced_by.empty()) {
9852        err = -EBUSY;
9853        ss << "class '" << device_class
9854           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9855        goto reply;
9856      }
9857
9858      set<int> osds;
9859      newcrush.get_devices_by_class(device_class, &osds);
9860      for (auto& p: osds) {
9861        err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9862        if (err < 0) {
9863          // ss has reason for failure
9864          goto reply;
9865        }
9866      }
9867
9868      if (osds.empty()) {
9869        // empty class, remove directly
9870        err = newcrush.remove_class_name(device_class);
9871        if (err < 0) {
9872          ss << "class '" << device_class << "' cannot be removed '"
9873             << cpp_strerror(err) << "'";
9874          goto reply;
9875        }
9876      }
9877
9878      pending_inc.crush.clear();
9879      newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9880      ss << "removed class " << device_class << " with id " << class_id
9881         << " from crush map";
9882      goto update;
9883   } else if (prefix == "osd crush class rename") {
9884     string srcname, dstname;
9885     if (!cmd_getval(cmdmap, "srcname", srcname)) {
9886       err = -EINVAL;
9887       goto reply;
9888     }
9889     if (!cmd_getval(cmdmap, "dstname", dstname)) {
9890       err = -EINVAL;
9891       goto reply;
9892     }
9893
9894     CrushWrapper newcrush;
9895     _get_pending_crush(newcrush);
9896     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9897       // suppose this is a replay and return success
9898       // so command is idempotent
9899       ss << "already renamed to '" << dstname << "'";
9900       err = 0;
9901       goto reply;
9902     }
9903
9904     err = newcrush.rename_class(srcname, dstname);
9905     if (err < 0) {
9906       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9907          << cpp_strerror(err);
9908       goto reply;
9909     }
9910
9911     pending_inc.crush.clear();
9912     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9913     ss << "rename class '" << srcname << "' to '" << dstname << "'";
9914     goto update;
9915   } else if (prefix == "osd crush add-bucket") {
9916     // os crush add-bucket <name> <type>
9917     string name, typestr;
9918     vector<string> argvec;
9919     cmd_getval(cmdmap, "name", name);
9920     cmd_getval(cmdmap, "type", typestr);
9921     cmd_getval(cmdmap, "args", argvec);
9922     map<string,string> loc;
9923     if (!argvec.empty()) {
9924       CrushWrapper::parse_loc_map(argvec, &loc);
9925       dout(0) << "will create and move bucket '" << name
9926               << "' to location " << loc << dendl;
9927     }
9928
9929     if (!_have_pending_crush() &&
9930         _get_stable_crush().name_exists(name)) {
9931       ss << "bucket '" << name << "' already exists";
9932       goto reply;
9933     }
9934
9935     CrushWrapper newcrush;
9936     _get_pending_crush(newcrush);
9937
9938     if (newcrush.name_exists(name)) {
9939       ss << "bucket '" << name << "' already exists";
9940       goto update;
9941     }
9942     int type = newcrush.get_type_id(typestr);
9943     if (type < 0) {
9944       ss << "type '" << typestr << "' does not exist";
9945       err = -EINVAL;
9946       goto reply;
9947     }
9948     if (type == 0) {
9949       ss << "type '" << typestr << "' is for devices, not buckets";
9950       err = -EINVAL;
9951       goto reply;
9952     }
9953     int bucketno;
9954     err = newcrush.add_bucket(0, 0,
9955                               CRUSH_HASH_DEFAULT, type, 0, NULL,
9956                               NULL, &bucketno);
9957     if (err < 0) {
9958       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9959       goto reply;
9960     }
9961     err = newcrush.set_item_name(bucketno, name);
9962     if (err < 0) {
9963       ss << "error setting bucket name to '" << name << "'";
9964       goto reply;
9965     }
9966
9967     if (!loc.empty()) {
9968       if (!newcrush.check_item_loc(cct, bucketno, loc,
9969           (int *)NULL)) {
9970         err = newcrush.move_bucket(cct, bucketno, loc);
9971         if (err < 0) {
9972           ss << "error moving bucket '" << name << "' to location " << loc;
9973           goto reply;
9974         }
9975       } else {
9976         ss << "no need to move item id " << bucketno << " name '" << name
9977            << "' to location " << loc << " in crush map";
9978       }
9979     }
9980
9981     pending_inc.crush.clear();
9982     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9983     if (loc.empty()) {
9984       ss << "added bucket " << name << " type " << typestr
9985          << " to crush map";
9986     } else {
9987       ss << "added bucket " << name << " type " << typestr
9988          << " to location " << loc;
9989     }
9990     goto update;
9991   } else if (prefix == "osd crush rename-bucket") {
9992     string srcname, dstname;
9993     cmd_getval(cmdmap, "srcname", srcname);
9994     cmd_getval(cmdmap, "dstname", dstname);
9995
9996     err = crush_rename_bucket(srcname, dstname, &ss);
9997     if (err == -EALREADY) // equivalent to success for idempotency
9998       err = 0;
9999     if (err)
10000       goto reply;
10001     else
10002       goto update;
10003   } else if (prefix == "osd crush weight-set create" ||
10004              prefix == "osd crush weight-set create-compat") {
10005     CrushWrapper newcrush;
10006     _get_pending_crush(newcrush);
10007     int64_t pool;
10008     int positions;
10009     if (newcrush.has_non_straw2_buckets()) {
10010       ss << "crush map contains one or more bucket(s) that are not straw2";
10011       err = -EPERM;
10012       goto reply;
10013     }
10014     if (prefix == "osd crush weight-set create") {
10015       if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10016           osdmap.require_min_compat_client < ceph_release_t::luminous) {
10017         ss << "require_min_compat_client "
10018            << osdmap.require_min_compat_client
10019            << " < luminous, which is required for per-pool weight-sets. "
10020            << "Try 'ceph osd set-require-min-compat-client luminous' "
10021            << "before using the new interface";
10022         err = -EPERM;
10023         goto reply;
10024       }
10025       string poolname, mode;
10026       cmd_getval(cmdmap, "pool", poolname);
10027       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10028       if (pool < 0) {
10029         ss << "pool '" << poolname << "' not found";
10030         err = -ENOENT;
10031         goto reply;
10032       }
10033       cmd_getval(cmdmap, "mode", mode);
10034       if (mode != "flat" && mode != "positional") {
10035         ss << "unrecognized weight-set mode '" << mode << "'";
10036         err = -EINVAL;
10037         goto reply;
10038       }
10039       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10040     } else {
10041       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10042       positions = 1;
10043     }
10044     if (!newcrush.create_choose_args(pool, positions)) {
10045       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10046         ss << "compat weight-set already created";
10047       } else {
10048         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10049            << "' already created";
10050       }
10051       goto reply;
10052     }
10053     pending_inc.crush.clear();
10054     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10055     goto update;
10056
10057   } else if (prefix == "osd crush weight-set rm" ||
10058              prefix == "osd crush weight-set rm-compat") {
10059     CrushWrapper newcrush;
10060     _get_pending_crush(newcrush);
10061     int64_t pool;
10062     if (prefix == "osd crush weight-set rm") {
10063       string poolname;
10064       cmd_getval(cmdmap, "pool", poolname);
10065       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10066       if (pool < 0) {
10067         ss << "pool '" << poolname << "' not found";
10068         err = -ENOENT;
10069         goto reply;
10070       }
10071     } else {
10072       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10073     }
10074     newcrush.rm_choose_args(pool);
10075     pending_inc.crush.clear();
10076     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10077     goto update;
10078
10079   } else if (prefix == "osd crush weight-set reweight" ||
10080              prefix == "osd crush weight-set reweight-compat") {
10081     string poolname, item;
10082     vector<double> weight;
10083     cmd_getval(cmdmap, "pool", poolname);
10084     cmd_getval(cmdmap, "item", item);
10085     cmd_getval(cmdmap, "weight", weight);
10086     CrushWrapper newcrush;
10087     _get_pending_crush(newcrush);
10088     int64_t pool;
10089     if (prefix == "osd crush weight-set reweight") {
10090       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10091       if (pool < 0) {
10092         ss << "pool '" << poolname << "' not found";
10093         err = -ENOENT;
10094         goto reply;
10095       }
10096       if (!newcrush.have_choose_args(pool)) {
10097         ss << "no weight-set for pool '" << poolname << "'";
10098         err = -ENOENT;
10099         goto reply;
10100       }
10101       auto arg_map = newcrush.choose_args_get(pool);
10102       int positions = newcrush.get_choose_args_positions(arg_map);
10103       if (weight.size() != (size_t)positions) {
10104          ss << "must specify exact " << positions << " weight values";
10105          err = -EINVAL;
10106          goto reply;
10107       }
10108     } else {
10109       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10110       if (!newcrush.have_choose_args(pool)) {
10111         ss << "no backward-compatible weight-set";
10112         err = -ENOENT;
10113         goto reply;
10114       }
10115     }
10116     if (!newcrush.name_exists(item)) {
10117       ss << "item '" << item << "' does not exist";
10118       err = -ENOENT;
10119       goto reply;
10120     }
10121     err = newcrush.choose_args_adjust_item_weightf(
10122       cct,
10123       newcrush.choose_args_get(pool),
10124       newcrush.get_item_id(item),
10125       weight,
10126       &ss);
10127     if (err < 0) {
10128       goto reply;
10129     }
10130     err = 0;
10131     pending_inc.crush.clear();
10132     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10133     goto update;
10134   } else if (osdid_present &&
10135              (prefix == "osd crush set" || prefix == "osd crush add")) {
10136     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10137     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10138     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10139
10140     if (!osdmap.exists(osdid)) {
10141       err = -ENOENT;
10142       ss << osd_name
10143          << " does not exist. Create it before updating the crush map";
10144       goto reply;
10145     }
10146
10147     double weight;
10148     if (!cmd_getval(cmdmap, "weight", weight)) {
10149       ss << "unable to parse weight value '"
10150          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10151       err = -EINVAL;
10152       goto reply;
10153     }
10154
10155     string args;
10156     vector<string> argvec;
10157     cmd_getval(cmdmap, "args", argvec);
10158     map<string,string> loc;
10159     CrushWrapper::parse_loc_map(argvec, &loc);
10160
10161     if (prefix == "osd crush set"
10162         && !_get_stable_crush().item_exists(osdid)) {
10163       err = -ENOENT;
10164       ss << "unable to set item id " << osdid << " name '" << osd_name
10165          << "' weight " << weight << " at location " << loc
10166          << ": does not exist";
10167       goto reply;
10168     }
10169
10170     dout(5) << "adding/updating crush item id " << osdid << " name '"
10171       << osd_name << "' weight " << weight << " at location "
10172       << loc << dendl;
10173     CrushWrapper newcrush;
10174     _get_pending_crush(newcrush);
10175
10176     string action;
10177     if (prefix == "osd crush set" ||
10178         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10179       action = "set";
10180       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10181     } else {
10182       action = "add";
10183       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10184       if (err == 0)
10185         err = 1;
10186     }
10187
10188     if (err < 0)
10189       goto reply;
10190
10191     if (err == 0 && !_have_pending_crush()) {
10192       ss << action << " item id " << osdid << " name '" << osd_name
10193          << "' weight " << weight << " at location " << loc << ": no change";
10194       goto reply;
10195     }
10196
10197     pending_inc.crush.clear();
10198     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10199     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10200        << weight << " at location " << loc << " to crush map";
10201     getline(ss, rs);
10202     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10203                                                       get_last_committed() + 1));
10204     return true;
10205
10206   } else if (prefix == "osd crush create-or-move") {
10207     do {
10208       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10209       if (!osdmap.exists(osdid)) {
10210         err = -ENOENT;
10211         ss << osd_name
10212            << " does not exist.  create it before updating the crush map";
10213         goto reply;
10214       }
10215
10216       double weight;
10217       if (!cmd_getval(cmdmap, "weight", weight)) {
10218         ss << "unable to parse weight value '"
10219            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10220         err = -EINVAL;
10221         goto reply;
10222       }
10223
10224       string args;
10225       vector<string> argvec;
10226       cmd_getval(cmdmap, "args", argvec);
10227       map<string,string> loc;
10228       CrushWrapper::parse_loc_map(argvec, &loc);
10229
10230       dout(0) << "create-or-move crush item name '" << osd_name
10231               << "' initial_weight " << weight << " at location " << loc
10232               << dendl;
10233
10234       CrushWrapper newcrush;
10235       _get_pending_crush(newcrush);
10236
10237       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10238                                          g_conf()->osd_crush_update_weight_set);
10239       if (err == 0) {
10240         ss << "create-or-move updated item name '" << osd_name
10241            << "' weight " << weight
10242            << " at location " << loc << " to crush map";
10243         break;
10244       }
10245       if (err > 0) {
10246         pending_inc.crush.clear();
10247         newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10248         ss << "create-or-move updating item name '" << osd_name
10249            << "' weight " << weight
10250            << " at location " << loc << " to crush map";
10251         getline(ss, rs);
10252         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10253                                                   get_last_committed() + 1));
10254         return true;
10255       }
10256     } while (false);
10257
10258   } else if (prefix == "osd crush move") {
10259     do {
10260       // osd crush move <name> <loc1> [<loc2> ...]
10261       string name;
10262       vector<string> argvec;
10263       cmd_getval(cmdmap, "name", name);
10264       cmd_getval(cmdmap, "args", argvec);
10265       map<string,string> loc;
10266       CrushWrapper::parse_loc_map(argvec, &loc);
10267
10268       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10269       CrushWrapper newcrush;
10270       _get_pending_crush(newcrush);
10271
10272       if (!newcrush.name_exists(name)) {
10273         err = -ENOENT;
10274         ss << "item " << name << " does not exist";
10275         break;
10276       }
10277       int id = newcrush.get_item_id(name);
10278
10279       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10280         if (id >= 0) {
10281           err = newcrush.create_or_move_item(
10282             cct, id, 0, name, loc,
10283             g_conf()->osd_crush_update_weight_set);
10284         } else {
10285           err = newcrush.move_bucket(cct, id, loc);
10286         }
10287         if (err >= 0) {
10288           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10289           pending_inc.crush.clear();
10290           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10291           getline(ss, rs);
10292           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10293                                                    get_last_committed() + 1));
10294           return true;
10295         }
10296       } else {
10297         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10298         err = 0;
10299       }
10300     } while (false);
10301   } else if (prefix == "osd crush swap-bucket") {
10302     string source, dest;
10303     cmd_getval(cmdmap, "source", source);
10304     cmd_getval(cmdmap, "dest", dest);
10305
10306     bool force = false;
10307     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10308
10309     CrushWrapper newcrush;
10310     _get_pending_crush(newcrush);
10311     if (!newcrush.name_exists(source)) {
10312       ss << "source item " << source << " does not exist";
10313       err = -ENOENT;
10314       goto reply;
10315     }
10316     if (!newcrush.name_exists(dest)) {
10317       ss << "dest item " << dest << " does not exist";
10318       err = -ENOENT;
10319       goto reply;
10320     }
10321     int sid = newcrush.get_item_id(source);
10322     int did = newcrush.get_item_id(dest);
10323     int sparent;
10324     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10325       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10326       err = -EPERM;
10327       goto reply;
10328     }
10329     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10330         !force) {
10331       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10332          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10333          << "; pass --yes-i-really-mean-it to proceed anyway";
10334       err = -EPERM;
10335       goto reply;
10336     }
10337     int r = newcrush.swap_bucket(cct, sid, did);
10338     if (r < 0) {
10339       ss << "failed to swap bucket contents: " << cpp_strerror(r);
10340       err = r;
10341       goto reply;
10342     }
10343     ss << "swapped bucket of " << source << " to " << dest;
10344     pending_inc.crush.clear();
10345     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10346     wait_for_finished_proposal(op,
10347                                new Monitor::C_Command(mon, op, err, ss.str(),
10348                                                       get_last_committed() + 1));
10349     return true;
10350   } else if (prefix == "osd crush link") {
10351     // osd crush link <name> <loc1> [<loc2> ...]
10352     string name;
10353     cmd_getval(cmdmap, "name", name);
10354     vector<string> argvec;
10355     cmd_getval(cmdmap, "args", argvec);
10356     map<string,string> loc;
10357     CrushWrapper::parse_loc_map(argvec, &loc);
10358
10359     // Need an explicit check for name_exists because get_item_id returns
10360     // 0 on unfound.
10361     int id = osdmap.crush->get_item_id(name);
10362     if (!osdmap.crush->name_exists(name)) {
10363       err = -ENOENT;
10364       ss << "item " << name << " does not exist";
10365       goto reply;
10366     } else {
10367       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10368     }
10369     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10370       ss << "no need to move item id " << id << " name '" << name
10371          << "' to location " << loc << " in crush map";
10372       err = 0;
10373       goto reply;
10374     }
10375
10376     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10377     CrushWrapper newcrush;
10378     _get_pending_crush(newcrush);
10379
10380     if (!newcrush.name_exists(name)) {
10381       err = -ENOENT;
10382       ss << "item " << name << " does not exist";
10383       goto reply;
10384     } else {
10385       int id = newcrush.get_item_id(name);
10386       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10387         err = newcrush.link_bucket(cct, id, loc);
10388         if (err >= 0) {
10389           ss << "linked item id " << id << " name '" << name
10390              << "' to location " << loc << " in crush map";
10391           pending_inc.crush.clear();
10392           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10393         } else {
10394           ss << "cannot link item id " << id << " name '" << name
10395              << "' to location " << loc;
10396           goto reply;
10397         }
10398       } else {
10399         ss << "no need to move item id " << id << " name '" << name
10400            << "' to location " << loc << " in crush map";
10401         err = 0;
10402       }
10403     }
10404     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10405                                               get_last_committed() + 1));
10406     return true;
10407   } else if (prefix == "osd crush rm" ||
10408              prefix == "osd crush remove" ||
10409              prefix == "osd crush unlink") {
10410     do {
10411       // osd crush rm <id> [ancestor]
10412       CrushWrapper newcrush;
10413       _get_pending_crush(newcrush);
10414
10415       string name;
10416       cmd_getval(cmdmap, "name", name);
10417
10418       if (!osdmap.crush->name_exists(name)) {
10419         err = 0;
10420         ss << "device '" << name << "' does not appear in the crush map";
10421         break;
10422       }
10423       if (!newcrush.name_exists(name)) {
10424         err = 0;
10425         ss << "device '" << name << "' does not appear in the crush map";
10426         getline(ss, rs);
10427         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10428                                                   get_last_committed() + 1));
10429         return true;
10430       }
10431       int id = newcrush.get_item_id(name);
10432       int ancestor = 0;
10433
10434       bool unlink_only = prefix == "osd crush unlink";
10435       string ancestor_str;
10436       if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10437         if (!newcrush.name_exists(ancestor_str)) {
10438           err = -ENOENT;
10439           ss << "ancestor item '" << ancestor_str
10440              << "' does not appear in the crush map";
10441           break;
10442         }
10443         ancestor = newcrush.get_item_id(ancestor_str);
10444       }
10445
10446       err = prepare_command_osd_crush_remove(
10447           newcrush,
10448           id, ancestor,
10449           (ancestor < 0), unlink_only);
10450
10451       if (err == -ENOENT) {
10452         ss << "item " << id << " does not appear in that position";
10453         err = 0;
10454         break;
10455       }
10456       if (err == 0) {
10457         if (!unlink_only)
10458           pending_inc.new_crush_node_flags[id] = 0;
10459         ss << "removed item id " << id << " name '" << name << "' from crush map";
10460         getline(ss, rs);
10461         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10462                                                   get_last_committed() + 1));
10463         return true;
10464       }
10465     } while (false);
10466
10467   } else if (prefix == "osd crush reweight-all") {
10468     CrushWrapper newcrush;
10469     _get_pending_crush(newcrush);
10470
10471     newcrush.reweight(cct);
10472     pending_inc.crush.clear();
10473     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10474     ss << "reweighted crush hierarchy";
10475     getline(ss, rs);
10476     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10477                                                   get_last_committed() + 1));
10478     return true;
10479   } else if (prefix == "osd crush reweight") {
10480     // osd crush reweight <name> <weight>
10481     CrushWrapper newcrush;
10482     _get_pending_crush(newcrush);
10483
10484     string name;
10485     cmd_getval(cmdmap, "name", name);
10486     if (!newcrush.name_exists(name)) {
10487       err = -ENOENT;
10488       ss << "device '" << name << "' does not appear in the crush map";
10489       goto reply;
10490     }
10491
10492     int id = newcrush.get_item_id(name);
10493     if (id < 0) {
10494       ss << "device '" << name << "' is not a leaf in the crush map";
10495       err = -EINVAL;
10496       goto reply;
10497     }
10498     double w;
10499     if (!cmd_getval(cmdmap, "weight", w)) {
10500       ss << "unable to parse weight value '"
10501          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10502       err = -EINVAL;
10503       goto reply;
10504     }
10505
10506     err = newcrush.adjust_item_weightf(cct, id, w,
10507                                        g_conf()->osd_crush_update_weight_set);
10508     if (err < 0)
10509       goto reply;
10510     pending_inc.crush.clear();
10511     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10512     ss << "reweighted item id " << id << " name '" << name << "' to " << w
10513        << " in crush map";
10514     getline(ss, rs);
10515     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10516                                                   get_last_committed() + 1));
10517     return true;
10518   } else if (prefix == "osd crush reweight-subtree") {
10519     // osd crush reweight <name> <weight>
10520     CrushWrapper newcrush;
10521     _get_pending_crush(newcrush);
10522
10523     string name;
10524     cmd_getval(cmdmap, "name", name);
10525     if (!newcrush.name_exists(name)) {
10526       err = -ENOENT;
10527       ss << "device '" << name << "' does not appear in the crush map";
10528       goto reply;
10529     }
10530
10531     int id = newcrush.get_item_id(name);
10532     if (id >= 0) {
10533       ss << "device '" << name << "' is not a subtree in the crush map";
10534       err = -EINVAL;
10535       goto reply;
10536     }
10537     double w;
10538     if (!cmd_getval(cmdmap, "weight", w)) {
10539       ss << "unable to parse weight value '"
10540          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10541       err = -EINVAL;
10542       goto reply;
10543     }
10544
10545     err = newcrush.adjust_subtree_weightf(cct, id, w,
10546                                           g_conf()->osd_crush_update_weight_set);
10547     if (err < 0)
10548       goto reply;
10549     pending_inc.crush.clear();
10550     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10551     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10552        << " in crush map";
10553     getline(ss, rs);
10554     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10555                                               get_last_committed() + 1));
10556     return true;
10557   } else if (prefix == "osd crush tunables") {
10558     CrushWrapper newcrush;
10559     _get_pending_crush(newcrush);
10560
10561     err = 0;
10562     string profile;
10563     cmd_getval(cmdmap, "profile", profile);
10564     if (profile == "legacy" || profile == "argonaut") {
10565       newcrush.set_tunables_legacy();
10566     } else if (profile == "bobtail") {
10567       newcrush.set_tunables_bobtail();
10568     } else if (profile == "firefly") {
10569       newcrush.set_tunables_firefly();
10570     } else if (profile == "hammer") {
10571       newcrush.set_tunables_hammer();
10572     } else if (profile == "jewel") {
10573       newcrush.set_tunables_jewel();
10574     } else if (profile == "optimal") {
10575       newcrush.set_tunables_optimal();
10576     } else if (profile == "default") {
10577       newcrush.set_tunables_default();
10578     } else {
10579       ss << "unrecognized profile '" << profile << "'";
10580       err = -EINVAL;
10581       goto reply;
10582     }
10583
10584     if (!validate_crush_against_features(&newcrush, ss)) {
10585       err = -EINVAL;
10586       goto reply;
10587     }
10588
10589     pending_inc.crush.clear();
10590     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10591     ss << "adjusted tunables profile to " << profile;
10592     getline(ss, rs);
10593     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10594                                               get_last_committed() + 1));
10595     return true;
10596   } else if (prefix == "osd crush set-tunable") {
10597     CrushWrapper newcrush;
10598     _get_pending_crush(newcrush);
10599
10600     err = 0;
10601     string tunable;
10602     cmd_getval(cmdmap, "tunable", tunable);
10603
10604     int64_t value = -1;
10605     if (!cmd_getval(cmdmap, "value", value)) {
10606       err = -EINVAL;
10607       ss << "failed to parse integer value "
10608          << cmd_vartype_stringify(cmdmap.at("value"));
10609       goto reply;
10610     }
10611
10612     if (tunable == "straw_calc_version") {
10613       if (value != 0 && value != 1) {
10614         ss << "value must be 0 or 1; got " << value;
10615         err = -EINVAL;
10616         goto reply;
10617       }
10618       newcrush.set_straw_calc_version(value);
10619     } else {
10620       ss << "unrecognized tunable '" << tunable << "'";
10621       err = -EINVAL;
10622       goto reply;
10623     }
10624
10625     if (!validate_crush_against_features(&newcrush, ss)) {
10626       err = -EINVAL;
10627       goto reply;
10628     }
10629
10630     pending_inc.crush.clear();
10631     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10632     ss << "adjusted tunable " << tunable << " to " << value;
10633     getline(ss, rs);
10634     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10635                                               get_last_committed() + 1));
10636     return true;
10637
10638   } else if (prefix == "osd crush rule create-simple") {
10639     string name, root, type, mode;
10640     cmd_getval(cmdmap, "name", name);
10641     cmd_getval(cmdmap, "root", root);
10642     cmd_getval(cmdmap, "type", type);
10643     cmd_getval(cmdmap, "mode", mode);
10644     if (mode == "")
10645       mode = "firstn";
10646
10647     if (osdmap.crush->rule_exists(name)) {
10648       // The name is uniquely associated to a ruleid and the rule it contains
10649       // From the user point of view, the rule is more meaningfull.
10650       ss << "rule " << name << " already exists";
10651       err = 0;
10652       goto reply;
10653     }
10654
10655     CrushWrapper newcrush;
10656     _get_pending_crush(newcrush);
10657
10658     if (newcrush.rule_exists(name)) {
10659       // The name is uniquely associated to a ruleid and the rule it contains
10660       // From the user point of view, the rule is more meaningfull.
10661       ss << "rule " << name << " already exists";
10662       err = 0;
10663     } else {
10664       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10665                                                pg_pool_t::TYPE_REPLICATED, &ss);
10666       if (ruleno < 0) {
10667         err = ruleno;
10668         goto reply;
10669       }
10670
10671       pending_inc.crush.clear();
10672       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10673     }
10674     getline(ss, rs);
10675     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10676                                               get_last_committed() + 1));
10677     return true;
10678
10679   } else if (prefix == "osd crush rule create-replicated") {
10680     string name, root, type, device_class;
10681     cmd_getval(cmdmap, "name", name);
10682     cmd_getval(cmdmap, "root", root);
10683     cmd_getval(cmdmap, "type", type);
10684     cmd_getval(cmdmap, "class", device_class);
10685
10686     if (osdmap.crush->rule_exists(name)) {
10687       // The name is uniquely associated to a ruleid and the rule it contains
10688       // From the user point of view, the rule is more meaningfull.
10689       ss << "rule " << name << " already exists";
10690       err = 0;
10691       goto reply;
10692     }
10693
10694     CrushWrapper newcrush;
10695     _get_pending_crush(newcrush);
10696
10697     if (newcrush.rule_exists(name)) {
10698       // The name is uniquely associated to a ruleid and the rule it contains
10699       // From the user point of view, the rule is more meaningfull.
10700       ss << "rule " << name << " already exists";
10701       err = 0;
10702     } else {
10703       int ruleno = newcrush.add_simple_rule(
10704         name, root, type, device_class,
10705         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10706       if (ruleno < 0) {
10707         err = ruleno;
10708         goto reply;
10709       }
10710
10711       pending_inc.crush.clear();
10712       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10713     }
10714     getline(ss, rs);
10715     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10716                                               get_last_committed() + 1));
10717     return true;
10718
10719   } else if (prefix == "osd erasure-code-profile rm") {
10720     string name;
10721     cmd_getval(cmdmap, "name", name);
10722
10723     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10724       goto wait;
10725
10726     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10727       err = -EBUSY;
10728       goto reply;
10729     }
10730
10731     if (osdmap.has_erasure_code_profile(name) ||
10732         pending_inc.new_erasure_code_profiles.count(name)) {
10733       if (osdmap.has_erasure_code_profile(name)) {
10734         pending_inc.old_erasure_code_profiles.push_back(name);
10735       } else {
10736         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10737         pending_inc.new_erasure_code_profiles.erase(name);
10738       }
10739
10740       getline(ss, rs);
10741       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10742                                                         get_last_committed() + 1));
10743       return true;
10744     } else {
10745       ss << "erasure-code-profile " << name << " does not exist";
10746       err = 0;
10747       goto reply;
10748     }
10749
10750   } else if (prefix == "osd erasure-code-profile set") {
10751     string name;
10752     cmd_getval(cmdmap, "name", name);
10753     vector<string> profile;
10754     cmd_getval(cmdmap, "profile", profile);
10755
10756     bool force = false;
10757     cmd_getval(cmdmap, "force", force);
10758
10759     map<string,string> profile_map;
10760     err = parse_erasure_code_profile(profile, &profile_map, &ss);
10761     if (err)
10762       goto reply;
10763     if (profile_map.find("plugin") == profile_map.end()) {
10764       ss << "erasure-code-profile " << profile_map
10765          << " must contain a plugin entry" << std::endl;
10766       err = -EINVAL;
10767       goto reply;
10768     }
10769     string plugin = profile_map["plugin"];
10770
10771     if (pending_inc.has_erasure_code_profile(name)) {
10772       dout(20) << "erasure code profile " << name << " try again" << dendl;
10773       goto wait;
10774     } else {
10775       err = normalize_profile(name, profile_map, force, &ss);
10776       if (err)
10777         goto reply;
10778
10779       if (osdmap.has_erasure_code_profile(name)) {
10780         ErasureCodeProfile existing_profile_map =
10781           osdmap.get_erasure_code_profile(name);
10782         err = normalize_profile(name, existing_profile_map, force, &ss);
10783         if (err)
10784           goto reply;
10785
10786         if (existing_profile_map == profile_map) {
10787           err = 0;
10788           goto reply;
10789         }
10790         if (!force) {
10791           err = -EPERM;
10792           ss << "will not override erasure code profile " << name
10793              << " because the existing profile "
10794              << existing_profile_map
10795              << " is different from the proposed profile "
10796              << profile_map;
10797           goto reply;
10798         }
10799       }
10800
10801       dout(20) << "erasure code profile set " << name << "="
10802                << profile_map << dendl;
10803       pending_inc.set_erasure_code_profile(name, profile_map);
10804     }
10805
10806     getline(ss, rs);
10807     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10808                                                       get_last_committed() + 1));
10809     return true;
10810
10811   } else if (prefix == "osd crush rule create-erasure") {
10812     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10813     if (err == -EAGAIN)
10814       goto wait;
10815     if (err)
10816       goto reply;
10817     string name, poolstr;
10818     cmd_getval(cmdmap, "name", name);
10819     string profile;
10820     cmd_getval(cmdmap, "profile", profile);
10821     if (profile == "")
10822       profile = "default";
10823     if (profile == "default") {
10824       if (!osdmap.has_erasure_code_profile(profile)) {
10825         if (pending_inc.has_erasure_code_profile(profile)) {
10826           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10827           goto wait;
10828         }
10829
10830         map<string,string> profile_map;
10831         err = osdmap.get_erasure_code_profile_default(cct,
10832                                                       profile_map,
10833                                                       &ss);
10834         if (err)
10835           goto reply;
10836         err = normalize_profile(name, profile_map, true, &ss);
10837         if (err)
10838           goto reply;
10839         dout(20) << "erasure code profile set " << profile << "="
10840                  << profile_map << dendl;
10841         pending_inc.set_erasure_code_profile(profile, profile_map);
10842         goto wait;
10843       }
10844     }
10845
10846     int rule;
10847     err = crush_rule_create_erasure(name, profile, &rule, &ss);
10848     if (err < 0) {
10849       switch(err) {
10850       case -EEXIST: // return immediately
10851         ss << "rule " << name << " already exists";
10852         err = 0;
10853         goto reply;
10854         break;
10855       case -EALREADY: // wait for pending to be proposed
10856         ss << "rule " << name << " already exists";
10857         err = 0;
10858         break;
10859       default: // non recoverable error
10860         goto reply;
10861         break;
10862       }
10863     } else {
10864       ss << "created rule " << name << " at " << rule;
10865     }
10866
10867     getline(ss, rs);
10868     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10869                                                       get_last_committed() + 1));
10870     return true;
10871
10872   } else if (prefix == "osd crush rule rm") {
10873     string name;
10874     cmd_getval(cmdmap, "name", name);
10875
10876     if (!osdmap.crush->rule_exists(name)) {
10877       ss << "rule " << name << " does not exist";
10878       err = 0;
10879       goto reply;
10880     }
10881
10882     CrushWrapper newcrush;
10883     _get_pending_crush(newcrush);
10884
10885     if (!newcrush.rule_exists(name)) {
10886       ss << "rule " << name << " does not exist";
10887       err = 0;
10888     } else {
10889       int ruleno = newcrush.get_rule_id(name);
10890       ceph_assert(ruleno >= 0);
10891
10892       // make sure it is not in use.
10893       // FIXME: this is ok in some situations, but let's not bother with that
10894       // complexity now.
10895       int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10896       if (osdmap.crush_rule_in_use(ruleset)) {
10897         ss << "crush ruleset " << name << " " << ruleset << " is in use";
10898         err = -EBUSY;
10899         goto reply;
10900       }
10901
10902       err = newcrush.remove_rule(ruleno);
10903       if (err < 0) {
10904         goto reply;
10905       }
10906
10907       pending_inc.crush.clear();
10908       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10909     }
10910     getline(ss, rs);
10911     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10912                                               get_last_committed() + 1));
10913     return true;
10914
10915   } else if (prefix == "osd crush rule rename") {
10916     string srcname;
10917     string dstname;
10918     cmd_getval(cmdmap, "srcname", srcname);
10919     cmd_getval(cmdmap, "dstname", dstname);
10920     if (srcname.empty() || dstname.empty()) {
10921       ss << "must specify both source rule name and destination rule name";
10922       err = -EINVAL;
10923       goto reply;
10924     }
10925     if (srcname == dstname) {
10926       ss << "destination rule name is equal to source rule name";
10927       err = 0;
10928       goto reply;
10929     }
10930
10931     CrushWrapper newcrush;
10932     _get_pending_crush(newcrush);
10933     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10934       // srcname does not exist and dstname already exists
10935       // suppose this is a replay and return success
10936       // (so this command is idempotent)
10937       ss << "already renamed to '" << dstname << "'";
10938       err = 0;
10939       goto reply;
10940     }
10941
10942     err = newcrush.rename_rule(srcname, dstname, &ss);
10943     if (err < 0) {
10944       // ss has reason for failure
10945       goto reply;
10946     }
10947     pending_inc.crush.clear();
10948     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10949     getline(ss, rs);
10950     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10951                                get_last_committed() + 1));
10952     return true;
10953
10954   } else if (prefix == "osd setmaxosd") {
10955     int64_t newmax;
10956     if (!cmd_getval(cmdmap, "newmax", newmax)) {
10957       ss << "unable to parse 'newmax' value '"
10958          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
10959       err = -EINVAL;
10960       goto reply;
10961     }
10962
10963     if (newmax > g_conf()->mon_max_osd) {
10964       err = -ERANGE;
10965       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
10966          << g_conf()->mon_max_osd << ")";
10967       goto reply;
10968     }
10969
10970     // Don't allow shrinking OSD number as this will cause data loss
10971     // and may cause kernel crashes.
10972     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10973     if (newmax < osdmap.get_max_osd()) {
10974       // Check if the OSDs exist between current max and new value.
10975       // If there are any OSDs exist, then don't allow shrinking number
10976       // of OSDs.
10977       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10978         if (osdmap.exists(i)) {
10979           err = -EBUSY;
10980           ss << "cannot shrink max_osd to " << newmax
10981              << " because osd." << i << " (and possibly others) still in use";
10982           goto reply;
10983         }
10984       }
10985     }
10986
10987     pending_inc.new_max_osd = newmax;
10988     ss << "set new max_osd = " << pending_inc.new_max_osd;
10989     getline(ss, rs);
10990     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10991                                               get_last_committed() + 1));
10992     return true;
10993
10994   } else if (prefix == "osd set-full-ratio" ||
10995              prefix == "osd set-backfillfull-ratio" ||
10996              prefix == "osd set-nearfull-ratio") {
10997     double n;
10998     if (!cmd_getval(cmdmap, "ratio", n)) {
10999       ss << "unable to parse 'ratio' value '"
11000          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11001       err = -EINVAL;
11002       goto reply;
11003     }
11004     if (prefix == "osd set-full-ratio")
11005       pending_inc.new_full_ratio = n;
11006     else if (prefix == "osd set-backfillfull-ratio")
11007       pending_inc.new_backfillfull_ratio = n;
11008     else if (prefix == "osd set-nearfull-ratio")
11009       pending_inc.new_nearfull_ratio = n;
11010     ss << prefix << " " << n;
11011     getline(ss, rs);
11012     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11013                                               get_last_committed() + 1));
11014     return true;
11015   } else if (prefix == "osd set-require-min-compat-client") {
11016     string v;
11017     cmd_getval(cmdmap, "version", v);
11018     ceph_release_t vno = ceph_release_from_name(v);
11019     if (!vno) {
11020       ss << "version " << v << " is not recognized";
11021       err = -EINVAL;
11022       goto reply;
11023     }
11024     OSDMap newmap;
11025     newmap.deepish_copy_from(osdmap);
11026     newmap.apply_incremental(pending_inc);
11027     newmap.require_min_compat_client = vno;
11028     auto mvno = newmap.get_min_compat_client();
11029     if (vno < mvno) {
11030       ss << "osdmap current utilizes features that require " << mvno
11031          << "; cannot set require_min_compat_client below that to " << vno;
11032       err = -EPERM;
11033       goto reply;
11034     }
11035     bool sure = false;
11036     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11037     if (!sure) {
11038       FeatureMap m;
11039       mon->get_combined_feature_map(&m);
11040       uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
11041       bool first = true;
11042       bool ok = true;
11043       for (int type : {
11044             CEPH_ENTITY_TYPE_CLIENT,
11045             CEPH_ENTITY_TYPE_MDS,
11046             CEPH_ENTITY_TYPE_MGR }) {
11047         auto p = m.m.find(type);
11048         if (p == m.m.end()) {
11049           continue;
11050         }
11051         for (auto& q : p->second) {
11052           uint64_t missing = ~q.first & features;
11053           if (missing) {
11054             if (first) {
11055               ss << "cannot set require_min_compat_client to " << v << ": ";
11056             } else {
11057               ss << "; ";
11058             }
11059             first = false;
11060             ss << q.second << " connected " << ceph_entity_type_name(type)
11061                << "(s) look like " << ceph_release_name(
11062                  ceph_release_from_features(q.first))
11063                << " (missing 0x" << std::hex << missing << std::dec << ")";
11064             ok = false;
11065           }
11066         }
11067       }
11068       if (!ok) {
11069         ss << "; add --yes-i-really-mean-it to do it anyway";
11070         err = -EPERM;
11071         goto reply;
11072       }
11073     }
11074     ss << "set require_min_compat_client to " << vno;
11075     pending_inc.new_require_min_compat_client = vno;
11076     getline(ss, rs);
11077     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11078                                                           get_last_committed() + 1));
11079     return true;
11080   } else if (prefix == "osd pause") {
11081     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11082
11083   } else if (prefix == "osd unpause") {
11084     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11085
11086   } else if (prefix == "osd set") {
11087     bool sure = false;
11088     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11089
11090     string key;
11091     cmd_getval(cmdmap, "key", key);
11092     if (key == "pause")
11093       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11094     else if (key == "noup")
11095       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11096     else if (key == "nodown")
11097       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11098     else if (key == "noout")
11099       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11100     else if (key == "noin")
11101       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11102     else if (key == "nobackfill")
11103       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11104     else if (key == "norebalance")
11105       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11106     else if (key == "norecover")
11107       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11108     else if (key == "noscrub")
11109       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11110     else if (key == "nodeep-scrub")
11111       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11112     else if (key == "notieragent")
11113       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11114     else if (key == "nosnaptrim")
11115       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11116     else if (key == "pglog_hardlimit") {
11117       if (!osdmap.get_num_up_osds() && !sure) {
11118         ss << "Not advisable to continue since no OSDs are up. Pass "
11119            << "--yes-i-really-mean-it if you really wish to continue.";
11120         err = -EPERM;
11121         goto reply;
11122       }
11123       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11124       // we are reusing a jewel feature bit that was retired in luminous.
11125       if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11126          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11127           || sure)) {
11128         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11129       } else {
11130         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11131         err = -EPERM;
11132         goto reply;
11133       }
11134     } else {
11135       ss << "unrecognized flag '" << key << "'";
11136       err = -EINVAL;
11137     }
11138
11139   } else if (prefix == "osd unset") {
11140     string key;
11141     cmd_getval(cmdmap, "key", key);
11142     if (key == "pause")
11143       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11144     else if (key == "noup")
11145       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11146     else if (key == "nodown")
11147       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11148     else if (key == "noout")
11149       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11150     else if (key == "noin")
11151       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11152     else if (key == "nobackfill")
11153       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11154     else if (key == "norebalance")
11155       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11156     else if (key == "norecover")
11157       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11158     else if (key == "noscrub")
11159       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11160     else if (key == "nodeep-scrub")
11161       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11162     else if (key == "notieragent")
11163       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11164     else if (key == "nosnaptrim")
11165       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11166     else {
11167       ss << "unrecognized flag '" << key << "'";
11168       err = -EINVAL;
11169     }
11170
11171   } else if (prefix == "osd require-osd-release") {
11172     string release;
11173     cmd_getval(cmdmap, "release", release);
11174     bool sure = false;
11175     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11176     ceph_release_t rel = ceph_release_from_name(release.c_str());
11177     if (!rel) {
11178       ss << "unrecognized release " << release;
11179       err = -EINVAL;
11180       goto reply;
11181     }
11182     if (rel == osdmap.require_osd_release) {
11183       // idempotent
11184       err = 0;
11185       goto reply;
11186     }
11187     ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11188     if (!osdmap.get_num_up_osds() && !sure) {
11189       ss << "Not advisable to continue since no OSDs are up. Pass "
11190          << "--yes-i-really-mean-it if you really wish to continue.";
11191       err = -EPERM;
11192       goto reply;
11193     }
11194     if (rel == ceph_release_t::mimic) {
11195       if (!mon->monmap->get_required_features().contains_all(
11196             ceph::features::mon::FEATURE_MIMIC)) {
11197         ss << "not all mons are mimic";
11198         err = -EPERM;
11199         goto reply;
11200       }
11201       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11202            && !sure) {
11203         ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11204         err = -EPERM;
11205         goto reply;
11206       }
11207     } else if (rel == ceph_release_t::nautilus) {
11208       if (!mon->monmap->get_required_features().contains_all(
11209             ceph::features::mon::FEATURE_NAUTILUS)) {
11210         ss << "not all mons are nautilus";
11211         err = -EPERM;
11212         goto reply;
11213       }
11214       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11215            && !sure) {
11216         ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11217         err = -EPERM;
11218         goto reply;
11219       }
11220     } else if (rel == ceph_release_t::octopus) {
11221       if (!mon->monmap->get_required_features().contains_all(
11222             ceph::features::mon::FEATURE_OCTOPUS)) {
11223         ss << "not all mons are octopus";
11224         err = -EPERM;
11225         goto reply;
11226       }
11227       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11228            && !sure) {
11229         ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11230         err = -EPERM;
11231         goto reply;
11232       }
11233     } else {
11234       ss << "not supported for this release yet";
11235       err = -EPERM;
11236       goto reply;
11237     }
11238     if (rel < osdmap.require_osd_release) {
11239       ss << "require_osd_release cannot be lowered once it has been set";
11240       err = -EPERM;
11241       goto reply;
11242     }
11243     pending_inc.new_require_osd_release = rel;
11244     goto update;
11245   } else if (prefix == "osd down" ||
11246              prefix == "osd out" ||
11247              prefix == "osd in" ||
11248              prefix == "osd rm" ||
11249              prefix == "osd stop") {
11250
11251     bool any = false;
11252     bool stop = false;
11253     bool verbose = true;
11254     bool definitely_dead = false;
11255
11256     vector<string> idvec;
11257     cmd_getval(cmdmap, "ids", idvec);
11258     cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11259     derr << "definitely_dead " << (int)definitely_dead << dendl;
11260     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11261       set<int> osds;
11262
11263       // wildcard?
11264       if (j == 0 &&
11265           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11266         if (prefix == "osd in") {
11267           // touch out osds only
11268           osdmap.get_out_existing_osds(osds);
11269         } else {
11270           osdmap.get_all_osds(osds);
11271         }
11272         stop = true;
11273         verbose = false; // so the output is less noisy.
11274       } else {
11275         long osd = parse_osd_id(idvec[j].c_str(), &ss);
11276         if (osd < 0) {
11277           ss << "invalid osd id" << osd;
11278           err = -EINVAL;
11279           continue;
11280         } else if (!osdmap.exists(osd)) {
11281           ss << "osd." << osd << " does not exist. ";
11282           continue;
11283         }
11284
11285         osds.insert(osd);
11286       }
11287
11288       for (auto &osd : osds) {
11289         if (prefix == "osd down") {
11290           if (osdmap.is_down(osd)) {
11291             if (verbose)
11292               ss << "osd." << osd << " is already down. ";
11293           } else {
11294             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11295             ss << "marked down osd." << osd << ". ";
11296             any = true;
11297           }
11298           if (definitely_dead) {
11299             if (!pending_inc.new_xinfo.count(osd)) {
11300               pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11301             }
11302             if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11303               any = true;
11304             }
11305             pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11306           }
11307         } else if (prefix == "osd out") {
11308           if (osdmap.is_out(osd)) {
11309             if (verbose)
11310               ss << "osd." << osd << " is already out. ";
11311           } else {
11312             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11313             if (osdmap.osd_weight[osd]) {
11314               if (pending_inc.new_xinfo.count(osd) == 0) {
11315                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11316               }
11317               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11318             }
11319             ss << "marked out osd." << osd << ". ";
11320             std::ostringstream msg;
11321             msg << "Client " << op->get_session()->entity_name
11322                 << " marked osd." << osd << " out";
11323             if (osdmap.is_up(osd)) {
11324               msg << ", while it was still marked up";
11325             } else {
11326               auto period = ceph_clock_now() - down_pending_out[osd];
11327               msg << ", after it was down for " << int(period.sec())
11328                   << " seconds";
11329             }
11330
11331             mon->clog->info() << msg.str();
11332             any = true;
11333           }
11334         } else if (prefix == "osd in") {
11335           if (osdmap.is_in(osd)) {
11336             if (verbose)
11337               ss << "osd." << osd << " is already in. ";
11338           } else {
11339             if (osdmap.osd_xinfo[osd].old_weight > 0) {
11340               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11341               if (pending_inc.new_xinfo.count(osd) == 0) {
11342                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11343               }
11344               pending_inc.new_xinfo[osd].old_weight = 0;
11345             } else {
11346               pending_inc.new_weight[osd] = CEPH_OSD_IN;
11347             }
11348             ss << "marked in osd." << osd << ". ";
11349             any = true;
11350           }
11351         } else if (prefix == "osd rm") {
11352           err = prepare_command_osd_remove(osd);
11353
11354           if (err == -EBUSY) {
11355             if (any)
11356               ss << ", ";
11357             ss << "osd." << osd << " is still up; must be down before removal. ";
11358           } else {
11359             ceph_assert(err == 0);
11360             if (any) {
11361               ss << ", osd." << osd;
11362             } else {
11363               ss << "removed osd." << osd;
11364             }
11365             any = true;
11366           }
11367         } else if (prefix == "osd stop") {
11368           if (osdmap.is_stop(osd)) {
11369             if (verbose)
11370               ss << "osd." << osd << " is already stopped. ";
11371           } else if (osdmap.is_down(osd)) {
11372             pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11373             ss << "stop down osd." << osd << ". ";
11374             any = true;
11375           } else {
11376             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11377             ss << "stop osd." << osd << ". ";
11378             any = true;
11379           }
11380         }
11381       }
11382     }
11383     if (any) {
11384       getline(ss, rs);
11385       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11386                                                 get_last_committed() + 1));
11387       return true;
11388     }
11389   } else if (prefix == "osd set-group" ||
11390              prefix == "osd unset-group" ||
11391              prefix == "osd add-noup" ||
11392              prefix == "osd add-nodown" ||
11393              prefix == "osd add-noin" ||
11394              prefix == "osd add-noout" ||
11395              prefix == "osd rm-noup" ||
11396              prefix == "osd rm-nodown" ||
11397              prefix == "osd rm-noin" ||
11398              prefix == "osd rm-noout") {
11399     bool do_set = prefix == "osd set-group" ||
11400                   prefix.find("add") != string::npos;
11401     string flag_str;
11402     unsigned flags = 0;
11403     vector<string> who;
11404     if (prefix == "osd set-group" || prefix == "osd unset-group") {
11405       cmd_getval(cmdmap, "flags", flag_str);
11406       cmd_getval(cmdmap, "who", who);
11407       vector<string> raw_flags;
11408       boost::split(raw_flags, flag_str, boost::is_any_of(","));
11409       for (auto& f : raw_flags) {
11410         if (f == "noup")
11411           flags |= CEPH_OSD_NOUP;
11412         else if (f == "nodown")
11413           flags |= CEPH_OSD_NODOWN;
11414         else if (f == "noin")
11415           flags |= CEPH_OSD_NOIN;
11416         else if (f == "noout")
11417           flags |= CEPH_OSD_NOOUT;
11418         else {
11419           ss << "unrecognized flag '" << f << "', must be one of "
11420              << "{noup,nodown,noin,noout}";
11421           err = -EINVAL;
11422           goto reply;
11423         }
11424       }
11425     } else {
11426       cmd_getval(cmdmap, "ids", who);
11427       if (prefix.find("noup") != string::npos)
11428         flags = CEPH_OSD_NOUP;
11429       else if (prefix.find("nodown") != string::npos)
11430         flags = CEPH_OSD_NODOWN;
11431       else if (prefix.find("noin") != string::npos)
11432         flags = CEPH_OSD_NOIN;
11433       else if (prefix.find("noout") != string::npos)
11434         flags = CEPH_OSD_NOOUT;
11435       else
11436         ceph_assert(0 == "Unreachable!");
11437     }
11438     if (flags == 0) {
11439       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11440       err = -EINVAL;
11441       goto reply;
11442     }
11443     if (who.empty()) {
11444       ss << "must specify at least one or more targets to set/unset";
11445       err = -EINVAL;
11446       goto reply;
11447     }
11448     set<int> osds;
11449     set<int> crush_nodes;
11450     set<int> device_classes;
11451     for (auto& w : who) {
11452       if (w == "any" || w == "all" || w == "*") {
11453         osdmap.get_all_osds(osds);
11454         break;
11455       }
11456       std::stringstream ts;
11457       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11458         osds.insert(osd);
11459       } else if (osdmap.crush->name_exists(w)) {
11460         crush_nodes.insert(osdmap.crush->get_item_id(w));
11461       } else if (osdmap.crush->class_exists(w)) {
11462         device_classes.insert(osdmap.crush->get_class_id(w));
11463       } else {
11464         ss << "unable to parse osd id or crush node or device class: "
11465            << "\"" << w << "\". ";
11466       }
11467     }
11468     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11469       // ss has reason for failure
11470       err = -EINVAL;
11471       goto reply;
11472     }
11473     bool any = false;
11474     for (auto osd : osds) {
11475       if (!osdmap.exists(osd)) {
11476         ss << "osd." << osd << " does not exist. ";
11477         continue;
11478       }
11479       if (do_set) {
11480         if (flags & CEPH_OSD_NOUP) {
11481           any |= osdmap.is_noup_by_osd(osd) ?
11482             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11483             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11484         }
11485         if (flags & CEPH_OSD_NODOWN) {
11486           any |= osdmap.is_nodown_by_osd(osd) ?
11487             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11488             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11489         }
11490         if (flags & CEPH_OSD_NOIN) {
11491           any |= osdmap.is_noin_by_osd(osd) ?
11492             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11493             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11494         }
11495         if (flags & CEPH_OSD_NOOUT) {
11496           any |= osdmap.is_noout_by_osd(osd) ?
11497             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11498             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11499         }
11500       } else {
11501         if (flags & CEPH_OSD_NOUP) {
11502           any |= osdmap.is_noup_by_osd(osd) ?
11503             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11504             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11505         }
11506         if (flags & CEPH_OSD_NODOWN) {
11507           any |= osdmap.is_nodown_by_osd(osd) ?
11508             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11509             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11510         }
11511         if (flags & CEPH_OSD_NOIN) {
11512           any |= osdmap.is_noin_by_osd(osd) ?
11513             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11514             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11515         }
11516         if (flags & CEPH_OSD_NOOUT) {
11517           any |= osdmap.is_noout_by_osd(osd) ?
11518             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11519             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11520         }
11521       }
11522     }
11523     for (auto& id : crush_nodes) {
11524       auto old_flags = osdmap.get_crush_node_flags(id);
11525       auto& pending_flags = pending_inc.new_crush_node_flags[id];
11526       pending_flags |= old_flags; // adopt existing flags first!
11527       if (do_set) {
11528         pending_flags |= flags;
11529       } else {
11530         pending_flags &= ~flags;
11531       }
11532       any = true;
11533     }
11534     for (auto& id : device_classes) {
11535       auto old_flags = osdmap.get_device_class_flags(id);
11536       auto& pending_flags = pending_inc.new_device_class_flags[id];
11537       pending_flags |= old_flags;
11538       if (do_set) {
11539         pending_flags |= flags;
11540       } else {
11541         pending_flags &= ~flags;
11542       }
11543       any = true;
11544     }
11545     if (any) {
11546       getline(ss, rs);
11547       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11548                                  get_last_committed() + 1));
11549       return true;
11550     }
11551   } else if (prefix == "osd pg-temp") {
11552     string pgidstr;
11553     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11554       ss << "unable to parse 'pgid' value '"
11555          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11556       err = -EINVAL;
11557       goto reply;
11558     }
11559     pg_t pgid;
11560     if (!pgid.parse(pgidstr.c_str())) {
11561       ss << "invalid pgid '" << pgidstr << "'";
11562       err = -EINVAL;
11563       goto reply;
11564     }
11565     if (!osdmap.pg_exists(pgid)) {
11566       ss << "pg " << pgid << " does not exist";
11567       err = -ENOENT;
11568       goto reply;
11569     }
11570     if (pending_inc.new_pg_temp.count(pgid)) {
11571       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11572       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11573       return true;
11574     }
11575
11576     vector<int64_t> id_vec;
11577     vector<int32_t> new_pg_temp;
11578     cmd_getval(cmdmap, "id", id_vec);
11579     if (id_vec.empty())  {
11580       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11581       ss << "done cleaning up pg_temp of " << pgid;
11582       goto update;
11583     }
11584     for (auto osd : id_vec) {
11585       if (!osdmap.exists(osd)) {
11586         ss << "osd." << osd << " does not exist";
11587         err = -ENOENT;
11588         goto reply;
11589       }
11590       new_pg_temp.push_back(osd);
11591     }
11592
11593     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11594     if ((int)new_pg_temp.size() < pool_min_size) {
11595       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11596          << pool_min_size << ")";
11597       err = -EINVAL;
11598       goto reply;
11599     }
11600
11601     int pool_size = osdmap.get_pg_pool_size(pgid);
11602     if ((int)new_pg_temp.size() > pool_size) {
11603       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11604          << pool_size << ")";
11605       err = -EINVAL;
11606       goto reply;
11607     }
11608
11609     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11610       new_pg_temp.begin(), new_pg_temp.end());
11611     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11612     goto update;
11613   } else if (prefix == "osd primary-temp") {
11614     string pgidstr;
11615     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11616       ss << "unable to parse 'pgid' value '"
11617          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11618       err = -EINVAL;
11619       goto reply;
11620     }
11621     pg_t pgid;
11622     if (!pgid.parse(pgidstr.c_str())) {
11623       ss << "invalid pgid '" << pgidstr << "'";
11624       err = -EINVAL;
11625       goto reply;
11626     }
11627     if (!osdmap.pg_exists(pgid)) {
11628       ss << "pg " << pgid << " does not exist";
11629       err = -ENOENT;
11630       goto reply;
11631     }
11632
11633     int64_t osd;
11634     if (!cmd_getval(cmdmap, "id", osd)) {
11635       ss << "unable to parse 'id' value '"
11636          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11637       err = -EINVAL;
11638       goto reply;
11639     }
11640     if (osd != -1 && !osdmap.exists(osd)) {
11641       ss << "osd." << osd << " does not exist";
11642       err = -ENOENT;
11643       goto reply;
11644     }
11645
11646     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11647         osdmap.require_min_compat_client < ceph_release_t::firefly) {
11648       ss << "require_min_compat_client "
11649          << osdmap.require_min_compat_client
11650          << " < firefly, which is required for primary-temp";
11651       err = -EPERM;
11652       goto reply;
11653     }
11654
11655     pending_inc.new_primary_temp[pgid] = osd;
11656     ss << "set " << pgid << " primary_temp mapping to " << osd;
11657     goto update;
11658   } else if (prefix == "pg repeer") {
11659     pg_t pgid;
11660     string pgidstr;
11661     cmd_getval(cmdmap, "pgid", pgidstr);
11662     if (!pgid.parse(pgidstr.c_str())) {
11663       ss << "invalid pgid '" << pgidstr << "'";
11664       err = -EINVAL;
11665       goto reply;
11666     }
11667     if (!osdmap.pg_exists(pgid)) {
11668       ss << "pg '" << pgidstr << "' does not exist";
11669       err = -ENOENT;
11670       goto reply;
11671     }
11672     vector<int> acting;
11673     int primary;
11674     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11675     if (primary < 0) {
11676       err = -EAGAIN;
11677       ss << "pg currently has no primary";
11678       goto reply;
11679     }
11680     if (acting.size() > 1) {
11681       // map to just primary; it will map back to what it wants
11682       pending_inc.new_pg_temp[pgid] = { primary };
11683     } else {
11684       // hmm, pick another arbitrary osd to induce a change.  Note
11685       // that this won't work if there is only one suitable OSD in the cluster.
11686       int i;
11687       bool done = false;
11688       for (i = 0; i < osdmap.get_max_osd(); ++i) {
11689         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11690           continue;
11691         }
11692         pending_inc.new_pg_temp[pgid] = { primary, i };
11693         done = true;
11694         break;
11695       }
11696       if (!done) {
11697         err = -EAGAIN;
11698         ss << "not enough up OSDs in the cluster to force repeer";
11699         goto reply;
11700       }
11701     }
11702     goto update;
11703   } else if (prefix == "osd pg-upmap" ||
11704              prefix == "osd rm-pg-upmap" ||
11705              prefix == "osd pg-upmap-items" ||
11706              prefix == "osd rm-pg-upmap-items") {
11707     if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11708       ss << "min_compat_client "
11709          << osdmap.require_min_compat_client
11710          << " < luminous, which is required for pg-upmap. "
11711          << "Try 'ceph osd set-require-min-compat-client luminous' "
11712          << "before using the new interface";
11713       err = -EPERM;
11714       goto reply;
11715     }
11716     err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11717     if (err == -EAGAIN)
11718       goto wait;
11719     if (err < 0)
11720       goto reply;
11721     string pgidstr;
11722     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11723       ss << "unable to parse 'pgid' value '"
11724          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11725       err = -EINVAL;
11726       goto reply;
11727     }
11728     pg_t pgid;
11729     if (!pgid.parse(pgidstr.c_str())) {
11730       ss << "invalid pgid '" << pgidstr << "'";
11731       err = -EINVAL;
11732       goto reply;
11733     }
11734     if (!osdmap.pg_exists(pgid)) {
11735       ss << "pg " << pgid << " does not exist";
11736       err = -ENOENT;
11737       goto reply;
11738     }
11739     if (pending_inc.old_pools.count(pgid.pool())) {
11740       ss << "pool of " << pgid << " is pending removal";
11741       err = -ENOENT;
11742       getline(ss, rs);
11743       wait_for_finished_proposal(op,
11744         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11745       return true;
11746     }
11747
11748     enum {
11749       OP_PG_UPMAP,
11750       OP_RM_PG_UPMAP,
11751       OP_PG_UPMAP_ITEMS,
11752       OP_RM_PG_UPMAP_ITEMS,
11753     } option;
11754
11755     if (prefix == "osd pg-upmap") {
11756       option = OP_PG_UPMAP;
11757     } else if (prefix == "osd rm-pg-upmap") {
11758       option = OP_RM_PG_UPMAP;
11759     } else if (prefix == "osd pg-upmap-items") {
11760       option = OP_PG_UPMAP_ITEMS;
11761     } else {
11762       option = OP_RM_PG_UPMAP_ITEMS;
11763     }
11764
11765     // check pending upmap changes
11766     switch (option) {
11767     case OP_PG_UPMAP: // fall through
11768     case OP_RM_PG_UPMAP:
11769       if (pending_inc.new_pg_upmap.count(pgid) ||
11770           pending_inc.old_pg_upmap.count(pgid)) {
11771         dout(10) << __func__ << " waiting for pending update on "
11772                  << pgid << dendl;
11773         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11774         return true;
11775       }
11776       break;
11777
11778     case OP_PG_UPMAP_ITEMS: // fall through
11779     case OP_RM_PG_UPMAP_ITEMS:
11780       if (pending_inc.new_pg_upmap_items.count(pgid) ||
11781           pending_inc.old_pg_upmap_items.count(pgid)) {
11782         dout(10) << __func__ << " waiting for pending update on "
11783                  << pgid << dendl;
11784         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11785         return true;
11786       }
11787       break;
11788
11789     default:
11790       ceph_abort_msg("invalid option");
11791     }
11792
11793     switch (option) {
11794     case OP_PG_UPMAP:
11795       {
11796         vector<int64_t> id_vec;
11797         if (!cmd_getval(cmdmap, "id", id_vec)) {
11798           ss << "unable to parse 'id' value(s) '"
11799              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11800           err = -EINVAL;
11801           goto reply;
11802         }
11803
11804         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11805         if ((int)id_vec.size() < pool_min_size) {
11806           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11807              << pool_min_size << ")";
11808           err = -EINVAL;
11809           goto reply;
11810         }
11811
11812         int pool_size = osdmap.get_pg_pool_size(pgid);
11813         if ((int)id_vec.size() > pool_size) {
11814           ss << "num of osds (" << id_vec.size() <<") > pool size ("
11815              << pool_size << ")";
11816           err = -EINVAL;
11817           goto reply;
11818         }
11819
11820         vector<int32_t> new_pg_upmap;
11821         for (auto osd : id_vec) {
11822           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11823             ss << "osd." << osd << " does not exist";
11824             err = -ENOENT;
11825             goto reply;
11826           }
11827           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11828           if (it != new_pg_upmap.end()) {
11829             ss << "osd." << osd << " already exists, ";
11830             continue;
11831           }
11832           new_pg_upmap.push_back(osd);
11833         }
11834
11835         if (new_pg_upmap.empty()) {
11836           ss << "no valid upmap items(pairs) is specified";
11837           err = -EINVAL;
11838           goto reply;
11839         }
11840
11841         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11842           new_pg_upmap.begin(), new_pg_upmap.end());
11843         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
11844       }
11845       break;
11846
11847     case OP_RM_PG_UPMAP:
11848       {
11849         pending_inc.old_pg_upmap.insert(pgid);
11850         ss << "clear " << pgid << " pg_upmap mapping";
11851       }
11852       break;
11853
11854     case OP_PG_UPMAP_ITEMS:
11855       {
11856         vector<int64_t> id_vec;
11857         if (!cmd_getval(cmdmap, "id", id_vec)) {
11858           ss << "unable to parse 'id' value(s) '"
11859              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11860           err = -EINVAL;
11861           goto reply;
11862         }
11863
11864         if (id_vec.size() % 2) {
11865           ss << "you must specify pairs of osd ids to be remapped";
11866           err = -EINVAL;
11867           goto reply;
11868         }
11869
11870         int pool_size = osdmap.get_pg_pool_size(pgid);
11871         if ((int)(id_vec.size() / 2) > pool_size) {
11872           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11873              << pool_size << ")";
11874           err = -EINVAL;
11875           goto reply;
11876         }
11877
11878         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11879         ostringstream items;
11880         items << "[";
11881         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11882           int from = *p++;
11883           int to = *p;
11884           if (from == to) {
11885             ss << "from osd." << from << " == to osd." << to << ", ";
11886             continue;
11887           }
11888           if (!osdmap.exists(from)) {
11889             ss << "osd." << from << " does not exist";
11890             err = -ENOENT;
11891             goto reply;
11892           }
11893           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11894             ss << "osd." << to << " does not exist";
11895             err = -ENOENT;
11896             goto reply;
11897           }
11898           pair<int32_t,int32_t> entry = make_pair(from, to);
11899           auto it = std::find(new_pg_upmap_items.begin(),
11900             new_pg_upmap_items.end(), entry);
11901           if (it != new_pg_upmap_items.end()) {
11902             ss << "osd." << from << " -> osd." << to << " already exists, ";
11903             continue;
11904           }
11905           new_pg_upmap_items.push_back(entry);
11906           items << from << "->" << to << ",";
11907         }
11908         string out(items.str());
11909         out.resize(out.size() - 1); // drop last ','
11910         out += "]";
11911
11912         if (new_pg_upmap_items.empty()) {
11913           ss << "no valid upmap items(pairs) is specified";
11914           err = -EINVAL;
11915           goto reply;
11916         }
11917
11918         pending_inc.new_pg_upmap_items[pgid] =
11919           mempool::osdmap::vector<pair<int32_t,int32_t>>(
11920           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11921         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11922       }
11923       break;
11924
11925     case OP_RM_PG_UPMAP_ITEMS:
11926       {
11927         pending_inc.old_pg_upmap_items.insert(pgid);
11928         ss << "clear " << pgid << " pg_upmap_items mapping";
11929       }
11930       break;
11931
11932     default:
11933       ceph_abort_msg("invalid option");
11934     }
11935
11936     goto update;
11937   } else if (prefix == "osd primary-affinity") {
11938     int64_t id;
11939     if (!cmd_getval(cmdmap, "id", id)) {
11940       ss << "invalid osd id value '"
11941          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11942       err = -EINVAL;
11943       goto reply;
11944     }
11945     double w;
11946     if (!cmd_getval(cmdmap, "weight", w)) {
11947       ss << "unable to parse 'weight' value '"
11948          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11949       err = -EINVAL;
11950       goto reply;
11951     }
11952     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11953     if (ww < 0L) {
11954       ss << "weight must be >= 0";
11955       err = -EINVAL;
11956       goto reply;
11957     }
11958     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11959         osdmap.require_min_compat_client < ceph_release_t::firefly) {
11960       ss << "require_min_compat_client "
11961          << osdmap.require_min_compat_client
11962          << " < firefly, which is required for primary-affinity";
11963       err = -EPERM;
11964       goto reply;
11965     }
11966     if (osdmap.exists(id)) {
11967       pending_inc.new_primary_affinity[id] = ww;
11968       ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11969       getline(ss, rs);
11970       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11971                                                 get_last_committed() + 1));
11972       return true;
11973     } else {
11974       ss << "osd." << id << " does not exist";
11975       err = -ENOENT;
11976       goto reply;
11977     }
11978   } else if (prefix == "osd reweight") {
11979     int64_t id;
11980     if (!cmd_getval(cmdmap, "id", id)) {
11981       ss << "unable to parse osd id value '"
11982          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11983       err = -EINVAL;
11984       goto reply;
11985     }
11986     double w;
11987     if (!cmd_getval(cmdmap, "weight", w)) {
11988       ss << "unable to parse weight value '"
11989          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11990       err = -EINVAL;
11991       goto reply;
11992     }
11993     long ww = (int)((double)CEPH_OSD_IN*w);
11994     if (ww < 0L) {
11995       ss << "weight must be >= 0";
11996       err = -EINVAL;
11997       goto reply;
11998     }
11999     if (osdmap.exists(id)) {
12000       pending_inc.new_weight[id] = ww;
12001       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12002       getline(ss, rs);
12003       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12004                                                 get_last_committed() + 1));
12005       return true;
12006     } else {
12007       ss << "osd." << id << " does not exist";
12008       err = -ENOENT;
12009       goto reply;
12010     }
12011   } else if (prefix == "osd reweightn") {
12012     map<int32_t, uint32_t> weights;
12013     err = parse_reweights(cct, cmdmap, osdmap, &weights);
12014     if (err) {
12015       ss << "unable to parse 'weights' value '"
12016          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12017       goto reply;
12018     }
12019     pending_inc.new_weight.insert(weights.begin(), weights.end());
12020     wait_for_finished_proposal(
12021         op,
12022         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12023     return true;
12024   } else if (prefix == "osd lost") {
12025     int64_t id;
12026     if (!cmd_getval(cmdmap, "id", id)) {
12027       ss << "unable to parse osd id value '"
12028          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12029       err = -EINVAL;
12030       goto reply;
12031     }
12032     bool sure = false;
12033     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12034     if (!sure) {
12035       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
12036             "--yes-i-really-mean-it if you really do.";
12037       err = -EPERM;
12038       goto reply;
12039     } else if (!osdmap.exists(id)) {
12040       ss << "osd." << id << " does not exist";
12041       err = -ENOENT;
12042       goto reply;
12043     } else if (!osdmap.is_down(id)) {
12044       ss << "osd." << id << " is not down";
12045       err = -EBUSY;
12046       goto reply;
12047     } else {
12048       epoch_t e = osdmap.get_info(id).down_at;
12049       pending_inc.new_lost[id] = e;
12050       ss << "marked osd lost in epoch " << e;
12051       getline(ss, rs);
12052       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12053                                                 get_last_committed() + 1));
12054       return true;
12055     }
12056
12057   } else if (prefix == "osd destroy-actual" ||
12058              prefix == "osd purge-actual" ||
12059              prefix == "osd purge-new") {
12060     /* Destroying an OSD means that we don't expect to further make use of
12061      * the OSDs data (which may even become unreadable after this operation),
12062      * and that we are okay with scrubbing all its cephx keys and config-key
12063      * data (which may include lockbox keys, thus rendering the osd's data
12064      * unreadable).
12065      *
12066      * The OSD will not be removed. Instead, we will mark it as destroyed,
12067      * such that a subsequent call to `create` will not reuse the osd id.
12068      * This will play into being able to recreate the OSD, at the same
12069      * crush location, with minimal data movement.
12070      */
12071
12072     // make sure authmon is writeable.
12073     if (!mon->authmon()->is_writeable()) {
12074       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12075                << "osd destroy" << dendl;
12076       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12077       return false;
12078     }
12079
12080     int64_t id;
12081     if (!cmd_getval(cmdmap, "id", id)) {
12082       auto p = cmdmap.find("id");
12083       if (p == cmdmap.end()) {
12084         ss << "no osd id specified";
12085       } else {
12086         ss << "unable to parse osd id value '"
12087            << cmd_vartype_stringify(cmdmap.at("id")) << "";
12088       }
12089       err = -EINVAL;
12090       goto reply;
12091     }
12092
12093     bool is_destroy = (prefix == "osd destroy-actual");
12094     if (!is_destroy) {
12095       ceph_assert("osd purge-actual" == prefix ||
12096              "osd purge-new" == prefix);
12097     }
12098
12099     bool sure = false;
12100     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12101     if (!sure) {
12102       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
12103          << "This will mean real, permanent data loss, as well "
12104          << "as deletion of cephx and lockbox keys. "
12105          << "Pass --yes-i-really-mean-it if you really do.";
12106       err = -EPERM;
12107       goto reply;
12108     } else if (!osdmap.exists(id)) {
12109       ss << "osd." << id << " does not exist";
12110       err = 0; // idempotent
12111       goto reply;
12112     } else if (osdmap.is_up(id)) {
12113       ss << "osd." << id << " is not `down`.";
12114       err = -EBUSY;
12115       goto reply;
12116     } else if (is_destroy && osdmap.is_destroyed(id)) {
12117       ss << "destroyed osd." << id;
12118       err = 0;
12119       goto reply;
12120     }
12121
12122     if (prefix == "osd purge-new" &&
12123         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12124       ss << "osd." << id << " is not new";
12125       err = -EPERM;
12126       goto reply;
12127     }
12128
12129     bool goto_reply = false;
12130
12131     paxos->plug();
12132     if (is_destroy) {
12133       err = prepare_command_osd_destroy(id, ss);
12134       // we checked above that it should exist.
12135       ceph_assert(err != -ENOENT);
12136     } else {
12137       err = prepare_command_osd_purge(id, ss);
12138       if (err == -ENOENT) {
12139         err = 0;
12140         ss << "osd." << id << " does not exist.";
12141         goto_reply = true;
12142       }
12143     }
12144     paxos->unplug();
12145
12146     if (err < 0 || goto_reply) {
12147       goto reply;
12148     }
12149
12150     if (is_destroy) {
12151       ss << "destroyed osd." << id;
12152     } else {
12153       ss << "purged osd." << id;
12154     }
12155
12156     getline(ss, rs);
12157     wait_for_finished_proposal(op,
12158         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12159     force_immediate_propose();
12160     return true;
12161
12162   } else if (prefix == "osd new") {
12163
12164     // make sure authmon is writeable.
12165     if (!mon->authmon()->is_writeable()) {
12166       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12167                << "osd new" << dendl;
12168       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12169       return false;
12170     }
12171
12172     map<string,string> param_map;
12173
12174     bufferlist bl = m->get_data();
12175     string param_json = bl.to_str();
12176     dout(20) << __func__ << " osd new json = " << param_json << dendl;
12177
12178     err = get_json_str_map(param_json, ss, &param_map);
12179     if (err < 0)
12180       goto reply;
12181
12182     dout(20) << __func__ << " osd new params " << param_map << dendl;
12183
12184     paxos->plug();
12185     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12186     paxos->unplug();
12187
12188     if (err < 0) {
12189       goto reply;
12190     }
12191
12192     if (f) {
12193       f->flush(rdata);
12194     } else {
12195       rdata.append(ss);
12196     }
12197
12198     if (err == EEXIST) {
12199       // idempotent operation
12200       err = 0;
12201       goto reply;
12202     }
12203
12204     wait_for_finished_proposal(op,
12205         new Monitor::C_Command(mon, op, 0, rs, rdata,
12206                                get_last_committed() + 1));
12207     force_immediate_propose();
12208     return true;
12209
12210   } else if (prefix == "osd create") {
12211
12212     // optional id provided?
12213     int64_t id = -1, cmd_id = -1;
12214     if (cmd_getval(cmdmap, "id", cmd_id)) {
12215       if (cmd_id < 0) {
12216         ss << "invalid osd id value '" << cmd_id << "'";
12217         err = -EINVAL;
12218         goto reply;
12219       }
12220       dout(10) << " osd create got id " << cmd_id << dendl;
12221     }
12222
12223     uuid_d uuid;
12224     string uuidstr;
12225     if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12226       if (!uuid.parse(uuidstr.c_str())) {
12227         ss << "invalid uuid value '" << uuidstr << "'";
12228         err = -EINVAL;
12229         goto reply;
12230       }
12231       // we only care about the id if we also have the uuid, to
12232       // ensure the operation's idempotency.
12233       id = cmd_id;
12234     }
12235
12236     int32_t new_id = -1;
12237     err = prepare_command_osd_create(id, uuid, &new_id, ss);
12238     if (err < 0) {
12239       if (err == -EAGAIN) {
12240         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12241         return true;
12242       }
12243       // a check has failed; reply to the user.
12244       goto reply;
12245
12246     } else if (err == EEXIST) {
12247       // this is an idempotent operation; we can go ahead and reply.
12248       if (f) {
12249         f->open_object_section("created_osd");
12250         f->dump_int("osdid", new_id);
12251         f->close_section();
12252         f->flush(rdata);
12253       } else {
12254         ss << new_id;
12255         rdata.append(ss);
12256       }
12257       err = 0;
12258       goto reply;
12259     }
12260
12261     string empty_device_class;
12262     do_osd_create(id, uuid, empty_device_class, &new_id);
12263
12264     if (f) {
12265       f->open_object_section("created_osd");
12266       f->dump_int("osdid", new_id);
12267       f->close_section();
12268       f->flush(rdata);
12269     } else {
12270       ss << new_id;
12271       rdata.append(ss);
12272     }
12273     wait_for_finished_proposal(op,
12274         new Monitor::C_Command(mon, op, 0, rs, rdata,
12275                                get_last_committed() + 1));
12276     return true;
12277
12278   } else if (prefix == "osd blacklist clear") {
12279     pending_inc.new_blacklist.clear();
12280     std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12281     osdmap.get_blacklist(&blacklist);
12282     for (const auto &entry : blacklist) {
12283       pending_inc.old_blacklist.push_back(entry.first);
12284     }
12285     ss << " removed all blacklist entries";
12286     getline(ss, rs);
12287     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12288                                               get_last_committed() + 1));
12289     return true;
12290   } else if (prefix == "osd blacklist") {
12291     string addrstr;
12292     cmd_getval(cmdmap, "addr", addrstr);
12293     entity_addr_t addr;
12294     if (!addr.parse(addrstr.c_str(), 0)) {
12295       ss << "unable to parse address " << addrstr;
12296       err = -EINVAL;
12297       goto reply;
12298     }
12299     else {
12300       if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12301         // always blacklist type ANY
12302         addr.set_type(entity_addr_t::TYPE_ANY);
12303       } else {
12304         addr.set_type(entity_addr_t::TYPE_LEGACY);
12305       }
12306
12307       string blacklistop;
12308       cmd_getval(cmdmap, "blacklistop", blacklistop);
12309       if (blacklistop == "add") {
12310         utime_t expires = ceph_clock_now();
12311         double d;
12312         // default one hour
12313         cmd_getval(cmdmap, "expire", d,
12314           g_conf()->mon_osd_blacklist_default_expire);
12315         expires += d;
12316
12317         pending_inc.new_blacklist[addr] = expires;
12318
12319         {
12320           // cancel any pending un-blacklisting request too
12321           auto it = std::find(pending_inc.old_blacklist.begin(),
12322             pending_inc.old_blacklist.end(), addr);
12323           if (it != pending_inc.old_blacklist.end()) {
12324             pending_inc.old_blacklist.erase(it);
12325           }
12326         }
12327
12328         ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12329         getline(ss, rs);
12330         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12331                                                   get_last_committed() + 1));
12332         return true;
12333       } else if (blacklistop == "rm") {
12334         if (osdmap.is_blacklisted(addr) ||
12335             pending_inc.new_blacklist.count(addr)) {
12336           if (osdmap.is_blacklisted(addr))
12337             pending_inc.old_blacklist.push_back(addr);
12338           else
12339             pending_inc.new_blacklist.erase(addr);
12340           ss << "un-blacklisting " << addr;
12341           getline(ss, rs);
12342           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12343                                                     get_last_committed() + 1));
12344           return true;
12345         }
12346         ss << addr << " isn't blacklisted";
12347         err = 0;
12348         goto reply;
12349       }
12350     }
12351   } else if (prefix == "osd pool mksnap") {
12352     string poolstr;
12353     cmd_getval(cmdmap, "pool", poolstr);
12354     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12355     if (pool < 0) {
12356       ss << "unrecognized pool '" << poolstr << "'";
12357       err = -ENOENT;
12358       goto reply;
12359     }
12360     string snapname;
12361     cmd_getval(cmdmap, "snap", snapname);
12362     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12363     if (p->is_unmanaged_snaps_mode()) {
12364       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12365       err = -EINVAL;
12366       goto reply;
12367     } else if (p->snap_exists(snapname.c_str())) {
12368       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12369       err = 0;
12370       goto reply;
12371     } else if (p->is_tier()) {
12372       ss << "pool " << poolstr << " is a cache tier";
12373       err = -EINVAL;
12374       goto reply;
12375     }
12376     pg_pool_t *pp = 0;
12377     if (pending_inc.new_pools.count(pool))
12378       pp = &pending_inc.new_pools[pool];
12379     if (!pp) {
12380       pp = &pending_inc.new_pools[pool];
12381       *pp = *p;
12382     }
12383     if (pp->snap_exists(snapname.c_str())) {
12384       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12385     } else {
12386       pp->add_snap(snapname.c_str(), ceph_clock_now());
12387       pp->set_snap_epoch(pending_inc.epoch);
12388       ss << "created pool " << poolstr << " snap " << snapname;
12389     }
12390     getline(ss, rs);
12391     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12392                                               get_last_committed() + 1));
12393     return true;
12394   } else if (prefix == "osd pool rmsnap") {
12395     string poolstr;
12396     cmd_getval(cmdmap, "pool", poolstr);
12397     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12398     if (pool < 0) {
12399       ss << "unrecognized pool '" << poolstr << "'";
12400       err = -ENOENT;
12401       goto reply;
12402     }
12403     string snapname;
12404     cmd_getval(cmdmap, "snap", snapname);
12405     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12406     if (p->is_unmanaged_snaps_mode()) {
12407       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12408       err = -EINVAL;
12409       goto reply;
12410     } else if (!p->snap_exists(snapname.c_str())) {
12411       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12412       err = 0;
12413       goto reply;
12414     }
12415     pg_pool_t *pp = 0;
12416     if (pending_inc.new_pools.count(pool))
12417       pp = &pending_inc.new_pools[pool];
12418     if (!pp) {
12419       pp = &pending_inc.new_pools[pool];
12420       *pp = *p;
12421     }
12422     snapid_t sn = pp->snap_exists(snapname.c_str());
12423     if (sn) {
12424       pp->remove_snap(sn);
12425       pp->set_snap_epoch(pending_inc.epoch);
12426       ss << "removed pool " << poolstr << " snap " << snapname;
12427     } else {
12428       ss << "already removed pool " << poolstr << " snap " << snapname;
12429     }
12430     getline(ss, rs);
12431     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12432                                               get_last_committed() + 1));
12433     return true;
12434   } else if (prefix == "osd pool create") {
12435     int64_t pg_num, pg_num_min;
12436     int64_t pgp_num;
12437     cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12438     cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12439     cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12440
12441     string pool_type_str;
12442     cmd_getval(cmdmap, "pool_type", pool_type_str);
12443     if (pool_type_str.empty())
12444       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12445
12446     string poolstr;
12447     cmd_getval(cmdmap, "pool", poolstr);
12448     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12449     if (pool_id >= 0) {
12450       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12451       if (pool_type_str != p->get_type_name()) {
12452         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12453         err = -EINVAL;
12454       } else {
12455         ss << "pool '" << poolstr << "' already exists";
12456         err = 0;
12457       }
12458       goto reply;
12459     }
12460
12461     int pool_type;
12462     if (pool_type_str == "replicated") {
12463       pool_type = pg_pool_t::TYPE_REPLICATED;
12464     } else if (pool_type_str == "erasure") {
12465       pool_type = pg_pool_t::TYPE_ERASURE;
12466     } else {
12467       ss << "unknown pool type '" << pool_type_str << "'";
12468       err = -EINVAL;
12469       goto reply;
12470     }
12471
12472     bool implicit_rule_creation = false;
12473     int64_t expected_num_objects = 0;
12474     string rule_name;
12475     cmd_getval(cmdmap, "rule", rule_name);
12476     string erasure_code_profile;
12477     cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12478
12479     if (pool_type == pg_pool_t::TYPE_ERASURE) {
12480       if (erasure_code_profile == "")
12481         erasure_code_profile = "default";
12482       //handle the erasure code profile
12483       if (erasure_code_profile == "default") {
12484         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12485           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12486             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12487             goto wait;
12488           }
12489
12490           map<string,string> profile_map;
12491           err = osdmap.get_erasure_code_profile_default(cct,
12492                                                       profile_map,
12493                                                       &ss);
12494           if (err)
12495             goto reply;
12496           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12497           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12498           goto wait;
12499         }
12500       }
12501       if (rule_name == "") {
12502         implicit_rule_creation = true;
12503         if (erasure_code_profile == "default") {
12504           rule_name = "erasure-code";
12505         } else {
12506           dout(1) << "implicitly use rule named after the pool: "
12507                 << poolstr << dendl;
12508           rule_name = poolstr;
12509         }
12510       }
12511       cmd_getval(cmdmap, "expected_num_objects",
12512                  expected_num_objects, int64_t(0));
12513     } else {
12514       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12515       //     and put expected_num_objects to rule field
12516       if (erasure_code_profile != "") { // cmd is from CLI
12517         if (rule_name != "") {
12518           string interr;
12519           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12520           if (interr.length()) {
12521             ss << "error parsing integer value '" << rule_name << "': " << interr;
12522             err = -EINVAL;
12523             goto reply;
12524           }
12525         }
12526         rule_name = erasure_code_profile;
12527       } else { // cmd is well-formed
12528         cmd_getval(cmdmap, "expected_num_objects",
12529                    expected_num_objects, int64_t(0));
12530       }
12531     }
12532
12533     if (!implicit_rule_creation && rule_name != "") {
12534       int rule;
12535       err = get_crush_rule(rule_name, &rule, &ss);
12536       if (err == -EAGAIN) {
12537         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12538         return true;
12539       }
12540       if (err)
12541         goto reply;
12542     }
12543
12544     if (expected_num_objects < 0) {
12545       ss << "'expected_num_objects' must be non-negative";
12546       err = -EINVAL;
12547       goto reply;
12548     }
12549
12550     if (expected_num_objects > 0 &&
12551         cct->_conf->osd_objectstore == "filestore" &&
12552         cct->_conf->filestore_merge_threshold > 0) {
12553       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12554       err = -EINVAL;
12555       goto reply;
12556     }
12557
12558     if (expected_num_objects == 0 &&
12559         cct->_conf->osd_objectstore == "filestore" &&
12560         cct->_conf->filestore_merge_threshold < 0) {
12561       int osds = osdmap.get_num_osds();
12562       if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12563         ss << "For better initial performance on pools expected to store a "
12564            << "large number of objects, consider supplying the "
12565            << "expected_num_objects parameter when creating the pool.\n";
12566       }
12567     }
12568
12569     int64_t fast_read_param;
12570     cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12571     FastReadType fast_read = FAST_READ_DEFAULT;
12572     if (fast_read_param == 0)
12573       fast_read = FAST_READ_OFF;
12574     else if (fast_read_param > 0)
12575       fast_read = FAST_READ_ON;
12576
12577     int64_t repl_size = 0;
12578     cmd_getval(cmdmap, "size", repl_size);
12579     int64_t target_size_bytes = 0;
12580     double target_size_ratio = 0.0;
12581     cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12582     cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12583
12584     string pg_autoscale_mode;
12585     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12586
12587     err = prepare_new_pool(poolstr,
12588                            -1, // default crush rule
12589                            rule_name,
12590                            pg_num, pgp_num, pg_num_min,
12591                            repl_size, target_size_bytes, target_size_ratio,
12592                            erasure_code_profile, pool_type,
12593                            (uint64_t)expected_num_objects,
12594                            fast_read,
12595                            pg_autoscale_mode,
12596                            &ss);
12597     if (err < 0) {
12598       switch(err) {
12599       case -EEXIST:
12600         ss << "pool '" << poolstr << "' already exists";
12601         break;
12602       case -EAGAIN:
12603         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12604         return true;
12605       case -ERANGE:
12606         goto reply;
12607       default:
12608         goto reply;
12609         break;
12610       }
12611     } else {
12612       ss << "pool '" << poolstr << "' created";
12613     }
12614     getline(ss, rs);
12615     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12616                                               get_last_committed() + 1));
12617     return true;
12618
12619   } else if (prefix == "osd pool delete" ||
12620              prefix == "osd pool rm") {
12621     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12622     string poolstr, poolstr2, sure;
12623     cmd_getval(cmdmap, "pool", poolstr);
12624     cmd_getval(cmdmap, "pool2", poolstr2);
12625     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12626     if (pool < 0) {
12627       ss << "pool '" << poolstr << "' does not exist";
12628       err = 0;
12629       goto reply;
12630     }
12631
12632     bool force_no_fake = false;
12633     cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12634     bool force = false;
12635     cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12636     if (poolstr2 != poolstr ||
12637         (!force && !force_no_fake)) {
12638       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12639          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12640          << "followed by --yes-i-really-really-mean-it.";
12641       err = -EPERM;
12642       goto reply;
12643     }
12644     err = _prepare_remove_pool(pool, &ss, force_no_fake);
12645     if (err == -EAGAIN) {
12646       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12647       return true;
12648     }
12649     if (err < 0)
12650       goto reply;
12651     goto update;
12652   } else if (prefix == "osd pool rename") {
12653     string srcpoolstr, destpoolstr;
12654     cmd_getval(cmdmap, "srcpool", srcpoolstr);
12655     cmd_getval(cmdmap, "destpool", destpoolstr);
12656     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12657     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12658
12659     if (pool_src < 0) {
12660       if (pool_dst >= 0) {
12661         // src pool doesn't exist, dst pool does exist: to ensure idempotency
12662         // of operations, assume this rename succeeded, as it is not changing
12663         // the current state.  Make sure we output something understandable
12664         // for whoever is issuing the command, if they are paying attention,
12665         // in case it was not intentional; or to avoid a "wtf?" and a bug
12666         // report in case it was intentional, while expecting a failure.
12667         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12668           << destpoolstr << "' does -- assuming successful rename";
12669         err = 0;
12670       } else {
12671         ss << "unrecognized pool '" << srcpoolstr << "'";
12672         err = -ENOENT;
12673       }
12674       goto reply;
12675     } else if (pool_dst >= 0) {
12676       // source pool exists and so does the destination pool
12677       ss << "pool '" << destpoolstr << "' already exists";
12678       err = -EEXIST;
12679       goto reply;
12680     }
12681
12682     int ret = _prepare_rename_pool(pool_src, destpoolstr);
12683     if (ret == 0) {
12684       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12685     } else {
12686       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12687         << cpp_strerror(ret);
12688     }
12689     getline(ss, rs);
12690     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12691                                               get_last_committed() + 1));
12692     return true;
12693
12694   } else if (prefix == "osd pool set") {
12695     err = prepare_command_pool_set(cmdmap, ss);
12696     if (err == -EAGAIN)
12697       goto wait;
12698     if (err < 0)
12699       goto reply;
12700
12701     getline(ss, rs);
12702     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12703                                                    get_last_committed() + 1));
12704     return true;
12705   } else if (prefix == "osd tier add") {
12706     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12707     if (err == -EAGAIN)
12708       goto wait;
12709     if (err)
12710       goto reply;
12711     string poolstr;
12712     cmd_getval(cmdmap, "pool", poolstr);
12713     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12714     if (pool_id < 0) {
12715       ss << "unrecognized pool '" << poolstr << "'";
12716       err = -ENOENT;
12717       goto reply;
12718     }
12719     string tierpoolstr;
12720     cmd_getval(cmdmap, "tierpool", tierpoolstr);
12721     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12722     if (tierpool_id < 0) {
12723       ss << "unrecognized pool '" << tierpoolstr << "'";
12724       err = -ENOENT;
12725       goto reply;
12726     }
12727     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12728     ceph_assert(p);
12729     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12730     ceph_assert(tp);
12731
12732     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12733       goto reply;
12734     }
12735
12736     // make sure new tier is empty
12737     string force_nonempty;
12738     cmd_getval(cmdmap, "force_nonempty", force_nonempty);
12739     const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
12740     if (pstats && pstats->stats.sum.num_objects != 0 &&
12741         force_nonempty != "--force-nonempty") {
12742       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12743       err = -ENOTEMPTY;
12744       goto reply;
12745     }
12746     if (tp->is_erasure()) {
12747       ss << "tier pool '" << tierpoolstr
12748          << "' is an ec pool, which cannot be a tier";
12749       err = -ENOTSUP;
12750       goto reply;
12751     }
12752     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12753         ((force_nonempty != "--force-nonempty") ||
12754          (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
12755       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12756       err = -ENOTEMPTY;
12757       goto reply;
12758     }
12759     // go
12760     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12761     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12762     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12763       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12764       return true;
12765     }
12766     np->tiers.insert(tierpool_id);
12767     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12768     ntp->tier_of = pool_id;
12769     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12770     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12771                                               get_last_committed() + 1));
12772     return true;
12773   } else if (prefix == "osd tier remove" ||
12774              prefix == "osd tier rm") {
12775     string poolstr;
12776     cmd_getval(cmdmap, "pool", poolstr);
12777     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12778     if (pool_id < 0) {
12779       ss << "unrecognized pool '" << poolstr << "'";
12780       err = -ENOENT;
12781       goto reply;
12782     }
12783     string tierpoolstr;
12784     cmd_getval(cmdmap, "tierpool", tierpoolstr);
12785     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12786     if (tierpool_id < 0) {
12787       ss << "unrecognized pool '" << tierpoolstr << "'";
12788       err = -ENOENT;
12789       goto reply;
12790     }
12791     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12792     ceph_assert(p);
12793     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12794     ceph_assert(tp);
12795
12796     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12797       goto reply;
12798     }
12799
12800     if (p->tiers.count(tierpool_id) == 0) {
12801       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12802       err = 0;
12803       goto reply;
12804     }
12805     if (tp->tier_of != pool_id) {
12806       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12807          << osdmap.get_pool_name(tp->tier_of) << "': "
12808          // be scary about it; this is an inconsistency and bells must go off
12809          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12810       err = -EINVAL;
12811       goto reply;
12812     }
12813     if (p->read_tier == tierpool_id) {
12814       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12815       err = -EBUSY;
12816       goto reply;
12817     }
12818     // go
12819     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12820     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12821     if (np->tiers.count(tierpool_id) == 0 ||
12822         ntp->tier_of != pool_id ||
12823         np->read_tier == tierpool_id) {
12824       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12825       return true;
12826     }
12827     np->tiers.erase(tierpool_id);
12828     ntp->clear_tier();
12829     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12830     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12831                                               get_last_committed() + 1));
12832     return true;
12833   } else if (prefix == "osd tier set-overlay") {
12834     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12835     if (err == -EAGAIN)
12836       goto wait;
12837     if (err)
12838       goto reply;
12839     string poolstr;
12840     cmd_getval(cmdmap, "pool", poolstr);
12841     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12842     if (pool_id < 0) {
12843       ss << "unrecognized pool '" << poolstr << "'";
12844       err = -ENOENT;
12845       goto reply;
12846     }
12847     string overlaypoolstr;
12848     cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
12849     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12850     if (overlaypool_id < 0) {
12851       ss << "unrecognized pool '" << overlaypoolstr << "'";
12852       err = -ENOENT;
12853       goto reply;
12854     }
12855     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12856     ceph_assert(p);
12857     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
12858     ceph_assert(overlay_p);
12859     if (p->tiers.count(overlaypool_id) == 0) {
12860       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12861       err = -EINVAL;
12862       goto reply;
12863     }
12864     if (p->read_tier == overlaypool_id) {
12865       err = 0;
12866       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12867       goto reply;
12868     }
12869     if (p->has_read_tier()) {
12870       ss << "pool '" << poolstr << "' has overlay '"
12871          << osdmap.get_pool_name(p->read_tier)
12872          << "'; please remove-overlay first";
12873       err = -EINVAL;
12874       goto reply;
12875     }
12876
12877     // go
12878     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12879     np->read_tier = overlaypool_id;
12880     np->write_tier = overlaypool_id;
12881     np->set_last_force_op_resend(pending_inc.epoch);
12882     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12883     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12884     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12885     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12886       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12887     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12888                                               get_last_committed() + 1));
12889     return true;
12890   } else if (prefix == "osd tier remove-overlay" ||
12891              prefix == "osd tier rm-overlay") {
12892     string poolstr;
12893     cmd_getval(cmdmap, "pool", poolstr);
12894     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12895     if (pool_id < 0) {
12896       ss << "unrecognized pool '" << poolstr << "'";
12897       err = -ENOENT;
12898       goto reply;
12899     }
12900     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12901     ceph_assert(p);
12902     if (!p->has_read_tier()) {
12903       err = 0;
12904       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12905       goto reply;
12906     }
12907
12908     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12909       goto reply;
12910     }
12911
12912     // go
12913     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12914     if (np->has_read_tier()) {
12915       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12916       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12917       nop->set_last_force_op_resend(pending_inc.epoch);
12918     }
12919     if (np->has_write_tier()) {
12920       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12921       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12922       nop->set_last_force_op_resend(pending_inc.epoch);
12923     }
12924     np->clear_read_tier();
12925     np->clear_write_tier();
12926     np->set_last_force_op_resend(pending_inc.epoch);
12927     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12928     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12929                                               get_last_committed() + 1));
12930     return true;
12931   } else if (prefix == "osd tier cache-mode") {
12932     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12933     if (err == -EAGAIN)
12934       goto wait;
12935     if (err)
12936       goto reply;
12937     string poolstr;
12938     cmd_getval(cmdmap, "pool", poolstr);
12939     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12940     if (pool_id < 0) {
12941       ss << "unrecognized pool '" << poolstr << "'";
12942       err = -ENOENT;
12943       goto reply;
12944     }
12945     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12946     ceph_assert(p);
12947     if (!p->is_tier()) {
12948       ss << "pool '" << poolstr << "' is not a tier";
12949       err = -EINVAL;
12950       goto reply;
12951     }
12952     string modestr;
12953     cmd_getval(cmdmap, "mode", modestr);
12954     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12955     if (int(mode) < 0) {
12956       ss << "'" << modestr << "' is not a valid cache mode";
12957       err = -EINVAL;
12958       goto reply;
12959     }
12960
12961     bool sure = false;
12962     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12963
12964     if (mode == pg_pool_t::CACHEMODE_FORWARD ||
12965         mode == pg_pool_t::CACHEMODE_READFORWARD) {
12966       ss << "'" << modestr << "' is no longer a supported cache mode";
12967       err = -EPERM;
12968       goto reply;
12969     }
12970     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12971          mode != pg_pool_t::CACHEMODE_NONE &&
12972          mode != pg_pool_t::CACHEMODE_PROXY &&
12973          mode != pg_pool_t::CACHEMODE_READPROXY) &&
12974          !sure) {
12975       ss << "'" << modestr << "' is not a well-supported cache mode and may "
12976          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
12977       err = -EPERM;
12978       goto reply;
12979     }
12980
12981     // pool already has this cache-mode set and there are no pending changes
12982     if (p->cache_mode == mode &&
12983         (pending_inc.new_pools.count(pool_id) == 0 ||
12984          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12985       ss << "set cache-mode for pool '" << poolstr << "'"
12986          << " to " << pg_pool_t::get_cache_mode_name(mode);
12987       err = 0;
12988       goto reply;
12989     }
12990
12991     /* Mode description:
12992      *
12993      *  none:       No cache-mode defined
12994      *  forward:    Forward all reads and writes to base pool [removed]
12995      *  writeback:  Cache writes, promote reads from base pool
12996      *  readonly:   Forward writes to base pool
12997      *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
12998      *  proxy:       Proxy all reads and writes to base pool
12999      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
13000      *
13001      * Hence, these are the allowed transitions:
13002      *
13003      *  none -> any
13004      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13005      *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13006      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13007      *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13008      *  writeback -> readproxy || proxy
13009      *  readonly -> any
13010      */
13011
13012     // We check if the transition is valid against the current pool mode, as
13013     // it is the only committed state thus far.  We will blantly squash
13014     // whatever mode is on the pending state.
13015
13016     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13017         (mode != pg_pool_t::CACHEMODE_PROXY &&
13018           mode != pg_pool_t::CACHEMODE_READPROXY)) {
13019       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13020          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13021          << "' pool; only '"
13022          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
13023          << "','"
13024          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13025         << "' allowed.";
13026       err = -EINVAL;
13027       goto reply;
13028     }
13029     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13030         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13031           mode != pg_pool_t::CACHEMODE_PROXY &&
13032           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13033
13034         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13035         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13036           mode != pg_pool_t::CACHEMODE_PROXY)) ||
13037
13038         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13039         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13040           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13041
13042         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13043         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13044           mode != pg_pool_t::CACHEMODE_PROXY &&
13045           mode != pg_pool_t::CACHEMODE_READPROXY))) {
13046
13047       const pool_stat_t* pstats =
13048         mon->mgrstatmon()->get_pool_stat(pool_id);
13049
13050       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13051         ss << "unable to set cache-mode '"
13052            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13053            << "': dirty objects found";
13054         err = -EBUSY;
13055         goto reply;
13056       }
13057     }
13058     // go
13059     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13060     np->cache_mode = mode;
13061     // set this both when moving to and from cache_mode NONE.  this is to
13062     // capture legacy pools that were set up before this flag existed.
13063     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13064     ss << "set cache-mode for pool '" << poolstr
13065         << "' to " << pg_pool_t::get_cache_mode_name(mode);
13066     if (mode == pg_pool_t::CACHEMODE_NONE) {
13067       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13068       ceph_assert(base_pool);
13069       if (base_pool->read_tier == pool_id ||
13070           base_pool->write_tier == pool_id)
13071         ss <<" (WARNING: pool is still configured as read or write tier)";
13072     }
13073     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13074                                               get_last_committed() + 1));
13075     return true;
13076   } else if (prefix == "osd tier add-cache") {
13077     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13078     if (err == -EAGAIN)
13079       goto wait;
13080     if (err)
13081       goto reply;
13082     string poolstr;
13083     cmd_getval(cmdmap, "pool", poolstr);
13084     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13085     if (pool_id < 0) {
13086       ss << "unrecognized pool '" << poolstr << "'";
13087       err = -ENOENT;
13088       goto reply;
13089     }
13090     string tierpoolstr;
13091     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13092     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13093     if (tierpool_id < 0) {
13094       ss << "unrecognized pool '" << tierpoolstr << "'";
13095       err = -ENOENT;
13096       goto reply;
13097     }
13098     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13099     ceph_assert(p);
13100     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13101     ceph_assert(tp);
13102
13103     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13104       goto reply;
13105     }
13106
13107     int64_t size = 0;
13108     if (!cmd_getval(cmdmap, "size", size)) {
13109       ss << "unable to parse 'size' value '"
13110          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13111       err = -EINVAL;
13112       goto reply;
13113     }
13114     // make sure new tier is empty
13115     const pool_stat_t *pstats =
13116       mon->mgrstatmon()->get_pool_stat(tierpool_id);
13117     if (pstats && pstats->stats.sum.num_objects != 0) {
13118       ss << "tier pool '" << tierpoolstr << "' is not empty";
13119       err = -ENOTEMPTY;
13120       goto reply;
13121     }
13122     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13123     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13124     if (int(mode) < 0) {
13125       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13126       err = -EINVAL;
13127       goto reply;
13128     }
13129     HitSet::Params hsp;
13130     auto& cache_hit_set_type =
13131       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13132     if (cache_hit_set_type == "bloom") {
13133       BloomHitSet::Params *bsp = new BloomHitSet::Params;
13134       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13135       hsp = HitSet::Params(bsp);
13136     } else if (cache_hit_set_type == "explicit_hash") {
13137       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13138     } else if (cache_hit_set_type == "explicit_object") {
13139       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13140     } else {
13141       ss << "osd tier cache default hit set type '"
13142          << cache_hit_set_type << "' is not a known type";
13143       err = -EINVAL;
13144       goto reply;
13145     }
13146     // go
13147     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13148     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13149     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13150       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13151       return true;
13152     }
13153     np->tiers.insert(tierpool_id);
13154     np->read_tier = np->write_tier = tierpool_id;
13155     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13156     np->set_last_force_op_resend(pending_inc.epoch);
13157     ntp->set_last_force_op_resend(pending_inc.epoch);
13158     ntp->tier_of = pool_id;
13159     ntp->cache_mode = mode;
13160     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13161     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13162     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13163     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13164     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13165     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13166     ntp->hit_set_params = hsp;
13167     ntp->target_max_bytes = size;
13168     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13169     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13170                                               get_last_committed() + 1));
13171     return true;
13172   } else if (prefix == "osd pool set-quota") {
13173     string poolstr;
13174     cmd_getval(cmdmap, "pool", poolstr);
13175     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13176     if (pool_id < 0) {
13177       ss << "unrecognized pool '" << poolstr << "'";
13178       err = -ENOENT;
13179       goto reply;
13180     }
13181
13182     string field;
13183     cmd_getval(cmdmap, "field", field);
13184     if (field != "max_objects" && field != "max_bytes") {
13185       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13186       err = -EINVAL;
13187       goto reply;
13188     }
13189
13190     // val could contain unit designations, so we treat as a string
13191     string val;
13192     cmd_getval(cmdmap, "val", val);
13193     string tss;
13194     int64_t value;
13195     if (field == "max_objects") {
13196       value = strict_sistrtoll(val.c_str(), &tss);
13197     } else if (field == "max_bytes") {
13198       value = strict_iecstrtoll(val.c_str(), &tss);
13199     } else {
13200       ceph_abort_msg("unrecognized option");
13201     }
13202     if (!tss.empty()) {
13203       ss << "error parsing value '" << val << "': " << tss;
13204       err = -EINVAL;
13205       goto reply;
13206     }
13207
13208     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13209     if (field == "max_objects") {
13210       pi->quota_max_objects = value;
13211     } else if (field == "max_bytes") {
13212       pi->quota_max_bytes = value;
13213     } else {
13214       ceph_abort_msg("unrecognized option");
13215     }
13216     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13217     rs = ss.str();
13218     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13219                                               get_last_committed() + 1));
13220     return true;
13221   } else if (prefix == "osd pool application enable" ||
13222              prefix == "osd pool application disable" ||
13223              prefix == "osd pool application set" ||
13224              prefix == "osd pool application rm") {
13225     err = prepare_command_pool_application(prefix, cmdmap, ss);
13226     if (err == -EAGAIN) {
13227       goto wait;
13228     } else if (err < 0) {
13229       goto reply;
13230     } else {
13231       goto update;
13232     }
13233   } else if (prefix == "osd force-create-pg") {
13234     pg_t pgid;
13235     string pgidstr;
13236     cmd_getval(cmdmap, "pgid", pgidstr);
13237     if (!pgid.parse(pgidstr.c_str())) {
13238       ss << "invalid pgid '" << pgidstr << "'";
13239       err = -EINVAL;
13240       goto reply;
13241     }
13242     if (!osdmap.pg_exists(pgid)) {
13243       ss << "pg " << pgid << " should not exist";
13244       err = -ENOENT;
13245       goto reply;
13246     }
13247     bool sure = false;
13248     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13249     if (!sure) {
13250       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13251          << "that the cluster will give up ever trying to recover the lost data.  Do this "
13252          << "only if you are certain that all copies of the PG are in fact lost and you are "
13253          << "willing to accept that the data is permanently destroyed.  Pass "
13254          << "--yes-i-really-mean-it to proceed.";
13255       err = -EPERM;
13256       goto reply;
13257     }
13258     bool creating_now;
13259     {
13260       std::lock_guard<std::mutex> l(creating_pgs_lock);
13261       auto emplaced = creating_pgs.pgs.emplace(
13262         pgid,
13263         creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13264                                        ceph_clock_now()));
13265       creating_now = emplaced.second;
13266     }
13267     if (creating_now) {
13268       ss << "pg " << pgidstr << " now creating, ok";
13269       // set the pool's CREATING flag so that (1) the osd won't ignore our
13270       // create message and (2) we won't propose any future pg_num changes
13271       // until after the PG has been instantiated.
13272       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13273         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13274       }
13275       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13276       err = 0;
13277       goto update;
13278     } else {
13279       ss << "pg " << pgid << " already creating";
13280       err = 0;
13281       goto reply;
13282     }
13283   } else {
13284     err = -EINVAL;
13285   }
13286
13287  reply:
13288   getline(ss, rs);
13289   if (err < 0 && rs.length() == 0)
13290     rs = cpp_strerror(err);
13291   mon->reply_command(op, err, rs, rdata, get_last_committed());
13292   return ret;
13293
13294  update:
13295   getline(ss, rs);
13296   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13297                                             get_last_committed() + 1));
13298   return true;
13299
13300  wait:
13301   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13302   return true;
13303 }
13304
13305 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13306 {
13307   op->mark_osdmon_event(__func__);
13308
13309   auto m = op->get_req<MPoolOp>();
13310   MonSession *session = op->get_session();
13311   if (!session) {
13312     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13313     return true;
13314   }
13315
13316   switch (m->op) {
13317   case POOL_OP_CREATE_UNMANAGED_SNAP:
13318   case POOL_OP_DELETE_UNMANAGED_SNAP:
13319     {
13320       const std::string* pool_name = nullptr;
13321       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13322       if (pg_pool != nullptr) {
13323         pool_name = &osdmap.get_pool_name(m->pool);
13324       }
13325
13326       if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13327                                           session->entity_name, session->caps,
13328                                           session->get_peer_socket_addr(),
13329                                           pool_name)) {
13330         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13331                 << "privileges. message: " << *m  << std::endl
13332                 << "caps: " << session->caps << dendl;
13333         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13334         return true;
13335       }
13336     }
13337     break;
13338   default:
13339     if (!session->is_capable("osd", MON_CAP_W)) {
13340       dout(0) << "got pool op from entity with insufficient privileges. "
13341               << "message: " << *m  << std::endl
13342               << "caps: " << session->caps << dendl;
13343       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13344       return true;
13345     }
13346     break;
13347   }
13348
13349   return false;
13350 }
13351
13352 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13353 {
13354   op->mark_osdmon_event(__func__);
13355   auto m = op->get_req<MPoolOp>();
13356
13357   if (enforce_pool_op_caps(op)) {
13358     return true;
13359   }
13360
13361   if (m->fsid != mon->monmap->fsid) {
13362     dout(0) << __func__ << " drop message on fsid " << m->fsid
13363             << " != " << mon->monmap->fsid << " for " << *m << dendl;
13364     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13365     return true;
13366   }
13367
13368   if (m->op == POOL_OP_CREATE)
13369     return preprocess_pool_op_create(op);
13370
13371   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13372   if (p == nullptr) {
13373     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13374     if (m->op == POOL_OP_DELETE) {
13375       _pool_op_reply(op, 0, osdmap.get_epoch());
13376     } else {
13377       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13378     }
13379     return true;
13380   }
13381
13382   // check if the snap and snapname exist
13383   bool snap_exists = false;
13384   if (p->snap_exists(m->name.c_str()))
13385     snap_exists = true;
13386
13387   switch (m->op) {
13388   case POOL_OP_CREATE_SNAP:
13389     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13390       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13391       return true;
13392     }
13393     if (snap_exists) {
13394       _pool_op_reply(op, 0, osdmap.get_epoch());
13395       return true;
13396     }
13397     return false;
13398   case POOL_OP_CREATE_UNMANAGED_SNAP:
13399     if (p->is_pool_snaps_mode()) {
13400       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13401       return true;
13402     }
13403     return false;
13404   case POOL_OP_DELETE_SNAP:
13405     if (p->is_unmanaged_snaps_mode()) {
13406       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13407       return true;
13408     }
13409     if (!snap_exists) {
13410       _pool_op_reply(op, 0, osdmap.get_epoch());
13411       return true;
13412     }
13413     return false;
13414   case POOL_OP_DELETE_UNMANAGED_SNAP:
13415     if (p->is_pool_snaps_mode()) {
13416       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13417       return true;
13418     }
13419     if (_is_removed_snap(m->pool, m->snapid)) {
13420       _pool_op_reply(op, 0, osdmap.get_epoch());
13421       return true;
13422     }
13423     return false;
13424   case POOL_OP_DELETE:
13425     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13426       _pool_op_reply(op, 0, osdmap.get_epoch());
13427       return true;
13428     }
13429     return false;
13430   case POOL_OP_AUID_CHANGE:
13431     return false;
13432   default:
13433     ceph_abort();
13434     break;
13435   }
13436
13437   return false;
13438 }
13439
13440 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13441 {
13442   if (!osdmap.have_pg_pool(pool)) {
13443     dout(10) << __func__ << " pool " << pool << " snap " << snap
13444              << " - pool dne" << dendl;
13445     return true;
13446   }
13447   if (osdmap.in_removed_snaps_queue(pool, snap)) {
13448     dout(10) << __func__ << " pool " << pool << " snap " << snap
13449              << " - in osdmap removed_snaps_queue" << dendl;
13450     return true;
13451   }
13452   snapid_t begin, end;
13453   int r = lookup_purged_snap(pool, snap, &begin, &end);
13454   if (r == 0) {
13455     dout(10) << __func__ << " pool " << pool << " snap " << snap
13456              << " - purged, [" << begin << "," << end << ")" << dendl;
13457     return true;
13458   }
13459   return false;
13460 }
13461
13462 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13463 {
13464   if (pending_inc.old_pools.count(pool)) {
13465     dout(10) << __func__ << " pool " << pool << " snap " << snap
13466              << " - pool pending deletion" << dendl;
13467     return true;
13468   }
13469   if (pending_inc.in_new_removed_snaps(pool, snap)) {
13470     dout(10) << __func__ << " pool " << pool << " snap " << snap
13471              << " - in pending new_removed_snaps" << dendl;
13472     return true;
13473   }
13474   return false;
13475 }
13476
13477 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13478 {
13479   op->mark_osdmon_event(__func__);
13480   auto m = op->get_req<MPoolOp>();
13481   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13482   if (pool >= 0) {
13483     _pool_op_reply(op, 0, osdmap.get_epoch());
13484     return true;
13485   }
13486
13487   return false;
13488 }
13489
13490 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13491 {
13492   op->mark_osdmon_event(__func__);
13493   auto m = op->get_req<MPoolOp>();
13494   dout(10) << "prepare_pool_op " << *m << dendl;
13495   if (m->op == POOL_OP_CREATE) {
13496     return prepare_pool_op_create(op);
13497   } else if (m->op == POOL_OP_DELETE) {
13498     return prepare_pool_op_delete(op);
13499   }
13500
13501   int ret = 0;
13502   bool changed = false;
13503
13504   if (!osdmap.have_pg_pool(m->pool)) {
13505     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13506     return false;
13507   }
13508
13509   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13510
13511   switch (m->op) {
13512     case POOL_OP_CREATE_SNAP:
13513       if (pool->is_tier()) {
13514         ret = -EINVAL;
13515         _pool_op_reply(op, ret, osdmap.get_epoch());
13516         return false;
13517       }  // else, fall through
13518     case POOL_OP_DELETE_SNAP:
13519       if (!pool->is_unmanaged_snaps_mode()) {
13520         bool snap_exists = pool->snap_exists(m->name.c_str());
13521         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13522           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13523           ret = 0;
13524         } else {
13525           break;
13526         }
13527       } else {
13528         ret = -EINVAL;
13529       }
13530       _pool_op_reply(op, ret, osdmap.get_epoch());
13531       return false;
13532
13533     case POOL_OP_DELETE_UNMANAGED_SNAP:
13534       // we won't allow removal of an unmanaged snapshot from a pool
13535       // not in unmanaged snaps mode.
13536       if (!pool->is_unmanaged_snaps_mode()) {
13537         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13538         return false;
13539       }
13540       /* fall-thru */
13541     case POOL_OP_CREATE_UNMANAGED_SNAP:
13542       // but we will allow creating an unmanaged snapshot on any pool
13543       // as long as it is not in 'pool' snaps mode.
13544       if (pool->is_pool_snaps_mode()) {
13545         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13546         return false;
13547       }
13548   }
13549
13550   // projected pool info
13551   pg_pool_t pp;
13552   if (pending_inc.new_pools.count(m->pool))
13553     pp = pending_inc.new_pools[m->pool];
13554   else
13555     pp = *osdmap.get_pg_pool(m->pool);
13556
13557   bufferlist reply_data;
13558
13559   // pool snaps vs unmanaged snaps are mutually exclusive
13560   switch (m->op) {
13561   case POOL_OP_CREATE_SNAP:
13562   case POOL_OP_DELETE_SNAP:
13563     if (pp.is_unmanaged_snaps_mode()) {
13564       ret = -EINVAL;
13565       goto out;
13566     }
13567     break;
13568
13569   case POOL_OP_CREATE_UNMANAGED_SNAP:
13570   case POOL_OP_DELETE_UNMANAGED_SNAP:
13571     if (pp.is_pool_snaps_mode()) {
13572       ret = -EINVAL;
13573       goto out;
13574     }
13575   }
13576
13577   switch (m->op) {
13578   case POOL_OP_CREATE_SNAP:
13579     if (!pp.snap_exists(m->name.c_str())) {
13580       pp.add_snap(m->name.c_str(), ceph_clock_now());
13581       dout(10) << "create snap in pool " << m->pool << " " << m->name
13582                << " seq " << pp.get_snap_epoch() << dendl;
13583       changed = true;
13584     }
13585     break;
13586
13587   case POOL_OP_DELETE_SNAP:
13588     {
13589       snapid_t s = pp.snap_exists(m->name.c_str());
13590       if (s) {
13591         pp.remove_snap(s);
13592         pending_inc.new_removed_snaps[m->pool].insert(s);
13593         changed = true;
13594       }
13595     }
13596     break;
13597
13598   case POOL_OP_CREATE_UNMANAGED_SNAP:
13599     {
13600       uint64_t snapid = pp.add_unmanaged_snap(
13601         osdmap.require_osd_release < ceph_release_t::octopus);
13602       encode(snapid, reply_data);
13603       changed = true;
13604     }
13605     break;
13606
13607   case POOL_OP_DELETE_UNMANAGED_SNAP:
13608     if (!_is_removed_snap(m->pool, m->snapid) &&
13609         !_is_pending_removed_snap(m->pool, m->snapid)) {
13610       if (m->snapid > pp.get_snap_seq()) {
13611         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13612         return false;
13613       }
13614       pp.remove_unmanaged_snap(
13615         m->snapid,
13616         osdmap.require_osd_release < ceph_release_t::octopus);
13617       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13618       // also record the new seq as purged: this avoids a discontinuity
13619       // after all of the snaps have been purged, since the seq assigned
13620       // during removal lives in the same namespace as the actual snaps.
13621       pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13622       changed = true;
13623     }
13624     break;
13625
13626   case POOL_OP_AUID_CHANGE:
13627     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13628     return false;
13629
13630   default:
13631     ceph_abort();
13632     break;
13633   }
13634
13635   if (changed) {
13636     pp.set_snap_epoch(pending_inc.epoch);
13637     pending_inc.new_pools[m->pool] = pp;
13638   }
13639
13640  out:
13641   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13642   return true;
13643 }
13644
13645 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13646 {
13647   op->mark_osdmon_event(__func__);
13648   int err = prepare_new_pool(op);
13649   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13650   return true;
13651 }
13652
13653 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13654                                    ostream *ss)
13655 {
13656   const string& poolstr = osdmap.get_pool_name(pool_id);
13657
13658   // If the Pool is in use by CephFS, refuse to delete it
13659   FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13660   if (pending_fsmap.pool_in_use(pool_id)) {
13661     *ss << "pool '" << poolstr << "' is in use by CephFS";
13662     return -EBUSY;
13663   }
13664
13665   if (pool.tier_of >= 0) {
13666     *ss << "pool '" << poolstr << "' is a tier of '"
13667         << osdmap.get_pool_name(pool.tier_of) << "'";
13668     return -EBUSY;
13669   }
13670   if (!pool.tiers.empty()) {
13671     *ss << "pool '" << poolstr << "' has tiers";
13672     for(auto tier : pool.tiers) {
13673       *ss << " " << osdmap.get_pool_name(tier);
13674     }
13675     return -EBUSY;
13676   }
13677
13678   if (!g_conf()->mon_allow_pool_delete) {
13679     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13680     return -EPERM;
13681   }
13682
13683   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13684     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13685     return -EPERM;
13686   }
13687
13688   *ss << "pool '" << poolstr << "' removed";
13689   return 0;
13690 }
13691
13692 /**
13693  * Check if it is safe to add a tier to a base pool
13694  *
13695  * @return
13696  * True if the operation should proceed, false if we should abort here
13697  * (abort doesn't necessarily mean error, could be idempotency)
13698  */
13699 bool OSDMonitor::_check_become_tier(
13700     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13701     const int64_t base_pool_id, const pg_pool_t *base_pool,
13702     int *err,
13703     ostream *ss) const
13704 {
13705   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13706   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13707
13708   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13709   if (pending_fsmap.pool_in_use(tier_pool_id)) {
13710     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13711     *err = -EBUSY;
13712     return false;
13713   }
13714
13715   if (base_pool->tiers.count(tier_pool_id)) {
13716     ceph_assert(tier_pool->tier_of == base_pool_id);
13717     *err = 0;
13718     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13719       << base_pool_name << "'";
13720     return false;
13721   }
13722
13723   if (base_pool->is_tier()) {
13724     *ss << "pool '" << base_pool_name << "' is already a tier of '"
13725       << osdmap.get_pool_name(base_pool->tier_of) << "', "
13726       << "multiple tiers are not yet supported.";
13727     *err = -EINVAL;
13728     return false;
13729   }
13730
13731   if (tier_pool->has_tiers()) {
13732     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13733     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13734          it != tier_pool->tiers.end(); ++it)
13735       *ss << "'" << osdmap.get_pool_name(*it) << "',";
13736     *ss << " multiple tiers are not yet supported.";
13737     *err = -EINVAL;
13738     return false;
13739   }
13740
13741   if (tier_pool->is_tier()) {
13742     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13743        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13744     *err = -EINVAL;
13745     return false;
13746   }
13747
13748   *err = 0;
13749   return true;
13750 }
13751
13752
13753 /**
13754  * Check if it is safe to remove a tier from this base pool
13755  *
13756  * @return
13757  * True if the operation should proceed, false if we should abort here
13758  * (abort doesn't necessarily mean error, could be idempotency)
13759  */
13760 bool OSDMonitor::_check_remove_tier(
13761     const int64_t base_pool_id, const pg_pool_t *base_pool,
13762     const pg_pool_t *tier_pool,
13763     int *err, ostream *ss) const
13764 {
13765   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13766
13767   // Apply CephFS-specific checks
13768   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13769   if (pending_fsmap.pool_in_use(base_pool_id)) {
13770     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13771       // If the underlying pool is erasure coded and does not allow EC
13772       // overwrites, we can't permit the removal of the replicated tier that
13773       // CephFS relies on to access it
13774       *ss << "pool '" << base_pool_name <<
13775           "' does not allow EC overwrites and is in use by CephFS"
13776           " via its tier";
13777       *err = -EBUSY;
13778       return false;
13779     }
13780
13781     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13782       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13783              "tier is still in use as a writeback cache.  Change the cache "
13784              "mode and flush the cache before removing it";
13785       *err = -EBUSY;
13786       return false;
13787     }
13788   }
13789
13790   *err = 0;
13791   return true;
13792 }
13793
13794 int OSDMonitor::_prepare_remove_pool(
13795   int64_t pool, ostream *ss, bool no_fake)
13796 {
13797   dout(10) << __func__ << " " << pool << dendl;
13798   const pg_pool_t *p = osdmap.get_pg_pool(pool);
13799   int r = _check_remove_pool(pool, *p, ss);
13800   if (r < 0)
13801     return r;
13802
13803   auto new_pool = pending_inc.new_pools.find(pool);
13804   if (new_pool != pending_inc.new_pools.end()) {
13805     // if there is a problem with the pending info, wait and retry
13806     // this op.
13807     const auto& p = new_pool->second;
13808     int r = _check_remove_pool(pool, p, ss);
13809     if (r < 0)
13810       return -EAGAIN;
13811   }
13812
13813   if (pending_inc.old_pools.count(pool)) {
13814     dout(10) << __func__ << " " << pool << " already pending removal"
13815              << dendl;
13816     return 0;
13817   }
13818
13819   if (g_conf()->mon_fake_pool_delete && !no_fake) {
13820     string old_name = osdmap.get_pool_name(pool);
13821     string new_name = old_name + "." + stringify(pool) + ".DELETED";
13822     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13823             << old_name << " -> " << new_name << dendl;
13824     pending_inc.new_pool_names[pool] = new_name;
13825     return 0;
13826   }
13827
13828   // remove
13829   pending_inc.old_pools.insert(pool);
13830
13831   // remove any pg_temp mappings for this pool
13832   for (auto p = osdmap.pg_temp->begin();
13833        p != osdmap.pg_temp->end();
13834        ++p) {
13835     if (p->first.pool() == pool) {
13836       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
13837                << p->first << dendl;
13838       pending_inc.new_pg_temp[p->first].clear();
13839     }
13840   }
13841   // remove any primary_temp mappings for this pool
13842   for (auto p = osdmap.primary_temp->begin();
13843       p != osdmap.primary_temp->end();
13844       ++p) {
13845     if (p->first.pool() == pool) {
13846       dout(10) << __func__ << " " << pool
13847                << " removing obsolete primary_temp" << p->first << dendl;
13848       pending_inc.new_primary_temp[p->first] = -1;
13849     }
13850   }
13851   // remove any pg_upmap mappings for this pool
13852   for (auto& p : osdmap.pg_upmap) {
13853     if (p.first.pool() == pool) {
13854       dout(10) << __func__ << " " << pool
13855                << " removing obsolete pg_upmap "
13856                << p.first << dendl;
13857       pending_inc.old_pg_upmap.insert(p.first);
13858     }
13859   }
13860   // remove any pending pg_upmap mappings for this pool
13861   {
13862     auto it = pending_inc.new_pg_upmap.begin();
13863     while (it != pending_inc.new_pg_upmap.end()) {
13864       if (it->first.pool() == pool) {
13865         dout(10) << __func__ << " " << pool
13866                  << " removing pending pg_upmap "
13867                  << it->first << dendl;
13868         it = pending_inc.new_pg_upmap.erase(it);
13869       } else {
13870         it++;
13871       }
13872     }
13873   }
13874   // remove any pg_upmap_items mappings for this pool
13875   for (auto& p : osdmap.pg_upmap_items) {
13876     if (p.first.pool() == pool) {
13877       dout(10) << __func__ << " " << pool
13878                << " removing obsolete pg_upmap_items " << p.first
13879                << dendl;
13880       pending_inc.old_pg_upmap_items.insert(p.first);
13881     }
13882   }
13883   // remove any pending pg_upmap mappings for this pool
13884   {
13885     auto it = pending_inc.new_pg_upmap_items.begin();
13886     while (it != pending_inc.new_pg_upmap_items.end()) {
13887       if (it->first.pool() == pool) {
13888         dout(10) << __func__ << " " << pool
13889                  << " removing pending pg_upmap_items "
13890                  << it->first << dendl;
13891         it = pending_inc.new_pg_upmap_items.erase(it);
13892       } else {
13893         it++;
13894       }
13895     }
13896   }
13897
13898   // remove any choose_args for this pool
13899   CrushWrapper newcrush;
13900   _get_pending_crush(newcrush);
13901   if (newcrush.have_choose_args(pool)) {
13902     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13903     newcrush.rm_choose_args(pool);
13904     pending_inc.crush.clear();
13905     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13906   }
13907   return 0;
13908 }
13909
13910 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13911 {
13912   dout(10) << "_prepare_rename_pool " << pool << dendl;
13913   if (pending_inc.old_pools.count(pool)) {
13914     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13915     return -ENOENT;
13916   }
13917   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13918        p != pending_inc.new_pool_names.end();
13919        ++p) {
13920     if (p->second == newname && p->first != pool) {
13921       return -EEXIST;
13922     }
13923   }
13924
13925   pending_inc.new_pool_names[pool] = newname;
13926   return 0;
13927 }
13928
13929 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13930 {
13931   op->mark_osdmon_event(__func__);
13932   auto m = op->get_req<MPoolOp>();
13933   ostringstream ss;
13934   int ret = _prepare_remove_pool(m->pool, &ss, false);
13935   if (ret == -EAGAIN) {
13936     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13937     return true;
13938   }
13939   if (ret < 0)
13940     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13941   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13942                                                       pending_inc.epoch));
13943   return true;
13944 }
13945
13946 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
13947                                 int ret, epoch_t epoch, bufferlist *blp)
13948 {
13949   op->mark_osdmon_event(__func__);
13950   auto m = op->get_req<MPoolOp>();
13951   dout(20) << "_pool_op_reply " << ret << dendl;
13952   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13953                                          ret, epoch, get_last_committed(), blp);
13954   mon->send_reply(op, reply);
13955 }
13956
13957 void OSDMonitor::convert_pool_priorities(void)
13958 {
13959   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13960   int64_t max_prio = 0;
13961   int64_t min_prio = 0;
13962   for (const auto &i : osdmap.get_pools()) {
13963     const auto &pool = i.second;
13964
13965     if (pool.opts.is_set(key)) {
13966       int64_t prio = 0;
13967       pool.opts.get(key, &prio);
13968       if (prio > max_prio)
13969         max_prio = prio;
13970       if (prio < min_prio)
13971         min_prio = prio;
13972     }
13973   }
13974   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13975     dout(20) << __func__ << " nothing to fix" << dendl;
13976     return;
13977   }
13978   // Current pool priorities exceeds new maximum
13979   for (const auto &i : osdmap.get_pools()) {
13980     const auto pool_id = i.first;
13981     pg_pool_t pool = i.second;
13982
13983     int64_t prio = 0;
13984     pool.opts.get(key, &prio);
13985     int64_t n;
13986
13987     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13988       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13989       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13990     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13991       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
13992       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13993     } else {
13994       continue;
13995     }
13996     if (n == 0) {
13997       pool.opts.unset(key);
13998     } else {
13999       pool.opts.set(key, static_cast<int64_t>(n));
14000     }
14001     dout(10) << __func__ << " pool " << pool_id
14002              << " recovery_priority adjusted "
14003              << prio << " to " << n << dendl;
14004     pool.last_change = pending_inc.epoch;
14005     pending_inc.new_pools[pool_id] = pool;
14006   }
14007 }