ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/ConfigKeyService.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDFull.h"
  43 #include "messages/MOSDMap.h"
  44 #include "messages/MMonGetOSDMap.h"
  45 #include "messages/MOSDBoot.h"
  46 #include "messages/MOSDAlive.h"
  47 #include "messages/MPoolOp.h"
  48 #include "messages/MPoolOpReply.h"
  49 #include "messages/MOSDPGCreate.h"
  50 #include "messages/MOSDPGCreate2.h"
  51 #include "messages/MOSDPGCreated.h"
  52 #include "messages/MOSDPGTemp.h"
  53 #include "messages/MOSDPGReadyToMerge.h"
  54 #include "messages/MMonCommand.h"
  55 #include "messages/MRemoveSnaps.h"
  56 #include "messages/MOSDScrub.h"
  57 #include "messages/MRoute.h"
  58
  59 #include "common/TextTable.h"
  60 #include "common/Timer.h"
  61 #include "common/ceph_argparse.h"
  62 #include "common/perf_counters.h"
  63 #include "common/strtol.h"
  64 #include "common/numa.h"
  65
  66 #include "common/config.h"
  67 #include "common/errno.h"
  68
  69 #include "erasure-code/ErasureCodePlugin.h"
  70 #include "compressor/Compressor.h"
  71 #include "common/Checksummer.h"
  72
  73 #include "include/compat.h"
  74 #include "include/ceph_assert.h"
  75 #include "include/stringify.h"
  76 #include "include/util.h"
  77 #include "common/cmdparse.h"
  78 #include "include/str_list.h"
  79 #include "include/str_map.h"
  80 #include "include/scope_guard.h"
  81
  82 #include "auth/cephx/CephxKeyServer.h"
  83 #include "osd/OSDCap.h"
  84
  85 #include "json_spirit/json_spirit_reader.h"
  86
  87 #include <boost/algorithm/string/predicate.hpp>
  88
  89 #define dout_subsys ceph_subsys_mon
  90 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
  91 static const string OSD_METADATA_PREFIX("osd_metadata");
  92 static const string OSD_SNAP_PREFIX("osd_snap");
  93
  94 namespace {
  95
  96 const uint32_t MAX_POOL_APPLICATIONS = 4;
  97 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
  98 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
  99
 100 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 101   // Note: this doesn't include support for the application tag match
 102   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 103     auto& match = grant.match;
 104     if (match.is_match_all()) {
 105       return true;
 106     } else if (pool_name != nullptr &&
 107                !match.pool_namespace.pool_name.empty() &&
 108                match.pool_namespace.pool_name == *pool_name) {
 109       return true;
 110     }
 111   }
 112   return false;
 113 }
 114
 115 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 116                                     const KeyServer& key_server,
 117                                     const EntityName& entity_name,
 118                                     const MonCap& mon_caps,
 119                                     const entity_addr_t& peer_socket_addr,
 120                                     const std::string* pool_name)
 121 {
 122   typedef std::map<std::string, std::string> CommandArgs;
 123
 124   if (mon_caps.is_capable(
 125         cct, CEPH_ENTITY_TYPE_MON,
 126         entity_name, "osd",
 127         "osd pool op unmanaged-snap",
 128         (pool_name == nullptr ?
 129          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 130          CommandArgs{{"poolname", *pool_name}}),
 131         false, true, false,
 132         peer_socket_addr)) {
 133     return true;
 134   }
 135
 136   AuthCapsInfo caps_info;
 137   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 138                                    caps_info)) {
 139     dout(10) << "unable to locate OSD cap data for " << entity_name
 140              << " in auth db" << dendl;
 141     return false;
 142   }
 143
 144   string caps_str;
 145   if (caps_info.caps.length() > 0) {
 146     auto p = caps_info.caps.cbegin();
 147     try {
 148       decode(caps_str, p);
 149     } catch (const buffer::error &err) {
 150       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 151            << dendl;
 152       return false;
 153     }
 154   }
 155
 156   OSDCap osd_cap;
 157   if (!osd_cap.parse(caps_str, nullptr)) {
 158     dout(10) << "unable to parse OSD cap data for " << entity_name
 159              << " in auth db" << dendl;
 160     return false;
 161   }
 162
 163   // if the entity has write permissions in one or all pools, permit
 164   // usage of unmanaged-snapshots
 165   if (osd_cap.allow_all()) {
 166     return true;
 167   }
 168
 169   for (auto& grant : osd_cap.grants) {
 170     if (grant.profile.is_valid()) {
 171       for (auto& profile_grant : grant.profile_grants) {
 172         if (is_osd_writable(profile_grant, pool_name)) {
 173           return true;
 174         }
 175       }
 176     } else if (is_osd_writable(grant, pool_name)) {
 177       return true;
 178     }
 179   }
 180
 181   return false;
 182 }
 183
 184 } // anonymous namespace
 185
 186 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
 187 {
 188   if (epoch_by_pg.size() <= ps) {
 189     epoch_by_pg.resize(ps + 1, 0);
 190   }
 191   const auto old_lec = epoch_by_pg[ps];
 192   if (old_lec >= last_epoch_clean) {
 193     // stale lec
 194     return;
 195   }
 196   epoch_by_pg[ps] = last_epoch_clean;
 197   if (last_epoch_clean < floor) {
 198     floor = last_epoch_clean;
 199   } else if (last_epoch_clean > floor) {
 200     if (old_lec == floor) {
 201       // probably should increase floor?
 202       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 203                                         std::end(epoch_by_pg));
 204       floor = *new_floor;
 205     }
 206   }
 207   if (ps != next_missing) {
 208     return;
 209   }
 210   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 211     if (epoch_by_pg[next_missing] == 0) {
 212       break;
 213     }
 214   }
 215 }
 216
 217 void LastEpochClean::remove_pool(uint64_t pool)
 218 {
 219   report_by_pool.erase(pool);
 220 }
 221
 222 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
 223 {
 224   auto& lec = report_by_pool[pg.pool()];
 225   return lec.report(pg.ps(), last_epoch_clean);
 226 }
 227
 228 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 229 {
 230   auto floor = latest.get_epoch();
 231   for (auto& pool : latest.get_pools()) {
 232     auto reported = report_by_pool.find(pool.first);
 233     if (reported == report_by_pool.end()) {
 234       return 0;
 235     }
 236     if (reported->second.next_missing < pool.second.get_pg_num()) {
 237       return 0;
 238     }
 239     if (reported->second.floor < floor) {
 240       floor = reported->second.floor;
 241     }
 242   }
 243   return floor;
 244 }
 245
 246
 247 class C_UpdateCreatingPGs : public Context {
 248 public:
 249   OSDMonitor *osdmon;
 250   utime_t start;
 251   epoch_t epoch;
 252   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 253     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 254   void finish(int r) override {
 255     if (r >= 0) {
 256       utime_t end = ceph_clock_now();
 257       dout(10) << "osdmap epoch " << epoch << " mapping took "
 258                << (end - start) << " seconds" << dendl;
 259       osdmon->update_creating_pgs();
 260       osdmon->check_pg_creates_subs();
 261     }
 262   }
 263 };
 264
 265 #undef dout_prefix
 266 #define dout_prefix _prefix(_dout, mon, osdmap)
 267 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
 268   return *_dout << "mon." << mon->name << "@" << mon->rank
 269                 << "(" << mon->get_state_name()
 270                 << ").osd e" << osdmap.get_epoch() << " ";
 271 }
 272
 273 OSDMonitor::OSDMonitor(
 274   CephContext *cct,
 275   Monitor *mn,
 276   Paxos *p,
 277   const string& service_name)
 278  : PaxosService(mn, p, service_name),
 279    cct(cct),
 280    inc_osd_cache(g_conf()->mon_osd_cache_size),
 281    full_osd_cache(g_conf()->mon_osd_cache_size),
 282    has_osdmap_manifest(false),
 283    mapper(mn->cct, &mn->cpu_tp)
 284 {}
 285
 286 bool OSDMonitor::_have_pending_crush()
 287 {
 288   return pending_inc.crush.length() > 0;
 289 }
 290
 291 CrushWrapper &OSDMonitor::_get_stable_crush()
 292 {
 293   return *osdmap.crush;
 294 }
 295
 296 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
 297 {
 298   bufferlist bl;
 299   if (pending_inc.crush.length())
 300     bl = pending_inc.crush;
 301   else
 302     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 303
 304   auto p = bl.cbegin();
 305   newcrush.decode(p);
 306 }
 307
 308 void OSDMonitor::create_initial()
 309 {
 310   dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
 311
 312   OSDMap newmap;
 313
 314   bufferlist bl;
 315   mon->store->get("mkfs", "osdmap", bl);
 316
 317   if (bl.length()) {
 318     newmap.decode(bl);
 319     newmap.set_fsid(mon->monmap->fsid);
 320   } else {
 321     newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
 322   }
 323   newmap.set_epoch(1);
 324   newmap.created = newmap.modified = ceph_clock_now();
 325
 326   // new clusters should sort bitwise by default.
 327   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 328
 329   newmap.flags |=
 330     CEPH_OSDMAP_RECOVERY_DELETES |
 331     CEPH_OSDMAP_PURGED_SNAPDIRS |
 332     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 333   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 334   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 335   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 336   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 337   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 338   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 339
 340   // new cluster should require latest by default
 341   if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
 342     if (g_conf()->mon_debug_no_require_mimic) {
 343       derr << __func__ << " mon_debug_no_require_mimic=true and nautilus=true" << dendl;
 344       newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
 345     } else {
 346       derr << __func__ << " mon_debug_no_require_nautilus=true" << dendl;
 347       newmap.require_osd_release = CEPH_RELEASE_MIMIC;
 348     }
 349   } else {
 350     newmap.require_osd_release = CEPH_RELEASE_NAUTILUS;
 351     int r = ceph_release_from_name(
 352       g_conf()->mon_osd_initial_require_min_compat_client.c_str());
 353     if (r <= 0) {
 354       ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 355     }
 356     newmap.require_min_compat_client = r;
 357   }
 358
 359   // encode into pending incremental
 360   uint64_t features = newmap.get_encoding_features();
 361   newmap.encode(pending_inc.fullmap,
 362                 features | CEPH_FEATURE_RESERVED);
 363   pending_inc.full_crc = newmap.get_crc();
 364   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 365 }
 366
 367 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 368 {
 369   s.insert(service_name);
 370   s.insert(OSD_PG_CREATING_PREFIX);
 371   s.insert(OSD_METADATA_PREFIX);
 372   s.insert(OSD_SNAP_PREFIX);
 373 }
 374
 375 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 376 {
 377   // we really don't care if the version has been updated, because we may
 378   // have trimmed without having increased the last committed; yet, we may
 379   // need to update the in-memory manifest.
 380   load_osdmap_manifest();
 381
 382   version_t version = get_last_committed();
 383   if (version == osdmap.epoch)
 384     return;
 385   ceph_assert(version > osdmap.epoch);
 386
 387   dout(15) << "update_from_paxos paxos e " << version
 388            << ", my e " << osdmap.epoch << dendl;
 389
 390   if (mapping_job) {
 391     if (!mapping_job->is_done()) {
 392       dout(1) << __func__ << " mapping job "
 393               << mapping_job.get() << " did not complete, "
 394               << mapping_job->shards << " left, canceling" << dendl;
 395       mapping_job->abort();
 396     }
 397     mapping_job.reset();
 398   }
 399
 400   load_health();
 401
 402   /*
 403    * We will possibly have a stashed latest that *we* wrote, and we will
 404    * always be sure to have the oldest full map in the first..last range
 405    * due to encode_trim_extra(), which includes the oldest full map in the trim
 406    * transaction.
 407    *
 408    * encode_trim_extra() does not however write the full map's
 409    * version to 'full_latest'.  This is only done when we are building the
 410    * full maps from the incremental versions.  But don't panic!  We make sure
 411    * that the following conditions find whichever full map version is newer.
 412    */
 413   version_t latest_full = get_version_latest_full();
 414   if (latest_full == 0 && get_first_committed() > 1)
 415     latest_full = get_first_committed();
 416
 417   if (get_first_committed() > 1 &&
 418       latest_full < get_first_committed()) {
 419     // the monitor could be just sync'ed with its peer, and the latest_full key
 420     // is not encoded in the paxos commits in encode_pending(), so we need to
 421     // make sure we get it pointing to a proper version.
 422     version_t lc = get_last_committed();
 423     version_t fc = get_first_committed();
 424
 425     dout(10) << __func__ << " looking for valid full map in interval"
 426              << " [" << fc << ", " << lc << "]" << dendl;
 427
 428     latest_full = 0;
 429     for (version_t v = lc; v >= fc; v--) {
 430       string full_key = "full_" + stringify(v);
 431       if (mon->store->exists(get_service_name(), full_key)) {
 432         dout(10) << __func__ << " found latest full map v " << v << dendl;
 433         latest_full = v;
 434         break;
 435       }
 436     }
 437
 438     ceph_assert(latest_full > 0);
 439     auto t(std::make_shared<MonitorDBStore::Transaction>());
 440     put_version_latest_full(t, latest_full);
 441     mon->store->apply_transaction(t);
 442     dout(10) << __func__ << " updated the on-disk full map version to "
 443              << latest_full << dendl;
 444   }
 445
 446   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 447     bufferlist latest_bl;
 448     get_version_full(latest_full, latest_bl);
 449     ceph_assert(latest_bl.length() != 0);
 450     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 451     osdmap = OSDMap();
 452     osdmap.decode(latest_bl);
 453   }
 454
 455   bufferlist bl;
 456   if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 457     auto p = bl.cbegin();
 458     std::lock_guard<std::mutex> l(creating_pgs_lock);
 459     creating_pgs.decode(p);
 460     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 461             << creating_pgs.last_scan_epoch
 462             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 463   } else {
 464     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 465             << dendl;
 466   }
 467
 468   // walk through incrementals
 469   MonitorDBStore::TransactionRef t;
 470   size_t tx_size = 0;
 471   while (version > osdmap.epoch) {
 472     bufferlist inc_bl;
 473     int err = get_version(osdmap.epoch+1, inc_bl);
 474     ceph_assert(err == 0);
 475     ceph_assert(inc_bl.length());
 476
 477     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 478             << dendl;
 479     OSDMap::Incremental inc(inc_bl);
 480     err = osdmap.apply_incremental(inc);
 481     ceph_assert(err == 0);
 482
 483     if (!t)
 484       t.reset(new MonitorDBStore::Transaction);
 485
 486     // Write out the full map for all past epochs.  Encode the full
 487     // map with the same features as the incremental.  If we don't
 488     // know, use the quorum features.  If we don't know those either,
 489     // encode with all features.
 490     uint64_t f = inc.encode_features;
 491     if (!f)
 492       f = mon->get_quorum_con_features();
 493     if (!f)
 494       f = -1;
 495     bufferlist full_bl;
 496     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 497     tx_size += full_bl.length();
 498
 499     bufferlist orig_full_bl;
 500     get_version_full(osdmap.epoch, orig_full_bl);
 501     if (orig_full_bl.length()) {
 502       // the primary provided the full map
 503       ceph_assert(inc.have_crc);
 504       if (inc.full_crc != osdmap.crc) {
 505         // This will happen if the mons were running mixed versions in
 506         // the past or some other circumstance made the full encoded
 507         // maps divergent.  Reloading here will bring us back into
 508         // sync with the primary for this and all future maps.  OSDs
 509         // will also be brought back into sync when they discover the
 510         // crc mismatch and request a full map from a mon.
 511         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 512              << dendl;
 513
 514         dout(20) << __func__ << " my (bad) full osdmap:\n";
 515         JSONFormatter jf(true);
 516         jf.dump_object("osdmap", osdmap);
 517         jf.flush(*_dout);
 518         *_dout << "\nhexdump:\n";
 519         full_bl.hexdump(*_dout);
 520         *_dout << dendl;
 521
 522         osdmap = OSDMap();
 523         osdmap.decode(orig_full_bl);
 524
 525         dout(20) << __func__ << " canonical full osdmap:\n";
 526         JSONFormatter jf(true);
 527         jf.dump_object("osdmap", osdmap);
 528         jf.flush(*_dout);
 529         *_dout << "\nhexdump:\n";
 530         orig_full_bl.hexdump(*_dout);
 531         *_dout << dendl;
 532       }
 533     } else {
 534       ceph_assert(!inc.have_crc);
 535       put_version_full(t, osdmap.epoch, full_bl);
 536     }
 537     put_version_latest_full(t, osdmap.epoch);
 538
 539     // share
 540     dout(1) << osdmap << dendl;
 541
 542     if (osdmap.epoch == 1) {
 543       t->erase("mkfs", "osdmap");
 544     }
 545
 546     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 547       mon->store->apply_transaction(t);
 548       t = MonitorDBStore::TransactionRef();
 549       tx_size = 0;
 550     }
 551     for (const auto &osd_state : inc.new_state) {
 552       if (osd_state.second & CEPH_OSD_UP) {
 553         // could be marked up *or* down, but we're too lazy to check which
 554         last_osd_report.erase(osd_state.first);
 555       }
 556       if (osd_state.second & CEPH_OSD_EXISTS) {
 557         // could be created *or* destroyed, but we can safely drop it
 558         osd_epochs.erase(osd_state.first);
 559       }
 560     }
 561   }
 562
 563   if (t) {
 564     mon->store->apply_transaction(t);
 565   }
 566
 567   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 568     if (osdmap.is_out(o))
 569       continue;
 570     auto found = down_pending_out.find(o);
 571     if (osdmap.is_down(o)) {
 572       // populate down -> out map
 573       if (found == down_pending_out.end()) {
 574         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 575         down_pending_out[o] = ceph_clock_now();
 576       }
 577     } else {
 578       if (found != down_pending_out.end()) {
 579         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 580         down_pending_out.erase(found);
 581       }
 582     }
 583   }
 584   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 585
 586   check_osdmap_subs();
 587   check_pg_creates_subs();
 588
 589   share_map_with_random_osd();
 590   update_logger();
 591
 592   process_failures();
 593
 594   // make sure our feature bits reflect the latest map
 595   update_msgr_features();
 596
 597   if (!mon->is_leader()) {
 598     // will be called by on_active() on the leader, avoid doing so twice
 599     start_mapping();
 600   }
 601 }
 602
 603 void OSDMonitor::start_mapping()
 604 {
 605   // initiate mapping job
 606   if (mapping_job) {
 607     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
 608              << dendl;
 609     mapping_job->abort();
 610   }
 611   if (!osdmap.get_pools().empty()) {
 612     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
 613     mapping_job = mapping.start_update(osdmap, mapper,
 614                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
 615     dout(10) << __func__ << " started mapping job " << mapping_job.get()
 616              << " at " << fin->start << dendl;
 617     mapping_job->set_finish_event(fin);
 618   } else {
 619     dout(10) << __func__ << " no pools, no mapping job" << dendl;
 620     mapping_job = nullptr;
 621   }
 622 }
 623
 624 void OSDMonitor::update_msgr_features()
 625 {
 626   set<int> types;
 627   types.insert((int)entity_name_t::TYPE_OSD);
 628   types.insert((int)entity_name_t::TYPE_CLIENT);
 629   types.insert((int)entity_name_t::TYPE_MDS);
 630   types.insert((int)entity_name_t::TYPE_MON);
 631   for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
 632     uint64_t mask;
 633     uint64_t features = osdmap.get_features(*q, &mask);
 634     if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
 635       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
 636       ceph::net::Policy p = mon->messenger->get_policy(*q);
 637       p.features_required = (p.features_required & ~mask) | features;
 638       mon->messenger->set_policy(*q, p);
 639     }
 640   }
 641 }
 642
 643 void OSDMonitor::on_active()
 644 {
 645   update_logger();
 646
 647   if (mon->is_leader()) {
 648     mon->clog->debug() << "osdmap " << osdmap;
 649     if (!priority_convert) {
 650       // Only do this once at start-up
 651       convert_pool_priorities();
 652       priority_convert = true;
 653     }
 654   } else {
 655     list<MonOpRequestRef> ls;
 656     take_all_failures(ls);
 657     while (!ls.empty()) {
 658       MonOpRequestRef op = ls.front();
 659       op->mark_osdmon_event(__func__);
 660       dispatch(op);
 661       ls.pop_front();
 662     }
 663   }
 664   start_mapping();
 665 }
 666
 667 void OSDMonitor::on_restart()
 668 {
 669   last_osd_report.clear();
 670 }
 671
 672 void OSDMonitor::on_shutdown()
 673 {
 674   dout(10) << __func__ << dendl;
 675   if (mapping_job) {
 676     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
 677              << dendl;
 678     mapping_job->abort();
 679   }
 680
 681   // discard failure info, waiters
 682   list<MonOpRequestRef> ls;
 683   take_all_failures(ls);
 684   ls.clear();
 685 }
 686
 687 void OSDMonitor::update_logger()
 688 {
 689   dout(10) << "update_logger" << dendl;
 690
 691   mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
 692   mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
 693   mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
 694   mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
 695 }
 696
 697 void OSDMonitor::create_pending()
 698 {
 699   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
 700   pending_inc.fsid = mon->monmap->fsid;
 701   pending_metadata.clear();
 702   pending_metadata_rm.clear();
 703
 704   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
 705
 706   // safety checks (this shouldn't really happen)
 707   {
 708     if (osdmap.backfillfull_ratio <= 0) {
 709       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 710       if (pending_inc.new_backfillfull_ratio > 1.0)
 711         pending_inc.new_backfillfull_ratio /= 100;
 712       dout(1) << __func__ << " setting backfillfull_ratio = "
 713               << pending_inc.new_backfillfull_ratio << dendl;
 714     }
 715     if (osdmap.full_ratio <= 0) {
 716       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
 717       if (pending_inc.new_full_ratio > 1.0)
 718         pending_inc.new_full_ratio /= 100;
 719       dout(1) << __func__ << " setting full_ratio = "
 720               << pending_inc.new_full_ratio << dendl;
 721     }
 722     if (osdmap.nearfull_ratio <= 0) {
 723       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 724       if (pending_inc.new_nearfull_ratio > 1.0)
 725         pending_inc.new_nearfull_ratio /= 100;
 726       dout(1) << __func__ << " setting nearfull_ratio = "
 727               << pending_inc.new_nearfull_ratio << dendl;
 728     }
 729   }
 730
 731   // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
 732   // structure.
 733   if (osdmap.crush->has_legacy_rule_ids()) {
 734     CrushWrapper newcrush;
 735     _get_pending_crush(newcrush);
 736
 737     // First, for all pools, work out which rule they really used
 738     // by resolving ruleset to rule.
 739     for (const auto &i : osdmap.get_pools()) {
 740       const auto pool_id = i.first;
 741       const auto &pool = i.second;
 742       int new_rule_id = newcrush.find_rule(pool.crush_rule,
 743                                            pool.type, pool.size);
 744
 745       dout(1) << __func__ << " rewriting pool "
 746               << osdmap.get_pool_name(pool_id) << " crush ruleset "
 747               << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
 748       if (pending_inc.new_pools.count(pool_id) == 0) {
 749         pending_inc.new_pools[pool_id] = pool;
 750       }
 751       pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
 752     }
 753
 754     // Now, go ahead and renumber all the rules so that their
 755     // rule_id field corresponds to their position in the array
 756     auto old_to_new = newcrush.renumber_rules();
 757     dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
 758     for (const auto &i : old_to_new) {
 759       dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
 760     }
 761     pending_inc.crush.clear();
 762     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
 763   }
 764 }
 765
 766 creating_pgs_t
 767 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
 768                                const OSDMap& nextmap)
 769 {
 770   dout(10) << __func__ << dendl;
 771   creating_pgs_t pending_creatings;
 772   {
 773     std::lock_guard<std::mutex> l(creating_pgs_lock);
 774     pending_creatings = creating_pgs;
 775   }
 776   // check for new or old pools
 777   if (pending_creatings.last_scan_epoch < inc.epoch) {
 778     unsigned queued = 0;
 779     queued += scan_for_creating_pgs(osdmap.get_pools(),
 780                                     inc.old_pools,
 781                                     inc.modified,
 782                                     &pending_creatings);
 783     queued += scan_for_creating_pgs(inc.new_pools,
 784                                     inc.old_pools,
 785                                     inc.modified,
 786                                     &pending_creatings);
 787     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
 788     for (auto deleted_pool : inc.old_pools) {
 789       auto removed = pending_creatings.remove_pool(deleted_pool);
 790       dout(10) << __func__ << " " << removed
 791                << " pg removed because containing pool deleted: "
 792                << deleted_pool << dendl;
 793       last_epoch_clean.remove_pool(deleted_pool);
 794     }
 795     // pgmon updates its creating_pgs in check_osd_map() which is called by
 796     // on_active() and check_osd_map() could be delayed if lease expires, so its
 797     // creating_pgs could be stale in comparison with the one of osdmon. let's
 798     // trim them here. otherwise, they will be added back after being erased.
 799     unsigned removed = 0;
 800     for (auto& pg : pending_created_pgs) {
 801       dout(20) << __func__ << " noting created pg " << pg << dendl;
 802       pending_creatings.created_pools.insert(pg.pool());
 803       removed += pending_creatings.pgs.erase(pg);
 804     }
 805     pending_created_pgs.clear();
 806     dout(10) << __func__ << " " << removed
 807              << " pgs removed because they're created" << dendl;
 808     pending_creatings.last_scan_epoch = osdmap.get_epoch();
 809   }
 810
 811   // filter out any pgs that shouldn't exist.
 812   {
 813     auto i = pending_creatings.pgs.begin();
 814     while (i != pending_creatings.pgs.end()) {
 815       if (!nextmap.pg_exists(i->first)) {
 816         dout(10) << __func__ << " removing pg " << i->first
 817                  << " which should not exist" << dendl;
 818         i = pending_creatings.pgs.erase(i);
 819       } else {
 820         ++i;
 821       }
 822     }
 823   }
 824
 825   // process queue
 826   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
 827   const auto total = pending_creatings.pgs.size();
 828   while (pending_creatings.pgs.size() < max &&
 829          !pending_creatings.queue.empty()) {
 830     auto p = pending_creatings.queue.begin();
 831     int64_t poolid = p->first;
 832     dout(10) << __func__ << " pool " << poolid
 833              << " created " << p->second.created
 834              << " modified " << p->second.modified
 835              << " [" << p->second.start << "-" << p->second.end << ")"
 836              << dendl;
 837     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
 838                                   p->second.end - p->second.start);
 839     ps_t first = p->second.start;
 840     ps_t end = first + n;
 841     for (ps_t ps = first; ps < end; ++ps) {
 842       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
 843       // NOTE: use the *current* epoch as the PG creation epoch so that the
 844       // OSD does not have to generate a long set of PastIntervals.
 845       pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
 846                                                     p->second.modified));
 847       dout(10) << __func__ << " adding " << pgid << dendl;
 848     }
 849     p->second.start = end;
 850     if (p->second.done()) {
 851       dout(10) << __func__ << " done with queue for " << poolid << dendl;
 852       pending_creatings.queue.erase(p);
 853     } else {
 854       dout(10) << __func__ << " pool " << poolid
 855                << " now [" << p->second.start << "-" << p->second.end << ")"
 856                << dendl;
 857     }
 858   }
 859   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
 860            << " pools" << dendl;
 861   dout(10) << __func__
 862            << " " << (pending_creatings.pgs.size() - total)
 863            << "/" << pending_creatings.pgs.size()
 864            << " pgs added from queued pools" << dendl;
 865   return pending_creatings;
 866 }
 867
 868 void OSDMonitor::maybe_prime_pg_temp()
 869 {
 870   bool all = false;
 871   if (pending_inc.crush.length()) {
 872     dout(10) << __func__ << " new crush map, all" << dendl;
 873     all = true;
 874   }
 875
 876   if (!pending_inc.new_up_client.empty()) {
 877     dout(10) << __func__ << " new up osds, all" << dendl;
 878     all = true;
 879   }
 880
 881   // check for interesting OSDs
 882   set<int> osds;
 883   for (auto p = pending_inc.new_state.begin();
 884        !all && p != pending_inc.new_state.end();
 885        ++p) {
 886     if ((p->second & CEPH_OSD_UP) &&
 887         osdmap.is_up(p->first)) {
 888       osds.insert(p->first);
 889     }
 890   }
 891   for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
 892        !all && p != pending_inc.new_weight.end();
 893        ++p) {
 894     if (p->second < osdmap.get_weight(p->first)) {
 895       // weight reduction
 896       osds.insert(p->first);
 897     } else {
 898       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
 899                << dendl;
 900       all = true;
 901     }
 902   }
 903
 904   if (!all && osds.empty())
 905     return;
 906
 907   if (!all) {
 908     unsigned estimate =
 909       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
 910     if (estimate > mapping.get_num_pgs() *
 911         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
 912       dout(10) << __func__ << " estimate " << estimate << " pgs on "
 913                << osds.size() << " osds >= "
 914                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
 915                << mapping.get_num_pgs() << " pgs, all"
 916                << dendl;
 917       all = true;
 918     } else {
 919       dout(10) << __func__ << " estimate " << estimate << " pgs on "
 920                << osds.size() << " osds" << dendl;
 921     }
 922   }
 923
 924   OSDMap next;
 925   next.deepish_copy_from(osdmap);
 926   next.apply_incremental(pending_inc);
 927
 928   if (next.get_pools().empty()) {
 929     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
 930   } else if (all) {
 931     PrimeTempJob job(next, this);
 932     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
 933     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
 934       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
 935     } else {
 936       dout(10) << __func__ << " did not finish in "
 937                << g_conf()->mon_osd_prime_pg_temp_max_time
 938                << ", stopping" << dendl;
 939       job.abort();
 940     }
 941   } else {
 942     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
 943     utime_t stop = ceph_clock_now();
 944     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
 945     const int chunk = 1000;
 946     int n = chunk;
 947     std::unordered_set<pg_t> did_pgs;
 948     for (auto osd : osds) {
 949       auto& pgs = mapping.get_osd_acting_pgs(osd);
 950       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
 951       for (auto pgid : pgs) {
 952         if (!did_pgs.insert(pgid).second) {
 953           continue;
 954         }
 955         prime_pg_temp(next, pgid);
 956         if (--n <= 0) {
 957           n = chunk;
 958           if (ceph_clock_now() > stop) {
 959             dout(10) << __func__ << " consumed more than "
 960                      << g_conf()->mon_osd_prime_pg_temp_max_time
 961                      << " seconds, stopping"
 962                      << dendl;
 963             return;
 964           }
 965         }
 966       }
 967     }
 968   }
 969 }
 970
 971 void OSDMonitor::prime_pg_temp(
 972   const OSDMap& next,
 973   pg_t pgid)
 974 {
 975   // TODO: remove this creating_pgs direct access?
 976   if (creating_pgs.pgs.count(pgid)) {
 977     return;
 978   }
 979   if (!osdmap.pg_exists(pgid)) {
 980     return;
 981   }
 982
 983   vector<int> up, acting;
 984   mapping.get(pgid, &up, nullptr, &acting, nullptr);
 985
 986   vector<int> next_up, next_acting;
 987   int next_up_primary, next_acting_primary;
 988   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
 989                             &next_acting, &next_acting_primary);
 990   if (acting == next_acting &&
 991       !(up != acting && next_up == next_acting))
 992     return;  // no change since last epoch
 993
 994   if (acting.empty())
 995     return;  // if previously empty now we can be no worse off
 996   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
 997   if (pool && acting.size() < pool->min_size)
 998     return;  // can be no worse off than before
 999
1000   if (next_up == next_acting) {
1001     acting.clear();
1002     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1003              << dendl;
1004   }
1005
1006   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1007            << " -> " << next_up << "/" << next_acting
1008            << ", priming " << acting
1009            << dendl;
1010   {
1011     std::lock_guard l(prime_pg_temp_lock);
1012     // do not touch a mapping if a change is pending
1013     pending_inc.new_pg_temp.emplace(
1014       pgid,
1015       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1016   }
1017 }
1018
1019 /**
1020  * @note receiving a transaction in this function gives a fair amount of
1021  * freedom to the service implementation if it does need it. It shouldn't.
1022  */
1023 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1024 {
1025   dout(10) << "encode_pending e " << pending_inc.epoch
1026            << dendl;
1027
1028   if (do_prune(t)) {
1029     dout(1) << __func__ << " osdmap full prune encoded e"
1030             << pending_inc.epoch << dendl;
1031   }
1032
1033   // finalize up pending_inc
1034   pending_inc.modified = ceph_clock_now();
1035
1036   int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1037   ceph_assert(r == 0);
1038
1039   if (mapping_job) {
1040     if (!mapping_job->is_done()) {
1041       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1042               << mapping_job.get() << " did not complete, "
1043               << mapping_job->shards << " left" << dendl;
1044       mapping_job->abort();
1045     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1046       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1047               << mapping_job.get() << " is prior epoch "
1048               << mapping.get_epoch() << dendl;
1049     } else {
1050       if (g_conf()->mon_osd_prime_pg_temp) {
1051         maybe_prime_pg_temp();
1052       }
1053     }
1054   } else if (g_conf()->mon_osd_prime_pg_temp) {
1055     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1056             << dendl;
1057   }
1058   mapping_job.reset();
1059
1060   // ensure we don't have blank new_state updates.  these are interrpeted as
1061   // CEPH_OSD_UP (and almost certainly not what we want!).
1062   auto p = pending_inc.new_state.begin();
1063   while (p != pending_inc.new_state.end()) {
1064     if (p->second == 0) {
1065       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1066       p = pending_inc.new_state.erase(p);
1067     } else {
1068       if (p->second & CEPH_OSD_UP) {
1069         pending_inc.new_last_up_change = pending_inc.modified;
1070       }
1071       ++p;
1072     }
1073   }
1074   if (!pending_inc.new_up_client.empty()) {
1075     pending_inc.new_last_up_change = pending_inc.modified;
1076   }
1077   for (auto& i : pending_inc.new_weight) {
1078     if (i.first > osdmap.max_osd) {
1079       if (i.second) {
1080         // new osd is already marked in
1081         pending_inc.new_last_in_change = pending_inc.modified;
1082       }
1083     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1084       // existing osd marked in or out
1085       pending_inc.new_last_in_change = pending_inc.modified;
1086     }
1087   }
1088
1089   {
1090     OSDMap tmp;
1091     tmp.deepish_copy_from(osdmap);
1092     tmp.apply_incremental(pending_inc);
1093
1094     // clean pg_temp mappings
1095     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1096
1097     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1098     {
1099       // check every upmapped pg for now
1100       // until we could reliably identify certain cases to ignore,
1101       // which is obviously the hard part TBD..
1102       vector<pg_t> pgs_to_check;
1103       tmp.get_upmap_pgs(&pgs_to_check);
1104       if (pgs_to_check.size() < g_conf()->mon_clean_pg_upmaps_per_chunk * 2) {
1105         // not enough pgs, do it inline
1106         tmp.clean_pg_upmaps(cct, &pending_inc);
1107       } else {
1108         CleanUpmapJob job(cct, tmp, pending_inc);
1109         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1110         job.wait();
1111       }
1112     }
1113
1114     // update creating pgs first so that we can remove the created pgid and
1115     // process the pool flag removal below in the same osdmap epoch.
1116     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1117     bufferlist creatings_bl;
1118     encode(pending_creatings, creatings_bl);
1119     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1120
1121     // remove any old (or incompat) POOL_CREATING flags
1122     for (auto& i : tmp.get_pools()) {
1123       if (tmp.require_osd_release < CEPH_RELEASE_NAUTILUS) {
1124         // pre-nautilus OSDMaps shouldn't get this flag.
1125         if (pending_inc.new_pools.count(i.first)) {
1126           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1127         }
1128       }
1129       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1130           !pending_creatings.still_creating_pool(i.first)) {
1131         dout(10) << __func__ << " done creating pool " << i.first
1132                  << ", clearing CREATING flag" << dendl;
1133         if (pending_inc.new_pools.count(i.first) == 0) {
1134           pending_inc.new_pools[i.first] = i.second;
1135         }
1136         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1137       }
1138     }
1139
1140     // remove any legacy osdmap nearfull/full flags
1141     {
1142       if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
1143         dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
1144                  << dendl;
1145         remove_flag(CEPH_OSDMAP_NEARFULL);
1146         remove_flag(CEPH_OSDMAP_FULL);
1147       }
1148     }
1149     // collect which pools are currently affected by
1150     // the near/backfill/full osd(s),
1151     // and set per-pool near/backfill/full flag instead
1152     set<int64_t> full_pool_ids;
1153     set<int64_t> backfillfull_pool_ids;
1154     set<int64_t> nearfull_pool_ids;
1155     tmp.get_full_pools(cct,
1156                        &full_pool_ids,
1157                        &backfillfull_pool_ids,
1158                          &nearfull_pool_ids);
1159     if (full_pool_ids.empty() ||
1160         backfillfull_pool_ids.empty() ||
1161         nearfull_pool_ids.empty()) {
1162       // normal case - no nearfull, backfillfull or full osds
1163         // try cancel any improper nearfull/backfillfull/full pool
1164         // flags first
1165       for (auto &pool: tmp.get_pools()) {
1166         auto p = pool.first;
1167         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1168             nearfull_pool_ids.empty()) {
1169           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1170                    << "'s nearfull flag" << dendl;
1171           if (pending_inc.new_pools.count(p) == 0) {
1172             // load original pool info first!
1173             pending_inc.new_pools[p] = pool.second;
1174           }
1175           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1176         }
1177         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1178             backfillfull_pool_ids.empty()) {
1179           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1180                    << "'s backfillfull flag" << dendl;
1181           if (pending_inc.new_pools.count(p) == 0) {
1182             pending_inc.new_pools[p] = pool.second;
1183           }
1184           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1185         }
1186         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1187             full_pool_ids.empty()) {
1188           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1189             // set by EQUOTA, skipping
1190             continue;
1191           }
1192           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1193                    << "'s full flag" << dendl;
1194           if (pending_inc.new_pools.count(p) == 0) {
1195             pending_inc.new_pools[p] = pool.second;
1196           }
1197           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1198         }
1199       }
1200     }
1201     if (!full_pool_ids.empty()) {
1202       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1203                << " as full" << dendl;
1204       for (auto &p: full_pool_ids) {
1205         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1206           continue;
1207         }
1208         if (pending_inc.new_pools.count(p) == 0) {
1209           pending_inc.new_pools[p] = tmp.pools[p];
1210         }
1211         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1212         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1213         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1214       }
1215       // cancel FLAG_FULL for pools which are no longer full too
1216       for (auto &pool: tmp.get_pools()) {
1217         auto p = pool.first;
1218         if (full_pool_ids.count(p)) {
1219           // skip pools we have just marked as full above
1220           continue;
1221         }
1222         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1223             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1224           // don't touch if currently is not full
1225           // or is running out of quota (and hence considered as full)
1226           continue;
1227         }
1228         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1229                  << "'s full flag" << dendl;
1230         if (pending_inc.new_pools.count(p) == 0) {
1231           pending_inc.new_pools[p] = pool.second;
1232         }
1233         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1234       }
1235     }
1236     if (!backfillfull_pool_ids.empty()) {
1237       for (auto &p: backfillfull_pool_ids) {
1238         if (full_pool_ids.count(p)) {
1239           // skip pools we have already considered as full above
1240           continue;
1241         }
1242         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1243           // make sure FLAG_FULL is truly set, so we are safe not
1244           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1245           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1246           continue;
1247         }
1248         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1249           // don't bother if pool is already marked as backfillfull
1250           continue;
1251         }
1252         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1253                  << "'s as backfillfull" << dendl;
1254         if (pending_inc.new_pools.count(p) == 0) {
1255           pending_inc.new_pools[p] = tmp.pools[p];
1256         }
1257         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1258         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1259       }
1260       // cancel FLAG_BACKFILLFULL for pools
1261       // which are no longer backfillfull too
1262       for (auto &pool: tmp.get_pools()) {
1263         auto p = pool.first;
1264         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1265           // skip pools we have just marked as backfillfull/full above
1266           continue;
1267         }
1268         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1269           // and don't touch if currently is not backfillfull
1270           continue;
1271         }
1272         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1273                  << "'s backfillfull flag" << dendl;
1274         if (pending_inc.new_pools.count(p) == 0) {
1275           pending_inc.new_pools[p] = pool.second;
1276         }
1277         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1278       }
1279     }
1280     if (!nearfull_pool_ids.empty()) {
1281       for (auto &p: nearfull_pool_ids) {
1282         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1283           continue;
1284         }
1285         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1286           // make sure FLAG_FULL is truly set, so we are safe not
1287           // to set a extra (redundant) FLAG_NEARFULL flag
1288           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1289           continue;
1290         }
1291         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1292           // don't bother if pool is already marked as nearfull
1293           continue;
1294         }
1295         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1296                  << "'s as nearfull" << dendl;
1297         if (pending_inc.new_pools.count(p) == 0) {
1298           pending_inc.new_pools[p] = tmp.pools[p];
1299         }
1300         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1301       }
1302       // cancel FLAG_NEARFULL for pools
1303       // which are no longer nearfull too
1304       for (auto &pool: tmp.get_pools()) {
1305         auto p = pool.first;
1306         if (full_pool_ids.count(p) ||
1307             backfillfull_pool_ids.count(p) ||
1308             nearfull_pool_ids.count(p)) {
1309           // skip pools we have just marked as
1310           // nearfull/backfillfull/full above
1311           continue;
1312         }
1313         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1314           // and don't touch if currently is not nearfull
1315           continue;
1316         }
1317         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1318                  << "'s nearfull flag" << dendl;
1319         if (pending_inc.new_pools.count(p) == 0) {
1320           pending_inc.new_pools[p] = pool.second;
1321         }
1322         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1323       }
1324     }
1325
1326     // min_compat_client?
1327     if (tmp.require_min_compat_client == 0) {
1328       auto mv = tmp.get_min_compat_client();
1329       dout(1) << __func__ << " setting require_min_compat_client to currently "
1330               << "required " << ceph_release_name(mv) << dendl;
1331       mon->clog->info() << "setting require_min_compat_client to currently "
1332                         << "required " << ceph_release_name(mv);
1333       pending_inc.new_require_min_compat_client = mv;
1334     }
1335
1336     // upgrade to mimic?
1337     if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC &&
1338         tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1339       dout(10) << __func__ << " first mimic+ epoch" << dendl;
1340       // record this epoch as the deletion for all legacy removed_snaps
1341       for (auto& p : tmp.get_pools()) {
1342         // update every pool
1343         if (pending_inc.new_pools.count(p.first) == 0) {
1344           pending_inc.new_pools[p.first] = p.second;
1345         }
1346         auto& pi = pending_inc.new_pools[p.first];
1347         if (pi.snap_seq == 0) {
1348           // no snaps on this pool
1349           continue;
1350         }
1351         if ((pi.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS |
1352                          pg_pool_t::FLAG_POOL_SNAPS)) == 0) {
1353           if (!pi.removed_snaps.empty()) {
1354             pi.flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
1355           } else {
1356             pi.flags |= pg_pool_t::FLAG_POOL_SNAPS;
1357           }
1358         }
1359
1360         // Make all previously removed snaps appear to be removed in this
1361         // epoch.  this populates removed_snaps_queue.  The OSD will subtract
1362         // off its purged_snaps, as before, and this set will shrink over the
1363         // following epochs as the purged snaps are reported back through the
1364         // mgr.
1365         OSDMap::snap_interval_set_t removed;
1366         if (!p.second.removed_snaps.empty()) {
1367           // different flavor of interval_set :(
1368           for (auto q = p.second.removed_snaps.begin();
1369                q != p.second.removed_snaps.end();
1370                ++q) {
1371             removed.insert(q.get_start(), q.get_len());
1372           }
1373         } else {
1374           for (snapid_t s = 1; s <= pi.get_snap_seq(); s = s + 1) {
1375             if (pi.snaps.count(s) == 0) {
1376               removed.insert(s);
1377             }
1378           }
1379         }
1380         pending_inc.new_removed_snaps[p.first].union_of(removed);
1381
1382         dout(10) << __func__ << " converting pool " << p.first
1383                  << " with " << p.second.removed_snaps.size()
1384                  << " legacy removed_snaps" << dendl;
1385         string k = make_snap_epoch_key(p.first, pending_inc.epoch);
1386         bufferlist v;
1387         encode(p.second.removed_snaps, v);
1388         t->put(OSD_SNAP_PREFIX, k, v);
1389         for (auto q = p.second.removed_snaps.begin();
1390              q != p.second.removed_snaps.end();
1391              ++q) {
1392           bufferlist v;
1393           string k = make_snap_key_value(p.first, q.get_start(),
1394                                          q.get_len(), pending_inc.epoch, &v);
1395           t->put(OSD_SNAP_PREFIX, k, v);
1396         }
1397       }
1398     }
1399     if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS &&
1400         tmp.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
1401       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1402       // add creating flags?
1403       for (auto& i : tmp.get_pools()) {
1404         if (pending_creatings.still_creating_pool(i.first)) {
1405           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1406                    << dendl;
1407           if (pending_inc.new_pools.count(i.first) == 0) {
1408             pending_inc.new_pools[i.first] = i.second;
1409           }
1410           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1411         }
1412       }
1413       // adjust blacklist items to all be TYPE_ANY
1414       for (auto& i : tmp.blacklist) {
1415         auto a = i.first;
1416         a.set_type(entity_addr_t::TYPE_ANY);
1417         pending_inc.new_blacklist[a] = i.second;
1418         pending_inc.old_blacklist.push_back(i.first);
1419       }
1420     }
1421   }
1422
1423   // tell me about it
1424   for (auto i = pending_inc.new_state.begin();
1425        i != pending_inc.new_state.end();
1426        ++i) {
1427     int s = i->second ? i->second : CEPH_OSD_UP;
1428     if (s & CEPH_OSD_UP)
1429       dout(2) << " osd." << i->first << " DOWN" << dendl;
1430     if (s & CEPH_OSD_EXISTS)
1431       dout(2) << " osd." << i->first << " DNE" << dendl;
1432   }
1433   for (auto i = pending_inc.new_up_client.begin();
1434        i != pending_inc.new_up_client.end();
1435        ++i) {
1436     //FIXME: insert cluster addresses too
1437     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1438   }
1439   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1440        i != pending_inc.new_weight.end();
1441        ++i) {
1442     if (i->second == CEPH_OSD_OUT) {
1443       dout(2) << " osd." << i->first << " OUT" << dendl;
1444     } else if (i->second == CEPH_OSD_IN) {
1445       dout(2) << " osd." << i->first << " IN" << dendl;
1446     } else {
1447       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1448     }
1449   }
1450
1451   // features for osdmap and its incremental
1452   uint64_t features;
1453
1454   // encode full map and determine its crc
1455   OSDMap tmp;
1456   {
1457     tmp.deepish_copy_from(osdmap);
1458     tmp.apply_incremental(pending_inc);
1459
1460     // determine appropriate features
1461     features = tmp.get_encoding_features();
1462     dout(10) << __func__ << " encoding full map with "
1463              << ceph_release_name(tmp.require_osd_release)
1464              << " features " << features << dendl;
1465
1466     // the features should be a subset of the mon quorum's features!
1467     ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1468
1469     bufferlist fullbl;
1470     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1471     pending_inc.full_crc = tmp.get_crc();
1472
1473     // include full map in the txn.  note that old monitors will
1474     // overwrite this.  new ones will now skip the local full map
1475     // encode and reload from this.
1476     put_version_full(t, pending_inc.epoch, fullbl);
1477   }
1478
1479   // encode
1480   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1481   bufferlist bl;
1482   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1483
1484   dout(20) << " full_crc " << tmp.get_crc()
1485            << " inc_crc " << pending_inc.inc_crc << dendl;
1486
1487   /* put everything in the transaction */
1488   put_version(t, pending_inc.epoch, bl);
1489   put_last_committed(t, pending_inc.epoch);
1490
1491   // metadata, too!
1492   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1493        p != pending_metadata.end();
1494        ++p)
1495     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1496   for (set<int>::iterator p = pending_metadata_rm.begin();
1497        p != pending_metadata_rm.end();
1498        ++p)
1499     t->erase(OSD_METADATA_PREFIX, stringify(*p));
1500   pending_metadata.clear();
1501   pending_metadata_rm.clear();
1502
1503   // removed_snaps
1504   if (tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1505     for (auto& i : pending_inc.new_removed_snaps) {
1506       {
1507         // all snaps removed this epoch
1508         string k = make_snap_epoch_key(i.first, pending_inc.epoch);
1509         bufferlist v;
1510         encode(i.second, v);
1511         t->put(OSD_SNAP_PREFIX, k, v);
1512       }
1513       for (auto q = i.second.begin();
1514            q != i.second.end();
1515            ++q) {
1516         bufferlist v;
1517         string k = make_snap_key_value(i.first, q.get_start(),
1518                                        q.get_len(), pending_inc.epoch, &v);
1519         t->put(OSD_SNAP_PREFIX, k, v);
1520       }
1521     }
1522     for (auto& i : pending_inc.new_purged_snaps) {
1523       for (auto q = i.second.begin();
1524            q != i.second.end();
1525            ++q) {
1526         bufferlist v;
1527         string k = make_snap_purged_key_value(i.first, q.get_start(),
1528                                               q.get_len(), pending_inc.epoch,
1529                                               &v);
1530         t->put(OSD_SNAP_PREFIX, k, v);
1531       }
1532     }
1533   }
1534
1535   // health
1536   health_check_map_t next;
1537   tmp.check_health(&next);
1538   encode_health(next, t);
1539 }
1540
1541 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1542 {
1543   bufferlist bl;
1544   int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1545   if (r < 0)
1546     return r;
1547   try {
1548     auto p = bl.cbegin();
1549     decode(m, p);
1550   }
1551   catch (buffer::error& e) {
1552     if (err)
1553       *err << "osd." << osd << " metadata is corrupt";
1554     return -EIO;
1555   }
1556   return 0;
1557 }
1558
1559 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
1560 {
1561   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1562     if (osdmap.is_up(osd)) {
1563       map<string,string> meta;
1564       load_metadata(osd, meta, nullptr);
1565       auto p = meta.find(field);
1566       if (p == meta.end()) {
1567         (*out)["unknown"]++;
1568       } else {
1569         (*out)[p->second]++;
1570       }
1571     }
1572   }
1573 }
1574
1575 void OSDMonitor::count_metadata(const string& field, Formatter *f)
1576 {
1577   map<string,int> by_val;
1578   count_metadata(field, &by_val);
1579   f->open_object_section(field.c_str());
1580   for (auto& p : by_val) {
1581     f->dump_int(p.first.c_str(), p.second);
1582   }
1583   f->close_section();
1584 }
1585
1586 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1587 {
1588   map<string, string> metadata;
1589   int r = load_metadata(osd, metadata, nullptr);
1590   if (r < 0)
1591     return r;
1592
1593   auto it = metadata.find("osd_objectstore");
1594   if (it == metadata.end())
1595     return -ENOENT;
1596   *type = it->second;
1597   return 0;
1598 }
1599
1600 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1601                                                  const pg_pool_t &pool,
1602                                                  ostream *err)
1603 {
1604   // just check a few pgs for efficiency - this can't give a guarantee anyway,
1605   // since filestore osds could always join the pool later
1606   set<int> checked_osds;
1607   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
1608     vector<int> up, acting;
1609     pg_t pgid(ps, pool_id);
1610     osdmap.pg_to_up_acting_osds(pgid, up, acting);
1611     for (int osd : up) {
1612       if (checked_osds.find(osd) != checked_osds.end())
1613         continue;
1614       string objectstore_type;
1615       int r = get_osd_objectstore_type(osd, &objectstore_type);
1616       // allow with missing metadata, e.g. due to an osd never booting yet
1617       if (r < 0 || objectstore_type == "bluestore") {
1618         checked_osds.insert(osd);
1619         continue;
1620       }
1621       *err << "osd." << osd << " uses " << objectstore_type;
1622       return false;
1623     }
1624   }
1625   return true;
1626 }
1627
1628 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1629 {
1630   map<string,string> m;
1631   if (int r = load_metadata(osd, m, err))
1632     return r;
1633   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1634     f->dump_string(p->first.c_str(), p->second);
1635   return 0;
1636 }
1637
1638 void OSDMonitor::print_nodes(Formatter *f)
1639 {
1640   // group OSDs by their hosts
1641   map<string, list<int> > osds; // hostname => osd
1642   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1643     map<string, string> m;
1644     if (load_metadata(osd, m, NULL)) {
1645       continue;
1646     }
1647     map<string, string>::iterator hostname = m.find("hostname");
1648     if (hostname == m.end()) {
1649       // not likely though
1650       continue;
1651     }
1652     osds[hostname->second].push_back(osd);
1653   }
1654
1655   dump_services(f, osds, "osd");
1656 }
1657
1658 void OSDMonitor::share_map_with_random_osd()
1659 {
1660   if (osdmap.get_num_up_osds() == 0) {
1661     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1662     return;
1663   }
1664
1665   MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1666   if (!s) {
1667     dout(10) << __func__ << " no up osd on our session map" << dendl;
1668     return;
1669   }
1670
1671   dout(10) << "committed, telling random " << s->name
1672            << " all about it" << dendl;
1673
1674   // get feature of the peer
1675   // use quorum_con_features, if it's an anonymous connection.
1676   uint64_t features = s->con_features ? s->con_features :
1677                                         mon->get_quorum_con_features();
1678   // whatev, they'll request more if they need it
1679   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
1680   s->con->send_message(m);
1681   // NOTE: do *not* record osd has up to this epoch (as we do
1682   // elsewhere) as they may still need to request older values.
1683 }
1684
1685 version_t OSDMonitor::get_trim_to() const
1686 {
1687   if (mon->get_quorum().empty()) {
1688     dout(10) << __func__ << ": quorum not formed" << dendl;
1689     return 0;
1690   }
1691
1692   {
1693     std::lock_guard<std::mutex> l(creating_pgs_lock);
1694     if (!creating_pgs.pgs.empty()) {
1695       return 0;
1696     }
1697   }
1698
1699   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
1700     dout(0) << __func__
1701             << " blocking osdmap trim"
1702                " ('mon_debug_block_osdmap_trim' set to 'true')"
1703             << dendl;
1704     return 0;
1705   }
1706
1707   {
1708     epoch_t floor = get_min_last_epoch_clean();
1709     dout(10) << " min_last_epoch_clean " << floor << dendl;
1710     if (g_conf()->mon_osd_force_trim_to > 0 &&
1711         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
1712       floor = g_conf()->mon_osd_force_trim_to;
1713       dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1714     }
1715     unsigned min = g_conf()->mon_min_osdmap_epochs;
1716     if (floor + min > get_last_committed()) {
1717       if (min < get_last_committed())
1718         floor = get_last_committed() - min;
1719       else
1720         floor = 0;
1721     }
1722     if (floor > get_first_committed())
1723       return floor;
1724   }
1725   return 0;
1726 }
1727
1728 epoch_t OSDMonitor::get_min_last_epoch_clean() const
1729 {
1730   auto floor = last_epoch_clean.get_lower_bound(osdmap);
1731   // also scan osd epochs
1732   // don't trim past the oldest reported osd epoch
1733   for (auto& osd_epoch : osd_epochs) {
1734     if (osd_epoch.second < floor) {
1735       floor = osd_epoch.second;
1736     }
1737   }
1738   return floor;
1739 }
1740
1741 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1742                                    version_t first)
1743 {
1744   dout(10) << __func__ << " including full map for e " << first << dendl;
1745   bufferlist bl;
1746   get_version_full(first, bl);
1747   put_version_full(tx, first, bl);
1748
1749   if (has_osdmap_manifest &&
1750       first > osdmap_manifest.get_first_pinned()) {
1751     _prune_update_trimmed(tx, first);
1752   }
1753 }
1754
1755
1756 /* full osdmap prune
1757  *
1758  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
1759  */
1760
1761 void OSDMonitor::load_osdmap_manifest()
1762 {
1763   bool store_has_manifest =
1764     mon->store->exists(get_service_name(), "osdmap_manifest");
1765
1766   if (!store_has_manifest) {
1767     if (!has_osdmap_manifest) {
1768       return;
1769     }
1770
1771     dout(20) << __func__
1772              << " dropping osdmap manifest from memory." << dendl;
1773     osdmap_manifest = osdmap_manifest_t();
1774     has_osdmap_manifest = false;
1775     return;
1776   }
1777
1778   dout(20) << __func__
1779            << " osdmap manifest detected in store; reload." << dendl;
1780
1781   bufferlist manifest_bl;
1782   int r = get_value("osdmap_manifest", manifest_bl);
1783   if (r < 0) {
1784     derr << __func__ << " unable to read osdmap version manifest" << dendl;
1785     ceph_abort_msg("error reading manifest");
1786   }
1787   osdmap_manifest.decode(manifest_bl);
1788   has_osdmap_manifest = true;
1789
1790   dout(10) << __func__ << " store osdmap manifest pinned ("
1791            << osdmap_manifest.get_first_pinned()
1792            << " .. "
1793            << osdmap_manifest.get_last_pinned()
1794            << ")"
1795            << dendl;
1796 }
1797
1798 bool OSDMonitor::should_prune() const
1799 {
1800   version_t first = get_first_committed();
1801   version_t last = get_last_committed();
1802   version_t min_osdmap_epochs =
1803     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
1804   version_t prune_min =
1805     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
1806   version_t prune_interval =
1807     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
1808   version_t last_pinned = osdmap_manifest.get_last_pinned();
1809   version_t last_to_pin = last - min_osdmap_epochs;
1810
1811   // Make it or break it constraints.
1812   //
1813   // If any of these conditions fails, we will not prune, regardless of
1814   // whether we have an on-disk manifest with an on-going pruning state.
1815   //
1816   if ((last - first) <= min_osdmap_epochs) {
1817     // between the first and last committed epochs, we don't have
1818     // enough epochs to trim, much less to prune.
1819     dout(10) << __func__
1820              << " currently holding only " << (last - first)
1821              << " epochs (min osdmap epochs: " << min_osdmap_epochs
1822              << "); do not prune."
1823              << dendl;
1824     return false;
1825
1826   } else if ((last_to_pin - first) < prune_min) {
1827     // between the first committed epoch and the last epoch we would prune,
1828     // we simply don't have enough versions over the minimum to prune maps.
1829     dout(10) << __func__
1830              << " could only prune " << (last_to_pin - first)
1831              << " epochs (" << first << ".." << last_to_pin << "), which"
1832                 " is less than the required minimum (" << prune_min << ")"
1833              << dendl;
1834     return false;
1835
1836   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
1837     dout(10) << __func__
1838              << " we have pruned as far as we can; do not prune."
1839              << dendl;
1840     return false;
1841
1842   } else if (last_pinned + prune_interval > last_to_pin) {
1843     dout(10) << __func__
1844              << " not enough epochs to form an interval (last pinned: "
1845              << last_pinned << ", last to pin: "
1846              << last_to_pin << ", interval: " << prune_interval << ")"
1847              << dendl;
1848     return false;
1849   }
1850
1851   dout(15) << __func__
1852            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
1853            << " lc (" << first << ".." << last << ")"
1854            << dendl;
1855   return true;
1856 }
1857
1858 void OSDMonitor::_prune_update_trimmed(
1859     MonitorDBStore::TransactionRef tx,
1860     version_t first)
1861 {
1862   dout(10) << __func__
1863            << " first " << first
1864            << " last_pinned " << osdmap_manifest.get_last_pinned()
1865            << " last_pinned " << osdmap_manifest.get_last_pinned()
1866            << dendl;
1867
1868   osdmap_manifest_t manifest = osdmap_manifest;
1869
1870   if (!manifest.is_pinned(first)) {
1871     manifest.pin(first);
1872   }
1873
1874   set<version_t>::iterator p_end = manifest.pinned.find(first);
1875   set<version_t>::iterator p = manifest.pinned.begin();
1876   manifest.pinned.erase(p, p_end);
1877   ceph_assert(manifest.get_first_pinned() == first);
1878
1879   if (manifest.get_last_pinned() == first+1 ||
1880       manifest.pinned.size() == 1) {
1881     // we reached the end of the line, as pinned maps go; clean up our
1882     // manifest, and let `should_prune()` decide whether we should prune
1883     // again.
1884     tx->erase(get_service_name(), "osdmap_manifest");
1885     return;
1886   }
1887
1888   bufferlist bl;
1889   manifest.encode(bl);
1890   tx->put(get_service_name(), "osdmap_manifest", bl);
1891 }
1892
1893 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
1894 {
1895   dout(1) << __func__ << dendl;
1896
1897   version_t pin_first;
1898
1899   // verify constrainsts on stable in-memory state
1900   if (!has_osdmap_manifest) {
1901     // we must have never pruned, OR if we pruned the state must no longer
1902     // be relevant (i.e., the state must have been removed alongside with
1903     // the trim that *must* have removed past the last pinned map in a
1904     // previous prune).
1905     ceph_assert(osdmap_manifest.pinned.empty());
1906     ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
1907     pin_first = get_first_committed();
1908
1909   } else {
1910     // we must have pruned in the past AND its state is still relevant
1911     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
1912     // and thus we still hold a manifest in the store).
1913     ceph_assert(!osdmap_manifest.pinned.empty());
1914     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
1915     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
1916
1917     dout(10) << __func__
1918              << " first_pinned " << osdmap_manifest.get_first_pinned()
1919              << " last_pinned " << osdmap_manifest.get_last_pinned()
1920              << dendl;
1921
1922     pin_first = osdmap_manifest.get_last_pinned();
1923   }
1924
1925   manifest.pin(pin_first);
1926 }
1927
1928 bool OSDMonitor::_prune_sanitize_options() const
1929 {
1930   uint64_t prune_interval =
1931     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
1932   uint64_t prune_min =
1933     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
1934   uint64_t txsize =
1935     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
1936
1937   bool r = true;
1938
1939   if (prune_interval == 0) {
1940     derr << __func__
1941          << " prune is enabled BUT prune interval is zero; abort."
1942          << dendl;
1943     r = false;
1944   } else if (prune_interval == 1) {
1945     derr << __func__
1946          << " prune interval is equal to one, which essentially means"
1947             " no pruning; abort."
1948          << dendl;
1949     r = false;
1950   }
1951   if (prune_min == 0) {
1952     derr << __func__
1953          << " prune is enabled BUT prune min is zero; abort."
1954          << dendl;
1955     r = false;
1956   }
1957   if (prune_interval > prune_min) {
1958     derr << __func__
1959          << " impossible to ascertain proper prune interval because"
1960          << " it is greater than the minimum prune epochs"
1961          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
1962          << dendl;
1963     r = false;
1964   }
1965
1966   if (txsize < prune_interval - 1) {
1967     derr << __func__
1968          << "'mon_osdmap_full_prune_txsize' (" << txsize
1969          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
1970          << "); abort." << dendl;
1971     r = false;
1972   }
1973   return r;
1974 }
1975
1976 bool OSDMonitor::is_prune_enabled() const {
1977   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
1978 }
1979
1980 bool OSDMonitor::is_prune_supported() const {
1981   return mon->get_required_mon_features().contains_any(
1982       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
1983 }
1984
1985 /** do_prune
1986  *
1987  * @returns true if has side-effects; false otherwise.
1988  */
1989 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
1990 {
1991   bool enabled = is_prune_enabled();
1992
1993   dout(1) << __func__ << " osdmap full prune "
1994           << ( enabled ? "enabled" : "disabled")
1995           << dendl;
1996
1997   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
1998     return false;
1999   }
2000
2001   // we are beyond the minimum prune versions, we need to remove maps because
2002   // otherwise the store will grow unbounded and we may end up having issues
2003   // with available disk space or store hangs.
2004
2005   // we will not pin all versions. We will leave a buffer number of versions.
2006   // this allows us the monitor to trim maps without caring too much about
2007   // pinned maps, and then allow us to use another ceph-mon without these
2008   // capabilities, without having to repair the store.
2009
2010   osdmap_manifest_t manifest = osdmap_manifest;
2011
2012   version_t first = get_first_committed();
2013   version_t last = get_last_committed();
2014
2015   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2016   version_t last_pinned = manifest.get_last_pinned();
2017   uint64_t prune_interval =
2018     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2019   uint64_t txsize =
2020     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2021
2022   prune_init(manifest);
2023
2024   // we need to get rid of some osdmaps
2025
2026   dout(5) << __func__
2027           << " lc (" << first << " .. " << last << ")"
2028           << " last_pinned " << last_pinned
2029           << " interval " << prune_interval
2030           << " last_to_pin " << last_to_pin
2031           << dendl;
2032
2033   // We will be erasing maps as we go.
2034   //
2035   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2036   //
2037   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2038   // we stop pruning. We could prune the maps between `next_to_pin` and
2039   // `last_to_pin`, but by not doing it we end up with neater pruned
2040   // intervals, aligned with `prune_interval`. Besides, this should not be a
2041   // problem as long as `prune_interval` is set to a sane value, instead of
2042   // hundreds or thousands of maps.
2043
2044   auto map_exists = [this](version_t v) {
2045     string k = mon->store->combine_strings("full", v);
2046     return mon->store->exists(get_service_name(), k);
2047   };
2048
2049   // 'interval' represents the number of maps from the last pinned
2050   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2051   // version 11 next; all intermediate versions will be removed.
2052   //
2053   // 'txsize' represents the maximum number of versions we'll be removing in
2054   // this iteration. If 'txsize' is large enough to perform multiple passes
2055   // pinning and removing maps, we will do so; if not, we'll do at least one
2056   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2057   // ensure that we never go *over* the maximum.
2058
2059   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2060   uint64_t removal_interval = prune_interval - 1;
2061
2062   if (txsize < removal_interval) {
2063     dout(5) << __func__
2064             << " setting txsize to removal interval size ("
2065             << removal_interval << " versions"
2066             << dendl;
2067     txsize = removal_interval;
2068   }
2069   ceph_assert(removal_interval > 0);
2070
2071   uint64_t num_pruned = 0;
2072   while (num_pruned + removal_interval <= txsize) {
2073     last_pinned = manifest.get_last_pinned();
2074
2075     if (last_pinned + prune_interval > last_to_pin) {
2076       break;
2077     }
2078     ceph_assert(last_pinned < last_to_pin);
2079
2080     version_t next_pinned = last_pinned + prune_interval;
2081     ceph_assert(next_pinned <= last_to_pin);
2082     manifest.pin(next_pinned);
2083
2084     dout(20) << __func__
2085              << " last_pinned " << last_pinned
2086              << " next_pinned " << next_pinned
2087              << " num_pruned " << num_pruned
2088              << " removal interval (" << (last_pinned+1)
2089              << ".." << (next_pinned-1) << ")"
2090              << " txsize " << txsize << dendl;
2091
2092     ceph_assert(map_exists(last_pinned));
2093     ceph_assert(map_exists(next_pinned));
2094
2095     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2096       ceph_assert(!manifest.is_pinned(v));
2097
2098       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2099       string full_key = mon->store->combine_strings("full", v);
2100       tx->erase(get_service_name(), full_key);
2101       ++num_pruned;
2102     }
2103   }
2104
2105   ceph_assert(num_pruned > 0);
2106
2107   bufferlist bl;
2108   manifest.encode(bl);
2109   tx->put(get_service_name(), "osdmap_manifest", bl);
2110
2111   return true;
2112 }
2113
2114
2115 // -------------
2116
2117 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2118 {
2119   op->mark_osdmon_event(__func__);
2120   Message *m = op->get_req();
2121   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2122
2123   switch (m->get_type()) {
2124     // READs
2125   case MSG_MON_COMMAND:
2126     try {
2127       return preprocess_command(op);
2128     } catch (const bad_cmd_get& e) {
2129       bufferlist bl;
2130       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2131       return true;
2132     }
2133   case CEPH_MSG_MON_GET_OSDMAP:
2134     return preprocess_get_osdmap(op);
2135
2136     // damp updates
2137   case MSG_OSD_MARK_ME_DOWN:
2138     return preprocess_mark_me_down(op);
2139   case MSG_OSD_FULL:
2140     return preprocess_full(op);
2141   case MSG_OSD_FAILURE:
2142     return preprocess_failure(op);
2143   case MSG_OSD_BOOT:
2144     return preprocess_boot(op);
2145   case MSG_OSD_ALIVE:
2146     return preprocess_alive(op);
2147   case MSG_OSD_PG_CREATED:
2148     return preprocess_pg_created(op);
2149   case MSG_OSD_PG_READY_TO_MERGE:
2150     return preprocess_pg_ready_to_merge(op);
2151   case MSG_OSD_PGTEMP:
2152     return preprocess_pgtemp(op);
2153   case MSG_OSD_BEACON:
2154     return preprocess_beacon(op);
2155
2156   case CEPH_MSG_POOLOP:
2157     return preprocess_pool_op(op);
2158
2159   case MSG_REMOVE_SNAPS:
2160     return preprocess_remove_snaps(op);
2161
2162   default:
2163     ceph_abort();
2164     return true;
2165   }
2166 }
2167
2168 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2169 {
2170   op->mark_osdmon_event(__func__);
2171   Message *m = op->get_req();
2172   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2173
2174   switch (m->get_type()) {
2175     // damp updates
2176   case MSG_OSD_MARK_ME_DOWN:
2177     return prepare_mark_me_down(op);
2178   case MSG_OSD_FULL:
2179     return prepare_full(op);
2180   case MSG_OSD_FAILURE:
2181     return prepare_failure(op);
2182   case MSG_OSD_BOOT:
2183     return prepare_boot(op);
2184   case MSG_OSD_ALIVE:
2185     return prepare_alive(op);
2186   case MSG_OSD_PG_CREATED:
2187     return prepare_pg_created(op);
2188   case MSG_OSD_PGTEMP:
2189     return prepare_pgtemp(op);
2190   case MSG_OSD_PG_READY_TO_MERGE:
2191     return prepare_pg_ready_to_merge(op);
2192   case MSG_OSD_BEACON:
2193     return prepare_beacon(op);
2194
2195   case MSG_MON_COMMAND:
2196     try {
2197       return prepare_command(op);
2198     } catch (const bad_cmd_get& e) {
2199       bufferlist bl;
2200       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2201       return true;
2202     }
2203
2204   case CEPH_MSG_POOLOP:
2205     return prepare_pool_op(op);
2206
2207   case MSG_REMOVE_SNAPS:
2208     return prepare_remove_snaps(op);
2209
2210
2211   default:
2212     ceph_abort();
2213   }
2214
2215   return false;
2216 }
2217
2218 bool OSDMonitor::should_propose(double& delay)
2219 {
2220   dout(10) << "should_propose" << dendl;
2221
2222   // if full map, propose immediately!  any subsequent changes will be clobbered.
2223   if (pending_inc.fullmap.length())
2224     return true;
2225
2226   // adjust osd weights?
2227   if (!osd_weight.empty() &&
2228       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2229     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2230     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2231     delay = 0.0;
2232     osd_weight.clear();
2233     return true;
2234   }
2235
2236   return PaxosService::should_propose(delay);
2237 }
2238
2239
2240
2241 // ---------------------------
2242 // READs
2243
2244 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2245 {
2246   op->mark_osdmon_event(__func__);
2247   MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
2248
2249   uint64_t features = mon->get_quorum_con_features();
2250   if (op->get_session() && op->get_session()->con_features)
2251     features = op->get_session()->con_features;
2252
2253   dout(10) << __func__ << " " << *m << dendl;
2254   MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2255   epoch_t first = get_first_committed();
2256   epoch_t last = osdmap.get_epoch();
2257   int max = g_conf()->osd_map_message_max;
2258   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2259   for (epoch_t e = std::max(first, m->get_full_first());
2260        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2261        ++e, --max) {
2262     bufferlist& bl = reply->maps[e];
2263     int r = get_version_full(e, features, bl);
2264     ceph_assert(r >= 0);
2265     max_bytes -= bl.length();
2266   }
2267   for (epoch_t e = std::max(first, m->get_inc_first());
2268        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2269        ++e, --max) {
2270     bufferlist& bl = reply->incremental_maps[e];
2271     int r = get_version(e, features, bl);
2272     ceph_assert(r >= 0);
2273     max_bytes -= bl.length();
2274   }
2275   reply->oldest_map = first;
2276   reply->newest_map = last;
2277   mon->send_reply(op, reply);
2278   return true;
2279 }
2280
2281
2282 // ---------------------------
2283 // UPDATEs
2284
2285 // failure --
2286
2287 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2288   // check permissions
2289   MonSession *session = op->get_session();
2290   if (!session)
2291     return true;
2292   if (!session->is_capable("osd", MON_CAP_X)) {
2293     dout(0) << "got MOSDFailure from entity with insufficient caps "
2294             << session->caps << dendl;
2295     return true;
2296   }
2297   if (fsid != mon->monmap->fsid) {
2298     dout(0) << "check_source: on fsid " << fsid
2299             << " != " << mon->monmap->fsid << dendl;
2300     return true;
2301   }
2302   return false;
2303 }
2304
2305
2306 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2307 {
2308   op->mark_osdmon_event(__func__);
2309   MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2310   // who is target_osd
2311   int badboy = m->get_target_osd();
2312
2313   // check permissions
2314   if (check_source(op, m->fsid))
2315     goto didit;
2316
2317   // first, verify the reporting host is valid
2318   if (m->get_orig_source().is_osd()) {
2319     int from = m->get_orig_source().num();
2320     if (!osdmap.exists(from) ||
2321         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2322         (osdmap.is_down(from) && m->if_osd_failed())) {
2323       dout(5) << "preprocess_failure from dead osd." << from
2324               << ", ignoring" << dendl;
2325       send_incremental(op, m->get_epoch()+1);
2326       goto didit;
2327     }
2328   }
2329
2330
2331   // weird?
2332   if (osdmap.is_down(badboy)) {
2333     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2334             << " " << m->get_target_addrs()
2335             << ", from " << m->get_orig_source() << dendl;
2336     if (m->get_epoch() < osdmap.get_epoch())
2337       send_incremental(op, m->get_epoch()+1);
2338     goto didit;
2339   }
2340   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2341     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2342             << " " << m->get_target_addrs()
2343             << " != map's " << osdmap.get_addrs(badboy)
2344             << ", from " << m->get_orig_source() << dendl;
2345     if (m->get_epoch() < osdmap.get_epoch())
2346       send_incremental(op, m->get_epoch()+1);
2347     goto didit;
2348   }
2349
2350   // already reported?
2351   if (osdmap.is_down(badboy) ||
2352       osdmap.get_up_from(badboy) > m->get_epoch()) {
2353     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2354             << " " << m->get_target_addrs()
2355             << ", from " << m->get_orig_source() << dendl;
2356     if (m->get_epoch() < osdmap.get_epoch())
2357       send_incremental(op, m->get_epoch()+1);
2358     goto didit;
2359   }
2360
2361   if (!can_mark_down(badboy)) {
2362     dout(5) << "preprocess_failure ignoring report of osd."
2363             << m->get_target_osd() << " " << m->get_target_addrs()
2364             << " from " << m->get_orig_source() << dendl;
2365     goto didit;
2366   }
2367
2368   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2369            << " " << m->get_target_addrs()
2370            << ", from " << m->get_orig_source() << dendl;
2371   return false;
2372
2373  didit:
2374   mon->no_reply(op);
2375   return true;
2376 }
2377
2378 class C_AckMarkedDown : public C_MonOp {
2379   OSDMonitor *osdmon;
2380 public:
2381   C_AckMarkedDown(
2382     OSDMonitor *osdmon,
2383     MonOpRequestRef op)
2384     : C_MonOp(op), osdmon(osdmon) {}
2385
2386   void _finish(int) override {
2387     MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2388     osdmon->mon->send_reply(
2389       op,
2390       new MOSDMarkMeDown(
2391         m->fsid,
2392         m->target_osd,
2393         m->target_addrs,
2394         m->get_epoch(),
2395         false));   // ACK itself does not request an ack
2396   }
2397   ~C_AckMarkedDown() override {
2398   }
2399 };
2400
2401 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2402 {
2403   op->mark_osdmon_event(__func__);
2404   MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2405   int from = m->target_osd;
2406
2407   // check permissions
2408   if (check_source(op, m->fsid))
2409     goto reply;
2410
2411   // first, verify the reporting host is valid
2412   if (!m->get_orig_source().is_osd())
2413     goto reply;
2414
2415   if (!osdmap.exists(from) ||
2416       osdmap.is_down(from) ||
2417       osdmap.get_addrs(from) != m->target_addrs) {
2418     dout(5) << "preprocess_mark_me_down from dead osd."
2419             << from << ", ignoring" << dendl;
2420     send_incremental(op, m->get_epoch()+1);
2421     goto reply;
2422   }
2423
2424   // no down might be set
2425   if (!can_mark_down(from))
2426     goto reply;
2427
2428   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2429            << " " << m->target_addrs << dendl;
2430   return false;
2431
2432  reply:
2433   if (m->request_ack) {
2434     Context *c(new C_AckMarkedDown(this, op));
2435     c->complete(0);
2436   }
2437   return true;
2438 }
2439
2440 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2441 {
2442   op->mark_osdmon_event(__func__);
2443   MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2444   int target_osd = m->target_osd;
2445
2446   ceph_assert(osdmap.is_up(target_osd));
2447   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2448
2449   mon->clog->info() << "osd." << target_osd << " marked itself down";
2450   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2451   if (m->request_ack)
2452     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2453   return true;
2454 }
2455
2456 bool OSDMonitor::can_mark_down(int i)
2457 {
2458   if (osdmap.is_nodown(i)) {
2459     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
2460             << "will not mark it down" << dendl;
2461     return false;
2462   }
2463
2464   int num_osds = osdmap.get_num_osds();
2465   if (num_osds == 0) {
2466     dout(5) << __func__ << " no osds" << dendl;
2467     return false;
2468   }
2469   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
2470   float up_ratio = (float)up / (float)num_osds;
2471   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
2472     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
2473             << g_conf()->mon_osd_min_up_ratio
2474             << ", will not mark osd." << i << " down" << dendl;
2475     return false;
2476   }
2477   return true;
2478 }
2479
2480 bool OSDMonitor::can_mark_up(int i)
2481 {
2482   if (osdmap.is_noup(i)) {
2483     dout(5) << __func__ << " osd." << i << " is marked as noup, "
2484             << "will not mark it up" << dendl;
2485     return false;
2486   }
2487
2488   return true;
2489 }
2490
2491 /**
2492  * @note the parameter @p i apparently only exists here so we can output the
2493  *       osd's id on messages.
2494  */
2495 bool OSDMonitor::can_mark_out(int i)
2496 {
2497   if (osdmap.is_noout(i)) {
2498     dout(5) << __func__ << " osd." << i << " is marked as noout, "
2499             << "will not mark it out" << dendl;
2500     return false;
2501   }
2502
2503   int num_osds = osdmap.get_num_osds();
2504   if (num_osds == 0) {
2505     dout(5) << __func__ << " no osds" << dendl;
2506     return false;
2507   }
2508   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
2509   float in_ratio = (float)in / (float)num_osds;
2510   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
2511     if (i >= 0)
2512       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2513               << g_conf()->mon_osd_min_in_ratio
2514               << ", will not mark osd." << i << " out" << dendl;
2515     else
2516       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2517               << g_conf()->mon_osd_min_in_ratio
2518               << ", will not mark osds out" << dendl;
2519     return false;
2520   }
2521
2522   return true;
2523 }
2524
2525 bool OSDMonitor::can_mark_in(int i)
2526 {
2527   if (osdmap.is_noin(i)) {
2528     dout(5) << __func__ << " osd." << i << " is marked as noin, "
2529             << "will not mark it in" << dendl;
2530     return false;
2531   }
2532
2533   return true;
2534 }
2535
2536 bool OSDMonitor::check_failures(utime_t now)
2537 {
2538   bool found_failure = false;
2539   for (map<int,failure_info_t>::iterator p = failure_info.begin();
2540        p != failure_info.end();
2541        ++p) {
2542     if (can_mark_down(p->first)) {
2543       found_failure |= check_failure(now, p->first, p->second);
2544     }
2545   }
2546   return found_failure;
2547 }
2548
2549 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
2550 {
2551   // already pending failure?
2552   if (pending_inc.new_state.count(target_osd) &&
2553       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2554     dout(10) << " already pending failure" << dendl;
2555     return true;
2556   }
2557
2558   set<string> reporters_by_subtree;
2559   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2560   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
2561   utime_t max_failed_since = fi.get_failed_since();
2562   utime_t failed_for = now - max_failed_since;
2563
2564   utime_t grace = orig_grace;
2565   double my_grace = 0, peer_grace = 0;
2566   double decay_k = 0;
2567   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2568     double halflife = (double)g_conf()->mon_osd_laggy_halflife;
2569     decay_k = ::log(.5) / halflife;
2570
2571     // scale grace period based on historical probability of 'lagginess'
2572     // (false positive failures due to slowness).
2573     const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
2574     double decay = exp((double)failed_for * decay_k);
2575     dout(20) << " halflife " << halflife << " decay_k " << decay_k
2576              << " failed_for " << failed_for << " decay " << decay << dendl;
2577     my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
2578     grace += my_grace;
2579   }
2580
2581   // consider the peers reporting a failure a proxy for a potential
2582   // 'subcluster' over the overall cluster that is similarly
2583   // laggy.  this is clearly not true in all cases, but will sometimes
2584   // help us localize the grace correction to a subset of the system
2585   // (say, a rack with a bad switch) that is unhappy.
2586   ceph_assert(fi.reporters.size());
2587   for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
2588         p != fi.reporters.end();
2589         ++p) {
2590     // get the parent bucket whose type matches with "reporter_subtree_level".
2591     // fall back to OSD if the level doesn't exist.
2592     map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
2593     map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
2594     if (iter == reporter_loc.end()) {
2595       reporters_by_subtree.insert("osd." + to_string(p->first));
2596     } else {
2597       reporters_by_subtree.insert(iter->second);
2598     }
2599     if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2600       const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
2601       utime_t elapsed = now - xi.down_stamp;
2602       double decay = exp((double)elapsed * decay_k);
2603       peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
2604     }
2605   }
2606
2607   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2608     peer_grace /= (double)fi.reporters.size();
2609     grace += peer_grace;
2610   }
2611
2612   dout(10) << " osd." << target_osd << " has "
2613            << fi.reporters.size() << " reporters, "
2614            << grace << " grace (" << orig_grace << " + " << my_grace
2615            << " + " << peer_grace << "), max_failed_since " << max_failed_since
2616            << dendl;
2617
2618   if (failed_for >= grace &&
2619       reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
2620     dout(1) << " we have enough reporters to mark osd." << target_osd
2621             << " down" << dendl;
2622     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2623
2624     mon->clog->info() << "osd." << target_osd << " failed ("
2625                       << osdmap.crush->get_full_location_ordered_string(
2626                         target_osd)
2627                       << ") ("
2628                       << (int)reporters_by_subtree.size()
2629                       << " reporters from different "
2630                       << reporter_subtree_level << " after "
2631                       << failed_for << " >= grace " << grace << ")";
2632     return true;
2633   }
2634   return false;
2635 }
2636
2637 void OSDMonitor::force_failure(int target_osd, int by)
2638 {
2639   // already pending failure?
2640   if (pending_inc.new_state.count(target_osd) &&
2641       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2642     dout(10) << " already pending failure" << dendl;
2643     return;
2644   }
2645
2646   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
2647   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2648
2649   mon->clog->info() << "osd." << target_osd << " failed ("
2650                     << osdmap.crush->get_full_location_ordered_string(target_osd)
2651                     << ") (connection refused reported by osd." << by << ")";
2652   return;
2653 }
2654
2655 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
2656 {
2657   op->mark_osdmon_event(__func__);
2658   MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2659   dout(1) << "prepare_failure osd." << m->get_target_osd()
2660           << " " << m->get_target_addrs()
2661           << " from " << m->get_orig_source()
2662           << " is reporting failure:" << m->if_osd_failed() << dendl;
2663
2664   int target_osd = m->get_target_osd();
2665   int reporter = m->get_orig_source().num();
2666   ceph_assert(osdmap.is_up(target_osd));
2667   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
2668
2669   if (m->if_osd_failed()) {
2670     // calculate failure time
2671     utime_t now = ceph_clock_now();
2672     utime_t failed_since =
2673       m->get_recv_stamp() - utime_t(m->failed_for, 0);
2674
2675     // add a report
2676     if (m->is_immediate()) {
2677       mon->clog->debug() << "osd." << m->get_target_osd()
2678                          << " reported immediately failed by "
2679                          << m->get_orig_source();
2680       force_failure(target_osd, reporter);
2681       mon->no_reply(op);
2682       return true;
2683     }
2684     mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
2685                       << m->get_orig_source();
2686
2687     failure_info_t& fi = failure_info[target_osd];
2688     MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2689     if (old_op) {
2690       mon->no_reply(old_op);
2691     }
2692
2693     return check_failure(now, target_osd, fi);
2694   } else {
2695     // remove the report
2696     mon->clog->debug() << "osd." << m->get_target_osd()
2697                        << " failure report canceled by "
2698                        << m->get_orig_source();
2699     if (failure_info.count(target_osd)) {
2700       failure_info_t& fi = failure_info[target_osd];
2701       MonOpRequestRef report_op = fi.cancel_report(reporter);
2702       if (report_op) {
2703         mon->no_reply(report_op);
2704       }
2705       if (fi.reporters.empty()) {
2706         dout(10) << " removing last failure_info for osd." << target_osd
2707                  << dendl;
2708         failure_info.erase(target_osd);
2709       } else {
2710         dout(10) << " failure_info for osd." << target_osd << " now "
2711                  << fi.reporters.size() << " reporters" << dendl;
2712       }
2713     } else {
2714       dout(10) << " no failure_info for osd." << target_osd << dendl;
2715     }
2716     mon->no_reply(op);
2717   }
2718
2719   return false;
2720 }
2721
2722 void OSDMonitor::process_failures()
2723 {
2724   map<int,failure_info_t>::iterator p = failure_info.begin();
2725   while (p != failure_info.end()) {
2726     if (osdmap.is_up(p->first)) {
2727       ++p;
2728     } else {
2729       dout(10) << "process_failures osd." << p->first << dendl;
2730       list<MonOpRequestRef> ls;
2731       p->second.take_report_messages(ls);
2732       failure_info.erase(p++);
2733
2734       while (!ls.empty()) {
2735         MonOpRequestRef o = ls.front();
2736         if (o) {
2737           o->mark_event(__func__);
2738           MOSDFailure *m = o->get_req<MOSDFailure>();
2739           send_latest(o, m->get_epoch());
2740           mon->no_reply(o);
2741         }
2742         ls.pop_front();
2743       }
2744     }
2745   }
2746 }
2747
2748 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2749 {
2750   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2751
2752   for (map<int,failure_info_t>::iterator p = failure_info.begin();
2753        p != failure_info.end();
2754        ++p) {
2755     p->second.take_report_messages(ls);
2756   }
2757   failure_info.clear();
2758 }
2759
2760
2761 // boot --
2762
2763 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2764 {
2765   op->mark_osdmon_event(__func__);
2766   MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2767   int from = m->get_orig_source_inst().name.num();
2768
2769   // check permissions, ignore if failed (no response expected)
2770   MonSession *session = op->get_session();
2771   if (!session)
2772     goto ignore;
2773   if (!session->is_capable("osd", MON_CAP_X)) {
2774     dout(0) << "got preprocess_boot message from entity with insufficient caps"
2775             << session->caps << dendl;
2776     goto ignore;
2777   }
2778
2779   if (m->sb.cluster_fsid != mon->monmap->fsid) {
2780     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2781             << " != " << mon->monmap->fsid << dendl;
2782     goto ignore;
2783   }
2784
2785   if (m->get_orig_source_inst().addr.is_blank_ip()) {
2786     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2787     goto ignore;
2788   }
2789
2790   ceph_assert(m->get_orig_source_inst().name.is_osd());
2791
2792   // force all osds to have gone through luminous prior to upgrade to nautilus
2793   {
2794     vector<string> missing;
2795     if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2796       missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
2797     }
2798     if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
2799       missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
2800     }
2801     if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2802       missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
2803     }
2804     if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
2805       missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
2806     }
2807
2808     if (!missing.empty()) {
2809       using std::experimental::make_ostream_joiner;
2810
2811       stringstream ss;
2812       copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
2813
2814       mon->clog->info() << "disallowing boot of OSD "
2815                         << m->get_orig_source_inst()
2816                         << " because the osd lacks " << ss.str();
2817       goto ignore;
2818     }
2819   }
2820
2821   // make sure upgrades stop at nautilus
2822   if (HAVE_FEATURE(m->osd_features, SERVER_O) &&
2823       osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
2824     mon->clog->info() << "disallowing boot of post-nautilus OSD "
2825                       << m->get_orig_source_inst()
2826                       << " because require_osd_release < nautilus";
2827     goto ignore;
2828   }
2829
2830   // The release check here is required because for OSD_PGLOG_HARDLIMIT,
2831   // we are reusing a jewel feature bit that was retired in luminous.
2832   if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
2833       osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
2834       !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
2835     mon->clog->info() << "disallowing boot of OSD "
2836                       << m->get_orig_source_inst()
2837                       << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
2838     goto ignore;
2839   }
2840
2841   // already booted?
2842   if (osdmap.is_up(from) &&
2843       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
2844       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
2845     // yup.
2846     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
2847             << " " << m->get_orig_source_addrs()
2848             << " =~ " << osdmap.get_addrs(from) << dendl;
2849     _booted(op, false);
2850     return true;
2851   }
2852
2853   if (osdmap.exists(from) &&
2854       !osdmap.get_uuid(from).is_zero() &&
2855       osdmap.get_uuid(from) != m->sb.osd_fsid) {
2856     dout(7) << __func__ << " from " << m->get_orig_source_inst()
2857             << " clashes with existing osd: different fsid"
2858             << " (ours: " << osdmap.get_uuid(from)
2859             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2860     goto ignore;
2861   }
2862
2863   if (osdmap.exists(from) &&
2864       osdmap.get_info(from).up_from > m->version &&
2865       osdmap.get_most_recent_addrs(from).legacy_equals(
2866         m->get_orig_source_addrs())) {
2867     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2868     send_latest(op, m->sb.current_epoch+1);
2869     return true;
2870   }
2871
2872   // noup?
2873   if (!can_mark_up(from)) {
2874     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2875     send_latest(op, m->sb.current_epoch+1);
2876     return true;
2877   }
2878
2879   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2880   return false;
2881
2882  ignore:
2883   return true;
2884 }
2885
2886 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2887 {
2888   op->mark_osdmon_event(__func__);
2889   MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2890   dout(7) << __func__ << " from " << m->get_source()
2891           << " sb " << m->sb
2892           << " client_addrs" << m->get_connection()->get_peer_addrs()
2893           << " cluster_addrs " << m->cluster_addrs
2894           << " hb_back_addrs " << m->hb_back_addrs
2895           << " hb_front_addrs " << m->hb_front_addrs
2896           << dendl;
2897
2898   ceph_assert(m->get_orig_source().is_osd());
2899   int from = m->get_orig_source().num();
2900
2901   // does this osd exist?
2902   if (from >= osdmap.get_max_osd()) {
2903     dout(1) << "boot from osd." << from << " >= max_osd "
2904             << osdmap.get_max_osd() << dendl;
2905     return false;
2906   }
2907
2908   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2909   if (pending_inc.new_state.count(from))
2910     oldstate ^= pending_inc.new_state[from];
2911
2912   // already up?  mark down first?
2913   if (osdmap.is_up(from)) {
2914     dout(7) << __func__ << " was up, first marking down osd." << from << " "
2915             << osdmap.get_addrs(from) << dendl;
2916     // preprocess should have caught these;  if not, assert.
2917     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
2918                   m->get_orig_source_addrs()) ||
2919                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
2920     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2921
2922     if (pending_inc.new_state.count(from) == 0 ||
2923         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2924       // mark previous guy down
2925       pending_inc.new_state[from] = CEPH_OSD_UP;
2926     }
2927     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2928   } else if (pending_inc.new_up_client.count(from)) {
2929     // already prepared, just wait
2930     dout(7) << __func__ << " already prepared, waiting on "
2931             << m->get_orig_source_addr() << dendl;
2932     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2933   } else {
2934     // mark new guy up.
2935     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
2936     pending_inc.new_up_cluster[from] = m->cluster_addrs;
2937     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
2938     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
2939
2940     down_pending_out.erase(from);  // if any
2941
2942     if (m->sb.weight)
2943       osd_weight[from] = m->sb.weight;
2944
2945     // set uuid?
2946     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2947              << dendl;
2948     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2949       // preprocess should have caught this;  if not, assert.
2950       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2951       pending_inc.new_uuid[from] = m->sb.osd_fsid;
2952     }
2953
2954     // fresh osd?
2955     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2956       const osd_info_t& i = osdmap.get_info(from);
2957       if (i.up_from > i.lost_at) {
2958         dout(10) << " fresh osd; marking lost_at too" << dendl;
2959         pending_inc.new_lost[from] = osdmap.get_epoch();
2960       }
2961     }
2962
2963     // metadata
2964     bufferlist osd_metadata;
2965     encode(m->metadata, osd_metadata);
2966     pending_metadata[from] = osd_metadata;
2967     pending_metadata_rm.erase(from);
2968
2969     // adjust last clean unmount epoch?
2970     const osd_info_t& info = osdmap.get_info(from);
2971     dout(10) << " old osd_info: " << info << dendl;
2972     if (m->sb.mounted > info.last_clean_begin ||
2973         (m->sb.mounted == info.last_clean_begin &&
2974          m->sb.clean_thru > info.last_clean_end)) {
2975       epoch_t begin = m->sb.mounted;
2976       epoch_t end = m->sb.clean_thru;
2977
2978       dout(10) << __func__ << " osd." << from << " last_clean_interval "
2979                << "[" << info.last_clean_begin << "," << info.last_clean_end
2980                << ") -> [" << begin << "-" << end << ")"
2981                << dendl;
2982       pending_inc.new_last_clean_interval[from] =
2983         pair<epoch_t,epoch_t>(begin, end);
2984     }
2985
2986     osd_xinfo_t xi = osdmap.get_xinfo(from);
2987     if (m->boot_epoch == 0) {
2988       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
2989       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
2990       dout(10) << " not laggy, new xi " << xi << dendl;
2991     } else {
2992       if (xi.down_stamp.sec()) {
2993         int interval = ceph_clock_now().sec() -
2994           xi.down_stamp.sec();
2995         if (g_conf()->mon_osd_laggy_max_interval &&
2996             (interval > g_conf()->mon_osd_laggy_max_interval)) {
2997           interval =  g_conf()->mon_osd_laggy_max_interval;
2998         }
2999         xi.laggy_interval =
3000           interval * g_conf()->mon_osd_laggy_weight +
3001           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3002       }
3003       xi.laggy_probability =
3004         g_conf()->mon_osd_laggy_weight +
3005         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3006       dout(10) << " laggy, now xi " << xi << dendl;
3007     }
3008
3009     // set features shared by the osd
3010     if (m->osd_features)
3011       xi.features = m->osd_features;
3012     else
3013       xi.features = m->get_connection()->get_features();
3014
3015     // mark in?
3016     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3017          (oldstate & CEPH_OSD_AUTOOUT)) ||
3018         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3019         (g_conf()->mon_osd_auto_mark_in)) {
3020       if (can_mark_in(from)) {
3021         if (osdmap.osd_xinfo[from].old_weight > 0) {
3022           pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
3023           xi.old_weight = 0;
3024         } else {
3025           pending_inc.new_weight[from] = CEPH_OSD_IN;
3026         }
3027       } else {
3028         dout(7) << __func__ << " NOIN set, will not mark in "
3029                 << m->get_orig_source_addr() << dendl;
3030       }
3031     }
3032
3033     pending_inc.new_xinfo[from] = xi;
3034
3035     // wait
3036     wait_for_finished_proposal(op, new C_Booted(this, op));
3037   }
3038   return true;
3039 }
3040
3041 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3042 {
3043   op->mark_osdmon_event(__func__);
3044   MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
3045   dout(7) << "_booted " << m->get_orig_source_inst()
3046           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3047
3048   if (logit) {
3049     mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3050                       << " boot";
3051   }
3052
3053   send_latest(op, m->sb.current_epoch+1);
3054 }
3055
3056
3057 // -------------
3058 // full
3059
3060 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3061 {
3062   op->mark_osdmon_event(__func__);
3063   MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3064   int from = m->get_orig_source().num();
3065   set<string> state;
3066   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3067
3068   // check permissions, ignore if failed
3069   MonSession *session = op->get_session();
3070   if (!session)
3071     goto ignore;
3072   if (!session->is_capable("osd", MON_CAP_X)) {
3073     dout(0) << "MOSDFull from entity with insufficient privileges:"
3074             << session->caps << dendl;
3075     goto ignore;
3076   }
3077
3078   // ignore a full message from the osd instance that already went down
3079   if (!osdmap.exists(from)) {
3080     dout(7) << __func__ << " ignoring full message from nonexistent "
3081             << m->get_orig_source_inst() << dendl;
3082     goto ignore;
3083   }
3084   if ((!osdmap.is_up(from) &&
3085        osdmap.get_most_recent_addrs(from).legacy_equals(
3086          m->get_orig_source_addrs())) ||
3087       (osdmap.is_up(from) &&
3088        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3089     dout(7) << __func__ << " ignoring full message from down "
3090             << m->get_orig_source_inst() << dendl;
3091     goto ignore;
3092   }
3093
3094   OSDMap::calc_state_set(osdmap.get_state(from), state);
3095
3096   if ((osdmap.get_state(from) & mask) == m->state) {
3097     dout(7) << __func__ << " state already " << state << " for osd." << from
3098             << " " << m->get_orig_source_inst() << dendl;
3099     _reply_map(op, m->version);
3100     goto ignore;
3101   }
3102
3103   dout(10) << __func__ << " want state " << state << " for osd." << from
3104            << " " << m->get_orig_source_inst() << dendl;
3105   return false;
3106
3107  ignore:
3108   return true;
3109 }
3110
3111 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3112 {
3113   op->mark_osdmon_event(__func__);
3114   const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3115   const int from = m->get_orig_source().num();
3116
3117   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3118   const unsigned want_state = m->state & mask;  // safety first
3119
3120   unsigned cur_state = osdmap.get_state(from);
3121   auto p = pending_inc.new_state.find(from);
3122   if (p != pending_inc.new_state.end()) {
3123     cur_state ^= p->second;
3124   }
3125   cur_state &= mask;
3126
3127   set<string> want_state_set, cur_state_set;
3128   OSDMap::calc_state_set(want_state, want_state_set);
3129   OSDMap::calc_state_set(cur_state, cur_state_set);
3130
3131   if (cur_state != want_state) {
3132     if (p != pending_inc.new_state.end()) {
3133       p->second &= ~mask;
3134     } else {
3135       pending_inc.new_state[from] = 0;
3136     }
3137     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3138     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3139             << " -> " << want_state_set << dendl;
3140   } else {
3141     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3142             << " = wanted " << want_state_set << ", just waiting" << dendl;
3143   }
3144
3145   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3146   return true;
3147 }
3148
3149 // -------------
3150 // alive
3151
3152 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3153 {
3154   op->mark_osdmon_event(__func__);
3155   MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3156   int from = m->get_orig_source().num();
3157
3158   // check permissions, ignore if failed
3159   MonSession *session = op->get_session();
3160   if (!session)
3161     goto ignore;
3162   if (!session->is_capable("osd", MON_CAP_X)) {
3163     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3164             << session->caps << dendl;
3165     goto ignore;
3166   }
3167
3168   if (!osdmap.is_up(from) ||
3169       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3170     dout(7) << "preprocess_alive ignoring alive message from down "
3171             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3172             << dendl;
3173     goto ignore;
3174   }
3175
3176   if (osdmap.get_up_thru(from) >= m->want) {
3177     // yup.
3178     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3179     _reply_map(op, m->version);
3180     return true;
3181   }
3182
3183   dout(10) << "preprocess_alive want up_thru " << m->want
3184            << " from " << m->get_orig_source_inst() << dendl;
3185   return false;
3186
3187  ignore:
3188   return true;
3189 }
3190
3191 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3192 {
3193   op->mark_osdmon_event(__func__);
3194   MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3195   int from = m->get_orig_source().num();
3196
3197   if (0) {  // we probably don't care much about these
3198     mon->clog->debug() << m->get_orig_source_inst() << " alive";
3199   }
3200
3201   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3202           << " from " << m->get_orig_source_inst() << dendl;
3203
3204   update_up_thru(from, m->version); // set to the latest map the OSD has
3205   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3206   return true;
3207 }
3208
3209 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3210 {
3211   op->mark_osdmon_event(__func__);
3212   dout(7) << "_reply_map " << e
3213           << " from " << op->get_req()->get_orig_source_inst()
3214           << dendl;
3215   send_latest(op, e);
3216 }
3217
3218 // pg_created
3219 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3220 {
3221   op->mark_osdmon_event(__func__);
3222   auto m = static_cast<MOSDPGCreated*>(op->get_req());
3223   dout(10) << __func__ << " " << *m << dendl;
3224   auto session = op->get_session();
3225   mon->no_reply(op);
3226   if (!session) {
3227     dout(10) << __func__ << ": no monitor session!" << dendl;
3228     return true;
3229   }
3230   if (!session->is_capable("osd", MON_CAP_X)) {
3231     derr << __func__ << " received from entity "
3232          << "with insufficient privileges " << session->caps << dendl;
3233     return true;
3234   }
3235   // always forward the "created!" to the leader
3236   return false;
3237 }
3238
3239 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3240 {
3241   op->mark_osdmon_event(__func__);
3242   auto m = static_cast<MOSDPGCreated*>(op->get_req());
3243   dout(10) << __func__ << " " << *m << dendl;
3244   auto src = m->get_orig_source();
3245   auto from = src.num();
3246   if (!src.is_osd() ||
3247       !mon->osdmon()->osdmap.is_up(from) ||
3248       !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3249         m->get_orig_source_addrs())) {
3250     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3251     return false;
3252   }
3253   pending_created_pgs.push_back(m->pgid);
3254   return true;
3255 }
3256
3257 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3258 {
3259   op->mark_osdmon_event(__func__);
3260   auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3261   dout(10) << __func__ << " " << *m << dendl;
3262   const pg_pool_t *pi;
3263   auto session = op->get_session();
3264   if (!session) {
3265     dout(10) << __func__ << ": no monitor session!" << dendl;
3266     goto ignore;
3267   }
3268   if (!session->is_capable("osd", MON_CAP_X)) {
3269     derr << __func__ << " received from entity "
3270          << "with insufficient privileges " << session->caps << dendl;
3271     goto ignore;
3272   }
3273   pi = osdmap.get_pg_pool(m->pgid.pool());
3274   if (!pi) {
3275     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3276     goto ignore;
3277   }
3278   if (pi->get_pg_num() <= m->pgid.ps()) {
3279     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3280     goto ignore;
3281   }
3282   if (pi->get_pg_num() != m->pgid.ps() + 1) {
3283     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3284     goto ignore;
3285   }
3286   if (pi->get_pg_num_pending() > m->pgid.ps()) {
3287     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3288     goto ignore;
3289   }
3290   return false;
3291
3292  ignore:
3293   mon->no_reply(op);
3294   return true;
3295 }
3296
3297 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3298 {
3299   op->mark_osdmon_event(__func__);
3300   auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3301   dout(10) << __func__ << " " << *m << dendl;
3302   pg_pool_t p;
3303   if (pending_inc.new_pools.count(m->pgid.pool()))
3304     p = pending_inc.new_pools[m->pgid.pool()];
3305   else
3306     p = *osdmap.get_pg_pool(m->pgid.pool());
3307   if (p.get_pg_num() != m->pgid.ps() + 1 ||
3308       p.get_pg_num_pending() > m->pgid.ps()) {
3309     dout(10) << __func__
3310              << " race with concurrent pg_num[_pending] update, will retry"
3311              << dendl;
3312     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3313     return true;
3314   }
3315
3316   if (m->ready) {
3317     p.dec_pg_num(m->pgid,
3318                  pending_inc.epoch,
3319                  m->source_version,
3320                  m->target_version,
3321                  m->last_epoch_started,
3322                  m->last_epoch_clean);
3323     p.last_change = pending_inc.epoch;
3324   } else {
3325     // back off the merge attempt!
3326     p.set_pg_num_pending(p.get_pg_num());
3327   }
3328
3329   // force pre-nautilus clients to resend their ops, since they
3330   // don't understand pg_num_pending changes form a new interval
3331   p.last_force_op_resend_prenautilus = pending_inc.epoch;
3332
3333   pending_inc.new_pools[m->pgid.pool()] = p;
3334
3335   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3336   if (m->ready &&
3337       prob > 0 &&
3338       prob > (double)(rand() % 1000)/1000.0) {
3339     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3340     auto n = new MMonCommand(mon->monmap->get_fsid());
3341     n->set_connection(m->get_connection());
3342     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3343                osdmap.get_pool_name(m->pgid.pool()) +
3344                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3345                stringify(m->pgid.ps() + 1) + "\"}" };
3346     MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3347     nop->set_type_service();
3348     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3349   } else {
3350     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3351   }
3352   return true;
3353 }
3354
3355
3356 // -------------
3357 // pg_temp changes
3358
3359 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3360 {
3361   MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3362   dout(10) << "preprocess_pgtemp " << *m << dendl;
3363   mempool::osdmap::vector<int> empty;
3364   int from = m->get_orig_source().num();
3365   size_t ignore_cnt = 0;
3366
3367   // check caps
3368   MonSession *session = op->get_session();
3369   if (!session)
3370     goto ignore;
3371   if (!session->is_capable("osd", MON_CAP_X)) {
3372     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3373             << session->caps << dendl;
3374     goto ignore;
3375   }
3376
3377   if (!osdmap.is_up(from) ||
3378       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3379     dout(7) << "ignoring pgtemp message from down "
3380             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3381             << dendl;
3382     goto ignore;
3383   }
3384
3385   if (m->forced) {
3386     return false;
3387   }
3388
3389   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3390     dout(20) << " " << p->first
3391              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3392              << " -> " << p->second << dendl;
3393
3394     // does the pool exist?
3395     if (!osdmap.have_pg_pool(p->first.pool())) {
3396       /*
3397        * 1. If the osdmap does not have the pool, it means the pool has been
3398        *    removed in-between the osd sending this message and us handling it.
3399        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3400        *    not exist in the pending either, as the osds would not send a
3401        *    message about a pool they know nothing about (yet).
3402        * 3. However, if the pool does exist in the pending, then it must be a
3403        *    new pool, and not relevant to this message (see 1).
3404        */
3405       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3406                << ": pool has been removed" << dendl;
3407       ignore_cnt++;
3408       continue;
3409     }
3410
3411     int acting_primary = -1;
3412     osdmap.pg_to_up_acting_osds(
3413       p->first, nullptr, nullptr, nullptr, &acting_primary);
3414     if (acting_primary != from) {
3415       /* If the source isn't the primary based on the current osdmap, we know
3416        * that the interval changed and that we can discard this message.
3417        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3418        * which of two pg temp mappings on the same pg is more recent.
3419        */
3420       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3421                << ": primary has changed" << dendl;
3422       ignore_cnt++;
3423       continue;
3424     }
3425
3426     // removal?
3427     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3428                               osdmap.primary_temp->count(p->first)))
3429       return false;
3430     // change?
3431     //  NOTE: we assume that this will clear pg_primary, so consider
3432     //        an existing pg_primary field to imply a change
3433     if (p->second.size() &&
3434         (osdmap.pg_temp->count(p->first) == 0 ||
3435          osdmap.pg_temp->get(p->first) != p->second ||
3436          osdmap.primary_temp->count(p->first)))
3437       return false;
3438   }
3439
3440   // should we ignore all the pgs?
3441   if (ignore_cnt == m->pg_temp.size())
3442     goto ignore;
3443
3444   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
3445   _reply_map(op, m->map_epoch);
3446   return true;
3447
3448  ignore:
3449   return true;
3450 }
3451
3452 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
3453 {
3454   epoch_t old_up_thru = osdmap.get_up_thru(from);
3455   auto ut = pending_inc.new_up_thru.find(from);
3456   if (ut != pending_inc.new_up_thru.end()) {
3457     old_up_thru = ut->second;
3458   }
3459   if (up_thru > old_up_thru) {
3460     // set up_thru too, so the osd doesn't have to ask again
3461     pending_inc.new_up_thru[from] = up_thru;
3462   }
3463 }
3464
3465 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
3466 {
3467   op->mark_osdmon_event(__func__);
3468   MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3469   int from = m->get_orig_source().num();
3470   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
3471   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3472     uint64_t pool = p->first.pool();
3473     if (pending_inc.old_pools.count(pool)) {
3474       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3475                << ": pool pending removal" << dendl;
3476       continue;
3477     }
3478     if (!osdmap.have_pg_pool(pool)) {
3479       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3480                << ": pool has been removed" << dendl;
3481       continue;
3482     }
3483     pending_inc.new_pg_temp[p->first] =
3484       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
3485
3486     // unconditionally clear pg_primary (until this message can encode
3487     // a change for that, too.. at which point we need to also fix
3488     // preprocess_pg_temp)
3489     if (osdmap.primary_temp->count(p->first) ||
3490         pending_inc.new_primary_temp.count(p->first))
3491       pending_inc.new_primary_temp[p->first] = -1;
3492   }
3493
3494   // set up_thru too, so the osd doesn't have to ask again
3495   update_up_thru(from, m->map_epoch);
3496
3497   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
3498   return true;
3499 }
3500
3501
3502 // ---
3503
3504 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
3505 {
3506   op->mark_osdmon_event(__func__);
3507   MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3508   dout(7) << "preprocess_remove_snaps " << *m << dendl;
3509
3510   // check privilege, ignore if failed
3511   MonSession *session = op->get_session();
3512   mon->no_reply(op);
3513   if (!session)
3514     goto ignore;
3515   if (!session->caps.is_capable(
3516         cct,
3517         CEPH_ENTITY_TYPE_MON,
3518         session->entity_name,
3519         "osd", "osd pool rmsnap", {}, true, true, false,
3520         session->get_peer_socket_addr())) {
3521     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3522             << session->caps << dendl;
3523     goto ignore;
3524   }
3525
3526   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
3527        q != m->snaps.end();
3528        ++q) {
3529     if (!osdmap.have_pg_pool(q->first)) {
3530       dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
3531       continue;
3532     }
3533     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
3534     for (vector<snapid_t>::iterator p = q->second.begin();
3535          p != q->second.end();
3536          ++p) {
3537       if (*p > pi->get_snap_seq() ||
3538           !pi->removed_snaps.contains(*p))
3539         return false;
3540     }
3541   }
3542
3543  ignore:
3544   return true;
3545 }
3546
3547 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
3548 {
3549   op->mark_osdmon_event(__func__);
3550   MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3551   dout(7) << "prepare_remove_snaps " << *m << dendl;
3552
3553   for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
3554        p != m->snaps.end();
3555        ++p) {
3556
3557     if (!osdmap.have_pg_pool(p->first)) {
3558       dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
3559       continue;
3560     }
3561
3562     pg_pool_t& pi = osdmap.pools[p->first];
3563     for (vector<snapid_t>::iterator q = p->second.begin();
3564          q != p->second.end();
3565          ++q) {
3566       if (!pi.removed_snaps.contains(*q) &&
3567           (!pending_inc.new_pools.count(p->first) ||
3568            !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
3569         pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
3570         newpi->removed_snaps.insert(*q);
3571         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
3572         dout(10) << " pool " << p->first << " removed_snaps added " << *q
3573                  << " (now " << newpi->removed_snaps << ")" << dendl;
3574         if (*q > newpi->get_snap_seq()) {
3575           dout(10) << " pool " << p->first << " snap_seq "
3576                    << newpi->get_snap_seq() << " -> " << *q << dendl;
3577           newpi->set_snap_seq(*q);
3578         }
3579         newpi->set_snap_epoch(pending_inc.epoch);
3580         pending_inc.new_removed_snaps[p->first].insert(*q);
3581       }
3582     }
3583   }
3584   return true;
3585 }
3586
3587 // osd beacon
3588 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
3589 {
3590   op->mark_osdmon_event(__func__);
3591   // check caps
3592   auto session = op->get_session();
3593   mon->no_reply(op);
3594   if (!session) {
3595     dout(10) << __func__ << " no monitor session!" << dendl;
3596     return true;
3597   }
3598   if (!session->is_capable("osd", MON_CAP_X)) {
3599     derr << __func__ << " received from entity "
3600          << "with insufficient privileges " << session->caps << dendl;
3601     return true;
3602   }
3603   // Always forward the beacon to the leader, even if they are the same as
3604   // the old one. The leader will mark as down osds that haven't sent
3605   // beacon for a few minutes.
3606   return false;
3607 }
3608
3609 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
3610 {
3611   op->mark_osdmon_event(__func__);
3612   const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3613   const auto src = beacon->get_orig_source();
3614   dout(10) << __func__ << " " << *beacon
3615            << " from " << src << dendl;
3616   int from = src.num();
3617
3618   if (!src.is_osd() ||
3619       !osdmap.is_up(from) ||
3620       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
3621     if (src.is_osd() && !osdmap.is_up(from)) {
3622       // share some new maps with this guy in case it may not be
3623       // aware of its own deadness...
3624       send_latest(op, beacon->version+1);
3625     }
3626     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
3627     return false;
3628   }
3629
3630   last_osd_report[from] = ceph_clock_now();
3631   osd_epochs[from] = beacon->version;
3632
3633   for (const auto& pg : beacon->pgs) {
3634     last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
3635   }
3636   return false;
3637 }
3638
3639 // ---------------
3640 // map helpers
3641
3642 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
3643 {
3644   op->mark_osdmon_event(__func__);
3645   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
3646           << " start " << start << dendl;
3647   if (start == 0)
3648     send_full(op);
3649   else
3650     send_incremental(op, start);
3651 }
3652
3653
3654 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
3655 {
3656   MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
3657   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
3658   r->oldest_map = get_first_committed();
3659   r->newest_map = osdmap.get_epoch();
3660   return r;
3661 }
3662
3663 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
3664 {
3665   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
3666            << std::hex << features << std::dec << dendl;
3667   MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
3668   m->oldest_map = get_first_committed();
3669   m->newest_map = osdmap.get_epoch();
3670
3671   for (epoch_t e = to; e >= from && e > 0; e--) {
3672     bufferlist bl;
3673     int err = get_version(e, features, bl);
3674     if (err == 0) {
3675       ceph_assert(bl.length());
3676       // if (get_version(e, bl) > 0) {
3677       dout(20) << "build_incremental    inc " << e << " "
3678                << bl.length() << " bytes" << dendl;
3679       m->incremental_maps[e] = bl;
3680     } else {
3681       ceph_assert(err == -ENOENT);
3682       ceph_assert(!bl.length());
3683       get_version_full(e, features, bl);
3684       if (bl.length() > 0) {
3685       //else if (get_version("full", e, bl) > 0) {
3686       dout(20) << "build_incremental   full " << e << " "
3687                << bl.length() << " bytes" << dendl;
3688       m->maps[e] = bl;
3689       } else {
3690         ceph_abort();  // we should have all maps.
3691       }
3692     }
3693   }
3694   return m;
3695 }
3696
3697 void OSDMonitor::send_full(MonOpRequestRef op)
3698 {
3699   op->mark_osdmon_event(__func__);
3700   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
3701   mon->send_reply(op, build_latest_full(op->get_session()->con_features));
3702 }
3703
3704 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
3705 {
3706   op->mark_osdmon_event(__func__);
3707
3708   MonSession *s = op->get_session();
3709   ceph_assert(s);
3710
3711   if (s->proxy_con) {
3712     // oh, we can tell the other mon to do it
3713     dout(10) << __func__ << " asking proxying mon to send_incremental from "
3714              << first << dendl;
3715     MRoute *r = new MRoute(s->proxy_tid, NULL);
3716     r->send_osdmap_first = first;
3717     s->proxy_con->send_message(r);
3718     op->mark_event("reply: send routed send_osdmap_first reply");
3719   } else {
3720     // do it ourselves
3721     send_incremental(first, s, false, op);
3722   }
3723 }
3724
3725 void OSDMonitor::send_incremental(epoch_t first,
3726                                   MonSession *session,
3727                                   bool onetime,
3728                                   MonOpRequestRef req)
3729 {
3730   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3731           << " to " << session->name << dendl;
3732
3733   // get feature of the peer
3734   // use quorum_con_features, if it's an anonymous connection.
3735   uint64_t features = session->con_features ? session->con_features :
3736     mon->get_quorum_con_features();
3737
3738   if (first <= session->osd_epoch) {
3739     dout(10) << __func__ << " " << session->name << " should already have epoch "
3740              << session->osd_epoch << dendl;
3741     first = session->osd_epoch + 1;
3742   }
3743
3744   if (first < get_first_committed()) {
3745     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
3746     m->oldest_map = get_first_committed();
3747     m->newest_map = osdmap.get_epoch();
3748
3749     // share removed snaps during the gap
3750     get_removed_snaps_range(first, m->oldest_map, &m->gap_removed_snaps);
3751
3752     first = get_first_committed();
3753     bufferlist bl;
3754     int err = get_version_full(first, features, bl);
3755     ceph_assert(err == 0);
3756     ceph_assert(bl.length());
3757     dout(20) << "send_incremental starting with base full "
3758              << first << " " << bl.length() << " bytes" << dendl;
3759     m->maps[first] = bl;
3760
3761     if (req) {
3762       mon->send_reply(req, m);
3763       session->osd_epoch = first;
3764       return;
3765     } else {
3766       session->con->send_message(m);
3767       session->osd_epoch = first;
3768     }
3769     first++;
3770   }
3771
3772   while (first <= osdmap.get_epoch()) {
3773     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
3774                                      osdmap.get_epoch());
3775     MOSDMap *m = build_incremental(first, last, features);
3776
3777     if (req) {
3778       // send some maps.  it may not be all of them, but it will get them
3779       // started.
3780       mon->send_reply(req, m);
3781     } else {
3782       session->con->send_message(m);
3783       first = last + 1;
3784     }
3785     session->osd_epoch = last;
3786     if (onetime || req)
3787       break;
3788   }
3789 }
3790
3791 void OSDMonitor::get_removed_snaps_range(
3792   epoch_t start, epoch_t end,
3793   mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps)
3794 {
3795   // we only care about pools that exist now.
3796   for (auto& p : osdmap.get_pools()) {
3797     auto& t = (*gap_removed_snaps)[p.first];
3798     for (epoch_t epoch = start; epoch < end; ++epoch) {
3799       string k = make_snap_epoch_key(p.first, epoch);
3800       bufferlist v;
3801       mon->store->get(OSD_SNAP_PREFIX, k, v);
3802       if (v.length()) {
3803         auto q = v.cbegin();
3804         OSDMap::snap_interval_set_t snaps;
3805         decode(snaps, q);
3806         t.union_of(snaps);
3807       }
3808     }
3809     dout(10) << __func__ << " " << p.first << " " << t << dendl;
3810   }
3811 }
3812
3813 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3814 {
3815   return get_version(ver, mon->get_quorum_con_features(), bl);
3816 }
3817
3818 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
3819 {
3820   OSDMap::Incremental inc;
3821   auto q = bl.cbegin();
3822   inc.decode(q);
3823   // always encode with subset of osdmap's canonical features
3824   uint64_t f = features & inc.encode_features;
3825   dout(20) << __func__ << " " << inc.epoch << " with features " << f
3826            << dendl;
3827   bl.clear();
3828   if (inc.fullmap.length()) {
3829     // embedded full map?
3830     OSDMap m;
3831     m.decode(inc.fullmap);
3832     inc.fullmap.clear();
3833     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
3834   }
3835   if (inc.crush.length()) {
3836     // embedded crush map
3837     CrushWrapper c;
3838     auto p = inc.crush.cbegin();
3839     c.decode(p);
3840     inc.crush.clear();
3841     c.encode(inc.crush, f);
3842   }
3843   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
3844 }
3845
3846 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
3847 {
3848   OSDMap m;
3849   auto q = bl.cbegin();
3850   m.decode(q);
3851   // always encode with subset of osdmap's canonical features
3852   uint64_t f = features & m.get_encoding_features();
3853   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
3854            << dendl;
3855   bl.clear();
3856   m.encode(bl, f | CEPH_FEATURE_RESERVED);
3857 }
3858
3859 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
3860 {
3861   uint64_t significant_features = OSDMap::get_significant_features(features);
3862   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
3863     return 0;
3864   }
3865   int ret = PaxosService::get_version(ver, bl);
3866   if (ret < 0) {
3867     return ret;
3868   }
3869   // NOTE: this check is imprecise; the OSDMap encoding features may
3870   // be a subset of the latest mon quorum features, but worst case we
3871   // reencode once and then cache the (identical) result under both
3872   // feature masks.
3873   if (significant_features !=
3874       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
3875     reencode_incremental_map(bl, features);
3876   }
3877   inc_osd_cache.add({ver, significant_features}, bl);
3878   return 0;
3879 }
3880
3881 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
3882 {
3883   bufferlist inc_bl;
3884   int err = get_version(ver, inc_bl);
3885   ceph_assert(err == 0);
3886   ceph_assert(inc_bl.length());
3887
3888   auto p = inc_bl.cbegin();
3889   inc.decode(p);
3890   dout(10) << __func__ << "     "
3891            << " epoch " << inc.epoch
3892            << " inc_crc " << inc.inc_crc
3893            << " full_crc " << inc.full_crc
3894            << " encode_features " << inc.encode_features << dendl;
3895   return 0;
3896 }
3897
3898 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
3899 {
3900   dout(10) << __func__ << " ver " << ver << dendl;
3901
3902   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
3903   if (closest_pinned == 0) {
3904     return -ENOENT;
3905   }
3906   if (closest_pinned > ver) {
3907     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
3908   }
3909   ceph_assert(closest_pinned <= ver);
3910
3911   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
3912
3913   // get osdmap incremental maps and apply on top of this one.
3914   bufferlist osdm_bl;
3915   bool has_cached_osdmap = false;
3916   for (version_t v = ver-1; v >= closest_pinned; --v) {
3917     if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
3918                                 &osdm_bl)) {
3919       dout(10) << __func__ << " found map in cache ver " << v << dendl;
3920       closest_pinned = v;
3921       has_cached_osdmap = true;
3922       break;
3923     }
3924   }
3925
3926   if (!has_cached_osdmap) {
3927     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
3928     if (err != 0) {
3929       derr << __func__ << " closest pinned map ver " << closest_pinned
3930            << " not available! error: " << cpp_strerror(err) << dendl;
3931     }
3932     ceph_assert(err == 0);
3933   }
3934
3935   ceph_assert(osdm_bl.length());
3936
3937   OSDMap osdm;
3938   osdm.decode(osdm_bl);
3939
3940   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
3941            << " e" << osdm.epoch
3942            << " crc " << osdm.get_crc()
3943            << " -- applying incremental maps." << dendl;
3944
3945   uint64_t encode_features = 0;
3946   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
3947     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
3948
3949     OSDMap::Incremental inc;
3950     int err = get_inc(v, inc);
3951     ceph_assert(err == 0);
3952
3953     encode_features = inc.encode_features;
3954
3955     err = osdm.apply_incremental(inc);
3956     ceph_assert(err == 0);
3957
3958     // this block performs paranoid checks on map retrieval
3959     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
3960         inc.full_crc != 0) {
3961
3962       uint64_t f = encode_features;
3963       if (!f) {
3964         f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
3965       }
3966
3967       // encode osdmap to force calculating crcs
3968       bufferlist tbl;
3969       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
3970       // decode osdmap to compare crcs with what's expected by incremental
3971       OSDMap tosdm;
3972       tosdm.decode(tbl);
3973
3974       if (tosdm.get_crc() != inc.full_crc) {
3975         derr << __func__
3976              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
3977              << ", expected " << inc.full_crc << ")" << dendl;
3978         ceph_abort_msg("osdmap crc mismatch");
3979       }
3980     }
3981
3982     // note: we cannot add the recently computed map to the cache, as is,
3983     // because we have not encoded the map into a bl.
3984   }
3985
3986   if (!encode_features) {
3987     dout(10) << __func__
3988              << " last incremental map didn't have features;"
3989              << " defaulting to quorum's or all" << dendl;
3990     encode_features =
3991       (mon->quorum_con_features ? mon->quorum_con_features : -1);
3992   }
3993   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
3994
3995   return 0;
3996 }
3997
3998 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3999 {
4000   return get_version_full(ver, mon->get_quorum_con_features(), bl);
4001 }
4002
4003 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4004                                  bufferlist& bl)
4005 {
4006   uint64_t significant_features = OSDMap::get_significant_features(features);
4007   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4008     return 0;
4009   }
4010   int ret = PaxosService::get_version_full(ver, bl);
4011   if (ret == -ENOENT) {
4012     // build map?
4013     ret = get_full_from_pinned_map(ver, bl);
4014   }
4015   if (ret < 0) {
4016     return ret;
4017   }
4018   // NOTE: this check is imprecise; the OSDMap encoding features may
4019   // be a subset of the latest mon quorum features, but worst case we
4020   // reencode once and then cache the (identical) result under both
4021   // feature masks.
4022   if (significant_features !=
4023       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4024     reencode_full_map(bl, features);
4025   }
4026   full_osd_cache.add({ver, significant_features}, bl);
4027   return 0;
4028 }
4029
4030 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4031 {
4032   dout(10) << "blacklist " << av << " until " << until << dendl;
4033   for (auto a : av.v) {
4034     if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4035       a.set_type(entity_addr_t::TYPE_ANY);
4036     } else {
4037       a.set_type(entity_addr_t::TYPE_LEGACY);
4038     }
4039     pending_inc.new_blacklist[a] = until;
4040   }
4041   return pending_inc.epoch;
4042 }
4043
4044 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4045 {
4046   if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4047     a.set_type(entity_addr_t::TYPE_ANY);
4048   } else {
4049     a.set_type(entity_addr_t::TYPE_LEGACY);
4050   }
4051   dout(10) << "blacklist " << a << " until " << until << dendl;
4052   pending_inc.new_blacklist[a] = until;
4053   return pending_inc.epoch;
4054 }
4055
4056
4057 void OSDMonitor::check_osdmap_subs()
4058 {
4059   dout(10) << __func__ << dendl;
4060   if (!osdmap.get_epoch()) {
4061     return;
4062   }
4063   auto osdmap_subs = mon->session_map.subs.find("osdmap");
4064   if (osdmap_subs == mon->session_map.subs.end()) {
4065     return;
4066   }
4067   auto p = osdmap_subs->second->begin();
4068   while (!p.end()) {
4069     auto sub = *p;
4070     ++p;
4071     check_osdmap_sub(sub);
4072   }
4073 }
4074
4075 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4076 {
4077   dout(10) << __func__ << " " << sub << " next " << sub->next
4078            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4079   if (sub->next <= osdmap.get_epoch()) {
4080     if (sub->next >= 1)
4081       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4082     else
4083       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4084     if (sub->onetime)
4085       mon->session_map.remove_sub(sub);
4086     else
4087       sub->next = osdmap.get_epoch() + 1;
4088   }
4089 }
4090
4091 void OSDMonitor::check_pg_creates_subs()
4092 {
4093   if (!osdmap.get_num_up_osds()) {
4094     return;
4095   }
4096   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4097   mon->with_session_map([this](const MonSessionMap& session_map) {
4098       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4099       if (pg_creates_subs == session_map.subs.end()) {
4100         return;
4101       }
4102       for (auto sub : *pg_creates_subs->second) {
4103         check_pg_creates_sub(sub);
4104       }
4105     });
4106 }
4107
4108 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4109 {
4110   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4111   ceph_assert(sub->type == "osd_pg_creates");
4112   // only send these if the OSD is up.  we will check_subs() when they do
4113   // come up so they will get the creates then.
4114   if (sub->session->name.is_osd() &&
4115       mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4116     sub->next = send_pg_creates(sub->session->name.num(),
4117                                 sub->session->con.get(),
4118                                 sub->next);
4119   }
4120 }
4121
4122 void OSDMonitor::do_application_enable(int64_t pool_id,
4123                                        const std::string &app_name,
4124                                        const std::string &app_key,
4125                                        const std::string &app_value)
4126 {
4127   ceph_assert(paxos->is_plugged() && is_writeable());
4128
4129   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4130            << dendl;
4131
4132   ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
4133
4134   auto pp = osdmap.get_pg_pool(pool_id);
4135   ceph_assert(pp != nullptr);
4136
4137   pg_pool_t p = *pp;
4138   if (pending_inc.new_pools.count(pool_id)) {
4139     p = pending_inc.new_pools[pool_id];
4140   }
4141
4142   if (app_key.empty()) {
4143     p.application_metadata.insert({app_name, {}});
4144   } else {
4145     p.application_metadata.insert({app_name, {{app_key, app_value}}});
4146   }
4147   p.last_change = pending_inc.epoch;
4148   pending_inc.new_pools[pool_id] = p;
4149 }
4150
4151 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4152                                  pool_opts_t::key_t opt,
4153                                  pool_opts_t::value_t val)
4154 {
4155   auto p = pending_inc.new_pools.try_emplace(
4156     pool_id, *osdmap.get_pg_pool(pool_id));
4157   p.first->second.opts.set(opt, val);
4158 }
4159
4160 unsigned OSDMonitor::scan_for_creating_pgs(
4161   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4162   const mempool::osdmap::set<int64_t>& removed_pools,
4163   utime_t modified,
4164   creating_pgs_t* creating_pgs) const
4165 {
4166   unsigned queued = 0;
4167   for (auto& p : pools) {
4168     int64_t poolid = p.first;
4169     if (creating_pgs->created_pools.count(poolid)) {
4170       dout(10) << __func__ << " already created " << poolid << dendl;
4171       continue;
4172     }
4173     const pg_pool_t& pool = p.second;
4174     int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4175                                          pool.get_type(), pool.get_size());
4176     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4177       continue;
4178
4179     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4180     const auto created = pool.get_last_change();
4181     if (last_scan_epoch && created <= last_scan_epoch) {
4182       dout(10) << __func__ << " no change in pool " << poolid
4183                << " " << pool << dendl;
4184       continue;
4185     }
4186     if (removed_pools.count(poolid)) {
4187       dout(10) << __func__ << " pool is being removed: " << poolid
4188                << " " << pool << dendl;
4189       continue;
4190     }
4191     dout(10) << __func__ << " queueing pool create for " << poolid
4192              << " " << pool << dendl;
4193     creating_pgs->create_pool(poolid, pool.get_pg_num(),
4194                               created, modified);
4195     queued++;
4196   }
4197   return queued;
4198 }
4199
4200 void OSDMonitor::update_creating_pgs()
4201 {
4202   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4203            << creating_pgs.queue.size() << " pools in queue" << dendl;
4204   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4205   std::lock_guard<std::mutex> l(creating_pgs_lock);
4206   for (const auto& pg : creating_pgs.pgs) {
4207     int acting_primary = -1;
4208     auto pgid = pg.first;
4209     if (!osdmap.pg_exists(pgid)) {
4210       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4211                << dendl;
4212       continue;
4213     }
4214     auto mapped = pg.second.first;
4215     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4216     spg_t spgid(pgid);
4217     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4218     // check the previous creating_pgs, look for the target to whom the pg was
4219     // previously mapped
4220     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4221       const auto last_acting_primary = pgs_by_epoch.first;
4222       for (auto& pgs: pgs_by_epoch.second) {
4223         if (pgs.second.count(spgid)) {
4224           if (last_acting_primary == acting_primary) {
4225             mapped = pgs.first;
4226           } else {
4227             dout(20) << __func__ << " " << pgid << " "
4228                      << " acting_primary:" << last_acting_primary
4229                      << " -> " << acting_primary << dendl;
4230             // note epoch if the target of the create message changed.
4231             mapped = mapping.get_epoch();
4232           }
4233           break;
4234         } else {
4235           // newly creating
4236           mapped = mapping.get_epoch();
4237         }
4238       }
4239     }
4240     dout(10) << __func__ << " will instruct osd." << acting_primary
4241              << " to create " << pgid << "@" << mapped << dendl;
4242     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4243   }
4244   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4245   creating_pgs_epoch = mapping.get_epoch();
4246 }
4247
4248 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4249 {
4250   dout(30) << __func__ << " osd." << osd << " next=" << next
4251            << " " << creating_pgs_by_osd_epoch << dendl;
4252   std::lock_guard<std::mutex> l(creating_pgs_lock);
4253   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4254     dout(20) << __func__
4255              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4256     // the subscribers will be updated when the mapping is completed anyway
4257     return next;
4258   }
4259   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4260   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4261     return next;
4262   ceph_assert(!creating_pgs_by_epoch->second.empty());
4263
4264   MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4265   MOSDPGCreate2 *m = nullptr;
4266
4267   bool old = osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS;
4268
4269   epoch_t last = 0;
4270   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4271        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4272     auto epoch = epoch_pgs->first;
4273     auto& pgs = epoch_pgs->second;
4274     dout(20) << __func__ << " osd." << osd << " from " << next
4275              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4276     last = epoch;
4277     for (auto& pg : pgs) {
4278       // Need the create time from the monitor using its clock to set
4279       // last_scrub_stamp upon pg creation.
4280       auto create = creating_pgs.pgs.find(pg.pgid);
4281       ceph_assert(create != creating_pgs.pgs.end());
4282       if (old) {
4283         if (!oldm) {
4284           oldm = new MOSDPGCreate(creating_pgs_epoch);
4285         }
4286         oldm->mkpg.emplace(pg.pgid,
4287                            pg_create_t{create->second.first, pg.pgid, 0});
4288         oldm->ctimes.emplace(pg.pgid, create->second.second);
4289       } else {
4290         if (!m) {
4291           m = new MOSDPGCreate2(creating_pgs_epoch);
4292         }
4293         m->pgs.emplace(pg, create->second);
4294       }
4295       dout(20) << __func__ << " will create " << pg
4296                << " at " << create->second.first << dendl;
4297     }
4298   }
4299   if (m) {
4300     con->send_message(m);
4301   } else if (oldm) {
4302     con->send_message(oldm);
4303   } else {
4304     dout(20) << __func__ << " osd." << osd << " from " << next
4305              << " has nothing to send" << dendl;
4306     return next;
4307   }
4308
4309   // sub is current through last + 1
4310   return last + 1;
4311 }
4312
4313 // TICK
4314
4315
4316 void OSDMonitor::tick()
4317 {
4318   if (!is_active()) return;
4319
4320   dout(10) << osdmap << dendl;
4321
4322   // always update osdmap manifest, regardless of being the leader.
4323   load_osdmap_manifest();
4324
4325   if (!mon->is_leader()) return;
4326
4327   bool do_propose = false;
4328   utime_t now = ceph_clock_now();
4329
4330   if (handle_osd_timeouts(now, last_osd_report)) {
4331     do_propose = true;
4332   }
4333
4334   // mark osds down?
4335   if (check_failures(now)) {
4336     do_propose = true;
4337   }
4338
4339   // Force a proposal if we need to prune; pruning is performed on
4340   // ``encode_pending()``, hence why we need to regularly trigger a proposal
4341   // even if there's nothing going on.
4342   if (is_prune_enabled() && should_prune()) {
4343     do_propose = true;
4344   }
4345
4346   // mark down osds out?
4347
4348   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4349    * influence at all. The decision is made based on the ratio of "in" osds,
4350    * and the function returns false if this ratio is lower that the minimum
4351    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
4352    */
4353   if (can_mark_out(-1)) {
4354     string down_out_subtree_limit = g_conf().get_val<string>(
4355       "mon_osd_down_out_subtree_limit");
4356     set<int> down_cache;  // quick cache of down subtrees
4357
4358     map<int,utime_t>::iterator i = down_pending_out.begin();
4359     while (i != down_pending_out.end()) {
4360       int o = i->first;
4361       utime_t down = now;
4362       down -= i->second;
4363       ++i;
4364
4365       if (osdmap.is_down(o) &&
4366           osdmap.is_in(o) &&
4367           can_mark_out(o)) {
4368         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
4369         utime_t grace = orig_grace;
4370         double my_grace = 0.0;
4371
4372         if (g_conf()->mon_osd_adjust_down_out_interval) {
4373           // scale grace period the same way we do the heartbeat grace.
4374           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
4375           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
4376           double decay_k = ::log(.5) / halflife;
4377           double decay = exp((double)down * decay_k);
4378           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
4379                    << " down for " << down << " decay " << decay << dendl;
4380           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
4381           grace += my_grace;
4382         }
4383
4384         // is this an entire large subtree down?
4385         if (down_out_subtree_limit.length()) {
4386           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
4387           if (type > 0) {
4388             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
4389               dout(10) << "tick entire containing " << down_out_subtree_limit
4390                        << " subtree for osd." << o
4391                        << " is down; resetting timer" << dendl;
4392               // reset timer, too.
4393               down_pending_out[o] = now;
4394               continue;
4395             }
4396           }
4397         }
4398
4399         bool down_out = !osdmap.is_destroyed(o) &&
4400           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
4401         bool destroyed_out = osdmap.is_destroyed(o) &&
4402           g_conf()->mon_osd_destroyed_out_interval > 0 &&
4403         // this is not precise enough as we did not make a note when this osd
4404         // was marked as destroyed, but let's not bother with that
4405         // complexity for now.
4406           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
4407         if (down_out || destroyed_out) {
4408           dout(10) << "tick marking osd." << o << " OUT after " << down
4409                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
4410           pending_inc.new_weight[o] = CEPH_OSD_OUT;
4411
4412           // set the AUTOOUT bit.
4413           if (pending_inc.new_state.count(o) == 0)
4414             pending_inc.new_state[o] = 0;
4415           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
4416
4417           // remember previous weight
4418           if (pending_inc.new_xinfo.count(o) == 0)
4419             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
4420           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
4421
4422           do_propose = true;
4423
4424           mon->clog->info() << "Marking osd." << o << " out (has been down for "
4425                             << int(down.sec()) << " seconds)";
4426         } else
4427           continue;
4428       }
4429
4430       down_pending_out.erase(o);
4431     }
4432   } else {
4433     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
4434   }
4435
4436   // expire blacklisted items?
4437   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4438        p != osdmap.blacklist.end();
4439        ++p) {
4440     if (p->second < now) {
4441       dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
4442       pending_inc.old_blacklist.push_back(p->first);
4443       do_propose = true;
4444     }
4445   }
4446
4447   if (try_prune_purged_snaps()) {
4448     do_propose = true;
4449   }
4450
4451   if (update_pools_status())
4452     do_propose = true;
4453
4454   if (do_propose ||
4455       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
4456     propose_pending();
4457 }
4458
4459 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
4460                                      std::map<int,utime_t> &last_osd_report)
4461 {
4462   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
4463   if (now - mon->get_leader_since() < timeo) {
4464     // We haven't been the leader for long enough to consider OSD timeouts
4465     return false;
4466   }
4467
4468   int max_osd = osdmap.get_max_osd();
4469   bool new_down = false;
4470
4471   for (int i=0; i < max_osd; ++i) {
4472     dout(30) << __func__ << ": checking up on osd " << i << dendl;
4473     if (!osdmap.exists(i)) {
4474       last_osd_report.erase(i); // if any
4475       continue;
4476     }
4477     if (!osdmap.is_up(i))
4478       continue;
4479     const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
4480     if (t == last_osd_report.end()) {
4481       // it wasn't in the map; start the timer.
4482       last_osd_report[i] = now;
4483     } else if (can_mark_down(i)) {
4484       utime_t diff = now - t->second;
4485       if (diff > timeo) {
4486         mon->clog->info() << "osd." << i << " marked down after no beacon for "
4487                           << diff << " seconds";
4488         derr << "no beacon from osd." << i << " since " << t->second
4489              << ", " << diff << " seconds ago.  marking down" << dendl;
4490         pending_inc.new_state[i] = CEPH_OSD_UP;
4491         new_down = true;
4492       }
4493     }
4494   }
4495   return new_down;
4496 }
4497
4498 static void dump_cpu_list(Formatter *f, const char *name,
4499                           const string& strlist)
4500 {
4501   cpu_set_t cpu_set;
4502   size_t cpu_set_size;
4503   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
4504     return;
4505   }
4506   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
4507   f->open_array_section(name);
4508   for (auto cpu : cpus) {
4509     f->dump_int("cpu", cpu);
4510   }
4511   f->close_section();
4512 }
4513
4514 void OSDMonitor::dump_info(Formatter *f)
4515 {
4516   f->open_object_section("osdmap");
4517   osdmap.dump(f);
4518   f->close_section();
4519
4520   f->open_array_section("osd_metadata");
4521   for (int i=0; i<osdmap.get_max_osd(); ++i) {
4522     if (osdmap.exists(i)) {
4523       f->open_object_section("osd");
4524       f->dump_unsigned("id", i);
4525       dump_osd_metadata(i, f, NULL);
4526       f->close_section();
4527     }
4528   }
4529   f->close_section();
4530
4531   f->dump_unsigned("osdmap_first_committed", get_first_committed());
4532   f->dump_unsigned("osdmap_last_committed", get_last_committed());
4533
4534   f->open_object_section("crushmap");
4535   osdmap.crush->dump(f);
4536   f->close_section();
4537
4538   if (has_osdmap_manifest) {
4539     f->open_object_section("osdmap_manifest");
4540     osdmap_manifest.dump(f);
4541     f->close_section();
4542   }
4543 }
4544
4545 namespace {
4546   enum osd_pool_get_choices {
4547     SIZE, MIN_SIZE,
4548     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
4549     NODELETE, NOPGCHANGE, NOSIZECHANGE,
4550     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
4551     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4552     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
4553     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4554     CACHE_TARGET_FULL_RATIO,
4555     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4556     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
4557     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
4558     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
4559     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
4560     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
4561     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
4562     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
4563     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
4564     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
4565     PG_AUTOSCALE_BIAS };
4566
4567   std::set<osd_pool_get_choices>
4568     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
4569                                 const std::set<osd_pool_get_choices>& second)
4570     {
4571       std::set<osd_pool_get_choices> result;
4572       std::set_difference(first.begin(), first.end(),
4573                           second.begin(), second.end(),
4574                           std::inserter(result, result.end()));
4575       return result;
4576     }
4577 }
4578
4579
4580 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
4581 {
4582   op->mark_osdmon_event(__func__);
4583   MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
4584   int r = 0;
4585   bufferlist rdata;
4586   stringstream ss, ds;
4587
4588   cmdmap_t cmdmap;
4589   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
4590     string rs = ss.str();
4591     mon->reply_command(op, -EINVAL, rs, get_last_committed());
4592     return true;
4593   }
4594
4595   MonSession *session = op->get_session();
4596   if (!session) {
4597     derr << __func__ << " no session" << dendl;
4598     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
4599     return true;
4600   }
4601
4602   string prefix;
4603   cmd_getval(cct, cmdmap, "prefix", prefix);
4604
4605   string format;
4606   cmd_getval(cct, cmdmap, "format", format, string("plain"));
4607   boost::scoped_ptr<Formatter> f(Formatter::create(format));
4608
4609   if (prefix == "osd stat") {
4610     osdmap.print_summary(f.get(), ds, "", true);
4611     if (f)
4612       f->flush(rdata);
4613     else
4614       rdata.append(ds);
4615   }
4616   else if (prefix == "osd dump" ||
4617            prefix == "osd tree" ||
4618            prefix == "osd tree-from" ||
4619            prefix == "osd ls" ||
4620            prefix == "osd getmap" ||
4621            prefix == "osd getcrushmap" ||
4622            prefix == "osd ls-tree") {
4623     string val;
4624
4625     epoch_t epoch = 0;
4626     int64_t epochnum;
4627     cmd_getval(cct, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
4628     epoch = epochnum;
4629
4630     bufferlist osdmap_bl;
4631     int err = get_version_full(epoch, osdmap_bl);
4632     if (err == -ENOENT) {
4633       r = -ENOENT;
4634       ss << "there is no map for epoch " << epoch;
4635       goto reply;
4636     }
4637     ceph_assert(err == 0);
4638     ceph_assert(osdmap_bl.length());
4639
4640     OSDMap *p;
4641     if (epoch == osdmap.get_epoch()) {
4642       p = &osdmap;
4643     } else {
4644       p = new OSDMap;
4645       p->decode(osdmap_bl);
4646     }
4647
4648     auto sg = make_scope_guard([&] {
4649       if (p != &osdmap) {
4650         delete p;
4651       }
4652     });
4653
4654     if (prefix == "osd dump") {
4655       stringstream ds;
4656       if (f) {
4657         f->open_object_section("osdmap");
4658         p->dump(f.get());
4659         f->close_section();
4660         f->flush(ds);
4661       } else {
4662         p->print(ds);
4663       }
4664       rdata.append(ds);
4665       if (!f)
4666         ds << " ";
4667     } else if (prefix == "osd ls") {
4668       if (f) {
4669         f->open_array_section("osds");
4670         for (int i = 0; i < osdmap.get_max_osd(); i++) {
4671           if (osdmap.exists(i)) {
4672             f->dump_int("osd", i);
4673           }
4674         }
4675         f->close_section();
4676         f->flush(ds);
4677       } else {
4678         bool first = true;
4679         for (int i = 0; i < osdmap.get_max_osd(); i++) {
4680           if (osdmap.exists(i)) {
4681             if (!first)
4682               ds << "\n";
4683             first = false;
4684             ds << i;
4685           }
4686         }
4687       }
4688       rdata.append(ds);
4689     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
4690       string bucket;
4691       if (prefix == "osd tree-from") {
4692         cmd_getval(cct, cmdmap, "bucket", bucket);
4693         if (!osdmap.crush->name_exists(bucket)) {
4694           ss << "bucket '" << bucket << "' does not exist";
4695           r = -ENOENT;
4696           goto reply;
4697         }
4698         int id = osdmap.crush->get_item_id(bucket);
4699         if (id >= 0) {
4700           ss << "\"" << bucket << "\" is not a bucket";
4701           r = -EINVAL;
4702           goto reply;
4703         }
4704       }
4705
4706       vector<string> states;
4707       cmd_getval(cct, cmdmap, "states", states);
4708       unsigned filter = 0;
4709       for (auto& s : states) {
4710         if (s == "up") {
4711           filter |= OSDMap::DUMP_UP;
4712         } else if (s == "down") {
4713           filter |= OSDMap::DUMP_DOWN;
4714         } else if (s == "in") {
4715           filter |= OSDMap::DUMP_IN;
4716         } else if (s == "out") {
4717           filter |= OSDMap::DUMP_OUT;
4718         } else if (s == "destroyed") {
4719           filter |= OSDMap::DUMP_DESTROYED;
4720         } else {
4721           ss << "unrecognized state '" << s << "'";
4722           r = -EINVAL;
4723           goto reply;
4724         }
4725       }
4726       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
4727           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4728         ss << "cannot specify both 'in' and 'out'";
4729         r = -EINVAL;
4730         goto reply;
4731       }
4732       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4733            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4734            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4735            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4736            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4737            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4738         ss << "can specify only one of 'up', 'down' and 'destroyed'";
4739         r = -EINVAL;
4740         goto reply;
4741       }
4742       if (f) {
4743         f->open_object_section("tree");
4744         p->print_tree(f.get(), NULL, filter, bucket);
4745         f->close_section();
4746         f->flush(ds);
4747       } else {
4748         p->print_tree(NULL, &ds, filter, bucket);
4749       }
4750       rdata.append(ds);
4751     } else if (prefix == "osd getmap") {
4752       rdata.append(osdmap_bl);
4753       ss << "got osdmap epoch " << p->get_epoch();
4754     } else if (prefix == "osd getcrushmap") {
4755       p->crush->encode(rdata, mon->get_quorum_con_features());
4756       ss << p->get_crush_version();
4757     } else if (prefix == "osd ls-tree") {
4758       string bucket_name;
4759       cmd_getval(cct, cmdmap, "name", bucket_name);
4760       set<int> osds;
4761       r = p->get_osds_by_bucket_name(bucket_name, &osds);
4762       if (r == -ENOENT) {
4763         ss << "\"" << bucket_name << "\" does not exist";
4764         goto reply;
4765       } else if (r < 0) {
4766         ss << "can not parse bucket name:\"" << bucket_name << "\"";
4767         goto reply;
4768       }
4769
4770       if (f) {
4771         f->open_array_section("osds");
4772         for (auto &i : osds) {
4773           if (osdmap.exists(i)) {
4774             f->dump_int("osd", i);
4775           }
4776         }
4777         f->close_section();
4778         f->flush(ds);
4779       } else {
4780         bool first = true;
4781         for (auto &i : osds) {
4782           if (osdmap.exists(i)) {
4783             if (!first)
4784               ds << "\n";
4785             first = false;
4786             ds << i;
4787           }
4788         }
4789       }
4790
4791       rdata.append(ds);
4792     }
4793   } else if (prefix == "osd getmaxosd") {
4794     if (f) {
4795       f->open_object_section("getmaxosd");
4796       f->dump_unsigned("epoch", osdmap.get_epoch());
4797       f->dump_int("max_osd", osdmap.get_max_osd());
4798       f->close_section();
4799       f->flush(rdata);
4800     } else {
4801       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4802       rdata.append(ds);
4803     }
4804   } else if (prefix == "osd utilization") {
4805     string out;
4806     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4807     if (f)
4808       f->flush(rdata);
4809     else
4810       rdata.append(out);
4811     r = 0;
4812     goto reply;
4813   } else if (prefix  == "osd find") {
4814     int64_t osd;
4815     if (!cmd_getval(cct, cmdmap, "id", osd)) {
4816       ss << "unable to parse osd id value '"
4817          << cmd_vartype_stringify(cmdmap["id"]) << "'";
4818       r = -EINVAL;
4819       goto reply;
4820     }
4821     if (!osdmap.exists(osd)) {
4822       ss << "osd." << osd << " does not exist";
4823       r = -ENOENT;
4824       goto reply;
4825     }
4826     string format;
4827     cmd_getval(cct, cmdmap, "format", format);
4828     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4829     f->open_object_section("osd_location");
4830     f->dump_int("osd", osd);
4831     f->dump_object("addrs", osdmap.get_addrs(osd));
4832     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
4833
4834     // try to identify host, pod/container name, etc.
4835     map<string,string> m;
4836     load_metadata(osd, m, nullptr);
4837     if (auto p = m.find("hostname"); p != m.end()) {
4838       f->dump_string("host", p->second);
4839     }
4840     for (auto& k : {
4841         "pod_name", "pod_namespace", // set by rook
4842         "container_name"             // set by ceph-ansible
4843         }) {
4844       if (auto p = m.find(k); p != m.end()) {
4845         f->dump_string(k, p->second);
4846       }
4847     }
4848
4849     // crush is helpful too
4850     f->open_object_section("crush_location");
4851     map<string,string> loc = osdmap.crush->get_full_location(osd);
4852     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4853       f->dump_string(p->first.c_str(), p->second);
4854     f->close_section();
4855     f->close_section();
4856     f->flush(rdata);
4857   } else if (prefix == "osd metadata") {
4858     int64_t osd = -1;
4859     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4860         !cmd_getval(cct, cmdmap, "id", osd)) {
4861       ss << "unable to parse osd id value '"
4862          << cmd_vartype_stringify(cmdmap["id"]) << "'";
4863       r = -EINVAL;
4864       goto reply;
4865     }
4866     if (osd >= 0 && !osdmap.exists(osd)) {
4867       ss << "osd." << osd << " does not exist";
4868       r = -ENOENT;
4869       goto reply;
4870     }
4871     string format;
4872     cmd_getval(cct, cmdmap, "format", format);
4873     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4874     if (osd >= 0) {
4875       f->open_object_section("osd_metadata");
4876       f->dump_unsigned("id", osd);
4877       r = dump_osd_metadata(osd, f.get(), &ss);
4878       if (r < 0)
4879         goto reply;
4880       f->close_section();
4881     } else {
4882       r = 0;
4883       f->open_array_section("osd_metadata");
4884       for (int i=0; i<osdmap.get_max_osd(); ++i) {
4885         if (osdmap.exists(i)) {
4886           f->open_object_section("osd");
4887           f->dump_unsigned("id", i);
4888           r = dump_osd_metadata(i, f.get(), NULL);
4889           if (r == -EINVAL || r == -ENOENT) {
4890             // Drop error, continue to get other daemons' metadata
4891             dout(4) << "No metadata for osd." << i << dendl;
4892             r = 0;
4893           } else if (r < 0) {
4894             // Unexpected error
4895             goto reply;
4896           }
4897           f->close_section();
4898         }
4899       }
4900       f->close_section();
4901     }
4902     f->flush(rdata);
4903   } else if (prefix == "osd versions") {
4904     if (!f)
4905       f.reset(Formatter::create("json-pretty"));
4906     count_metadata("ceph_version", f.get());
4907     f->flush(rdata);
4908     r = 0;
4909   } else if (prefix == "osd count-metadata") {
4910     if (!f)
4911       f.reset(Formatter::create("json-pretty"));
4912     string field;
4913     cmd_getval(cct, cmdmap, "property", field);
4914     count_metadata(field, f.get());
4915     f->flush(rdata);
4916     r = 0;
4917   } else if (prefix == "osd numa-status") {
4918     TextTable tbl;
4919     if (f) {
4920       f->open_array_section("osds");
4921     } else {
4922       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
4923       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
4924       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
4925       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
4926       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
4927       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
4928     }
4929     for (int i=0; i<osdmap.get_max_osd(); ++i) {
4930       if (osdmap.exists(i)) {
4931         map<string,string> m;
4932         ostringstream err;
4933         if (load_metadata(i, m, &err) < 0) {
4934           continue;
4935         }
4936         string host;
4937         auto p = m.find("hostname");
4938         if (p != m.end()) {
4939           host = p->second;
4940         }
4941         if (f) {
4942           f->open_object_section("osd");
4943           f->dump_int("osd", i);
4944           f->dump_string("host", host);
4945           for (auto n : { "network_numa_node", "objectstore_numa_node",
4946                 "numa_node" }) {
4947             p = m.find(n);
4948             if (p != m.end()) {
4949               f->dump_int(n, atoi(p->second.c_str()));
4950             }
4951           }
4952           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
4953             p = m.find(n);
4954             if (p != m.end()) {
4955               list<string> ls = get_str_list(p->second, ",");
4956               f->open_array_section(n);
4957               for (auto node : ls) {
4958                 f->dump_int("node", atoi(node.c_str()));
4959               }
4960               f->close_section();
4961             }
4962           }
4963           for (auto n : { "numa_node_cpus" }) {
4964             p = m.find(n);
4965             if (p != m.end()) {
4966               dump_cpu_list(f.get(), n, p->second);
4967             }
4968           }
4969           f->close_section();
4970         } else {
4971           tbl << i;
4972           tbl << host;
4973           p = m.find("network_numa_nodes");
4974           if (p != m.end()) {
4975             tbl << p->second;
4976           } else {
4977             tbl << "-";
4978           }
4979           p = m.find("objectstore_numa_nodes");
4980           if (p != m.end()) {
4981             tbl << p->second;
4982           } else {
4983             tbl << "-";
4984           }
4985           p = m.find("numa_node");
4986           auto q = m.find("numa_node_cpus");
4987           if (p != m.end() && q != m.end()) {
4988             tbl << p->second;
4989             tbl << q->second;
4990           } else {
4991             tbl << "-";
4992             tbl << "-";
4993           }
4994           tbl << TextTable::endrow;
4995         }
4996       }
4997     }
4998     if (f) {
4999       f->close_section();
5000       f->flush(rdata);
5001     } else {
5002       rdata.append(stringify(tbl));
5003     }
5004   } else if (prefix == "osd map") {
5005     string poolstr, objstr, namespacestr;
5006     cmd_getval(cct, cmdmap, "pool", poolstr);
5007     cmd_getval(cct, cmdmap, "object", objstr);
5008     cmd_getval(cct, cmdmap, "nspace", namespacestr);
5009
5010     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5011     if (pool < 0) {
5012       ss << "pool " << poolstr << " does not exist";
5013       r = -ENOENT;
5014       goto reply;
5015     }
5016     object_locator_t oloc(pool, namespacestr);
5017     object_t oid(objstr);
5018     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5019     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5020     vector<int> up, acting;
5021     int up_p, acting_p;
5022     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5023
5024     string fullobjname;
5025     if (!namespacestr.empty())
5026       fullobjname = namespacestr + string("/") + oid.name;
5027     else
5028       fullobjname = oid.name;
5029     if (f) {
5030       f->open_object_section("osd_map");
5031       f->dump_unsigned("epoch", osdmap.get_epoch());
5032       f->dump_string("pool", poolstr);
5033       f->dump_int("pool_id", pool);
5034       f->dump_stream("objname") << fullobjname;
5035       f->dump_stream("raw_pgid") << pgid;
5036       f->dump_stream("pgid") << mpgid;
5037       f->open_array_section("up");
5038       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5039         f->dump_int("osd", *p);
5040       f->close_section();
5041       f->dump_int("up_primary", up_p);
5042       f->open_array_section("acting");
5043       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5044         f->dump_int("osd", *p);
5045       f->close_section();
5046       f->dump_int("acting_primary", acting_p);
5047       f->close_section(); // osd_map
5048       f->flush(rdata);
5049     } else {
5050       ds << "osdmap e" << osdmap.get_epoch()
5051         << " pool '" << poolstr << "' (" << pool << ")"
5052         << " object '" << fullobjname << "' ->"
5053         << " pg " << pgid << " (" << mpgid << ")"
5054         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5055         << pg_vector_string(acting) << ", p" << acting_p << ")";
5056       rdata.append(ds);
5057     }
5058
5059   } else if (prefix == "pg map") {
5060     pg_t pgid;
5061     string pgidstr;
5062     cmd_getval(cct, cmdmap, "pgid", pgidstr);
5063     if (!pgid.parse(pgidstr.c_str())) {
5064       ss << "invalid pgid '" << pgidstr << "'";
5065       r = -EINVAL;
5066       goto reply;
5067     }
5068     vector<int> up, acting;
5069     if (!osdmap.have_pg_pool(pgid.pool())) {
5070       ss << "pg '" << pgidstr << "' does not exist";
5071       r = -ENOENT;
5072       goto reply;
5073     }
5074     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5075     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5076     if (f) {
5077       f->open_object_section("pg_map");
5078       f->dump_unsigned("epoch", osdmap.get_epoch());
5079       f->dump_stream("raw_pgid") << pgid;
5080       f->dump_stream("pgid") << mpgid;
5081       f->open_array_section("up");
5082       for (auto osd : up) {
5083         f->dump_int("up_osd", osd);
5084       }
5085       f->close_section();
5086       f->open_array_section("acting");
5087       for (auto osd : acting) {
5088         f->dump_int("acting_osd", osd);
5089       }
5090       f->close_section();
5091       f->close_section();
5092       f->flush(rdata);
5093     } else {
5094       ds << "osdmap e" << osdmap.get_epoch()
5095          << " pg " << pgid << " (" << mpgid << ")"
5096          << " -> up " << up << " acting " << acting;
5097       rdata.append(ds);
5098     }
5099     goto reply;
5100
5101   } else if (prefix == "osd lspools") {
5102     if (f)
5103       f->open_array_section("pools");
5104     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5105          p != osdmap.pools.end();
5106          ++p) {
5107       if (f) {
5108         f->open_object_section("pool");
5109         f->dump_int("poolnum", p->first);
5110         f->dump_string("poolname", osdmap.pool_name[p->first]);
5111         f->close_section();
5112       } else {
5113         ds << p->first << ' ' << osdmap.pool_name[p->first];
5114         if (next(p) != osdmap.pools.end()) {
5115           ds << '\n';
5116         }
5117       }
5118     }
5119     if (f) {
5120       f->close_section();
5121       f->flush(ds);
5122     }
5123     rdata.append(ds);
5124   } else if (prefix == "osd blacklist ls") {
5125     if (f)
5126       f->open_array_section("blacklist");
5127
5128     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5129          p != osdmap.blacklist.end();
5130          ++p) {
5131       if (f) {
5132         f->open_object_section("entry");
5133         f->dump_string("addr", p->first.get_legacy_str());
5134         f->dump_stream("until") << p->second;
5135         f->close_section();
5136       } else {
5137         stringstream ss;
5138         string s;
5139         ss << p->first << " " << p->second;
5140         getline(ss, s);
5141         s += "\n";
5142         rdata.append(s);
5143       }
5144     }
5145     if (f) {
5146       f->close_section();
5147       f->flush(rdata);
5148     }
5149     ss << "listed " << osdmap.blacklist.size() << " entries";
5150
5151   } else if (prefix == "osd pool ls") {
5152     string detail;
5153     cmd_getval(cct, cmdmap, "detail", detail);
5154     if (!f && detail == "detail") {
5155       ostringstream ss;
5156       osdmap.print_pools(ss);
5157       rdata.append(ss.str());
5158     } else {
5159       if (f)
5160         f->open_array_section("pools");
5161       for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5162            it != osdmap.get_pools().end();
5163            ++it) {
5164         if (f) {
5165           if (detail == "detail") {
5166             f->open_object_section("pool");
5167             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5168             it->second.dump(f.get());
5169             f->close_section();
5170           } else {
5171             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5172           }
5173         } else {
5174           rdata.append(osdmap.get_pool_name(it->first) + "\n");
5175         }
5176       }
5177       if (f) {
5178         f->close_section();
5179         f->flush(rdata);
5180       }
5181     }
5182
5183   } else if (prefix == "osd crush get-tunable") {
5184     string tunable;
5185     cmd_getval(cct, cmdmap, "tunable", tunable);
5186     ostringstream rss;
5187     if (f)
5188       f->open_object_section("tunable");
5189     if (tunable == "straw_calc_version") {
5190       if (f)
5191         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5192       else
5193         rss << osdmap.crush->get_straw_calc_version() << "\n";
5194     } else {
5195       r = -EINVAL;
5196       goto reply;
5197     }
5198     if (f) {
5199       f->close_section();
5200       f->flush(rdata);
5201     } else {
5202       rdata.append(rss.str());
5203     }
5204     r = 0;
5205
5206   } else if (prefix == "osd pool get") {
5207     string poolstr;
5208     cmd_getval(cct, cmdmap, "pool", poolstr);
5209     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5210     if (pool < 0) {
5211       ss << "unrecognized pool '" << poolstr << "'";
5212       r = -ENOENT;
5213       goto reply;
5214     }
5215
5216     const pg_pool_t *p = osdmap.get_pg_pool(pool);
5217     string var;
5218     cmd_getval(cct, cmdmap, "var", var);
5219
5220     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5221     const choices_map_t ALL_CHOICES = {
5222       {"size", SIZE},
5223       {"min_size", MIN_SIZE},
5224       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5225       {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5226       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5227       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5228       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5229       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5230       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5231       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5232       {"use_gmt_hitset", USE_GMT_HITSET},
5233       {"target_max_objects", TARGET_MAX_OBJECTS},
5234       {"target_max_bytes", TARGET_MAX_BYTES},
5235       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5236       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5237       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5238       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5239       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5240       {"erasure_code_profile", ERASURE_CODE_PROFILE},
5241       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5242       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5243       {"fast_read", FAST_READ},
5244       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5245       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5246       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5247       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5248       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5249       {"recovery_priority", RECOVERY_PRIORITY},
5250       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5251       {"scrub_priority", SCRUB_PRIORITY},
5252       {"compression_mode", COMPRESSION_MODE},
5253       {"compression_algorithm", COMPRESSION_ALGORITHM},
5254       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5255       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5256       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5257       {"csum_type", CSUM_TYPE},
5258       {"csum_max_block", CSUM_MAX_BLOCK},
5259       {"csum_min_block", CSUM_MIN_BLOCK},
5260       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5261       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5262       {"pg_num_min", PG_NUM_MIN},
5263       {"target_size_bytes", TARGET_SIZE_BYTES},
5264       {"target_size_ratio", TARGET_SIZE_RATIO},
5265       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
5266     };
5267
5268     typedef std::set<osd_pool_get_choices> choices_set_t;
5269
5270     const choices_set_t ONLY_TIER_CHOICES = {
5271       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5272       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5273       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5274       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5275       MIN_READ_RECENCY_FOR_PROMOTE,
5276       MIN_WRITE_RECENCY_FOR_PROMOTE,
5277       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
5278     };
5279     const choices_set_t ONLY_ERASURE_CHOICES = {
5280       EC_OVERWRITES, ERASURE_CODE_PROFILE
5281     };
5282
5283     choices_set_t selected_choices;
5284     if (var == "all") {
5285       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
5286           it != ALL_CHOICES.end(); ++it) {
5287         selected_choices.insert(it->second);
5288       }
5289
5290       if(!p->is_tier()) {
5291         selected_choices = subtract_second_from_first(selected_choices,
5292                                                       ONLY_TIER_CHOICES);
5293       }
5294
5295       if(!p->is_erasure()) {
5296         selected_choices = subtract_second_from_first(selected_choices,
5297                                                       ONLY_ERASURE_CHOICES);
5298       }
5299     } else /* var != "all" */  {
5300       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
5301       osd_pool_get_choices selected = found->second;
5302
5303       if (!p->is_tier() &&
5304           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
5305         ss << "pool '" << poolstr
5306            << "' is not a tier pool: variable not applicable";
5307         r = -EACCES;
5308         goto reply;
5309       }
5310
5311       if (!p->is_erasure() &&
5312           ONLY_ERASURE_CHOICES.find(selected)
5313           != ONLY_ERASURE_CHOICES.end()) {
5314         ss << "pool '" << poolstr
5315            << "' is not a erasure pool: variable not applicable";
5316         r = -EACCES;
5317         goto reply;
5318       }
5319
5320       if (pool_opts_t::is_opt_name(var) &&
5321           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
5322         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
5323         r = -ENOENT;
5324         goto reply;
5325       }
5326
5327       selected_choices.insert(selected);
5328     }
5329
5330     if (f) {
5331       f->open_object_section("pool");
5332       f->dump_string("pool", poolstr);
5333       f->dump_int("pool_id", pool);
5334       for(choices_set_t::const_iterator it = selected_choices.begin();
5335           it != selected_choices.end(); ++it) {
5336         choices_map_t::const_iterator i;
5337         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5338           if (i->second == *it) {
5339             break;
5340           }
5341         }
5342         ceph_assert(i != ALL_CHOICES.end());
5343         switch(*it) {
5344           case PG_NUM:
5345             f->dump_int("pg_num", p->get_pg_num());
5346             break;
5347           case PGP_NUM:
5348             f->dump_int("pgp_num", p->get_pgp_num());
5349             break;
5350           case SIZE:
5351             f->dump_int("size", p->get_size());
5352             break;
5353           case MIN_SIZE:
5354             f->dump_int("min_size", p->get_min_size());
5355             break;
5356           case CRUSH_RULE:
5357             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
5358               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
5359                                p->get_crush_rule()));
5360             } else {
5361               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
5362             }
5363             break;
5364           case EC_OVERWRITES:
5365             f->dump_bool("allow_ec_overwrites",
5366                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
5367             break;
5368           case PG_AUTOSCALE_MODE:
5369             f->dump_string("pg_autoscale_mode",
5370                            pg_pool_t::get_pg_autoscale_mode_name(
5371                              p->pg_autoscale_mode));
5372             break;
5373           case HASHPSPOOL:
5374           case NODELETE:
5375           case NOPGCHANGE:
5376           case NOSIZECHANGE:
5377           case WRITE_FADVISE_DONTNEED:
5378           case NOSCRUB:
5379           case NODEEP_SCRUB:
5380             f->dump_bool(i->first.c_str(),
5381                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
5382             break;
5383           case HIT_SET_PERIOD:
5384             f->dump_int("hit_set_period", p->hit_set_period);
5385             break;
5386           case HIT_SET_COUNT:
5387             f->dump_int("hit_set_count", p->hit_set_count);
5388             break;
5389           case HIT_SET_TYPE:
5390             f->dump_string("hit_set_type",
5391                            HitSet::get_type_name(p->hit_set_params.get_type()));
5392             break;
5393           case HIT_SET_FPP:
5394             {
5395               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5396                 BloomHitSet::Params *bloomp =
5397                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5398                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
5399               } else if(var != "all") {
5400                 f->close_section();
5401                 ss << "hit set is not of type Bloom; " <<
5402                   "invalid to get a false positive rate!";
5403                 r = -EINVAL;
5404                 goto reply;
5405               }
5406             }
5407             break;
5408           case USE_GMT_HITSET:
5409             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
5410             break;
5411           case TARGET_MAX_OBJECTS:
5412             f->dump_unsigned("target_max_objects", p->target_max_objects);
5413             break;
5414           case TARGET_MAX_BYTES:
5415             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
5416             break;
5417           case CACHE_TARGET_DIRTY_RATIO:
5418             f->dump_unsigned("cache_target_dirty_ratio_micro",
5419                              p->cache_target_dirty_ratio_micro);
5420             f->dump_float("cache_target_dirty_ratio",
5421                           ((float)p->cache_target_dirty_ratio_micro/1000000));
5422             break;
5423           case CACHE_TARGET_DIRTY_HIGH_RATIO:
5424             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
5425                              p->cache_target_dirty_high_ratio_micro);
5426             f->dump_float("cache_target_dirty_high_ratio",
5427                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
5428             break;
5429           case CACHE_TARGET_FULL_RATIO:
5430             f->dump_unsigned("cache_target_full_ratio_micro",
5431                              p->cache_target_full_ratio_micro);
5432             f->dump_float("cache_target_full_ratio",
5433                           ((float)p->cache_target_full_ratio_micro/1000000));
5434             break;
5435           case CACHE_MIN_FLUSH_AGE:
5436             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
5437             break;
5438           case CACHE_MIN_EVICT_AGE:
5439             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
5440             break;
5441           case ERASURE_CODE_PROFILE:
5442             f->dump_string("erasure_code_profile", p->erasure_code_profile);
5443             break;
5444           case MIN_READ_RECENCY_FOR_PROMOTE:
5445             f->dump_int("min_read_recency_for_promote",
5446                         p->min_read_recency_for_promote);
5447             break;
5448           case MIN_WRITE_RECENCY_FOR_PROMOTE:
5449             f->dump_int("min_write_recency_for_promote",
5450                         p->min_write_recency_for_promote);
5451             break;
5452           case FAST_READ:
5453             f->dump_int("fast_read", p->fast_read);
5454             break;
5455           case HIT_SET_GRADE_DECAY_RATE:
5456             f->dump_int("hit_set_grade_decay_rate",
5457                         p->hit_set_grade_decay_rate);
5458             break;
5459           case HIT_SET_SEARCH_LAST_N:
5460             f->dump_int("hit_set_search_last_n",
5461                         p->hit_set_search_last_n);
5462             break;
5463           case SCRUB_MIN_INTERVAL:
5464           case SCRUB_MAX_INTERVAL:
5465           case DEEP_SCRUB_INTERVAL:
5466           case RECOVERY_PRIORITY:
5467           case RECOVERY_OP_PRIORITY:
5468           case SCRUB_PRIORITY:
5469           case COMPRESSION_MODE:
5470           case COMPRESSION_ALGORITHM:
5471           case COMPRESSION_REQUIRED_RATIO:
5472           case COMPRESSION_MAX_BLOB_SIZE:
5473           case COMPRESSION_MIN_BLOB_SIZE:
5474           case CSUM_TYPE:
5475           case CSUM_MAX_BLOCK:
5476           case CSUM_MIN_BLOCK:
5477           case FINGERPRINT_ALGORITHM:
5478           case PG_NUM_MIN:
5479           case TARGET_SIZE_BYTES:
5480           case TARGET_SIZE_RATIO:
5481           case PG_AUTOSCALE_BIAS:
5482             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5483             if (p->opts.is_set(key)) {
5484               if(*it == CSUM_TYPE) {
5485                 int64_t val;
5486                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
5487                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
5488               } else {
5489                 p->opts.dump(i->first, f.get());
5490               }
5491             }
5492             break;
5493         }
5494       }
5495       f->close_section();
5496       f->flush(rdata);
5497     } else /* !f */ {
5498       for(choices_set_t::const_iterator it = selected_choices.begin();
5499           it != selected_choices.end(); ++it) {
5500         choices_map_t::const_iterator i;
5501         switch(*it) {
5502           case PG_NUM:
5503             ss << "pg_num: " << p->get_pg_num() << "\n";
5504             break;
5505           case PGP_NUM:
5506             ss << "pgp_num: " << p->get_pgp_num() << "\n";
5507             break;
5508           case SIZE:
5509             ss << "size: " << p->get_size() << "\n";
5510             break;
5511           case MIN_SIZE:
5512             ss << "min_size: " << p->get_min_size() << "\n";
5513             break;
5514           case CRUSH_RULE:
5515             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
5516               ss << "crush_rule: " << osdmap.crush->get_rule_name(
5517                 p->get_crush_rule()) << "\n";
5518             } else {
5519               ss << "crush_rule: " << p->get_crush_rule() << "\n";
5520             }
5521             break;
5522           case PG_AUTOSCALE_MODE:
5523             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
5524               p->pg_autoscale_mode) <<"\n";
5525             break;
5526           case HIT_SET_PERIOD:
5527             ss << "hit_set_period: " << p->hit_set_period << "\n";
5528             break;
5529           case HIT_SET_COUNT:
5530             ss << "hit_set_count: " << p->hit_set_count << "\n";
5531             break;
5532           case HIT_SET_TYPE:
5533             ss << "hit_set_type: " <<
5534               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
5535             break;
5536           case HIT_SET_FPP:
5537             {
5538               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5539                 BloomHitSet::Params *bloomp =
5540                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5541                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
5542               } else if(var != "all") {
5543                 ss << "hit set is not of type Bloom; " <<
5544                   "invalid to get a false positive rate!";
5545                 r = -EINVAL;
5546                 goto reply;
5547               }
5548             }
5549             break;
5550           case USE_GMT_HITSET:
5551             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
5552             break;
5553           case TARGET_MAX_OBJECTS:
5554             ss << "target_max_objects: " << p->target_max_objects << "\n";
5555             break;
5556           case TARGET_MAX_BYTES:
5557             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
5558             break;
5559           case CACHE_TARGET_DIRTY_RATIO:
5560             ss << "cache_target_dirty_ratio: "
5561                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
5562             break;
5563           case CACHE_TARGET_DIRTY_HIGH_RATIO:
5564             ss << "cache_target_dirty_high_ratio: "
5565                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
5566             break;
5567           case CACHE_TARGET_FULL_RATIO:
5568             ss << "cache_target_full_ratio: "
5569                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
5570             break;
5571           case CACHE_MIN_FLUSH_AGE:
5572             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
5573             break;
5574           case CACHE_MIN_EVICT_AGE:
5575             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
5576             break;
5577           case ERASURE_CODE_PROFILE:
5578             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
5579             break;
5580           case MIN_READ_RECENCY_FOR_PROMOTE:
5581             ss << "min_read_recency_for_promote: " <<
5582               p->min_read_recency_for_promote << "\n";
5583             break;
5584           case HIT_SET_GRADE_DECAY_RATE:
5585             ss << "hit_set_grade_decay_rate: " <<
5586               p->hit_set_grade_decay_rate << "\n";
5587             break;
5588           case HIT_SET_SEARCH_LAST_N:
5589             ss << "hit_set_search_last_n: " <<
5590               p->hit_set_search_last_n << "\n";
5591             break;
5592           case EC_OVERWRITES:
5593             ss << "allow_ec_overwrites: " <<
5594               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
5595               "\n";
5596             break;
5597           case HASHPSPOOL:
5598           case NODELETE:
5599           case NOPGCHANGE:
5600           case NOSIZECHANGE:
5601           case WRITE_FADVISE_DONTNEED:
5602           case NOSCRUB:
5603           case NODEEP_SCRUB:
5604             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5605               if (i->second == *it)
5606                 break;
5607             }
5608             ceph_assert(i != ALL_CHOICES.end());
5609             ss << i->first << ": " <<
5610               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
5611                "true" : "false") << "\n";
5612             break;
5613           case MIN_WRITE_RECENCY_FOR_PROMOTE:
5614             ss << "min_write_recency_for_promote: " <<
5615               p->min_write_recency_for_promote << "\n";
5616             break;
5617           case FAST_READ:
5618             ss << "fast_read: " << p->fast_read << "\n";
5619             break;
5620           case SCRUB_MIN_INTERVAL:
5621           case SCRUB_MAX_INTERVAL:
5622           case DEEP_SCRUB_INTERVAL:
5623           case RECOVERY_PRIORITY:
5624           case RECOVERY_OP_PRIORITY:
5625           case SCRUB_PRIORITY:
5626           case COMPRESSION_MODE:
5627           case COMPRESSION_ALGORITHM:
5628           case COMPRESSION_REQUIRED_RATIO:
5629           case COMPRESSION_MAX_BLOB_SIZE:
5630           case COMPRESSION_MIN_BLOB_SIZE:
5631           case CSUM_TYPE:
5632           case CSUM_MAX_BLOCK:
5633           case CSUM_MIN_BLOCK:
5634           case FINGERPRINT_ALGORITHM:
5635           case PG_NUM_MIN:
5636           case TARGET_SIZE_BYTES:
5637           case TARGET_SIZE_RATIO:
5638           case PG_AUTOSCALE_BIAS:
5639             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5640               if (i->second == *it)
5641                 break;
5642             }
5643             ceph_assert(i != ALL_CHOICES.end());
5644             {
5645               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5646               if (p->opts.is_set(key)) {
5647                 if(key == pool_opts_t::CSUM_TYPE) {
5648                   int64_t val;
5649                   p->opts.get(key, &val);
5650                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
5651                 } else {
5652                   ss << i->first << ": " << p->opts.get(key) << "\n";
5653                 }
5654               }
5655             }
5656             break;
5657         }
5658         rdata.append(ss.str());
5659         ss.str("");
5660       }
5661     }
5662     r = 0;
5663   } else if (prefix == "osd pool get-quota") {
5664     string pool_name;
5665     cmd_getval(cct, cmdmap, "pool", pool_name);
5666
5667     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
5668     if (poolid < 0) {
5669       ceph_assert(poolid == -ENOENT);
5670       ss << "unrecognized pool '" << pool_name << "'";
5671       r = -ENOENT;
5672       goto reply;
5673     }
5674     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
5675
5676     if (f) {
5677       f->open_object_section("pool_quotas");
5678       f->dump_string("pool_name", pool_name);
5679       f->dump_unsigned("pool_id", poolid);
5680       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
5681       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
5682       f->close_section();
5683       f->flush(rdata);
5684     } else {
5685       stringstream rs;
5686       rs << "quotas for pool '" << pool_name << "':\n"
5687          << "  max objects: ";
5688       if (p->quota_max_objects == 0)
5689         rs << "N/A";
5690       else
5691         rs << si_u_t(p->quota_max_objects) << " objects";
5692       rs << "\n"
5693          << "  max bytes  : ";
5694       if (p->quota_max_bytes == 0)
5695         rs << "N/A";
5696       else
5697         rs << byte_u_t(p->quota_max_bytes);
5698       rdata.append(rs.str());
5699     }
5700     rdata.append("\n");
5701     r = 0;
5702   } else if (prefix == "osd crush rule list" ||
5703              prefix == "osd crush rule ls") {
5704     if (f) {
5705       f->open_array_section("rules");
5706       osdmap.crush->list_rules(f.get());
5707       f->close_section();
5708       f->flush(rdata);
5709     } else {
5710       ostringstream ss;
5711       osdmap.crush->list_rules(&ss);
5712       rdata.append(ss.str());
5713     }
5714   } else if (prefix == "osd crush rule ls-by-class") {
5715     string class_name;
5716     cmd_getval(cct, cmdmap, "class", class_name);
5717     if (class_name.empty()) {
5718       ss << "no class specified";
5719       r = -EINVAL;
5720       goto reply;
5721     }
5722     set<int> rules;
5723     r = osdmap.crush->get_rules_by_class(class_name, &rules);
5724     if (r < 0) {
5725       ss << "failed to get rules by class '" << class_name << "'";
5726       goto reply;
5727     }
5728     if (f) {
5729       f->open_array_section("rules");
5730       for (auto &rule: rules) {
5731         f->dump_string("name", osdmap.crush->get_rule_name(rule));
5732       }
5733       f->close_section();
5734       f->flush(rdata);
5735     } else {
5736       ostringstream rs;
5737       for (auto &rule: rules) {
5738         rs << osdmap.crush->get_rule_name(rule) << "\n";
5739       }
5740       rdata.append(rs.str());
5741     }
5742   } else if (prefix == "osd crush rule dump") {
5743     string name;
5744     cmd_getval(cct, cmdmap, "name", name);
5745     string format;
5746     cmd_getval(cct, cmdmap, "format", format);
5747     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5748     if (name == "") {
5749       f->open_array_section("rules");
5750       osdmap.crush->dump_rules(f.get());
5751       f->close_section();
5752     } else {
5753       int ruleno = osdmap.crush->get_rule_id(name);
5754       if (ruleno < 0) {
5755         ss << "unknown crush rule '" << name << "'";
5756         r = ruleno;
5757         goto reply;
5758       }
5759       osdmap.crush->dump_rule(ruleno, f.get());
5760     }
5761     ostringstream rs;
5762     f->flush(rs);
5763     rs << "\n";
5764     rdata.append(rs.str());
5765   } else if (prefix == "osd crush dump") {
5766     string format;
5767     cmd_getval(cct, cmdmap, "format", format);
5768     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5769     f->open_object_section("crush_map");
5770     osdmap.crush->dump(f.get());
5771     f->close_section();
5772     ostringstream rs;
5773     f->flush(rs);
5774     rs << "\n";
5775     rdata.append(rs.str());
5776   } else if (prefix == "osd crush show-tunables") {
5777     string format;
5778     cmd_getval(cct, cmdmap, "format", format);
5779     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5780     f->open_object_section("crush_map_tunables");
5781     osdmap.crush->dump_tunables(f.get());
5782     f->close_section();
5783     ostringstream rs;
5784     f->flush(rs);
5785     rs << "\n";
5786     rdata.append(rs.str());
5787   } else if (prefix == "osd crush tree") {
5788     string shadow;
5789     cmd_getval(cct, cmdmap, "shadow", shadow);
5790     bool show_shadow = shadow == "--show-shadow";
5791     boost::scoped_ptr<Formatter> f(Formatter::create(format));
5792     if (f) {
5793       f->open_object_section("crush_tree");
5794       osdmap.crush->dump_tree(nullptr,
5795                               f.get(),
5796                               osdmap.get_pool_names(),
5797                               show_shadow);
5798       f->close_section();
5799       f->flush(rdata);
5800     } else {
5801       ostringstream ss;
5802       osdmap.crush->dump_tree(&ss,
5803                               nullptr,
5804                               osdmap.get_pool_names(),
5805                               show_shadow);
5806       rdata.append(ss.str());
5807     }
5808   } else if (prefix == "osd crush ls") {
5809     string name;
5810     if (!cmd_getval(cct, cmdmap, "node", name)) {
5811       ss << "no node specified";
5812       r = -EINVAL;
5813       goto reply;
5814     }
5815     if (!osdmap.crush->name_exists(name)) {
5816       ss << "node '" << name << "' does not exist";
5817       r = -ENOENT;
5818       goto reply;
5819     }
5820     int id = osdmap.crush->get_item_id(name);
5821     list<int> result;
5822     if (id >= 0) {
5823       result.push_back(id);
5824     } else {
5825       int num = osdmap.crush->get_bucket_size(id);
5826       for (int i = 0; i < num; ++i) {
5827         result.push_back(osdmap.crush->get_bucket_item(id, i));
5828       }
5829     }
5830     if (f) {
5831       f->open_array_section("items");
5832       for (auto i : result) {
5833         f->dump_string("item", osdmap.crush->get_item_name(i));
5834       }
5835       f->close_section();
5836       f->flush(rdata);
5837     } else {
5838       ostringstream ss;
5839       for (auto i : result) {
5840         ss << osdmap.crush->get_item_name(i) << "\n";
5841       }
5842       rdata.append(ss.str());
5843     }
5844     r = 0;
5845   } else if (prefix == "osd crush class ls") {
5846     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5847     f->open_array_section("crush_classes");
5848     for (auto i : osdmap.crush->class_name)
5849       f->dump_string("class", i.second);
5850     f->close_section();
5851     f->flush(rdata);
5852   } else if (prefix == "osd crush class ls-osd") {
5853     string name;
5854     cmd_getval(cct, cmdmap, "class", name);
5855     set<int> osds;
5856     osdmap.crush->get_devices_by_class(name, &osds);
5857     if (f) {
5858       f->open_array_section("osds");
5859       for (auto &osd: osds)
5860         f->dump_int("osd", osd);
5861       f->close_section();
5862       f->flush(rdata);
5863     } else {
5864       bool first = true;
5865       for (auto &osd : osds) {
5866         if (!first)
5867           ds << "\n";
5868         first = false;
5869         ds << osd;
5870       }
5871       rdata.append(ds);
5872     }
5873   } else if (prefix == "osd crush get-device-class") {
5874     vector<string> idvec;
5875     cmd_getval(cct, cmdmap, "ids", idvec);
5876     map<int, string> class_by_osd;
5877     for (auto& id : idvec) {
5878       ostringstream ts;
5879       long osd = parse_osd_id(id.c_str(), &ts);
5880       if (osd < 0) {
5881         ss << "unable to parse osd id:'" << id << "'";
5882         r = -EINVAL;
5883         goto reply;
5884       }
5885       auto device_class = osdmap.crush->get_item_class(osd);
5886       if (device_class)
5887         class_by_osd[osd] = device_class;
5888       else
5889         class_by_osd[osd] = ""; // no class
5890     }
5891     if (f) {
5892       f->open_array_section("osd_device_classes");
5893       for (auto& i : class_by_osd) {
5894         f->open_object_section("osd_device_class");
5895         f->dump_int("osd", i.first);
5896         f->dump_string("device_class", i.second);
5897         f->close_section();
5898       }
5899       f->close_section();
5900       f->flush(rdata);
5901     } else {
5902       if (class_by_osd.size() == 1) {
5903         // for single input, make a clean output
5904         ds << class_by_osd.begin()->second;
5905       } else {
5906         // note that we do not group osds by class here
5907         for (auto it = class_by_osd.begin();
5908              it != class_by_osd.end();
5909              it++) {
5910           ds << "osd." << it->first << ' ' << it->second;
5911           if (next(it) != class_by_osd.end())
5912             ds << '\n';
5913         }
5914       }
5915       rdata.append(ds);
5916     }
5917   } else if (prefix == "osd erasure-code-profile ls") {
5918     const auto &profiles = osdmap.get_erasure_code_profiles();
5919     if (f)
5920       f->open_array_section("erasure-code-profiles");
5921     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5922       if (f)
5923         f->dump_string("profile", i->first.c_str());
5924       else
5925         rdata.append(i->first + "\n");
5926     }
5927     if (f) {
5928       f->close_section();
5929       ostringstream rs;
5930       f->flush(rs);
5931       rs << "\n";
5932       rdata.append(rs.str());
5933     }
5934   } else if (prefix == "osd crush weight-set ls") {
5935     boost::scoped_ptr<Formatter> f(Formatter::create(format));
5936     if (f) {
5937       f->open_array_section("weight_sets");
5938       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5939         f->dump_string("pool", "(compat)");
5940       }
5941       for (auto& i : osdmap.crush->choose_args) {
5942         if (i.first >= 0) {
5943           f->dump_string("pool", osdmap.get_pool_name(i.first));
5944         }
5945       }
5946       f->close_section();
5947       f->flush(rdata);
5948     } else {
5949       ostringstream rs;
5950       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5951         rs << "(compat)\n";
5952       }
5953       for (auto& i : osdmap.crush->choose_args) {
5954         if (i.first >= 0) {
5955           rs << osdmap.get_pool_name(i.first) << "\n";
5956         }
5957       }
5958       rdata.append(rs.str());
5959     }
5960   } else if (prefix == "osd crush weight-set dump") {
5961     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5962                                                      "json-pretty"));
5963     osdmap.crush->dump_choose_args(f.get());
5964     f->flush(rdata);
5965   } else if (prefix == "osd erasure-code-profile get") {
5966     string name;
5967     cmd_getval(cct, cmdmap, "name", name);
5968     if (!osdmap.has_erasure_code_profile(name)) {
5969       ss << "unknown erasure code profile '" << name << "'";
5970       r = -ENOENT;
5971       goto reply;
5972     }
5973     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5974     if (f)
5975       f->open_object_section("profile");
5976     for (map<string,string>::const_iterator i = profile.begin();
5977          i != profile.end();
5978          ++i) {
5979       if (f)
5980         f->dump_string(i->first.c_str(), i->second.c_str());
5981       else
5982         rdata.append(i->first + "=" + i->second + "\n");
5983     }
5984     if (f) {
5985       f->close_section();
5986       ostringstream rs;
5987       f->flush(rs);
5988       rs << "\n";
5989       rdata.append(rs.str());
5990     }
5991   } else if (prefix == "osd pool application get") {
5992     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5993                                                      "json-pretty"));
5994     string pool_name;
5995     cmd_getval(cct, cmdmap, "pool", pool_name);
5996     string app;
5997     cmd_getval(cct, cmdmap, "app", app);
5998     string key;
5999     cmd_getval(cct, cmdmap, "key", key);
6000
6001     if (pool_name.empty()) {
6002       // all
6003       f->open_object_section("pools");
6004       for (const auto &pool : osdmap.pools) {
6005         std::string name("<unknown>");
6006         const auto &pni = osdmap.pool_name.find(pool.first);
6007         if (pni != osdmap.pool_name.end())
6008           name = pni->second;
6009         f->open_object_section(name.c_str());
6010         for (auto &app_pair : pool.second.application_metadata) {
6011           f->open_object_section(app_pair.first.c_str());
6012           for (auto &kv_pair : app_pair.second) {
6013             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6014           }
6015           f->close_section();
6016         }
6017         f->close_section(); // name
6018       }
6019       f->close_section(); // pools
6020       f->flush(rdata);
6021     } else {
6022       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6023       if (pool < 0) {
6024         ss << "unrecognized pool '" << pool_name << "'";
6025         r = -ENOENT;
6026         goto reply;
6027       }
6028       auto p = osdmap.get_pg_pool(pool);
6029       // filter by pool
6030       if (app.empty()) {
6031         f->open_object_section(pool_name.c_str());
6032         for (auto &app_pair : p->application_metadata) {
6033           f->open_object_section(app_pair.first.c_str());
6034           for (auto &kv_pair : app_pair.second) {
6035             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6036           }
6037           f->close_section(); // application
6038         }
6039         f->close_section(); // pool_name
6040         f->flush(rdata);
6041         goto reply;
6042       }
6043
6044       auto app_it = p->application_metadata.find(app);
6045       if (app_it == p->application_metadata.end()) {
6046         ss << "pool '" << pool_name << "' has no application '" << app << "'";
6047         r = -ENOENT;
6048         goto reply;
6049       }
6050       // filter by pool + app
6051       if (key.empty()) {
6052         f->open_object_section(app_it->first.c_str());
6053         for (auto &kv_pair : app_it->second) {
6054           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6055         }
6056         f->close_section(); // application
6057         f->flush(rdata);
6058         goto reply;
6059       }
6060       // filter by pool + app + key
6061       auto key_it = app_it->second.find(key);
6062       if (key_it == app_it->second.end()) {
6063         ss << "application '" << app << "' on pool '" << pool_name
6064            << "' does not have key '" << key << "'";
6065         r = -ENOENT;
6066         goto reply;
6067       }
6068       ss << key_it->second << "\n";
6069       rdata.append(ss.str());
6070       ss.str("");
6071     }
6072   } else if (prefix == "osd get-require-min-compat-client") {
6073     ss << ceph_release_name(osdmap.require_min_compat_client) << std::endl;
6074     rdata.append(ss.str());
6075     ss.str("");
6076     goto reply;
6077   } else if (prefix == "osd pool application enable" ||
6078              prefix == "osd pool application disable" ||
6079              prefix == "osd pool application set" ||
6080              prefix == "osd pool application rm") {
6081     bool changed = false;
6082     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6083     if (r != 0) {
6084       // Error, reply.
6085       goto reply;
6086     } else if (changed) {
6087       // Valid mutation, proceed to prepare phase
6088       return false;
6089     } else {
6090       // Idempotent case, reply
6091       goto reply;
6092     }
6093   } else {
6094     // try prepare update
6095     return false;
6096   }
6097
6098  reply:
6099   string rs;
6100   getline(ss, rs);
6101   mon->reply_command(op, r, rs, rdata, get_last_committed());
6102   return true;
6103 }
6104
6105 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6106 {
6107   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6108     osdmap.get_pg_pool(pool_id));
6109   ceph_assert(pool);
6110   pool->set_flag(flags);
6111 }
6112
6113 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6114 {
6115   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6116     osdmap.get_pg_pool(pool_id));
6117   ceph_assert(pool);
6118   pool->unset_flag(flags);
6119 }
6120
6121 string OSDMonitor::make_snap_epoch_key(int64_t pool, epoch_t epoch)
6122 {
6123   char k[80];
6124   snprintf(k, sizeof(k), "removed_epoch_%llu_%08lx",
6125            (unsigned long long)pool, (unsigned long)epoch);
6126   return k;
6127 }
6128
6129 string OSDMonitor::make_snap_key(int64_t pool, snapid_t snap)
6130 {
6131   char k[80];
6132   snprintf(k, sizeof(k), "removed_snap_%llu_%016llx",
6133            (unsigned long long)pool, (unsigned long long)snap);
6134   return k;
6135 }
6136
6137
6138 string OSDMonitor::make_snap_key_value(
6139   int64_t pool, snapid_t snap, snapid_t num,
6140   epoch_t epoch, bufferlist *v)
6141 {
6142   // encode the *last* epoch in the key so that we can use forward
6143   // iteration only to search for an epoch in an interval.
6144   encode(snap, *v);
6145   encode(snap + num, *v);
6146   encode(epoch, *v);
6147   return make_snap_key(pool, snap + num - 1);
6148 }
6149
6150 string OSDMonitor::make_snap_purged_key(int64_t pool, snapid_t snap)
6151 {
6152   char k[80];
6153   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6154            (unsigned long long)pool, (unsigned long long)snap);
6155   return k;
6156 }
6157 string OSDMonitor::make_snap_purged_key_value(
6158   int64_t pool, snapid_t snap, snapid_t num,
6159   epoch_t epoch, bufferlist *v)
6160 {
6161   // encode the *last* epoch in the key so that we can use forward
6162   // iteration only to search for an epoch in an interval.
6163   encode(snap, *v);
6164   encode(snap + num, *v);
6165   encode(epoch, *v);
6166   return make_snap_purged_key(pool, snap + num - 1);
6167 }
6168
6169 int OSDMonitor::lookup_pruned_snap(int64_t pool, snapid_t snap,
6170                                    snapid_t *begin, snapid_t *end)
6171 {
6172   string k = make_snap_key(pool, snap);
6173   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6174   it->lower_bound(k);
6175   if (!it->valid()) {
6176     return -ENOENT;
6177   }
6178   if (it->key().find(OSD_SNAP_PREFIX) != 0) {
6179     return -ENOENT;
6180   }
6181   bufferlist v = it->value();
6182   auto p = v.cbegin();
6183   decode(*begin, p);
6184   decode(*end, p);
6185   if (snap < *begin || snap >= *end) {
6186     return -ENOENT;
6187   }
6188   return 0;
6189 }
6190
6191 bool OSDMonitor::try_prune_purged_snaps()
6192 {
6193   if (!mon->mgrstatmon()->is_readable()) {
6194     return false;
6195   }
6196   if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC) {
6197     return false;
6198   }
6199   if (!pending_inc.new_purged_snaps.empty()) {
6200     return false;  // we already pruned for this epoch
6201   }
6202
6203   unsigned max_prune = cct->_conf.get_val<uint64_t>(
6204     "mon_max_snap_prune_per_epoch");
6205   if (!max_prune) {
6206     max_prune = 100000;
6207   }
6208   dout(10) << __func__ << " max_prune " << max_prune << dendl;
6209
6210   unsigned actually_pruned = 0;
6211   auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
6212   for (auto& p : osdmap.get_pools()) {
6213     auto q = purged_snaps.find(p.first);
6214     if (q == purged_snaps.end()) {
6215       continue;
6216     }
6217     auto& purged = q->second;
6218     if (purged.empty()) {
6219       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
6220       continue;
6221     }
6222     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
6223     OSDMap::snap_interval_set_t to_prune;
6224     unsigned maybe_pruned = actually_pruned;
6225     for (auto i = purged.begin(); i != purged.end(); ++i) {
6226       snapid_t begin = i.get_start();
6227       auto end = i.get_start() + i.get_len();
6228       snapid_t pbegin = 0, pend = 0;
6229       int r = lookup_pruned_snap(p.first, begin, &pbegin, &pend);
6230       if (r == 0) {
6231         // already purged.
6232         // be a bit aggressive about backing off here, because the mon may
6233         // do a lot of work going through this set, and if we know the
6234         // purged set from the OSDs is at least *partly* stale we may as
6235         // well wait for it to be fresh.
6236         dout(20) << __func__ << "  we've already pruned " << pbegin
6237                  << "~" << (pend - pbegin) << dendl;
6238         break;  // next pool
6239       }
6240       if (pbegin && pbegin < end) {
6241         // the tail of [begin,end) is purged; shorten the range
6242         ceph_assert(pbegin > begin);
6243         end = pbegin;
6244       }
6245       to_prune.insert(begin, end - begin);
6246       maybe_pruned += end - begin;
6247       if (maybe_pruned >= max_prune) {
6248         break;
6249       }
6250     }
6251     if (!to_prune.empty()) {
6252       // PGs may still be reporting things as purged that we have already
6253       // pruned from removed_snaps_queue.
6254       OSDMap::snap_interval_set_t actual;
6255       auto r = osdmap.removed_snaps_queue.find(p.first);
6256       if (r != osdmap.removed_snaps_queue.end()) {
6257         actual.intersection_of(to_prune, r->second);
6258       }
6259       actually_pruned += actual.size();
6260       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
6261                << ", actual pruned " << actual << dendl;
6262       if (!actual.empty()) {
6263         pending_inc.new_purged_snaps[p.first].swap(actual);
6264       }
6265     }
6266     if (actually_pruned >= max_prune) {
6267       break;
6268     }
6269   }
6270   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
6271   return !!actually_pruned;
6272 }
6273
6274 bool OSDMonitor::update_pools_status()
6275 {
6276   if (!mon->mgrstatmon()->is_readable())
6277     return false;
6278
6279   bool ret = false;
6280
6281   auto& pools = osdmap.get_pools();
6282   for (auto it = pools.begin(); it != pools.end(); ++it) {
6283     const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
6284     if (!pstat)
6285       continue;
6286     const object_stat_sum_t& sum = pstat->stats.sum;
6287     const pg_pool_t &pool = it->second;
6288     const string& pool_name = osdmap.get_pool_name(it->first);
6289
6290     bool pool_is_full =
6291       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
6292       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
6293
6294     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
6295       if (pool_is_full)
6296         continue;
6297
6298       mon->clog->info() << "pool '" << pool_name
6299                        << "' no longer out of quota; removing NO_QUOTA flag";
6300       // below we cancel FLAG_FULL too, we'll set it again in
6301       // OSDMonitor::encode_pending if it still fails the osd-full checking.
6302       clear_pool_flags(it->first,
6303                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
6304       ret = true;
6305     } else {
6306       if (!pool_is_full)
6307         continue;
6308
6309       if (pool.quota_max_bytes > 0 &&
6310           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
6311         mon->clog->warn() << "pool '" << pool_name << "' is full"
6312                          << " (reached quota's max_bytes: "
6313                          << byte_u_t(pool.quota_max_bytes) << ")";
6314       }
6315       if (pool.quota_max_objects > 0 &&
6316                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
6317         mon->clog->warn() << "pool '" << pool_name << "' is full"
6318                          << " (reached quota's max_objects: "
6319                          << pool.quota_max_objects << ")";
6320       }
6321       // set both FLAG_FULL_QUOTA and FLAG_FULL
6322       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
6323       // since FLAG_FULL should always take precedence
6324       set_pool_flags(it->first,
6325                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
6326       clear_pool_flags(it->first,
6327                        pg_pool_t::FLAG_NEARFULL |
6328                        pg_pool_t::FLAG_BACKFILLFULL);
6329       ret = true;
6330     }
6331   }
6332   return ret;
6333 }
6334
6335 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
6336 {
6337   op->mark_osdmon_event(__func__);
6338   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
6339   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
6340   MonSession *session = op->get_session();
6341   if (!session)
6342     return -EPERM;
6343   string erasure_code_profile;
6344   stringstream ss;
6345   string rule_name;
6346   int ret = 0;
6347   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
6348                          0, 0, 0, 0, 0, 0.0,
6349                          erasure_code_profile,
6350                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
6351
6352   if (ret < 0) {
6353     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
6354   }
6355   return ret;
6356 }
6357
6358 int OSDMonitor::crush_rename_bucket(const string& srcname,
6359                                     const string& dstname,
6360                                     ostream *ss)
6361 {
6362   int ret;
6363   //
6364   // Avoid creating a pending crush if it does not already exists and
6365   // the rename would fail.
6366   //
6367   if (!_have_pending_crush()) {
6368     ret = _get_stable_crush().can_rename_bucket(srcname,
6369                                                 dstname,
6370                                                 ss);
6371     if (ret)
6372       return ret;
6373   }
6374
6375   CrushWrapper newcrush;
6376   _get_pending_crush(newcrush);
6377
6378   ret = newcrush.rename_bucket(srcname,
6379                                dstname,
6380                                ss);
6381   if (ret)
6382     return ret;
6383
6384   pending_inc.crush.clear();
6385   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6386   *ss << "renamed bucket " << srcname << " into " << dstname;
6387   return 0;
6388 }
6389
6390 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
6391 {
6392   string replacement = "";
6393
6394   if (plugin == "jerasure_generic" ||
6395       plugin == "jerasure_sse3" ||
6396       plugin == "jerasure_sse4" ||
6397       plugin == "jerasure_neon") {
6398     replacement = "jerasure";
6399   } else if (plugin == "shec_generic" ||
6400              plugin == "shec_sse3" ||
6401              plugin == "shec_sse4" ||
6402              plugin == "shec_neon") {
6403     replacement = "shec";
6404   }
6405
6406   if (replacement != "") {
6407     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
6408             << plugin << " that has been deprecated. Please use "
6409             << replacement << " instead." << dendl;
6410   }
6411 }
6412
6413 int OSDMonitor::normalize_profile(const string& profilename,
6414                                   ErasureCodeProfile &profile,
6415                                   bool force,
6416                                   ostream *ss)
6417 {
6418   ErasureCodeInterfaceRef erasure_code;
6419   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6420   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
6421   check_legacy_ec_plugin(plugin->second, profilename);
6422   int err = instance.factory(plugin->second,
6423                              g_conf().get_val<std::string>("erasure_code_dir"),
6424                              profile, &erasure_code, ss);
6425   if (err) {
6426     return err;
6427   }
6428
6429   err = erasure_code->init(profile, ss);
6430   if (err) {
6431     return err;
6432   }
6433
6434   auto it = profile.find("stripe_unit");
6435   if (it != profile.end()) {
6436     string err_str;
6437     uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
6438     if (!err_str.empty()) {
6439       *ss << "could not parse stripe_unit '" << it->second
6440           << "': " << err_str << std::endl;
6441       return -EINVAL;
6442     }
6443     uint32_t data_chunks = erasure_code->get_data_chunk_count();
6444     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
6445     if (chunk_size != stripe_unit) {
6446       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
6447           << "alignment. Would be padded to " << chunk_size
6448           << std::endl;
6449       return -EINVAL;
6450     }
6451     if ((stripe_unit % 4096) != 0 && !force) {
6452       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
6453           << "use --force to override this check" << std::endl;
6454       return -EINVAL;
6455     }
6456   }
6457   return 0;
6458 }
6459
6460 int OSDMonitor::crush_rule_create_erasure(const string &name,
6461                                              const string &profile,
6462                                              int *rule,
6463                                              ostream *ss)
6464 {
6465   int ruleid = osdmap.crush->get_rule_id(name);
6466   if (ruleid != -ENOENT) {
6467     *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
6468     return -EEXIST;
6469   }
6470
6471   CrushWrapper newcrush;
6472   _get_pending_crush(newcrush);
6473
6474   ruleid = newcrush.get_rule_id(name);
6475   if (ruleid != -ENOENT) {
6476     *rule = newcrush.get_rule_mask_ruleset(ruleid);
6477     return -EALREADY;
6478   } else {
6479     ErasureCodeInterfaceRef erasure_code;
6480     int err = get_erasure_code(profile, &erasure_code, ss);
6481     if (err) {
6482       *ss << "failed to load plugin using profile " << profile << std::endl;
6483       return err;
6484     }
6485
6486     err = erasure_code->create_rule(name, newcrush, ss);
6487     erasure_code.reset();
6488     if (err < 0)
6489       return err;
6490     *rule = err;
6491     pending_inc.crush.clear();
6492     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6493     return 0;
6494   }
6495 }
6496
6497 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
6498                                  ErasureCodeInterfaceRef *erasure_code,
6499                                  ostream *ss) const
6500 {
6501   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
6502     return -EAGAIN;
6503   ErasureCodeProfile profile =
6504     osdmap.get_erasure_code_profile(erasure_code_profile);
6505   ErasureCodeProfile::const_iterator plugin =
6506     profile.find("plugin");
6507   if (plugin == profile.end()) {
6508     *ss << "cannot determine the erasure code plugin"
6509         << " because there is no 'plugin' entry in the erasure_code_profile "
6510         << profile << std::endl;
6511     return -EINVAL;
6512   }
6513   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
6514   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6515   return instance.factory(plugin->second,
6516                           g_conf().get_val<std::string>("erasure_code_dir"),
6517                           profile, erasure_code, ss);
6518 }
6519
6520 int OSDMonitor::check_cluster_features(uint64_t features,
6521                                        stringstream &ss)
6522 {
6523   stringstream unsupported_ss;
6524   int unsupported_count = 0;
6525   if ((mon->get_quorum_con_features() & features) != features) {
6526     unsupported_ss << "the monitor cluster";
6527     ++unsupported_count;
6528   }
6529
6530   set<int32_t> up_osds;
6531   osdmap.get_up_osds(up_osds);
6532   for (set<int32_t>::iterator it = up_osds.begin();
6533        it != up_osds.end(); ++it) {
6534     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
6535     if ((xi.features & features) != features) {
6536       if (unsupported_count > 0)
6537         unsupported_ss << ", ";
6538       unsupported_ss << "osd." << *it;
6539       unsupported_count ++;
6540     }
6541   }
6542
6543   if (unsupported_count > 0) {
6544     ss << "features " << features << " unsupported by: "
6545        << unsupported_ss.str();
6546     return -ENOTSUP;
6547   }
6548
6549   // check pending osd state, too!
6550   for (map<int32_t,osd_xinfo_t>::const_iterator p =
6551          pending_inc.new_xinfo.begin();
6552        p != pending_inc.new_xinfo.end(); ++p) {
6553     const osd_xinfo_t &xi = p->second;
6554     if ((xi.features & features) != features) {
6555       dout(10) << __func__ << " pending osd." << p->first
6556                << " features are insufficient; retry" << dendl;
6557       return -EAGAIN;
6558     }
6559   }
6560
6561   return 0;
6562 }
6563
6564 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
6565                                                  stringstream& ss)
6566 {
6567   OSDMap::Incremental new_pending = pending_inc;
6568   encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
6569   OSDMap newmap;
6570   newmap.deepish_copy_from(osdmap);
6571   newmap.apply_incremental(new_pending);
6572
6573   // client compat
6574   if (newmap.require_min_compat_client > 0) {
6575     auto mv = newmap.get_min_compat_client();
6576     if (mv > newmap.require_min_compat_client) {
6577       ss << "new crush map requires client version " << ceph_release_name(mv)
6578          << " but require_min_compat_client is "
6579          << ceph_release_name(newmap.require_min_compat_client);
6580       return false;
6581     }
6582   }
6583
6584   // osd compat
6585   uint64_t features =
6586     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
6587     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
6588   stringstream features_ss;
6589   int r = check_cluster_features(features, features_ss);
6590   if (r) {
6591     ss << "Could not change CRUSH: " << features_ss.str();
6592     return false;
6593   }
6594
6595   return true;
6596 }
6597
6598 bool OSDMonitor::erasure_code_profile_in_use(
6599   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
6600   const string &profile,
6601   ostream *ss)
6602 {
6603   bool found = false;
6604   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
6605        p != pools.end();
6606        ++p) {
6607     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
6608       *ss << osdmap.pool_name[p->first] << " ";
6609       found = true;
6610     }
6611   }
6612   if (found) {
6613     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
6614   }
6615   return found;
6616 }
6617
6618 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
6619                                            map<string,string> *erasure_code_profile_map,
6620                                            ostream *ss)
6621 {
6622   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
6623                                    get_json_str_map,
6624                                    *ss,
6625                                    erasure_code_profile_map,
6626                                    true);
6627   if (r)
6628     return r;
6629   ceph_assert((*erasure_code_profile_map).count("plugin"));
6630   string default_plugin = (*erasure_code_profile_map)["plugin"];
6631   map<string,string> user_map;
6632   for (vector<string>::const_iterator i = erasure_code_profile.begin();
6633        i != erasure_code_profile.end();
6634        ++i) {
6635     size_t equal = i->find('=');
6636     if (equal == string::npos) {
6637       user_map[*i] = string();
6638       (*erasure_code_profile_map)[*i] = string();
6639     } else {
6640       const string key = i->substr(0, equal);
6641       equal++;
6642       const string value = i->substr(equal);
6643       if (key.find("ruleset-") == 0) {
6644         *ss << "property '" << key << "' is no longer supported; try "
6645             << "'crush-" << key.substr(8) << "' instead";
6646         return -EINVAL;
6647       }
6648       user_map[key] = value;
6649       (*erasure_code_profile_map)[key] = value;
6650     }
6651   }
6652
6653   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
6654     (*erasure_code_profile_map) = user_map;
6655
6656   return 0;
6657 }
6658
6659 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
6660                                   const string &erasure_code_profile,
6661                                   uint8_t repl_size,
6662                                   unsigned *size, unsigned *min_size,
6663                                   ostream *ss)
6664 {
6665   int err = 0;
6666   switch (pool_type) {
6667   case pg_pool_t::TYPE_REPLICATED:
6668     if (repl_size == 0) {
6669       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
6670     }
6671     *size = repl_size;
6672     *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
6673     break;
6674   case pg_pool_t::TYPE_ERASURE:
6675     {
6676       ErasureCodeInterfaceRef erasure_code;
6677       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6678       if (err == 0) {
6679         *size = erasure_code->get_chunk_count();
6680         *min_size =
6681           erasure_code->get_data_chunk_count() +
6682           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
6683         assert(*min_size <= *size);
6684         assert(*min_size >= erasure_code->get_data_chunk_count());
6685       }
6686     }
6687     break;
6688   default:
6689     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
6690     err = -EINVAL;
6691     break;
6692   }
6693   return err;
6694 }
6695
6696 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
6697                                           const string &erasure_code_profile,
6698                                           uint32_t *stripe_width,
6699                                           ostream *ss)
6700 {
6701   int err = 0;
6702   switch (pool_type) {
6703   case pg_pool_t::TYPE_REPLICATED:
6704     // ignored
6705     break;
6706   case pg_pool_t::TYPE_ERASURE:
6707     {
6708       ErasureCodeProfile profile =
6709         osdmap.get_erasure_code_profile(erasure_code_profile);
6710       ErasureCodeInterfaceRef erasure_code;
6711       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6712       if (err)
6713         break;
6714       uint32_t data_chunks = erasure_code->get_data_chunk_count();
6715       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
6716       auto it = profile.find("stripe_unit");
6717       if (it != profile.end()) {
6718         string err_str;
6719         stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
6720         ceph_assert(err_str.empty());
6721       }
6722       *stripe_width = data_chunks *
6723         erasure_code->get_chunk_size(stripe_unit * data_chunks);
6724     }
6725     break;
6726   default:
6727     *ss << "prepare_pool_stripe_width: "
6728        << pool_type << " is not a known pool type";
6729     err = -EINVAL;
6730     break;
6731   }
6732   return err;
6733 }
6734
6735 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
6736                                         const string &erasure_code_profile,
6737                                         const string &rule_name,
6738                                         int *crush_rule,
6739                                         ostream *ss)
6740 {
6741
6742   if (*crush_rule < 0) {
6743     switch (pool_type) {
6744     case pg_pool_t::TYPE_REPLICATED:
6745       {
6746         if (rule_name == "") {
6747           // Use default rule
6748           *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
6749           if (*crush_rule < 0) {
6750             // Errors may happen e.g. if no valid rule is available
6751             *ss << "No suitable CRUSH rule exists, check "
6752                 << "'osd pool default crush *' config options";
6753             return -ENOENT;
6754           }
6755         } else {
6756           return get_crush_rule(rule_name, crush_rule, ss);
6757         }
6758       }
6759       break;
6760     case pg_pool_t::TYPE_ERASURE:
6761       {
6762         int err = crush_rule_create_erasure(rule_name,
6763                                                erasure_code_profile,
6764                                                crush_rule, ss);
6765         switch (err) {
6766         case -EALREADY:
6767           dout(20) << "prepare_pool_crush_rule: rule "
6768                    << rule_name << " try again" << dendl;
6769           // fall through
6770         case 0:
6771           // need to wait for the crush rule to be proposed before proceeding
6772           err = -EAGAIN;
6773           break;
6774         case -EEXIST:
6775           err = 0;
6776           break;
6777         }
6778         return err;
6779       }
6780       break;
6781     default:
6782       *ss << "prepare_pool_crush_rule: " << pool_type
6783          << " is not a known pool type";
6784       return -EINVAL;
6785       break;
6786     }
6787   } else {
6788     if (!osdmap.crush->ruleset_exists(*crush_rule)) {
6789       *ss << "CRUSH rule " << *crush_rule << " not found";
6790       return -ENOENT;
6791     }
6792   }
6793
6794   return 0;
6795 }
6796
6797 int OSDMonitor::get_crush_rule(const string &rule_name,
6798                                int *crush_rule,
6799                                ostream *ss)
6800 {
6801   int ret;
6802   ret = osdmap.crush->get_rule_id(rule_name);
6803   if (ret != -ENOENT) {
6804     // found it, use it
6805     *crush_rule = ret;
6806   } else {
6807     CrushWrapper newcrush;
6808     _get_pending_crush(newcrush);
6809
6810     ret = newcrush.get_rule_id(rule_name);
6811     if (ret != -ENOENT) {
6812       // found it, wait for it to be proposed
6813       dout(20) << __func__ << ": rule " << rule_name
6814                << " try again" << dendl;
6815       return -EAGAIN;
6816     } else {
6817       // Cannot find it , return error
6818       *ss << "specified rule " << rule_name << " doesn't exist";
6819       return ret;
6820     }
6821   }
6822   return 0;
6823 }
6824
6825 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
6826 {
6827   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
6828   auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
6829   auto max_pgs = max_pgs_per_osd * num_osds;
6830   uint64_t projected = 0;
6831   if (pool < 0) {
6832     projected += pg_num * size;
6833   }
6834   for (const auto& i : osdmap.get_pools()) {
6835     if (i.first == pool) {
6836       projected += pg_num * size;
6837     } else {
6838       projected += i.second.get_pg_num_target() * i.second.get_size();
6839     }
6840   }
6841   if (projected > max_pgs) {
6842     if (pool >= 0) {
6843       *ss << "pool id " << pool;
6844     }
6845     *ss << " pg_num " << pg_num << " size " << size
6846         << " would mean " << projected
6847         << " total pgs, which exceeds max " << max_pgs
6848         << " (mon_max_pg_per_osd " << max_pgs_per_osd
6849         << " * num_in_osds " << num_osds << ")";
6850     return -ERANGE;
6851   }
6852   return 0;
6853 }
6854
6855 /**
6856  * @param name The name of the new pool
6857  * @param crush_rule The crush rule to use. If <0, will use the system default
6858  * @param crush_rule_name The crush rule to use, if crush_rulset <0
6859  * @param pg_num The pg_num to use. If set to 0, will use the system default
6860  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6861  * @param repl_size Replication factor, or 0 for default
6862  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6863  * @param pool_type TYPE_ERASURE, or TYPE_REP
6864  * @param expected_num_objects expected number of objects on the pool
6865  * @param fast_read fast read type.
6866  * @param ss human readable error message, if any.
6867  *
6868  * @return 0 on success, negative errno on failure.
6869  */
6870 int OSDMonitor::prepare_new_pool(string& name,
6871                                  int crush_rule,
6872                                  const string &crush_rule_name,
6873                                  unsigned pg_num, unsigned pgp_num,
6874                                  unsigned pg_num_min,
6875                                  const uint64_t repl_size,
6876                                  const uint64_t target_size_bytes,
6877                                  const float target_size_ratio,
6878                                  const string &erasure_code_profile,
6879                                  const unsigned pool_type,
6880                                  const uint64_t expected_num_objects,
6881                                  FastReadType fast_read,
6882                                  ostream *ss)
6883 {
6884   if (name.length() == 0)
6885     return -EINVAL;
6886   if (pg_num == 0)
6887     pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
6888   if (pgp_num == 0)
6889     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
6890   if (!pgp_num)
6891     pgp_num = pg_num;
6892   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
6893     *ss << "'pg_num' must be greater than 0 and less than or equal to "
6894         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
6895         << " (you may adjust 'mon max pool pg num' for higher values)";
6896     return -ERANGE;
6897   }
6898   if (pgp_num > pg_num) {
6899     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6900         << ", which in this case is " << pg_num;
6901     return -ERANGE;
6902   }
6903   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
6904     *ss << "'fast_read' can only apply to erasure coding pool";
6905     return -EINVAL;
6906   }
6907   int r;
6908   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
6909                                  crush_rule_name, &crush_rule, ss);
6910   if (r) {
6911     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
6912     return r;
6913   }
6914   if (g_conf()->mon_osd_crush_smoke_test) {
6915     CrushWrapper newcrush;
6916     _get_pending_crush(newcrush);
6917     ostringstream err;
6918     CrushTester tester(newcrush, err);
6919     tester.set_min_x(0);
6920     tester.set_max_x(50);
6921     tester.set_rule(crush_rule);
6922     auto start = ceph::coarse_mono_clock::now();
6923     r = tester.test_with_fork(g_conf()->mon_lease);
6924     auto duration = ceph::coarse_mono_clock::now() - start;
6925     if (r < 0) {
6926       dout(10) << "tester.test_with_fork returns " << r
6927                << ": " << err.str() << dendl;
6928       *ss << "crush test failed with " << r << ": " << err.str();
6929       return r;
6930     }
6931     dout(10) << __func__ << " crush smoke test duration: "
6932              << duration << dendl;
6933   }
6934   unsigned size, min_size;
6935   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
6936                         &size, &min_size, ss);
6937   if (r) {
6938     dout(10) << "prepare_pool_size returns " << r << dendl;
6939     return r;
6940   }
6941   r = check_pg_num(-1, pg_num, size, ss);
6942   if (r) {
6943     dout(10) << "check_pg_num returns " << r << dendl;
6944     return r;
6945   }
6946
6947   if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
6948     return -EINVAL;
6949   }
6950
6951   uint32_t stripe_width = 0;
6952   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
6953   if (r) {
6954     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
6955     return r;
6956   }
6957
6958   bool fread = false;
6959   if (pool_type == pg_pool_t::TYPE_ERASURE) {
6960     switch (fast_read) {
6961       case FAST_READ_OFF:
6962         fread = false;
6963         break;
6964       case FAST_READ_ON:
6965         fread = true;
6966         break;
6967       case FAST_READ_DEFAULT:
6968         fread = g_conf()->osd_pool_default_ec_fast_read;
6969         break;
6970       default:
6971         *ss << "invalid fast_read setting: " << fast_read;
6972         return -EINVAL;
6973     }
6974   }
6975
6976   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
6977        p != pending_inc.new_pool_names.end();
6978        ++p) {
6979     if (p->second == name)
6980       return 0;
6981   }
6982
6983   if (-1 == pending_inc.new_pool_max)
6984     pending_inc.new_pool_max = osdmap.pool_max;
6985   int64_t pool = ++pending_inc.new_pool_max;
6986   pg_pool_t empty;
6987   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
6988   pi->create_time = ceph_clock_now();
6989   pi->type = pool_type;
6990   pi->fast_read = fread;
6991   pi->flags = g_conf()->osd_pool_default_flags;
6992   if (g_conf()->osd_pool_default_flag_hashpspool)
6993     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
6994   if (g_conf()->osd_pool_default_flag_nodelete)
6995     pi->set_flag(pg_pool_t::FLAG_NODELETE);
6996   if (g_conf()->osd_pool_default_flag_nopgchange)
6997     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
6998   if (g_conf()->osd_pool_default_flag_nosizechange)
6999     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7000   pi->set_flag(pg_pool_t::FLAG_CREATING);
7001   if (g_conf()->osd_pool_use_gmt_hitset)
7002     pi->use_gmt_hitset = true;
7003   else
7004     pi->use_gmt_hitset = false;
7005
7006   pi->size = size;
7007   pi->min_size = min_size;
7008   pi->crush_rule = crush_rule;
7009   pi->expected_num_objects = expected_num_objects;
7010   pi->object_hash = CEPH_STR_HASH_RJENKINS;
7011
7012   {
7013     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7014       g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7015     pi->pg_autoscale_mode = m >= 0 ? m : 0;
7016   }
7017   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7018   pi->set_pg_num(
7019     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7020     : pg_num);
7021   pi->set_pg_num_pending(pi->get_pg_num());
7022   pi->set_pg_num_target(pg_num);
7023   pi->set_pgp_num(pi->get_pg_num());
7024   pi->set_pgp_num_target(pgp_num);
7025   if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7026       pg_num_min) {
7027     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7028   }
7029
7030   pi->last_change = pending_inc.epoch;
7031   pi->auid = 0;
7032
7033   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7034       pi->erasure_code_profile = erasure_code_profile;
7035   } else {
7036       pi->erasure_code_profile = "";
7037   }
7038   pi->stripe_width = stripe_width;
7039
7040   if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7041       target_size_bytes) {
7042     // only store for nautilus+ because TARGET_SIZE_BYTES may be
7043     // larger than int32_t max.
7044     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7045   }
7046   if (target_size_ratio > 0.0 &&
7047     osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
7048     // only store for nautilus+, just to be consistent and tidy.
7049     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7050   }
7051
7052   pi->cache_target_dirty_ratio_micro =
7053     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7054   pi->cache_target_dirty_high_ratio_micro =
7055     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7056   pi->cache_target_full_ratio_micro =
7057     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7058   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7059   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7060
7061   pending_inc.new_pool_names[pool] = name;
7062   return 0;
7063 }
7064
7065 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7066 {
7067   op->mark_osdmon_event(__func__);
7068   ostringstream ss;
7069   if (pending_inc.new_flags < 0)
7070     pending_inc.new_flags = osdmap.get_flags();
7071   pending_inc.new_flags |= flag;
7072   ss << OSDMap::get_flag_string(flag) << " is set";
7073   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7074                                                     get_last_committed() + 1));
7075   return true;
7076 }
7077
7078 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7079 {
7080   op->mark_osdmon_event(__func__);
7081   ostringstream ss;
7082   if (pending_inc.new_flags < 0)
7083     pending_inc.new_flags = osdmap.get_flags();
7084   pending_inc.new_flags &= ~flag;
7085   ss << OSDMap::get_flag_string(flag) << " is unset";
7086   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7087                                                     get_last_committed() + 1));
7088   return true;
7089 }
7090
7091 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7092                                          stringstream& ss)
7093 {
7094   string poolstr;
7095   cmd_getval(cct, cmdmap, "pool", poolstr);
7096   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7097   if (pool < 0) {
7098     ss << "unrecognized pool '" << poolstr << "'";
7099     return -ENOENT;
7100   }
7101   string var;
7102   cmd_getval(cct, cmdmap, "var", var);
7103
7104   pg_pool_t p = *osdmap.get_pg_pool(pool);
7105   if (pending_inc.new_pools.count(pool))
7106     p = pending_inc.new_pools[pool];
7107
7108   // accept val as a json string in the normal case (current
7109   // generation monitor).  parse out int or float values from the
7110   // string as needed.  however, if it is not a string, try to pull
7111   // out an int, in case an older monitor with an older json schema is
7112   // forwarding a request.
7113   string val;
7114   string interr, floaterr;
7115   int64_t n = 0;
7116   double f = 0;
7117   int64_t uf = 0;  // micro-f
7118   cmd_getval(cct, cmdmap, "val", val);
7119
7120   // parse string as both int and float; different fields use different types.
7121   n = strict_strtoll(val.c_str(), 10, &interr);
7122   f = strict_strtod(val.c_str(), &floaterr);
7123   uf = llrintl(f * (double)1000000.0);
7124
7125   if (!p.is_tier() &&
7126       (var == "hit_set_type" || var == "hit_set_period" ||
7127        var == "hit_set_count" || var == "hit_set_fpp" ||
7128        var == "target_max_objects" || var == "target_max_bytes" ||
7129        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7130        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7131        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7132        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7133        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7134     return -EACCES;
7135   }
7136
7137   if (var == "size") {
7138     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7139       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7140       return -EPERM;
7141     }
7142     if (p.type == pg_pool_t::TYPE_ERASURE) {
7143       ss << "can not change the size of an erasure-coded pool";
7144       return -ENOTSUP;
7145     }
7146     if (interr.length()) {
7147       ss << "error parsing integer value '" << val << "': " << interr;
7148       return -EINVAL;
7149     }
7150     if (n <= 0 || n > 10) {
7151       ss << "pool size must be between 1 and 10";
7152       return -EINVAL;
7153     }
7154     int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7155     if (r < 0) {
7156       return r;
7157     }
7158     p.size = n;
7159     if (n < p.min_size)
7160       p.min_size = n;
7161   } else if (var == "min_size") {
7162     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7163       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7164       return -EPERM;
7165     }
7166     if (interr.length()) {
7167       ss << "error parsing integer value '" << val << "': " << interr;
7168       return -EINVAL;
7169     }
7170
7171     if (p.type != pg_pool_t::TYPE_ERASURE) {
7172       if (n < 1 || n > p.size) {
7173         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7174         return -EINVAL;
7175       }
7176     } else {
7177        ErasureCodeInterfaceRef erasure_code;
7178        int k;
7179        stringstream tmp;
7180        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7181        if (err == 0) {
7182          k = erasure_code->get_data_chunk_count();
7183        } else {
7184          ss << __func__ << " get_erasure_code failed: " << tmp.str();
7185          return err;
7186        }
7187
7188        if (n < k || n > p.size) {
7189          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
7190          return -EINVAL;
7191        }
7192     }
7193     p.min_size = n;
7194   } else if (var == "pg_num_actual") {
7195     if (interr.length()) {
7196       ss << "error parsing integer value '" << val << "': " << interr;
7197       return -EINVAL;
7198     }
7199     if (n == (int)p.get_pg_num()) {
7200       return 0;
7201     }
7202     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7203       ss << "'pg_num' must be greater than 0 and less than or equal to "
7204          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7205          << " (you may adjust 'mon max pool pg num' for higher values)";
7206       return -ERANGE;
7207     }
7208     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
7209       ss << "cannot adjust pg_num while initial PGs are being created";
7210       return -EBUSY;
7211     }
7212     if (n > (int)p.get_pg_num()) {
7213       if (p.get_pg_num() != p.get_pg_num_pending()) {
7214         // force pre-nautilus clients to resend their ops, since they
7215         // don't understand pg_num_pending changes form a new interval
7216         p.last_force_op_resend_prenautilus = pending_inc.epoch;
7217       }
7218       p.set_pg_num(n);
7219     } else {
7220       if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7221         ss << "nautilus OSDs are required to adjust pg_num_pending";
7222         return -EPERM;
7223       }
7224       if (n < (int)p.get_pgp_num()) {
7225         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
7226         return -EINVAL;
7227       }
7228       if (n < (int)p.get_pg_num() - 1) {
7229         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
7230            << ") - 1; only single pg decrease is currently supported";
7231         return -EINVAL;
7232       }
7233       p.set_pg_num_pending(n);
7234       // force pre-nautilus clients to resend their ops, since they
7235       // don't understand pg_num_pending changes form a new interval
7236       p.last_force_op_resend_prenautilus = pending_inc.epoch;
7237     }
7238     // force pre-luminous clients to resend their ops, since they
7239     // don't understand that split PGs now form a new interval.
7240     p.last_force_op_resend_preluminous = pending_inc.epoch;
7241   } else if (var == "pg_num") {
7242     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7243       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
7244       return -EPERM;
7245     }
7246     if (interr.length()) {
7247       ss << "error parsing integer value '" << val << "': " << interr;
7248       return -EINVAL;
7249     }
7250     if (n == (int)p.get_pg_num_target()) {
7251       return 0;
7252     }
7253     if (n <= 0 || static_cast<uint64_t>(n) >
7254                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7255       ss << "'pg_num' must be greater than 0 and less than or equal to "
7256          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7257          << " (you may adjust 'mon max pool pg num' for higher values)";
7258       return -ERANGE;
7259     }
7260     if (n > (int)p.get_pg_num_target()) {
7261       int r = check_pg_num(pool, n, p.get_size(), &ss);
7262       if (r) {
7263         return r;
7264       }
7265       bool force = false;
7266       cmd_getval(cct,cmdmap, "yes_i_really_mean_it", force);
7267       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
7268         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
7269         return -EPERM;
7270       }
7271     } else {
7272       if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7273         ss << "nautilus OSDs are required to decrease pg_num";
7274         return -EPERM;
7275       }
7276     }
7277     if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7278       // pre-nautilus osdmap format; increase pg_num directly
7279       assert(n > (int)p.get_pg_num());
7280       // force pre-nautilus clients to resend their ops, since they
7281       // don't understand pg_num_target changes form a new interval
7282       p.last_force_op_resend_prenautilus = pending_inc.epoch;
7283       // force pre-luminous clients to resend their ops, since they
7284       // don't understand that split PGs now form a new interval.
7285       p.last_force_op_resend_preluminous = pending_inc.epoch;
7286       p.set_pg_num(n);
7287     } else {
7288       // set targets; mgr will adjust pg_num_actual and pgp_num later.
7289       // make pgp_num track pg_num if it already matches.  if it is set
7290       // differently, leave it different and let the user control it
7291       // manually.
7292       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
7293         p.set_pgp_num_target(n);
7294       }
7295       p.set_pg_num_target(n);
7296     }
7297   } else if (var == "pgp_num_actual") {
7298     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7299       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7300       return -EPERM;
7301     }
7302     if (interr.length()) {
7303       ss << "error parsing integer value '" << val << "': " << interr;
7304       return -EINVAL;
7305     }
7306     if (n <= 0) {
7307       ss << "specified pgp_num must > 0, but you set to " << n;
7308       return -EINVAL;
7309     }
7310     if (n > (int)p.get_pg_num()) {
7311       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
7312       return -EINVAL;
7313     }
7314     if (n > (int)p.get_pg_num_pending()) {
7315       ss << "specified pgp_num " << n
7316          << " > pg_num_pending " << p.get_pg_num_pending();
7317       return -EINVAL;
7318     }
7319     p.set_pgp_num(n);
7320   } else if (var == "pgp_num") {
7321     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7322       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7323       return -EPERM;
7324     }
7325     if (interr.length()) {
7326       ss << "error parsing integer value '" << val << "': " << interr;
7327       return -EINVAL;
7328     }
7329     if (n <= 0) {
7330       ss << "specified pgp_num must > 0, but you set to " << n;
7331       return -EINVAL;
7332     }
7333     if (n > (int)p.get_pg_num_target()) {
7334       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
7335       return -EINVAL;
7336     }
7337     if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7338       // pre-nautilus osdmap format; increase pgp_num directly
7339       p.set_pgp_num(n);
7340     } else {
7341       p.set_pgp_num_target(n);
7342     }
7343   } else if (var == "pg_autoscale_mode") {
7344     n = pg_pool_t::get_pg_autoscale_mode_by_name(val);
7345     if (n < 0) {
7346       ss << "specified invalid mode " << val;
7347       return -EINVAL;
7348     }
7349     if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7350       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
7351       return -EINVAL;
7352     }
7353     p.pg_autoscale_mode = n;
7354   } else if (var == "crush_rule") {
7355     int id = osdmap.crush->get_rule_id(val);
7356     if (id == -ENOENT) {
7357       ss << "crush rule " << val << " does not exist";
7358       return -ENOENT;
7359     }
7360     if (id < 0) {
7361       ss << cpp_strerror(id);
7362       return -ENOENT;
7363     }
7364     if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
7365       return -EINVAL;
7366     }
7367     p.crush_rule = id;
7368   } else if (var == "nodelete" || var == "nopgchange" ||
7369              var == "nosizechange" || var == "write_fadvise_dontneed" ||
7370              var == "noscrub" || var == "nodeep-scrub") {
7371     uint64_t flag = pg_pool_t::get_flag_by_name(var);
7372     // make sure we only compare against 'n' if we didn't receive a string
7373     if (val == "true" || (interr.empty() && n == 1)) {
7374       p.set_flag(flag);
7375     } else if (val == "false" || (interr.empty() && n == 0)) {
7376       p.unset_flag(flag);
7377     } else {
7378       ss << "expecting value 'true', 'false', '0', or '1'";
7379       return -EINVAL;
7380     }
7381   } else if (var == "hashpspool") {
7382     uint64_t flag = pg_pool_t::get_flag_by_name(var);
7383     bool force = false;
7384     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7385
7386     if (!force) {
7387       ss << "are you SURE?  this will remap all placement groups in this pool,"
7388             " this triggers large data movement,"
7389             " pass --yes-i-really-mean-it if you really do.";
7390       return -EPERM;
7391     }
7392     // make sure we only compare against 'n' if we didn't receive a string
7393     if (val == "true" || (interr.empty() && n == 1)) {
7394       p.set_flag(flag);
7395     } else if (val == "false" || (interr.empty() && n == 0)) {
7396       p.unset_flag(flag);
7397     } else {
7398       ss << "expecting value 'true', 'false', '0', or '1'";
7399       return -EINVAL;
7400     }
7401   } else if (var == "hit_set_type") {
7402     if (val == "none")
7403       p.hit_set_params = HitSet::Params();
7404     else {
7405       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
7406       if (err)
7407         return err;
7408       if (val == "bloom") {
7409         BloomHitSet::Params *bsp = new BloomHitSet::Params;
7410         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7411         p.hit_set_params = HitSet::Params(bsp);
7412       } else if (val == "explicit_hash")
7413         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
7414       else if (val == "explicit_object")
7415         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
7416       else {
7417         ss << "unrecognized hit_set type '" << val << "'";
7418         return -EINVAL;
7419       }
7420     }
7421   } else if (var == "hit_set_period") {
7422     if (interr.length()) {
7423       ss << "error parsing integer value '" << val << "': " << interr;
7424       return -EINVAL;
7425     } else if (n < 0) {
7426       ss << "hit_set_period should be non-negative";
7427       return -EINVAL;
7428     }
7429     p.hit_set_period = n;
7430   } else if (var == "hit_set_count") {
7431     if (interr.length()) {
7432       ss << "error parsing integer value '" << val << "': " << interr;
7433       return -EINVAL;
7434     } else if (n < 0) {
7435       ss << "hit_set_count should be non-negative";
7436       return -EINVAL;
7437     }
7438     p.hit_set_count = n;
7439   } else if (var == "hit_set_fpp") {
7440     if (floaterr.length()) {
7441       ss << "error parsing floating point value '" << val << "': " << floaterr;
7442       return -EINVAL;
7443     } else if (f < 0 || f > 1.0) {
7444       ss << "hit_set_fpp should be in the range 0..1";
7445       return -EINVAL;
7446     }
7447     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
7448       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
7449       return -EINVAL;
7450     }
7451     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
7452     bloomp->set_fpp(f);
7453   } else if (var == "use_gmt_hitset") {
7454     if (val == "true" || (interr.empty() && n == 1)) {
7455       p.use_gmt_hitset = true;
7456     } else {
7457       ss << "expecting value 'true' or '1'";
7458       return -EINVAL;
7459     }
7460   } else if (var == "allow_ec_overwrites") {
7461     if (!p.is_erasure()) {
7462       ss << "ec overwrites can only be enabled for an erasure coded pool";
7463       return -EINVAL;
7464     }
7465     stringstream err;
7466     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
7467         !is_pool_currently_all_bluestore(pool, p, &err)) {
7468       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
7469       return -EINVAL;
7470     }
7471     if (val == "true" || (interr.empty() && n == 1)) {
7472         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
7473     } else if (val == "false" || (interr.empty() && n == 0)) {
7474       ss << "ec overwrites cannot be disabled once enabled";
7475       return -EINVAL;
7476     } else {
7477       ss << "expecting value 'true', 'false', '0', or '1'";
7478       return -EINVAL;
7479     }
7480   } else if (var == "target_max_objects") {
7481     if (interr.length()) {
7482       ss << "error parsing int '" << val << "': " << interr;
7483       return -EINVAL;
7484     }
7485     p.target_max_objects = n;
7486   } else if (var == "target_max_bytes") {
7487     if (interr.length()) {
7488       ss << "error parsing int '" << val << "': " << interr;
7489       return -EINVAL;
7490     }
7491     p.target_max_bytes = n;
7492   } else if (var == "cache_target_dirty_ratio") {
7493     if (floaterr.length()) {
7494       ss << "error parsing float '" << val << "': " << floaterr;
7495       return -EINVAL;
7496     }
7497     if (f < 0 || f > 1.0) {
7498       ss << "value must be in the range 0..1";
7499       return -ERANGE;
7500     }
7501     p.cache_target_dirty_ratio_micro = uf;
7502   } else if (var == "cache_target_dirty_high_ratio") {
7503     if (floaterr.length()) {
7504       ss << "error parsing float '" << val << "': " << floaterr;
7505       return -EINVAL;
7506     }
7507     if (f < 0 || f > 1.0) {
7508       ss << "value must be in the range 0..1";
7509       return -ERANGE;
7510     }
7511     p.cache_target_dirty_high_ratio_micro = uf;
7512   } else if (var == "cache_target_full_ratio") {
7513     if (floaterr.length()) {
7514       ss << "error parsing float '" << val << "': " << floaterr;
7515       return -EINVAL;
7516     }
7517     if (f < 0 || f > 1.0) {
7518       ss << "value must be in the range 0..1";
7519       return -ERANGE;
7520     }
7521     p.cache_target_full_ratio_micro = uf;
7522   } else if (var == "cache_min_flush_age") {
7523     if (interr.length()) {
7524       ss << "error parsing int '" << val << "': " << interr;
7525       return -EINVAL;
7526     }
7527     p.cache_min_flush_age = n;
7528   } else if (var == "cache_min_evict_age") {
7529     if (interr.length()) {
7530       ss << "error parsing int '" << val << "': " << interr;
7531       return -EINVAL;
7532     }
7533     p.cache_min_evict_age = n;
7534   } else if (var == "min_read_recency_for_promote") {
7535     if (interr.length()) {
7536       ss << "error parsing integer value '" << val << "': " << interr;
7537       return -EINVAL;
7538     }
7539     p.min_read_recency_for_promote = n;
7540   } else if (var == "hit_set_grade_decay_rate") {
7541     if (interr.length()) {
7542       ss << "error parsing integer value '" << val << "': " << interr;
7543       return -EINVAL;
7544     }
7545     if (n > 100 || n < 0) {
7546       ss << "value out of range,valid range is 0 - 100";
7547       return -EINVAL;
7548     }
7549     p.hit_set_grade_decay_rate = n;
7550   } else if (var == "hit_set_search_last_n") {
7551     if (interr.length()) {
7552       ss << "error parsing integer value '" << val << "': " << interr;
7553       return -EINVAL;
7554     }
7555     if (n > p.hit_set_count || n < 0) {
7556       ss << "value out of range,valid range is 0 - hit_set_count";
7557       return -EINVAL;
7558     }
7559     p.hit_set_search_last_n = n;
7560   } else if (var == "min_write_recency_for_promote") {
7561     if (interr.length()) {
7562       ss << "error parsing integer value '" << val << "': " << interr;
7563       return -EINVAL;
7564     }
7565     p.min_write_recency_for_promote = n;
7566   } else if (var == "fast_read") {
7567     if (p.is_replicated()) {
7568         ss << "fast read is not supported in replication pool";
7569         return -EINVAL;
7570     }
7571     if (val == "true" || (interr.empty() && n == 1)) {
7572       p.fast_read = true;
7573     } else if (val == "false" || (interr.empty() && n == 0)) {
7574       p.fast_read = false;
7575     } else {
7576       ss << "expecting value 'true', 'false', '0', or '1'";
7577       return -EINVAL;
7578     }
7579   } else if (pool_opts_t::is_opt_name(var)) {
7580     bool unset = val == "unset";
7581     if (var == "compression_mode") {
7582       if (!unset) {
7583         auto cmode = Compressor::get_comp_mode_type(val);
7584         if (!cmode) {
7585           ss << "unrecognized compression mode '" << val << "'";
7586           return -EINVAL;
7587         }
7588       }
7589     } else if (var == "compression_algorithm") {
7590       if (!unset) {
7591         auto alg = Compressor::get_comp_alg_type(val);
7592         if (!alg) {
7593           ss << "unrecognized compression_algorithm '" << val << "'";
7594           return -EINVAL;
7595         }
7596       }
7597     } else if (var == "compression_required_ratio") {
7598       if (floaterr.length()) {
7599         ss << "error parsing float value '" << val << "': " << floaterr;
7600         return -EINVAL;
7601       }
7602       if (f < 0 || f > 1) {
7603         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
7604         return -EINVAL;
7605       }
7606     } else if (var == "csum_type") {
7607       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7608       if (t < 0 ) {
7609         ss << "unrecognized csum_type '" << val << "'";
7610         return -EINVAL;
7611       }
7612       //preserve csum_type numeric value
7613       n = t;
7614       interr.clear();
7615     } else if (var == "compression_max_blob_size" ||
7616                var == "compression_min_blob_size" ||
7617                var == "csum_max_block" ||
7618                var == "csum_min_block") {
7619       if (interr.length()) {
7620         ss << "error parsing int value '" << val << "': " << interr;
7621         return -EINVAL;
7622       }
7623     } else if (var == "fingerprint_algorithm") {
7624       if (!unset) {
7625         auto alg = pg_pool_t::get_fingerprint_from_str(val);
7626         if (!alg) {
7627           ss << "unrecognized fingerprint_algorithm '" << val << "'";
7628           return -EINVAL;
7629         }
7630       }
7631     } else if (var == "pg_num_min") {
7632       if (interr.length()) {
7633         ss << "error parsing int value '" << val << "': " << interr;
7634         return -EINVAL;
7635       }
7636       if (n > (int)p.get_pg_num_target()) {
7637         ss << "specified pg_num_min " << n
7638            << " > pg_num " << p.get_pg_num_target();
7639         return -EINVAL;
7640       }
7641     } else if (var == "recovery_priority") {
7642       if (interr.length()) {
7643         ss << "error parsing int value '" << val << "': " << interr;
7644         return -EINVAL;
7645       }
7646       if (!g_conf()->debug_allow_any_pool_priority) {
7647         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
7648           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
7649              << " and " << OSD_POOL_PRIORITY_MAX;
7650           return -EINVAL;
7651         }
7652       }
7653     } else if (var == "pg_autoscale_bias") {
7654       if (f < 0.0 || f > 1000.0) {
7655         ss << "pg_autoscale_bias must be between 0 and 1000";
7656         return -EINVAL;
7657       }
7658     }
7659
7660     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
7661     switch (desc.type) {
7662     case pool_opts_t::STR:
7663       if (unset) {
7664         p.opts.unset(desc.key);
7665       } else {
7666         p.opts.set(desc.key, static_cast<std::string>(val));
7667       }
7668       break;
7669     case pool_opts_t::INT:
7670       if (interr.length()) {
7671         ss << "error parsing integer value '" << val << "': " << interr;
7672         return -EINVAL;
7673       }
7674       if (n == 0) {
7675         p.opts.unset(desc.key);
7676       } else {
7677         p.opts.set(desc.key, static_cast<int64_t>(n));
7678       }
7679       break;
7680     case pool_opts_t::DOUBLE:
7681       if (floaterr.length()) {
7682         ss << "error parsing floating point value '" << val << "': " << floaterr;
7683         return -EINVAL;
7684       }
7685       if (f == 0) {
7686         p.opts.unset(desc.key);
7687       } else {
7688         p.opts.set(desc.key, static_cast<double>(f));
7689       }
7690       break;
7691     default:
7692       ceph_assert(!"unknown type");
7693     }
7694   } else {
7695     ss << "unrecognized variable '" << var << "'";
7696     return -EINVAL;
7697   }
7698   if (val != "unset") {
7699     ss << "set pool " << pool << " " << var << " to " << val;
7700   } else {
7701     ss << "unset pool " << pool << " " << var;
7702   }
7703   p.last_change = pending_inc.epoch;
7704   pending_inc.new_pools[pool] = p;
7705   return 0;
7706 }
7707
7708 int OSDMonitor::prepare_command_pool_application(const string &prefix,
7709                                                  const cmdmap_t& cmdmap,
7710                                                  stringstream& ss)
7711 {
7712   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
7713 }
7714
7715 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
7716                                                     const cmdmap_t& cmdmap,
7717                                                     stringstream& ss,
7718                                                     bool *modified)
7719 {
7720   return _command_pool_application(prefix, cmdmap, ss, modified, false);
7721 }
7722
7723
7724 /**
7725  * Common logic for preprocess and prepare phases of pool application
7726  * tag commands.  In preprocess mode we're only detecting invalid
7727  * commands, and determining whether it was a modification or a no-op.
7728  * In prepare mode we're actually updating the pending state.
7729  */
7730 int OSDMonitor::_command_pool_application(const string &prefix,
7731                                           const cmdmap_t& cmdmap,
7732                                           stringstream& ss,
7733                                           bool *modified,
7734                                           bool preparing)
7735 {
7736   string pool_name;
7737   cmd_getval(cct, cmdmap, "pool", pool_name);
7738   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
7739   if (pool < 0) {
7740     ss << "unrecognized pool '" << pool_name << "'";
7741     return -ENOENT;
7742   }
7743
7744   pg_pool_t p = *osdmap.get_pg_pool(pool);
7745   if (preparing) {
7746     if (pending_inc.new_pools.count(pool)) {
7747       p = pending_inc.new_pools[pool];
7748     }
7749   }
7750
7751   string app;
7752   cmd_getval(cct, cmdmap, "app", app);
7753   bool app_exists = (p.application_metadata.count(app) > 0);
7754
7755   string key;
7756   cmd_getval(cct, cmdmap, "key", key);
7757   if (key == "all") {
7758     ss << "key cannot be 'all'";
7759     return -EINVAL;
7760   }
7761
7762   string value;
7763   cmd_getval(cct, cmdmap, "value", value);
7764   if (value == "all") {
7765     ss << "value cannot be 'all'";
7766     return -EINVAL;
7767   }
7768
7769   if (boost::algorithm::ends_with(prefix, "enable")) {
7770     if (app.empty()) {
7771       ss << "application name must be provided";
7772       return -EINVAL;
7773     }
7774
7775     if (p.is_tier()) {
7776       ss << "application must be enabled on base tier";
7777       return -EINVAL;
7778     }
7779
7780     bool force = false;
7781     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7782
7783     if (!app_exists && !p.application_metadata.empty() && !force) {
7784       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
7785          << "application; pass --yes-i-really-mean-it to proceed anyway";
7786       return -EPERM;
7787     }
7788
7789     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
7790       ss << "too many enabled applications on pool '" << pool_name << "'; "
7791          << "max " << MAX_POOL_APPLICATIONS;
7792       return -EINVAL;
7793     }
7794
7795     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
7796       ss << "application name '" << app << "' too long; max length "
7797          << MAX_POOL_APPLICATION_LENGTH;
7798       return -EINVAL;
7799     }
7800
7801     if (!app_exists) {
7802       p.application_metadata[app] = {};
7803     }
7804     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
7805
7806   } else if (boost::algorithm::ends_with(prefix, "disable")) {
7807     bool force = false;
7808     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7809
7810     if (!force) {
7811       ss << "Are you SURE? Disabling an application within a pool might result "
7812          << "in loss of application functionality; pass "
7813          << "--yes-i-really-mean-it to proceed anyway";
7814       return -EPERM;
7815     }
7816
7817     if (!app_exists) {
7818       ss << "application '" << app << "' is not enabled on pool '" << pool_name
7819          << "'";
7820       return 0; // idempotent
7821     }
7822
7823     p.application_metadata.erase(app);
7824     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
7825
7826   } else if (boost::algorithm::ends_with(prefix, "set")) {
7827     if (p.is_tier()) {
7828       ss << "application metadata must be set on base tier";
7829       return -EINVAL;
7830     }
7831
7832     if (!app_exists) {
7833       ss << "application '" << app << "' is not enabled on pool '" << pool_name
7834          << "'";
7835       return -ENOENT;
7836     }
7837
7838     string key;
7839     cmd_getval(cct, cmdmap, "key", key);
7840
7841     if (key.empty()) {
7842       ss << "key must be provided";
7843       return -EINVAL;
7844     }
7845
7846     auto &app_keys = p.application_metadata[app];
7847     if (app_keys.count(key) == 0 &&
7848         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
7849       ss << "too many keys set for application '" << app << "' on pool '"
7850          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
7851       return -EINVAL;
7852     }
7853
7854     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
7855       ss << "key '" << app << "' too long; max length "
7856          << MAX_POOL_APPLICATION_LENGTH;
7857       return -EINVAL;
7858     }
7859
7860     string value;
7861     cmd_getval(cct, cmdmap, "value", value);
7862     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
7863       ss << "value '" << value << "' too long; max length "
7864          << MAX_POOL_APPLICATION_LENGTH;
7865       return -EINVAL;
7866     }
7867
7868     p.application_metadata[app][key] = value;
7869     ss << "set application '" << app << "' key '" << key << "' to '"
7870        << value << "' on pool '" << pool_name << "'";
7871   } else if (boost::algorithm::ends_with(prefix, "rm")) {
7872     if (!app_exists) {
7873       ss << "application '" << app << "' is not enabled on pool '" << pool_name
7874          << "'";
7875       return -ENOENT;
7876     }
7877
7878     string key;
7879     cmd_getval(cct, cmdmap, "key", key);
7880     auto it = p.application_metadata[app].find(key);
7881     if (it == p.application_metadata[app].end()) {
7882       ss << "application '" << app << "' on pool '" << pool_name
7883          << "' does not have key '" << key << "'";
7884       return 0; // idempotent
7885     }
7886
7887     p.application_metadata[app].erase(it);
7888     ss << "removed application '" << app << "' key '" << key << "' on pool '"
7889        << pool_name << "'";
7890   } else {
7891     ceph_abort();
7892   }
7893
7894   if (preparing) {
7895     p.last_change = pending_inc.epoch;
7896     pending_inc.new_pools[pool] = p;
7897   }
7898
7899   // Because we fell through this far, we didn't hit no-op cases,
7900   // so pool was definitely modified
7901   if (modified != nullptr) {
7902     *modified = true;
7903   }
7904
7905   return 0;
7906 }
7907
7908 int OSDMonitor::_prepare_command_osd_crush_remove(
7909     CrushWrapper &newcrush,
7910     int32_t id,
7911     int32_t ancestor,
7912     bool has_ancestor,
7913     bool unlink_only)
7914 {
7915   int err = 0;
7916
7917   if (has_ancestor) {
7918     err = newcrush.remove_item_under(cct, id, ancestor,
7919         unlink_only);
7920   } else {
7921     err = newcrush.remove_item(cct, id, unlink_only);
7922   }
7923   return err;
7924 }
7925
7926 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
7927 {
7928   pending_inc.crush.clear();
7929   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7930 }
7931
7932 int OSDMonitor::prepare_command_osd_crush_remove(
7933     CrushWrapper &newcrush,
7934     int32_t id,
7935     int32_t ancestor,
7936     bool has_ancestor,
7937     bool unlink_only)
7938 {
7939   int err = _prepare_command_osd_crush_remove(
7940       newcrush, id, ancestor,
7941       has_ancestor, unlink_only);
7942
7943   if (err < 0)
7944     return err;
7945
7946   ceph_assert(err == 0);
7947   do_osd_crush_remove(newcrush);
7948
7949   return 0;
7950 }
7951
7952 int OSDMonitor::prepare_command_osd_remove(int32_t id)
7953 {
7954   if (osdmap.is_up(id)) {
7955     return -EBUSY;
7956   }
7957
7958   pending_inc.new_state[id] = osdmap.get_state(id);
7959   pending_inc.new_uuid[id] = uuid_d();
7960   pending_metadata_rm.insert(id);
7961   pending_metadata.erase(id);
7962
7963   return 0;
7964 }
7965
7966 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
7967 {
7968   ceph_assert(existing_id);
7969   *existing_id = -1;
7970
7971   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
7972     if (!osdmap.exists(i) &&
7973         pending_inc.new_up_client.count(i) == 0 &&
7974         (pending_inc.new_state.count(i) == 0 ||
7975          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
7976       *existing_id = i;
7977       return -1;
7978     }
7979   }
7980
7981   if (pending_inc.new_max_osd < 0) {
7982     return osdmap.get_max_osd();
7983   }
7984   return pending_inc.new_max_osd;
7985 }
7986
7987 void OSDMonitor::do_osd_create(
7988     const int32_t id,
7989     const uuid_d& uuid,
7990     const string& device_class,
7991     int32_t* new_id)
7992 {
7993   dout(10) << __func__ << " uuid " << uuid << dendl;
7994   ceph_assert(new_id);
7995
7996   // We presume validation has been performed prior to calling this
7997   // function. We assert with prejudice.
7998
7999   int32_t allocated_id = -1; // declare here so we can jump
8000   int32_t existing_id = -1;
8001   if (!uuid.is_zero()) {
8002     existing_id = osdmap.identify_osd(uuid);
8003     if (existing_id >= 0) {
8004       ceph_assert(id < 0 || id == existing_id);
8005       *new_id = existing_id;
8006       goto out;
8007     } else if (id >= 0) {
8008       // uuid does not exist, and id has been provided, so just create
8009       // the new osd.id
8010       *new_id = id;
8011       goto out;
8012     }
8013   }
8014
8015   // allocate a new id
8016   allocated_id = _allocate_osd_id(&existing_id);
8017   dout(10) << __func__ << " allocated id " << allocated_id
8018            << " existing id " << existing_id << dendl;
8019   if (existing_id >= 0) {
8020     ceph_assert(existing_id < osdmap.get_max_osd());
8021     ceph_assert(allocated_id < 0);
8022     pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8023     *new_id = existing_id;
8024   } else if (allocated_id >= 0) {
8025     ceph_assert(existing_id < 0);
8026     // raise max_osd
8027     if (pending_inc.new_max_osd < 0) {
8028       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8029     } else {
8030       ++pending_inc.new_max_osd;
8031     }
8032     *new_id = pending_inc.new_max_osd - 1;
8033     ceph_assert(*new_id == allocated_id);
8034   } else {
8035     ceph_abort_msg("unexpected condition");
8036   }
8037
8038 out:
8039   if (device_class.size()) {
8040     CrushWrapper newcrush;
8041     _get_pending_crush(newcrush);
8042     if (newcrush.get_max_devices() < *new_id + 1) {
8043       newcrush.set_max_devices(*new_id + 1);
8044     }
8045     string name = string("osd.") + stringify(*new_id);
8046     if (!newcrush.item_exists(*new_id)) {
8047       newcrush.set_item_name(*new_id, name);
8048     }
8049     ostringstream ss;
8050     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8051     if (r < 0) {
8052       derr << __func__ << " failed to set " << name << " device_class "
8053            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8054            << dendl;
8055       // non-fatal... this might be a replay and we want to be idempotent.
8056     } else {
8057       dout(20) << __func__ << " set " << name << " device_class " << device_class
8058                << dendl;
8059       pending_inc.crush.clear();
8060       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8061     }
8062   } else {
8063     dout(20) << __func__ << " no device_class" << dendl;
8064   }
8065
8066   dout(10) << __func__ << " using id " << *new_id << dendl;
8067   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8068     pending_inc.new_max_osd = *new_id + 1;
8069   }
8070
8071   pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8072   if (!uuid.is_zero())
8073     pending_inc.new_uuid[*new_id] = uuid;
8074 }
8075
8076 int OSDMonitor::validate_osd_create(
8077     const int32_t id,
8078     const uuid_d& uuid,
8079     const bool check_osd_exists,
8080     int32_t* existing_id,
8081     stringstream& ss)
8082 {
8083
8084   dout(10) << __func__ << " id " << id << " uuid " << uuid
8085            << " check_osd_exists " << check_osd_exists << dendl;
8086
8087   ceph_assert(existing_id);
8088
8089   if (id < 0 && uuid.is_zero()) {
8090     // we have nothing to validate
8091     *existing_id = -1;
8092     return 0;
8093   } else if (uuid.is_zero()) {
8094     // we have an id but we will ignore it - because that's what
8095     // `osd create` does.
8096     return 0;
8097   }
8098
8099   /*
8100    * This function will be used to validate whether we are able to
8101    * create a new osd when the `uuid` is specified.
8102    *
8103    * It will be used by both `osd create` and `osd new`, as the checks
8104    * are basically the same when it pertains to osd id and uuid validation.
8105    * However, `osd create` presumes an `uuid` is optional, for legacy
8106    * reasons, while `osd new` requires the `uuid` to be provided. This
8107    * means that `osd create` will not be idempotent if an `uuid` is not
8108    * provided, but we will always guarantee the idempotency of `osd new`.
8109    */
8110
8111   ceph_assert(!uuid.is_zero());
8112   if (pending_inc.identify_osd(uuid) >= 0) {
8113     // osd is about to exist
8114     return -EAGAIN;
8115   }
8116
8117   int32_t i = osdmap.identify_osd(uuid);
8118   if (i >= 0) {
8119     // osd already exists
8120     if (id >= 0 && i != id) {
8121       ss << "uuid " << uuid << " already in use for different id " << i;
8122       return -EEXIST;
8123     }
8124     // return a positive errno to distinguish between a blocking error
8125     // and an error we consider to not be a problem (i.e., this would be
8126     // an idempotent operation).
8127     *existing_id = i;
8128     return EEXIST;
8129   }
8130   // i < 0
8131   if (id >= 0) {
8132     if (pending_inc.new_state.count(id)) {
8133       // osd is about to exist
8134       return -EAGAIN;
8135     }
8136     // we may not care if an osd exists if we are recreating a previously
8137     // destroyed osd.
8138     if (check_osd_exists && osdmap.exists(id)) {
8139       ss << "id " << id << " already in use and does not match uuid "
8140          << uuid;
8141       return -EINVAL;
8142     }
8143   }
8144   return 0;
8145 }
8146
8147 int OSDMonitor::prepare_command_osd_create(
8148     const int32_t id,
8149     const uuid_d& uuid,
8150     int32_t* existing_id,
8151     stringstream& ss)
8152 {
8153   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8154   ceph_assert(existing_id);
8155   if (osdmap.is_destroyed(id)) {
8156     ss << "ceph osd create has been deprecated. Please use ceph osd new "
8157           "instead.";
8158     return -EINVAL;
8159   }
8160
8161   if (uuid.is_zero()) {
8162     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8163   }
8164
8165   return validate_osd_create(id, uuid, true, existing_id, ss);
8166 }
8167
8168 int OSDMonitor::prepare_command_osd_new(
8169     MonOpRequestRef op,
8170     const cmdmap_t& cmdmap,
8171     const map<string,string>& params,
8172     stringstream &ss,
8173     Formatter *f)
8174 {
8175   uuid_d uuid;
8176   string uuidstr;
8177   int64_t id = -1;
8178
8179   ceph_assert(paxos->is_plugged());
8180
8181   dout(10) << __func__ << " " << op << dendl;
8182
8183   /* validate command. abort now if something's wrong. */
8184
8185   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8186    *
8187    * If `id` is not specified, we will identify any existing osd based
8188    * on `uuid`. Operation will be idempotent iff secrets match.
8189    *
8190    * If `id` is specified, we will identify any existing osd based on
8191    * `uuid` and match against `id`. If they match, operation will be
8192    * idempotent iff secrets match.
8193    *
8194    * `-i secrets.json` will be optional. If supplied, will be used
8195    * to check for idempotency when `id` and `uuid` match.
8196    *
8197    * If `id` is not specified, and `uuid` does not exist, an id will
8198    * be found or allocated for the osd.
8199    *
8200    * If `id` is specified, and the osd has been previously marked
8201    * as destroyed, then the `id` will be reused.
8202    */
8203   if (!cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
8204     ss << "requires the OSD's UUID to be specified.";
8205     return -EINVAL;
8206   } else if (!uuid.parse(uuidstr.c_str())) {
8207     ss << "invalid UUID value '" << uuidstr << "'.";
8208     return -EINVAL;
8209   }
8210
8211   if (cmd_getval(cct, cmdmap, "id", id) &&
8212       (id < 0)) {
8213     ss << "invalid OSD id; must be greater or equal than zero.";
8214     return -EINVAL;
8215   }
8216
8217   // are we running an `osd create`-like command, or recreating
8218   // a previously destroyed osd?
8219
8220   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
8221
8222   // we will care about `id` to assess whether osd is `destroyed`, or
8223   // to create a new osd.
8224   // we will need an `id` by the time we reach auth.
8225
8226   int32_t existing_id = -1;
8227   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
8228                                 &existing_id, ss);
8229
8230   bool may_be_idempotent = false;
8231   if (err == EEXIST) {
8232     // this is idempotent from the osdmon's point-of-view
8233     may_be_idempotent = true;
8234     ceph_assert(existing_id >= 0);
8235     id = existing_id;
8236   } else if (err < 0) {
8237     return err;
8238   }
8239
8240   if (!may_be_idempotent) {
8241     // idempotency is out of the window. We are either creating a new
8242     // osd or recreating a destroyed osd.
8243     //
8244     // We now need to figure out if we have an `id` (and if it's valid),
8245     // of find an `id` if we don't have one.
8246
8247     // NOTE: we need to consider the case where the `id` is specified for
8248     // `osd create`, and we must honor it. So this means checking if
8249     // the `id` is destroyed, and if so assume the destroy; otherwise,
8250     // check if it `exists` - in which case we complain about not being
8251     // `destroyed`. In the end, if nothing fails, we must allow the
8252     // creation, so that we are compatible with `create`.
8253     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
8254       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
8255       ss << "OSD " << id << " has not yet been destroyed";
8256       return -EINVAL;
8257     } else if (id < 0) {
8258       // find an `id`
8259       id = _allocate_osd_id(&existing_id);
8260       if (id < 0) {
8261         ceph_assert(existing_id >= 0);
8262         id = existing_id;
8263       }
8264       dout(10) << __func__ << " found id " << id << " to use" << dendl;
8265     } else if (id >= 0 && osdmap.is_destroyed(id)) {
8266       dout(10) << __func__ << " recreating osd." << id << dendl;
8267     } else {
8268       dout(10) << __func__ << " creating new osd." << id << dendl;
8269     }
8270   } else {
8271     ceph_assert(id >= 0);
8272     ceph_assert(osdmap.exists(id));
8273   }
8274
8275   // we are now able to either create a brand new osd or reuse an existing
8276   // osd that has been previously destroyed.
8277
8278   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8279
8280   if (may_be_idempotent && params.empty()) {
8281     // nothing to do, really.
8282     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
8283     ceph_assert(id >= 0);
8284     if (f) {
8285       f->open_object_section("created_osd");
8286       f->dump_int("osdid", id);
8287       f->close_section();
8288     } else {
8289       ss << id;
8290     }
8291     return EEXIST;
8292   }
8293
8294   string device_class;
8295   auto p = params.find("crush_device_class");
8296   if (p != params.end()) {
8297     device_class = p->second;
8298     dout(20) << __func__ << " device_class will be " << device_class << dendl;
8299   }
8300   string cephx_secret, lockbox_secret, dmcrypt_key;
8301   bool has_lockbox = false;
8302   bool has_secrets = params.count("cephx_secret")
8303     || params.count("cephx_lockbox_secret")
8304     || params.count("dmcrypt_key");
8305
8306   ConfigKeyService *svc = nullptr;
8307   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
8308
8309   if (has_secrets) {
8310     if (params.count("cephx_secret") == 0) {
8311       ss << "requires a cephx secret.";
8312       return -EINVAL;
8313     }
8314     cephx_secret = params.at("cephx_secret");
8315
8316     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
8317     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
8318
8319     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
8320              << " dmcrypt " << has_dmcrypt_key << dendl;
8321
8322     if (has_lockbox_secret && has_dmcrypt_key) {
8323       has_lockbox = true;
8324       lockbox_secret = params.at("cephx_lockbox_secret");
8325       dmcrypt_key = params.at("dmcrypt_key");
8326     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
8327       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
8328       return -EINVAL;
8329     }
8330
8331     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
8332
8333     err = mon->authmon()->validate_osd_new(id, uuid,
8334         cephx_secret,
8335         lockbox_secret,
8336         cephx_entity,
8337         lockbox_entity,
8338         ss);
8339     if (err < 0) {
8340       return err;
8341     } else if (may_be_idempotent && err != EEXIST) {
8342       // for this to be idempotent, `id` should already be >= 0; no need
8343       // to use validate_id.
8344       ceph_assert(id >= 0);
8345       ss << "osd." << id << " exists but secrets do not match";
8346       return -EEXIST;
8347     }
8348
8349     if (has_lockbox) {
8350       svc = (ConfigKeyService*)mon->config_key_service;
8351       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
8352       if (err < 0) {
8353         return err;
8354       } else if (may_be_idempotent && err != EEXIST) {
8355         ceph_assert(id >= 0);
8356         ss << "osd." << id << " exists but dm-crypt key does not match.";
8357         return -EEXIST;
8358       }
8359     }
8360   }
8361   ceph_assert(!has_secrets || !cephx_secret.empty());
8362   ceph_assert(!has_lockbox || !lockbox_secret.empty());
8363
8364   if (may_be_idempotent) {
8365     // we have nothing to do for either the osdmon or the authmon,
8366     // and we have no lockbox - so the config key service will not be
8367     // touched. This is therefore an idempotent operation, and we can
8368     // just return right away.
8369     dout(10) << __func__ << " idempotent -- no op." << dendl;
8370     ceph_assert(id >= 0);
8371     if (f) {
8372       f->open_object_section("created_osd");
8373       f->dump_int("osdid", id);
8374       f->close_section();
8375     } else {
8376       ss << id;
8377     }
8378     return EEXIST;
8379   }
8380   ceph_assert(!may_be_idempotent);
8381
8382   // perform updates.
8383   if (has_secrets) {
8384     ceph_assert(!cephx_secret.empty());
8385     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
8386            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
8387
8388     err = mon->authmon()->do_osd_new(cephx_entity,
8389         lockbox_entity,
8390         has_lockbox);
8391     ceph_assert(0 == err);
8392
8393     if (has_lockbox) {
8394       ceph_assert(nullptr != svc);
8395       svc->do_osd_new(uuid, dmcrypt_key);
8396     }
8397   }
8398
8399   if (is_recreate_destroyed) {
8400     ceph_assert(id >= 0);
8401     ceph_assert(osdmap.is_destroyed(id));
8402     pending_inc.new_weight[id] = CEPH_OSD_OUT;
8403     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
8404     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
8405       pending_inc.new_state[id] |= CEPH_OSD_NEW;
8406     }
8407     if (osdmap.get_state(id) & CEPH_OSD_UP) {
8408       // due to http://tracker.ceph.com/issues/20751 some clusters may
8409       // have UP set for non-existent OSDs; make sure it is cleared
8410       // for a newly created osd.
8411       pending_inc.new_state[id] |= CEPH_OSD_UP;
8412     }
8413     pending_inc.new_uuid[id] = uuid;
8414   } else {
8415     ceph_assert(id >= 0);
8416     int32_t new_id = -1;
8417     do_osd_create(id, uuid, device_class, &new_id);
8418     ceph_assert(new_id >= 0);
8419     ceph_assert(id == new_id);
8420   }
8421
8422   if (f) {
8423     f->open_object_section("created_osd");
8424     f->dump_int("osdid", id);
8425     f->close_section();
8426   } else {
8427     ss << id;
8428   }
8429
8430   return 0;
8431 }
8432
8433 bool OSDMonitor::prepare_command(MonOpRequestRef op)
8434 {
8435   op->mark_osdmon_event(__func__);
8436   MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
8437   stringstream ss;
8438   cmdmap_t cmdmap;
8439   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
8440     string rs = ss.str();
8441     mon->reply_command(op, -EINVAL, rs, get_last_committed());
8442     return true;
8443   }
8444
8445   MonSession *session = op->get_session();
8446   if (!session) {
8447     derr << __func__ << " no session" << dendl;
8448     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
8449     return true;
8450   }
8451
8452   return prepare_command_impl(op, cmdmap);
8453 }
8454
8455 static int parse_reweights(CephContext *cct,
8456                            const cmdmap_t& cmdmap,
8457                            const OSDMap& osdmap,
8458                            map<int32_t, uint32_t>* weights)
8459 {
8460   string weights_str;
8461   if (!cmd_getval(cct, cmdmap, "weights", weights_str)) {
8462     return -EINVAL;
8463   }
8464   std::replace(begin(weights_str), end(weights_str), '\'', '"');
8465   json_spirit::mValue json_value;
8466   if (!json_spirit::read(weights_str, json_value)) {
8467     return -EINVAL;
8468   }
8469   if (json_value.type() != json_spirit::obj_type) {
8470     return -EINVAL;
8471   }
8472   const auto obj = json_value.get_obj();
8473   try {
8474     for (auto& osd_weight : obj) {
8475       auto osd_id = std::stoi(osd_weight.first);
8476       if (!osdmap.exists(osd_id)) {
8477         return -ENOENT;
8478       }
8479       if (osd_weight.second.type() != json_spirit::str_type) {
8480         return -EINVAL;
8481       }
8482       auto weight = std::stoul(osd_weight.second.get_str());
8483       weights->insert({osd_id, weight});
8484     }
8485   } catch (const std::logic_error& e) {
8486     return -EINVAL;
8487   }
8488   return 0;
8489 }
8490
8491 int OSDMonitor::prepare_command_osd_destroy(
8492     int32_t id,
8493     stringstream& ss)
8494 {
8495   ceph_assert(paxos->is_plugged());
8496
8497   // we check if the osd exists for the benefit of `osd purge`, which may
8498   // have previously removed the osd. If the osd does not exist, return
8499   // -ENOENT to convey this, and let the caller deal with it.
8500   //
8501   // we presume that all auth secrets and config keys were removed prior
8502   // to this command being called. if they exist by now, we also assume
8503   // they must have been created by some other command and do not pertain
8504   // to this non-existent osd.
8505   if (!osdmap.exists(id)) {
8506     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
8507     return -ENOENT;
8508   }
8509
8510   uuid_d uuid = osdmap.get_uuid(id);
8511   dout(10) << __func__ << " destroying osd." << id
8512            << " uuid " << uuid << dendl;
8513
8514   // if it has been destroyed, we assume our work here is done.
8515   if (osdmap.is_destroyed(id)) {
8516     ss << "destroyed osd." << id;
8517     return 0;
8518   }
8519
8520   EntityName cephx_entity, lockbox_entity;
8521   bool idempotent_auth = false, idempotent_cks = false;
8522
8523   int err = mon->authmon()->validate_osd_destroy(id, uuid,
8524                                                  cephx_entity,
8525                                                  lockbox_entity,
8526                                                  ss);
8527   if (err < 0) {
8528     if (err == -ENOENT) {
8529       idempotent_auth = true;
8530     } else {
8531       return err;
8532     }
8533   }
8534
8535   ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
8536   err = svc->validate_osd_destroy(id, uuid);
8537   if (err < 0) {
8538     ceph_assert(err == -ENOENT);
8539     err = 0;
8540     idempotent_cks = true;
8541   }
8542
8543   if (!idempotent_auth) {
8544     err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
8545     ceph_assert(0 == err);
8546   }
8547
8548   if (!idempotent_cks) {
8549     svc->do_osd_destroy(id, uuid);
8550   }
8551
8552   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
8553   pending_inc.new_uuid[id] = uuid_d();
8554
8555   // we can only propose_pending() once per service, otherwise we'll be
8556   // defying PaxosService and all laws of nature. Therefore, as we may
8557   // be used during 'osd purge', let's keep the caller responsible for
8558   // proposing.
8559   ceph_assert(err == 0);
8560   return 0;
8561 }
8562
8563 int OSDMonitor::prepare_command_osd_purge(
8564     int32_t id,
8565     stringstream& ss)
8566 {
8567   ceph_assert(paxos->is_plugged());
8568   dout(10) << __func__ << " purging osd." << id << dendl;
8569
8570   ceph_assert(!osdmap.is_up(id));
8571
8572   /*
8573    * This may look a bit weird, but this is what's going to happen:
8574    *
8575    *  1. we make sure that removing from crush works
8576    *  2. we call `prepare_command_osd_destroy()`. If it returns an
8577    *     error, then we abort the whole operation, as no updates
8578    *     have been made. However, we this function will have
8579    *     side-effects, thus we need to make sure that all operations
8580    *     performed henceforth will *always* succeed.
8581    *  3. we call `prepare_command_osd_remove()`. Although this
8582    *     function can return an error, it currently only checks if the
8583    *     osd is up - and we have made sure that it is not so, so there
8584    *     is no conflict, and it is effectively an update.
8585    *  4. finally, we call `do_osd_crush_remove()`, which will perform
8586    *     the crush update we delayed from before.
8587    */
8588
8589   CrushWrapper newcrush;
8590   _get_pending_crush(newcrush);
8591
8592   bool may_be_idempotent = false;
8593
8594   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
8595   if (err == -ENOENT) {
8596     err = 0;
8597     may_be_idempotent = true;
8598   } else if (err < 0) {
8599     ss << "error removing osd." << id << " from crush";
8600     return err;
8601   }
8602
8603   // no point destroying the osd again if it has already been marked destroyed
8604   if (!osdmap.is_destroyed(id)) {
8605     err = prepare_command_osd_destroy(id, ss);
8606     if (err < 0) {
8607       if (err == -ENOENT) {
8608         err = 0;
8609       } else {
8610         return err;
8611       }
8612     } else {
8613       may_be_idempotent = false;
8614     }
8615   }
8616   ceph_assert(0 == err);
8617
8618   if (may_be_idempotent && !osdmap.exists(id)) {
8619     dout(10) << __func__ << " osd." << id << " does not exist and "
8620              << "we are idempotent." << dendl;
8621     return -ENOENT;
8622   }
8623
8624   err = prepare_command_osd_remove(id);
8625   // we should not be busy, as we should have made sure this id is not up.
8626   ceph_assert(0 == err);
8627
8628   do_osd_crush_remove(newcrush);
8629   return 0;
8630 }
8631
8632 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
8633                                       const cmdmap_t& cmdmap)
8634 {
8635   op->mark_osdmon_event(__func__);
8636   MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
8637   bool ret = false;
8638   stringstream ss;
8639   string rs;
8640   bufferlist rdata;
8641   int err = 0;
8642
8643   string format;
8644   cmd_getval(cct, cmdmap, "format", format, string("plain"));
8645   boost::scoped_ptr<Formatter> f(Formatter::create(format));
8646
8647   string prefix;
8648   cmd_getval(cct, cmdmap, "prefix", prefix);
8649
8650   int64_t osdid;
8651   string osd_name;
8652   bool osdid_present = false;
8653   if (prefix != "osd pg-temp" &&
8654       prefix != "osd pg-upmap" &&
8655       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
8656     osdid_present = cmd_getval(cct, cmdmap, "id", osdid);
8657   }
8658   if (osdid_present) {
8659     ostringstream oss;
8660     oss << "osd." << osdid;
8661     osd_name = oss.str();
8662   }
8663
8664   // Even if there's a pending state with changes that could affect
8665   // a command, considering that said state isn't yet committed, we
8666   // just don't care about those changes if the command currently being
8667   // handled acts as a no-op against the current committed state.
8668   // In a nutshell, we assume this command  happens *before*.
8669   //
8670   // Let me make this clearer:
8671   //
8672   //   - If we have only one client, and that client issues some
8673   //     operation that would conflict with this operation  but is
8674   //     still on the pending state, then we would be sure that said
8675   //     operation wouldn't have returned yet, so the client wouldn't
8676   //     issue this operation (unless the client didn't wait for the
8677   //     operation to finish, and that would be the client's own fault).
8678   //
8679   //   - If we have more than one client, each client will observe
8680   //     whatever is the state at the moment of the commit.  So, if we
8681   //     have two clients, one issuing an unlink and another issuing a
8682   //     link, and if the link happens while the unlink is still on the
8683   //     pending state, from the link's point-of-view this is a no-op.
8684   //     If different clients are issuing conflicting operations and
8685   //     they care about that, then the clients should make sure they
8686   //     enforce some kind of concurrency mechanism -- from our
8687   //     perspective that's what Douglas Adams would call an SEP.
8688   //
8689   // This should be used as a general guideline for most commands handled
8690   // in this function.  Adapt as you see fit, but please bear in mind that
8691   // this is the expected behavior.
8692
8693
8694   if (prefix == "osd setcrushmap" ||
8695       (prefix == "osd crush set" && !osdid_present)) {
8696     if (pending_inc.crush.length()) {
8697       dout(10) << __func__ << " waiting for pending crush update " << dendl;
8698       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8699       return true;
8700     }
8701     dout(10) << "prepare_command setting new crush map" << dendl;
8702     bufferlist data(m->get_data());
8703     CrushWrapper crush;
8704     try {
8705       auto bl = data.cbegin();
8706       crush.decode(bl);
8707     }
8708     catch (const std::exception &e) {
8709       err = -EINVAL;
8710       ss << "Failed to parse crushmap: " << e.what();
8711       goto reply;
8712     }
8713
8714     int64_t prior_version = 0;
8715     if (cmd_getval(cct, cmdmap, "prior_version", prior_version)) {
8716       if (prior_version == osdmap.get_crush_version() - 1) {
8717         // see if we are a resend of the last update.  this is imperfect
8718         // (multiple racing updaters may not both get reliable success)
8719         // but we expect crush updaters (via this interface) to be rare-ish.
8720         bufferlist current, proposed;
8721         osdmap.crush->encode(current, mon->get_quorum_con_features());
8722         crush.encode(proposed, mon->get_quorum_con_features());
8723         if (current.contents_equal(proposed)) {
8724           dout(10) << __func__
8725                    << " proposed matches current and version equals previous"
8726                    << dendl;
8727           err = 0;
8728           ss << osdmap.get_crush_version();
8729           goto reply;
8730         }
8731       }
8732       if (prior_version != osdmap.get_crush_version()) {
8733         err = -EPERM;
8734         ss << "prior_version " << prior_version << " != crush version "
8735            << osdmap.get_crush_version();
8736         goto reply;
8737       }
8738     }
8739
8740     if (crush.has_legacy_rule_ids()) {
8741       err = -EINVAL;
8742       ss << "crush maps with ruleset != ruleid are no longer allowed";
8743       goto reply;
8744     }
8745     if (!validate_crush_against_features(&crush, ss)) {
8746       err = -EINVAL;
8747       goto reply;
8748     }
8749
8750     err = osdmap.validate_crush_rules(&crush, &ss);
8751     if (err < 0) {
8752       goto reply;
8753     }
8754
8755     if (g_conf()->mon_osd_crush_smoke_test) {
8756       // sanity check: test some inputs to make sure this map isn't
8757       // totally broken
8758       dout(10) << " testing map" << dendl;
8759       stringstream ess;
8760       CrushTester tester(crush, ess);
8761       tester.set_min_x(0);
8762       tester.set_max_x(50);
8763       auto start = ceph::coarse_mono_clock::now();
8764       int r = tester.test_with_fork(g_conf()->mon_lease);
8765       auto duration = ceph::coarse_mono_clock::now() - start;
8766       if (r < 0) {
8767         dout(10) << " tester.test_with_fork returns " << r
8768                  << ": " << ess.str() << dendl;
8769         ss << "crush smoke test failed with " << r << ": " << ess.str();
8770         err = r;
8771         goto reply;
8772       }
8773       dout(10) << __func__ << " crush somke test duration: "
8774                << duration << ", result: " << ess.str() << dendl;
8775     }
8776
8777     pending_inc.crush = data;
8778     ss << osdmap.get_crush_version() + 1;
8779     goto update;
8780
8781   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
8782     CrushWrapper newcrush;
8783     _get_pending_crush(newcrush);
8784     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
8785       int bid = -1 - b;
8786       if (newcrush.bucket_exists(bid) &&
8787           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
8788         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
8789         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
8790       }
8791     }
8792     if (!validate_crush_against_features(&newcrush, ss)) {
8793       err = -EINVAL;
8794       goto reply;
8795     }
8796     pending_inc.crush.clear();
8797     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8798     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8799                                               get_last_committed() + 1));
8800     return true;
8801   } else if (prefix == "osd crush set-device-class") {
8802     string device_class;
8803     if (!cmd_getval(cct, cmdmap, "class", device_class)) {
8804       err = -EINVAL; // no value!
8805       goto reply;
8806     }
8807
8808     bool stop = false;
8809     vector<string> idvec;
8810     cmd_getval(cct, cmdmap, "ids", idvec);
8811     CrushWrapper newcrush;
8812     _get_pending_crush(newcrush);
8813     set<int> updated;
8814     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8815       set<int> osds;
8816       // wildcard?
8817       if (j == 0 &&
8818           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8819         osdmap.get_all_osds(osds);
8820         stop = true;
8821       } else {
8822         // try traditional single osd way
8823         long osd = parse_osd_id(idvec[j].c_str(), &ss);
8824         if (osd < 0) {
8825           // ss has reason for failure
8826           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8827           err = -EINVAL;
8828           continue;
8829         }
8830         osds.insert(osd);
8831       }
8832
8833       for (auto &osd : osds) {
8834         if (!osdmap.exists(osd)) {
8835           ss << "osd." << osd << " does not exist. ";
8836           continue;
8837         }
8838
8839         ostringstream oss;
8840         oss << "osd." << osd;
8841         string name = oss.str();
8842
8843         if (newcrush.get_max_devices() < osd + 1) {
8844           newcrush.set_max_devices(osd + 1);
8845         }
8846         string action;
8847         if (newcrush.item_exists(osd)) {
8848           action = "updating";
8849         } else {
8850           action = "creating";
8851           newcrush.set_item_name(osd, name);
8852         }
8853
8854         dout(5) << action << " crush item id " << osd << " name '" << name
8855                 << "' device_class '" << device_class << "'"
8856                 << dendl;
8857         err = newcrush.update_device_class(osd, device_class, name, &ss);
8858         if (err < 0) {
8859           goto reply;
8860         }
8861         if (err == 0 && !_have_pending_crush()) {
8862           if (!stop) {
8863             // for single osd only, wildcard makes too much noise
8864             ss << "set-device-class item id " << osd << " name '" << name
8865                << "' device_class '" << device_class << "': no change. ";
8866           }
8867         } else {
8868           updated.insert(osd);
8869         }
8870       }
8871     }
8872
8873     if (!updated.empty()) {
8874       pending_inc.crush.clear();
8875       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8876       ss << "set osd(s) " << updated << " to class '" << device_class << "'";
8877       getline(ss, rs);
8878       wait_for_finished_proposal(op,
8879         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8880       return true;
8881     }
8882
8883  } else if (prefix == "osd crush rm-device-class") {
8884     bool stop = false;
8885     vector<string> idvec;
8886     cmd_getval(cct, cmdmap, "ids", idvec);
8887     CrushWrapper newcrush;
8888     _get_pending_crush(newcrush);
8889     set<int> updated;
8890
8891     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8892       set<int> osds;
8893
8894       // wildcard?
8895       if (j == 0 &&
8896           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8897         osdmap.get_all_osds(osds);
8898         stop = true;
8899       } else {
8900         // try traditional single osd way
8901         long osd = parse_osd_id(idvec[j].c_str(), &ss);
8902         if (osd < 0) {
8903           // ss has reason for failure
8904           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8905           err = -EINVAL;
8906           goto reply;
8907         }
8908         osds.insert(osd);
8909       }
8910
8911       for (auto &osd : osds) {
8912         if (!osdmap.exists(osd)) {
8913           ss << "osd." << osd << " does not exist. ";
8914           continue;
8915         }
8916
8917         auto class_name = newcrush.get_item_class(osd);
8918         if (!class_name) {
8919           ss << "osd." << osd << " belongs to no class, ";
8920           continue;
8921         }
8922         // note that we do not verify if class_is_in_use here
8923         // in case the device is misclassified and user wants
8924         // to overridely reset...
8925
8926         err = newcrush.remove_device_class(cct, osd, &ss);
8927         if (err < 0) {
8928           // ss has reason for failure
8929           goto reply;
8930         }
8931         updated.insert(osd);
8932       }
8933     }
8934
8935     if (!updated.empty()) {
8936       pending_inc.crush.clear();
8937       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8938       ss << "done removing class of osd(s): " << updated;
8939       getline(ss, rs);
8940       wait_for_finished_proposal(op,
8941         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8942       return true;
8943     }
8944   } else if (prefix == "osd crush class create") {
8945     string device_class;
8946     if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
8947       err = -EINVAL; // no value!
8948       goto reply;
8949     }
8950     if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8951       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8952          << "luminous' before using crush device classes";
8953       err = -EPERM;
8954       goto reply;
8955     }
8956     if (!_have_pending_crush() &&
8957         _get_stable_crush().class_exists(device_class)) {
8958       ss << "class '" << device_class << "' already exists";
8959       goto reply;
8960     }
8961      CrushWrapper newcrush;
8962     _get_pending_crush(newcrush);
8963      if (newcrush.class_exists(device_class)) {
8964       ss << "class '" << device_class << "' already exists";
8965       goto update;
8966     }
8967     int class_id = newcrush.get_or_create_class_id(device_class);
8968     pending_inc.crush.clear();
8969     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8970     ss << "created class " << device_class << " with id " << class_id
8971        << " to crush map";
8972     goto update;
8973   } else if (prefix == "osd crush class rm") {
8974     string device_class;
8975     if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
8976        err = -EINVAL; // no value!
8977        goto reply;
8978      }
8979      if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8980        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8981          << "luminous' before using crush device classes";
8982        err = -EPERM;
8983        goto reply;
8984      }
8985
8986      if (!osdmap.crush->class_exists(device_class)) {
8987        err = 0;
8988        goto reply;
8989      }
8990
8991      CrushWrapper newcrush;
8992      _get_pending_crush(newcrush);
8993      if (!newcrush.class_exists(device_class)) {
8994        err = 0; // make command idempotent
8995        goto wait;
8996      }
8997      int class_id = newcrush.get_class_id(device_class);
8998      stringstream ts;
8999      if (newcrush.class_is_in_use(class_id, &ts)) {
9000        err = -EBUSY;
9001        ss << "class '" << device_class << "' " << ts.str();
9002        goto reply;
9003      }
9004
9005      // check if class is used by any erasure-code-profiles
9006      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9007        osdmap.get_erasure_code_profiles();
9008      auto ec_profiles = pending_inc.get_erasure_code_profiles();
9009 #ifdef HAVE_STDLIB_MAP_SPLICING
9010      ec_profiles.merge(old_ec_profiles);
9011 #else
9012      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9013                         make_move_iterator(end(old_ec_profiles)));
9014 #endif
9015      list<string> referenced_by;
9016      for (auto &i: ec_profiles) {
9017        for (auto &j: i.second) {
9018          if ("crush-device-class" == j.first && device_class == j.second) {
9019            referenced_by.push_back(i.first);
9020          }
9021        }
9022      }
9023      if (!referenced_by.empty()) {
9024        err = -EBUSY;
9025        ss << "class '" << device_class
9026           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9027        goto reply;
9028      }
9029
9030      set<int> osds;
9031      newcrush.get_devices_by_class(device_class, &osds);
9032      for (auto& p: osds) {
9033        err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9034        if (err < 0) {
9035          // ss has reason for failure
9036          goto reply;
9037        }
9038      }
9039
9040      if (osds.empty()) {
9041        // empty class, remove directly
9042        err = newcrush.remove_class_name(device_class);
9043        if (err < 0) {
9044          ss << "class '" << device_class << "' cannot be removed '"
9045             << cpp_strerror(err) << "'";
9046          goto reply;
9047        }
9048      }
9049
9050      pending_inc.crush.clear();
9051      newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9052      ss << "removed class " << device_class << " with id " << class_id
9053         << " from crush map";
9054      goto update;
9055   } else if (prefix == "osd crush class rename") {
9056     string srcname, dstname;
9057     if (!cmd_getval(cct, cmdmap, "srcname", srcname)) {
9058       err = -EINVAL;
9059       goto reply;
9060     }
9061     if (!cmd_getval(cct, cmdmap, "dstname", dstname)) {
9062       err = -EINVAL;
9063       goto reply;
9064     }
9065
9066     CrushWrapper newcrush;
9067     _get_pending_crush(newcrush);
9068     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9069       // suppose this is a replay and return success
9070       // so command is idempotent
9071       ss << "already renamed to '" << dstname << "'";
9072       err = 0;
9073       goto reply;
9074     }
9075
9076     err = newcrush.rename_class(srcname, dstname);
9077     if (err < 0) {
9078       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9079          << cpp_strerror(err);
9080       goto reply;
9081     }
9082
9083     pending_inc.crush.clear();
9084     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9085     ss << "rename class '" << srcname << "' to '" << dstname << "'";
9086     goto update;
9087   } else if (prefix == "osd crush add-bucket") {
9088     // os crush add-bucket <name> <type>
9089     string name, typestr;
9090     vector<string> argvec;
9091     cmd_getval(cct, cmdmap, "name", name);
9092     cmd_getval(cct, cmdmap, "type", typestr);
9093     cmd_getval(cct, cmdmap, "args", argvec);
9094     map<string,string> loc;
9095     if (!argvec.empty()) {
9096       CrushWrapper::parse_loc_map(argvec, &loc);
9097       dout(0) << "will create and move bucket '" << name
9098               << "' to location " << loc << dendl;
9099     }
9100
9101     if (!_have_pending_crush() &&
9102         _get_stable_crush().name_exists(name)) {
9103       ss << "bucket '" << name << "' already exists";
9104       goto reply;
9105     }
9106
9107     CrushWrapper newcrush;
9108     _get_pending_crush(newcrush);
9109
9110     if (newcrush.name_exists(name)) {
9111       ss << "bucket '" << name << "' already exists";
9112       goto update;
9113     }
9114     int type = newcrush.get_type_id(typestr);
9115     if (type < 0) {
9116       ss << "type '" << typestr << "' does not exist";
9117       err = -EINVAL;
9118       goto reply;
9119     }
9120     if (type == 0) {
9121       ss << "type '" << typestr << "' is for devices, not buckets";
9122       err = -EINVAL;
9123       goto reply;
9124     }
9125     int bucketno;
9126     err = newcrush.add_bucket(0, 0,
9127                               CRUSH_HASH_DEFAULT, type, 0, NULL,
9128                               NULL, &bucketno);
9129     if (err < 0) {
9130       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9131       goto reply;
9132     }
9133     err = newcrush.set_item_name(bucketno, name);
9134     if (err < 0) {
9135       ss << "error setting bucket name to '" << name << "'";
9136       goto reply;
9137     }
9138
9139     if (!loc.empty()) {
9140       if (!newcrush.check_item_loc(cct, bucketno, loc,
9141           (int *)NULL)) {
9142         err = newcrush.move_bucket(cct, bucketno, loc);
9143         if (err < 0) {
9144           ss << "error moving bucket '" << name << "' to location " << loc;
9145           goto reply;
9146         }
9147       } else {
9148         ss << "no need to move item id " << bucketno << " name '" << name
9149            << "' to location " << loc << " in crush map";
9150       }
9151     }
9152
9153     pending_inc.crush.clear();
9154     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9155     if (loc.empty()) {
9156       ss << "added bucket " << name << " type " << typestr
9157          << " to crush map";
9158     } else {
9159       ss << "added bucket " << name << " type " << typestr
9160          << " to location " << loc;
9161     }
9162     goto update;
9163   } else if (prefix == "osd crush rename-bucket") {
9164     string srcname, dstname;
9165     cmd_getval(cct, cmdmap, "srcname", srcname);
9166     cmd_getval(cct, cmdmap, "dstname", dstname);
9167
9168     err = crush_rename_bucket(srcname, dstname, &ss);
9169     if (err == -EALREADY) // equivalent to success for idempotency
9170       err = 0;
9171     if (err)
9172       goto reply;
9173     else
9174       goto update;
9175   } else if (prefix == "osd crush weight-set create" ||
9176              prefix == "osd crush weight-set create-compat") {
9177     CrushWrapper newcrush;
9178     _get_pending_crush(newcrush);
9179     int64_t pool;
9180     int positions;
9181     if (newcrush.has_non_straw2_buckets()) {
9182       ss << "crush map contains one or more bucket(s) that are not straw2";
9183       err = -EPERM;
9184       goto reply;
9185     }
9186     if (prefix == "osd crush weight-set create") {
9187       if (osdmap.require_min_compat_client > 0 &&
9188           osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
9189         ss << "require_min_compat_client "
9190            << ceph_release_name(osdmap.require_min_compat_client)
9191            << " < luminous, which is required for per-pool weight-sets. "
9192            << "Try 'ceph osd set-require-min-compat-client luminous' "
9193            << "before using the new interface";
9194         err = -EPERM;
9195         goto reply;
9196       }
9197       string poolname, mode;
9198       cmd_getval(cct, cmdmap, "pool", poolname);
9199       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9200       if (pool < 0) {
9201         ss << "pool '" << poolname << "' not found";
9202         err = -ENOENT;
9203         goto reply;
9204       }
9205       cmd_getval(cct, cmdmap, "mode", mode);
9206       if (mode != "flat" && mode != "positional") {
9207         ss << "unrecognized weight-set mode '" << mode << "'";
9208         err = -EINVAL;
9209         goto reply;
9210       }
9211       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
9212     } else {
9213       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9214       positions = 1;
9215     }
9216     if (!newcrush.create_choose_args(pool, positions)) {
9217       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
9218         ss << "compat weight-set already created";
9219       } else {
9220         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
9221            << "' already created";
9222       }
9223       goto reply;
9224     }
9225     pending_inc.crush.clear();
9226     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9227     goto update;
9228
9229   } else if (prefix == "osd crush weight-set rm" ||
9230              prefix == "osd crush weight-set rm-compat") {
9231     CrushWrapper newcrush;
9232     _get_pending_crush(newcrush);
9233     int64_t pool;
9234     if (prefix == "osd crush weight-set rm") {
9235       string poolname;
9236       cmd_getval(cct, cmdmap, "pool", poolname);
9237       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9238       if (pool < 0) {
9239         ss << "pool '" << poolname << "' not found";
9240         err = -ENOENT;
9241         goto reply;
9242       }
9243     } else {
9244       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9245     }
9246     newcrush.rm_choose_args(pool);
9247     pending_inc.crush.clear();
9248     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9249     goto update;
9250
9251   } else if (prefix == "osd crush weight-set reweight" ||
9252              prefix == "osd crush weight-set reweight-compat") {
9253     string poolname, item;
9254     vector<double> weight;
9255     cmd_getval(cct, cmdmap, "pool", poolname);
9256     cmd_getval(cct, cmdmap, "item", item);
9257     cmd_getval(cct, cmdmap, "weight", weight);
9258     CrushWrapper newcrush;
9259     _get_pending_crush(newcrush);
9260     int64_t pool;
9261     if (prefix == "osd crush weight-set reweight") {
9262       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9263       if (pool < 0) {
9264         ss << "pool '" << poolname << "' not found";
9265         err = -ENOENT;
9266         goto reply;
9267       }
9268       if (!newcrush.have_choose_args(pool)) {
9269         ss << "no weight-set for pool '" << poolname << "'";
9270         err = -ENOENT;
9271         goto reply;
9272       }
9273       auto arg_map = newcrush.choose_args_get(pool);
9274       int positions = newcrush.get_choose_args_positions(arg_map);
9275       if (weight.size() != (size_t)positions) {
9276          ss << "must specify exact " << positions << " weight values";
9277          err = -EINVAL;
9278          goto reply;
9279       }
9280     } else {
9281       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9282       if (!newcrush.have_choose_args(pool)) {
9283         ss << "no backward-compatible weight-set";
9284         err = -ENOENT;
9285         goto reply;
9286       }
9287     }
9288     if (!newcrush.name_exists(item)) {
9289       ss << "item '" << item << "' does not exist";
9290       err = -ENOENT;
9291       goto reply;
9292     }
9293     err = newcrush.choose_args_adjust_item_weightf(
9294       cct,
9295       newcrush.choose_args_get(pool),
9296       newcrush.get_item_id(item),
9297       weight,
9298       &ss);
9299     if (err < 0) {
9300       goto reply;
9301     }
9302     err = 0;
9303     pending_inc.crush.clear();
9304     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9305     goto update;
9306   } else if (osdid_present &&
9307              (prefix == "osd crush set" || prefix == "osd crush add")) {
9308     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
9309     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
9310     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
9311
9312     if (!osdmap.exists(osdid)) {
9313       err = -ENOENT;
9314       ss << osd_name
9315          << " does not exist. Create it before updating the crush map";
9316       goto reply;
9317     }
9318
9319     double weight;
9320     if (!cmd_getval(cct, cmdmap, "weight", weight)) {
9321       ss << "unable to parse weight value '"
9322          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9323       err = -EINVAL;
9324       goto reply;
9325     }
9326
9327     string args;
9328     vector<string> argvec;
9329     cmd_getval(cct, cmdmap, "args", argvec);
9330     map<string,string> loc;
9331     CrushWrapper::parse_loc_map(argvec, &loc);
9332
9333     if (prefix == "osd crush set"
9334         && !_get_stable_crush().item_exists(osdid)) {
9335       err = -ENOENT;
9336       ss << "unable to set item id " << osdid << " name '" << osd_name
9337          << "' weight " << weight << " at location " << loc
9338          << ": does not exist";
9339       goto reply;
9340     }
9341
9342     dout(5) << "adding/updating crush item id " << osdid << " name '"
9343       << osd_name << "' weight " << weight << " at location "
9344       << loc << dendl;
9345     CrushWrapper newcrush;
9346     _get_pending_crush(newcrush);
9347
9348     string action;
9349     if (prefix == "osd crush set" ||
9350         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
9351       action = "set";
9352       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
9353     } else {
9354       action = "add";
9355       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
9356       if (err == 0)
9357         err = 1;
9358     }
9359
9360     if (err < 0)
9361       goto reply;
9362
9363     if (err == 0 && !_have_pending_crush()) {
9364       ss << action << " item id " << osdid << " name '" << osd_name
9365          << "' weight " << weight << " at location " << loc << ": no change";
9366       goto reply;
9367     }
9368
9369     pending_inc.crush.clear();
9370     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9371     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
9372        << weight << " at location " << loc << " to crush map";
9373     getline(ss, rs);
9374     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9375                                                       get_last_committed() + 1));
9376     return true;
9377
9378   } else if (prefix == "osd crush create-or-move") {
9379     do {
9380       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
9381       if (!osdmap.exists(osdid)) {
9382         err = -ENOENT;
9383         ss << osd_name
9384            << " does not exist.  create it before updating the crush map";
9385         goto reply;
9386       }
9387
9388       double weight;
9389       if (!cmd_getval(cct, cmdmap, "weight", weight)) {
9390         ss << "unable to parse weight value '"
9391            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9392         err = -EINVAL;
9393         goto reply;
9394       }
9395
9396       string args;
9397       vector<string> argvec;
9398       cmd_getval(cct, cmdmap, "args", argvec);
9399       map<string,string> loc;
9400       CrushWrapper::parse_loc_map(argvec, &loc);
9401
9402       dout(0) << "create-or-move crush item name '" << osd_name
9403               << "' initial_weight " << weight << " at location " << loc
9404               << dendl;
9405
9406       CrushWrapper newcrush;
9407       _get_pending_crush(newcrush);
9408
9409       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
9410                                          g_conf()->osd_crush_update_weight_set);
9411       if (err == 0) {
9412         ss << "create-or-move updated item name '" << osd_name
9413            << "' weight " << weight
9414            << " at location " << loc << " to crush map";
9415         break;
9416       }
9417       if (err > 0) {
9418         pending_inc.crush.clear();
9419         newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9420         ss << "create-or-move updating item name '" << osd_name
9421            << "' weight " << weight
9422            << " at location " << loc << " to crush map";
9423         getline(ss, rs);
9424         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9425                                                   get_last_committed() + 1));
9426         return true;
9427       }
9428     } while (false);
9429
9430   } else if (prefix == "osd crush move") {
9431     do {
9432       // osd crush move <name> <loc1> [<loc2> ...]
9433       string name;
9434       vector<string> argvec;
9435       cmd_getval(cct, cmdmap, "name", name);
9436       cmd_getval(cct, cmdmap, "args", argvec);
9437       map<string,string> loc;
9438       CrushWrapper::parse_loc_map(argvec, &loc);
9439
9440       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
9441       CrushWrapper newcrush;
9442       _get_pending_crush(newcrush);
9443
9444       if (!newcrush.name_exists(name)) {
9445         err = -ENOENT;
9446         ss << "item " << name << " does not exist";
9447         break;
9448       }
9449       int id = newcrush.get_item_id(name);
9450
9451       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
9452         if (id >= 0) {
9453           err = newcrush.create_or_move_item(
9454             cct, id, 0, name, loc,
9455             g_conf()->osd_crush_update_weight_set);
9456         } else {
9457           err = newcrush.move_bucket(cct, id, loc);
9458         }
9459         if (err >= 0) {
9460           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9461           pending_inc.crush.clear();
9462           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9463           getline(ss, rs);
9464           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9465                                                    get_last_committed() + 1));
9466           return true;
9467         }
9468       } else {
9469         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9470         err = 0;
9471       }
9472     } while (false);
9473   } else if (prefix == "osd crush swap-bucket") {
9474     string source, dest;
9475     cmd_getval(cct, cmdmap, "source", source);
9476     cmd_getval(cct, cmdmap, "dest", dest);
9477
9478     bool force = false;
9479     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
9480
9481     CrushWrapper newcrush;
9482     _get_pending_crush(newcrush);
9483     if (!newcrush.name_exists(source)) {
9484       ss << "source item " << source << " does not exist";
9485       err = -ENOENT;
9486       goto reply;
9487     }
9488     if (!newcrush.name_exists(dest)) {
9489       ss << "dest item " << dest << " does not exist";
9490       err = -ENOENT;
9491       goto reply;
9492     }
9493     int sid = newcrush.get_item_id(source);
9494     int did = newcrush.get_item_id(dest);
9495     int sparent;
9496     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
9497       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
9498       err = -EPERM;
9499       goto reply;
9500     }
9501     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
9502         !force) {
9503       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
9504          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
9505          << "; pass --yes-i-really-mean-it to proceed anyway";
9506       err = -EPERM;
9507       goto reply;
9508     }
9509     int r = newcrush.swap_bucket(cct, sid, did);
9510     if (r < 0) {
9511       ss << "failed to swap bucket contents: " << cpp_strerror(r);
9512       err = r;
9513       goto reply;
9514     }
9515     ss << "swapped bucket of " << source << " to " << dest;
9516     pending_inc.crush.clear();
9517     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9518     wait_for_finished_proposal(op,
9519                                new Monitor::C_Command(mon, op, err, ss.str(),
9520                                                       get_last_committed() + 1));
9521     return true;
9522   } else if (prefix == "osd crush link") {
9523     // osd crush link <name> <loc1> [<loc2> ...]
9524     string name;
9525     cmd_getval(cct, cmdmap, "name", name);
9526     vector<string> argvec;
9527     cmd_getval(cct, cmdmap, "args", argvec);
9528     map<string,string> loc;
9529     CrushWrapper::parse_loc_map(argvec, &loc);
9530
9531     // Need an explicit check for name_exists because get_item_id returns
9532     // 0 on unfound.
9533     int id = osdmap.crush->get_item_id(name);
9534     if (!osdmap.crush->name_exists(name)) {
9535       err = -ENOENT;
9536       ss << "item " << name << " does not exist";
9537       goto reply;
9538     } else {
9539       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
9540     }
9541     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
9542       ss << "no need to move item id " << id << " name '" << name
9543          << "' to location " << loc << " in crush map";
9544       err = 0;
9545       goto reply;
9546     }
9547
9548     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
9549     CrushWrapper newcrush;
9550     _get_pending_crush(newcrush);
9551
9552     if (!newcrush.name_exists(name)) {
9553       err = -ENOENT;
9554       ss << "item " << name << " does not exist";
9555       goto reply;
9556     } else {
9557       int id = newcrush.get_item_id(name);
9558       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
9559         err = newcrush.link_bucket(cct, id, loc);
9560         if (err >= 0) {
9561           ss << "linked item id " << id << " name '" << name
9562              << "' to location " << loc << " in crush map";
9563           pending_inc.crush.clear();
9564           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9565         } else {
9566           ss << "cannot link item id " << id << " name '" << name
9567              << "' to location " << loc;
9568           goto reply;
9569         }
9570       } else {
9571         ss << "no need to move item id " << id << " name '" << name
9572            << "' to location " << loc << " in crush map";
9573         err = 0;
9574       }
9575     }
9576     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
9577                                               get_last_committed() + 1));
9578     return true;
9579   } else if (prefix == "osd crush rm" ||
9580              prefix == "osd crush remove" ||
9581              prefix == "osd crush unlink") {
9582     do {
9583       // osd crush rm <id> [ancestor]
9584       CrushWrapper newcrush;
9585       _get_pending_crush(newcrush);
9586
9587       string name;
9588       cmd_getval(cct, cmdmap, "name", name);
9589
9590       if (!osdmap.crush->name_exists(name)) {
9591         err = 0;
9592         ss << "device '" << name << "' does not appear in the crush map";
9593         break;
9594       }
9595       if (!newcrush.name_exists(name)) {
9596         err = 0;
9597         ss << "device '" << name << "' does not appear in the crush map";
9598         getline(ss, rs);
9599         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9600                                                   get_last_committed() + 1));
9601         return true;
9602       }
9603       int id = newcrush.get_item_id(name);
9604       int ancestor = 0;
9605
9606       bool unlink_only = prefix == "osd crush unlink";
9607       string ancestor_str;
9608       if (cmd_getval(cct, cmdmap, "ancestor", ancestor_str)) {
9609         if (!newcrush.name_exists(ancestor_str)) {
9610           err = -ENOENT;
9611           ss << "ancestor item '" << ancestor_str
9612              << "' does not appear in the crush map";
9613           break;
9614         }
9615         ancestor = newcrush.get_item_id(ancestor_str);
9616       }
9617
9618       err = prepare_command_osd_crush_remove(
9619           newcrush,
9620           id, ancestor,
9621           (ancestor < 0), unlink_only);
9622
9623       if (err == -ENOENT) {
9624         ss << "item " << id << " does not appear in that position";
9625         err = 0;
9626         break;
9627       }
9628       if (err == 0) {
9629         if (!unlink_only)
9630           pending_inc.new_crush_node_flags[id] = 0;
9631         ss << "removed item id " << id << " name '" << name << "' from crush map";
9632         getline(ss, rs);
9633         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9634                                                   get_last_committed() + 1));
9635         return true;
9636       }
9637     } while (false);
9638
9639   } else if (prefix == "osd crush reweight-all") {
9640     CrushWrapper newcrush;
9641     _get_pending_crush(newcrush);
9642
9643     newcrush.reweight(cct);
9644     pending_inc.crush.clear();
9645     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9646     ss << "reweighted crush hierarchy";
9647     getline(ss, rs);
9648     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9649                                                   get_last_committed() + 1));
9650     return true;
9651   } else if (prefix == "osd crush reweight") {
9652     // osd crush reweight <name> <weight>
9653     CrushWrapper newcrush;
9654     _get_pending_crush(newcrush);
9655
9656     string name;
9657     cmd_getval(cct, cmdmap, "name", name);
9658     if (!newcrush.name_exists(name)) {
9659       err = -ENOENT;
9660       ss << "device '" << name << "' does not appear in the crush map";
9661       goto reply;
9662     }
9663
9664     int id = newcrush.get_item_id(name);
9665     if (id < 0) {
9666       ss << "device '" << name << "' is not a leaf in the crush map";
9667       err = -EINVAL;
9668       goto reply;
9669     }
9670     double w;
9671     if (!cmd_getval(cct, cmdmap, "weight", w)) {
9672       ss << "unable to parse weight value '"
9673          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9674       err = -EINVAL;
9675       goto reply;
9676     }
9677
9678     err = newcrush.adjust_item_weightf(cct, id, w,
9679                                        g_conf()->osd_crush_update_weight_set);
9680     if (err < 0)
9681       goto reply;
9682     pending_inc.crush.clear();
9683     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9684     ss << "reweighted item id " << id << " name '" << name << "' to " << w
9685        << " in crush map";
9686     getline(ss, rs);
9687     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9688                                                   get_last_committed() + 1));
9689     return true;
9690   } else if (prefix == "osd crush reweight-subtree") {
9691     // osd crush reweight <name> <weight>
9692     CrushWrapper newcrush;
9693     _get_pending_crush(newcrush);
9694
9695     string name;
9696     cmd_getval(cct, cmdmap, "name", name);
9697     if (!newcrush.name_exists(name)) {
9698       err = -ENOENT;
9699       ss << "device '" << name << "' does not appear in the crush map";
9700       goto reply;
9701     }
9702
9703     int id = newcrush.get_item_id(name);
9704     if (id >= 0) {
9705       ss << "device '" << name << "' is not a subtree in the crush map";
9706       err = -EINVAL;
9707       goto reply;
9708     }
9709     double w;
9710     if (!cmd_getval(cct, cmdmap, "weight", w)) {
9711       ss << "unable to parse weight value '"
9712          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9713       err = -EINVAL;
9714       goto reply;
9715     }
9716
9717     err = newcrush.adjust_subtree_weightf(cct, id, w,
9718                                           g_conf()->osd_crush_update_weight_set);
9719     if (err < 0)
9720       goto reply;
9721     pending_inc.crush.clear();
9722     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9723     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
9724        << " in crush map";
9725     getline(ss, rs);
9726     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9727                                               get_last_committed() + 1));
9728     return true;
9729   } else if (prefix == "osd crush tunables") {
9730     CrushWrapper newcrush;
9731     _get_pending_crush(newcrush);
9732
9733     err = 0;
9734     string profile;
9735     cmd_getval(cct, cmdmap, "profile", profile);
9736     if (profile == "legacy" || profile == "argonaut") {
9737       newcrush.set_tunables_legacy();
9738     } else if (profile == "bobtail") {
9739       newcrush.set_tunables_bobtail();
9740     } else if (profile == "firefly") {
9741       newcrush.set_tunables_firefly();
9742     } else if (profile == "hammer") {
9743       newcrush.set_tunables_hammer();
9744     } else if (profile == "jewel") {
9745       newcrush.set_tunables_jewel();
9746     } else if (profile == "optimal") {
9747       newcrush.set_tunables_optimal();
9748     } else if (profile == "default") {
9749       newcrush.set_tunables_default();
9750     } else {
9751       ss << "unrecognized profile '" << profile << "'";
9752       err = -EINVAL;
9753       goto reply;
9754     }
9755
9756     if (!validate_crush_against_features(&newcrush, ss)) {
9757       err = -EINVAL;
9758       goto reply;
9759     }
9760
9761     pending_inc.crush.clear();
9762     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9763     ss << "adjusted tunables profile to " << profile;
9764     getline(ss, rs);
9765     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9766                                               get_last_committed() + 1));
9767     return true;
9768   } else if (prefix == "osd crush set-tunable") {
9769     CrushWrapper newcrush;
9770     _get_pending_crush(newcrush);
9771
9772     err = 0;
9773     string tunable;
9774     cmd_getval(cct, cmdmap, "tunable", tunable);
9775
9776     int64_t value = -1;
9777     if (!cmd_getval(cct, cmdmap, "value", value)) {
9778       err = -EINVAL;
9779       ss << "failed to parse integer value "
9780          << cmd_vartype_stringify(cmdmap.at("value"));
9781       goto reply;
9782     }
9783
9784     if (tunable == "straw_calc_version") {
9785       if (value != 0 && value != 1) {
9786         ss << "value must be 0 or 1; got " << value;
9787         err = -EINVAL;
9788         goto reply;
9789       }
9790       newcrush.set_straw_calc_version(value);
9791     } else {
9792       ss << "unrecognized tunable '" << tunable << "'";
9793       err = -EINVAL;
9794       goto reply;
9795     }
9796
9797     if (!validate_crush_against_features(&newcrush, ss)) {
9798       err = -EINVAL;
9799       goto reply;
9800     }
9801
9802     pending_inc.crush.clear();
9803     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9804     ss << "adjusted tunable " << tunable << " to " << value;
9805     getline(ss, rs);
9806     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9807                                               get_last_committed() + 1));
9808     return true;
9809
9810   } else if (prefix == "osd crush rule create-simple") {
9811     string name, root, type, mode;
9812     cmd_getval(cct, cmdmap, "name", name);
9813     cmd_getval(cct, cmdmap, "root", root);
9814     cmd_getval(cct, cmdmap, "type", type);
9815     cmd_getval(cct, cmdmap, "mode", mode);
9816     if (mode == "")
9817       mode = "firstn";
9818
9819     if (osdmap.crush->rule_exists(name)) {
9820       // The name is uniquely associated to a ruleid and the rule it contains
9821       // From the user point of view, the rule is more meaningfull.
9822       ss << "rule " << name << " already exists";
9823       err = 0;
9824       goto reply;
9825     }
9826
9827     CrushWrapper newcrush;
9828     _get_pending_crush(newcrush);
9829
9830     if (newcrush.rule_exists(name)) {
9831       // The name is uniquely associated to a ruleid and the rule it contains
9832       // From the user point of view, the rule is more meaningfull.
9833       ss << "rule " << name << " already exists";
9834       err = 0;
9835     } else {
9836       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
9837                                                pg_pool_t::TYPE_REPLICATED, &ss);
9838       if (ruleno < 0) {
9839         err = ruleno;
9840         goto reply;
9841       }
9842
9843       pending_inc.crush.clear();
9844       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9845     }
9846     getline(ss, rs);
9847     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9848                                               get_last_committed() + 1));
9849     return true;
9850
9851   } else if (prefix == "osd crush rule create-replicated") {
9852     string name, root, type, device_class;
9853     cmd_getval(cct, cmdmap, "name", name);
9854     cmd_getval(cct, cmdmap, "root", root);
9855     cmd_getval(cct, cmdmap, "type", type);
9856     cmd_getval(cct, cmdmap, "class", device_class);
9857
9858     if (osdmap.crush->rule_exists(name)) {
9859       // The name is uniquely associated to a ruleid and the rule it contains
9860       // From the user point of view, the rule is more meaningfull.
9861       ss << "rule " << name << " already exists";
9862       err = 0;
9863       goto reply;
9864     }
9865
9866     CrushWrapper newcrush;
9867     _get_pending_crush(newcrush);
9868
9869     if (newcrush.rule_exists(name)) {
9870       // The name is uniquely associated to a ruleid and the rule it contains
9871       // From the user point of view, the rule is more meaningfull.
9872       ss << "rule " << name << " already exists";
9873       err = 0;
9874     } else {
9875       int ruleno = newcrush.add_simple_rule(
9876         name, root, type, device_class,
9877         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
9878       if (ruleno < 0) {
9879         err = ruleno;
9880         goto reply;
9881       }
9882
9883       pending_inc.crush.clear();
9884       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9885     }
9886     getline(ss, rs);
9887     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9888                                               get_last_committed() + 1));
9889     return true;
9890
9891   } else if (prefix == "osd erasure-code-profile rm") {
9892     string name;
9893     cmd_getval(cct, cmdmap, "name", name);
9894
9895     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
9896       goto wait;
9897
9898     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
9899       err = -EBUSY;
9900       goto reply;
9901     }
9902
9903     if (osdmap.has_erasure_code_profile(name) ||
9904         pending_inc.new_erasure_code_profiles.count(name)) {
9905       if (osdmap.has_erasure_code_profile(name)) {
9906         pending_inc.old_erasure_code_profiles.push_back(name);
9907       } else {
9908         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
9909         pending_inc.new_erasure_code_profiles.erase(name);
9910       }
9911
9912       getline(ss, rs);
9913       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9914                                                         get_last_committed() + 1));
9915       return true;
9916     } else {
9917       ss << "erasure-code-profile " << name << " does not exist";
9918       err = 0;
9919       goto reply;
9920     }
9921
9922   } else if (prefix == "osd erasure-code-profile set") {
9923     string name;
9924     cmd_getval(cct, cmdmap, "name", name);
9925     vector<string> profile;
9926     cmd_getval(cct, cmdmap, "profile", profile);
9927
9928     bool force = false;
9929     cmd_getval(cct, cmdmap, "force", force);
9930
9931     map<string,string> profile_map;
9932     err = parse_erasure_code_profile(profile, &profile_map, &ss);
9933     if (err)
9934       goto reply;
9935     if (profile_map.find("plugin") == profile_map.end()) {
9936       ss << "erasure-code-profile " << profile_map
9937          << " must contain a plugin entry" << std::endl;
9938       err = -EINVAL;
9939       goto reply;
9940     }
9941     string plugin = profile_map["plugin"];
9942
9943     if (pending_inc.has_erasure_code_profile(name)) {
9944       dout(20) << "erasure code profile " << name << " try again" << dendl;
9945       goto wait;
9946     } else {
9947       err = normalize_profile(name, profile_map, force, &ss);
9948       if (err)
9949         goto reply;
9950
9951       if (osdmap.has_erasure_code_profile(name)) {
9952         ErasureCodeProfile existing_profile_map =
9953           osdmap.get_erasure_code_profile(name);
9954         err = normalize_profile(name, existing_profile_map, force, &ss);
9955         if (err)
9956           goto reply;
9957
9958         if (existing_profile_map == profile_map) {
9959           err = 0;
9960           goto reply;
9961         }
9962         if (!force) {
9963           err = -EPERM;
9964           ss << "will not override erasure code profile " << name
9965              << " because the existing profile "
9966              << existing_profile_map
9967              << " is different from the proposed profile "
9968              << profile_map;
9969           goto reply;
9970         }
9971       }
9972
9973       dout(20) << "erasure code profile set " << name << "="
9974                << profile_map << dendl;
9975       pending_inc.set_erasure_code_profile(name, profile_map);
9976     }
9977
9978     getline(ss, rs);
9979     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9980                                                       get_last_committed() + 1));
9981     return true;
9982
9983   } else if (prefix == "osd crush rule create-erasure") {
9984     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
9985     if (err == -EAGAIN)
9986       goto wait;
9987     if (err)
9988       goto reply;
9989     string name, poolstr;
9990     cmd_getval(cct, cmdmap, "name", name);
9991     string profile;
9992     cmd_getval(cct, cmdmap, "profile", profile);
9993     if (profile == "")
9994       profile = "default";
9995     if (profile == "default") {
9996       if (!osdmap.has_erasure_code_profile(profile)) {
9997         if (pending_inc.has_erasure_code_profile(profile)) {
9998           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
9999           goto wait;
10000         }
10001
10002         map<string,string> profile_map;
10003         err = osdmap.get_erasure_code_profile_default(cct,
10004                                                       profile_map,
10005                                                       &ss);
10006         if (err)
10007           goto reply;
10008         err = normalize_profile(name, profile_map, true, &ss);
10009         if (err)
10010           goto reply;
10011         dout(20) << "erasure code profile set " << profile << "="
10012                  << profile_map << dendl;
10013         pending_inc.set_erasure_code_profile(profile, profile_map);
10014         goto wait;
10015       }
10016     }
10017
10018     int rule;
10019     err = crush_rule_create_erasure(name, profile, &rule, &ss);
10020     if (err < 0) {
10021       switch(err) {
10022       case -EEXIST: // return immediately
10023         ss << "rule " << name << " already exists";
10024         err = 0;
10025         goto reply;
10026         break;
10027       case -EALREADY: // wait for pending to be proposed
10028         ss << "rule " << name << " already exists";
10029         err = 0;
10030         break;
10031       default: // non recoverable error
10032         goto reply;
10033         break;
10034       }
10035     } else {
10036       ss << "created rule " << name << " at " << rule;
10037     }
10038
10039     getline(ss, rs);
10040     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10041                                                       get_last_committed() + 1));
10042     return true;
10043
10044   } else if (prefix == "osd crush rule rm") {
10045     string name;
10046     cmd_getval(cct, cmdmap, "name", name);
10047
10048     if (!osdmap.crush->rule_exists(name)) {
10049       ss << "rule " << name << " does not exist";
10050       err = 0;
10051       goto reply;
10052     }
10053
10054     CrushWrapper newcrush;
10055     _get_pending_crush(newcrush);
10056
10057     if (!newcrush.rule_exists(name)) {
10058       ss << "rule " << name << " does not exist";
10059       err = 0;
10060     } else {
10061       int ruleno = newcrush.get_rule_id(name);
10062       ceph_assert(ruleno >= 0);
10063
10064       // make sure it is not in use.
10065       // FIXME: this is ok in some situations, but let's not bother with that
10066       // complexity now.
10067       int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10068       if (osdmap.crush_rule_in_use(ruleset)) {
10069         ss << "crush ruleset " << name << " " << ruleset << " is in use";
10070         err = -EBUSY;
10071         goto reply;
10072       }
10073
10074       err = newcrush.remove_rule(ruleno);
10075       if (err < 0) {
10076         goto reply;
10077       }
10078
10079       pending_inc.crush.clear();
10080       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10081     }
10082     getline(ss, rs);
10083     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10084                                               get_last_committed() + 1));
10085     return true;
10086
10087   } else if (prefix == "osd crush rule rename") {
10088     string srcname;
10089     string dstname;
10090     cmd_getval(cct, cmdmap, "srcname", srcname);
10091     cmd_getval(cct, cmdmap, "dstname", dstname);
10092     if (srcname.empty() || dstname.empty()) {
10093       ss << "must specify both source rule name and destination rule name";
10094       err = -EINVAL;
10095       goto reply;
10096     }
10097     if (srcname == dstname) {
10098       ss << "destination rule name is equal to source rule name";
10099       err = 0;
10100       goto reply;
10101     }
10102
10103     CrushWrapper newcrush;
10104     _get_pending_crush(newcrush);
10105     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10106       // srcname does not exist and dstname already exists
10107       // suppose this is a replay and return success
10108       // (so this command is idempotent)
10109       ss << "already renamed to '" << dstname << "'";
10110       err = 0;
10111       goto reply;
10112     }
10113
10114     err = newcrush.rename_rule(srcname, dstname, &ss);
10115     if (err < 0) {
10116       // ss has reason for failure
10117       goto reply;
10118     }
10119     pending_inc.crush.clear();
10120     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10121     getline(ss, rs);
10122     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10123                                get_last_committed() + 1));
10124     return true;
10125
10126   } else if (prefix == "osd setmaxosd") {
10127     int64_t newmax;
10128     if (!cmd_getval(cct, cmdmap, "newmax", newmax)) {
10129       ss << "unable to parse 'newmax' value '"
10130          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
10131       err = -EINVAL;
10132       goto reply;
10133     }
10134
10135     if (newmax > g_conf()->mon_max_osd) {
10136       err = -ERANGE;
10137       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
10138          << g_conf()->mon_max_osd << ")";
10139       goto reply;
10140     }
10141
10142     // Don't allow shrinking OSD number as this will cause data loss
10143     // and may cause kernel crashes.
10144     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10145     if (newmax < osdmap.get_max_osd()) {
10146       // Check if the OSDs exist between current max and new value.
10147       // If there are any OSDs exist, then don't allow shrinking number
10148       // of OSDs.
10149       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10150         if (osdmap.exists(i)) {
10151           err = -EBUSY;
10152           ss << "cannot shrink max_osd to " << newmax
10153              << " because osd." << i << " (and possibly others) still in use";
10154           goto reply;
10155         }
10156       }
10157     }
10158
10159     pending_inc.new_max_osd = newmax;
10160     ss << "set new max_osd = " << pending_inc.new_max_osd;
10161     getline(ss, rs);
10162     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10163                                               get_last_committed() + 1));
10164     return true;
10165
10166   } else if (prefix == "osd set-full-ratio" ||
10167              prefix == "osd set-backfillfull-ratio" ||
10168              prefix == "osd set-nearfull-ratio") {
10169     double n;
10170     if (!cmd_getval(cct, cmdmap, "ratio", n)) {
10171       ss << "unable to parse 'ratio' value '"
10172          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
10173       err = -EINVAL;
10174       goto reply;
10175     }
10176     if (prefix == "osd set-full-ratio")
10177       pending_inc.new_full_ratio = n;
10178     else if (prefix == "osd set-backfillfull-ratio")
10179       pending_inc.new_backfillfull_ratio = n;
10180     else if (prefix == "osd set-nearfull-ratio")
10181       pending_inc.new_nearfull_ratio = n;
10182     ss << prefix << " " << n;
10183     getline(ss, rs);
10184     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10185                                               get_last_committed() + 1));
10186     return true;
10187   } else if (prefix == "osd set-require-min-compat-client") {
10188     string v;
10189     cmd_getval(cct, cmdmap, "version", v);
10190     int vno = ceph_release_from_name(v.c_str());
10191     if (vno <= 0) {
10192       ss << "version " << v << " is not recognized";
10193       err = -EINVAL;
10194       goto reply;
10195     }
10196     OSDMap newmap;
10197     newmap.deepish_copy_from(osdmap);
10198     newmap.apply_incremental(pending_inc);
10199     newmap.require_min_compat_client = vno;
10200     auto mvno = newmap.get_min_compat_client();
10201     if (vno < mvno) {
10202       ss << "osdmap current utilizes features that require "
10203          << ceph_release_name(mvno)
10204          << "; cannot set require_min_compat_client below that to "
10205          << ceph_release_name(vno);
10206       err = -EPERM;
10207       goto reply;
10208     }
10209     bool sure = false;
10210     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
10211     if (!sure) {
10212       FeatureMap m;
10213       mon->get_combined_feature_map(&m);
10214       uint64_t features = ceph_release_features(vno);
10215       bool first = true;
10216       bool ok = true;
10217       for (int type : {
10218             CEPH_ENTITY_TYPE_CLIENT,
10219             CEPH_ENTITY_TYPE_MDS,
10220             CEPH_ENTITY_TYPE_MGR }) {
10221         auto p = m.m.find(type);
10222         if (p == m.m.end()) {
10223           continue;
10224         }
10225         for (auto& q : p->second) {
10226           uint64_t missing = ~q.first & features;
10227           if (missing) {
10228             if (first) {
10229               ss << "cannot set require_min_compat_client to " << v << ": ";
10230             } else {
10231               ss << "; ";
10232             }
10233             first = false;
10234             ss << q.second << " connected " << ceph_entity_type_name(type)
10235                << "(s) look like " << ceph_release_name(
10236                  ceph_release_from_features(q.first))
10237                << " (missing 0x" << std::hex << missing << std::dec << ")";
10238             ok = false;
10239           }
10240         }
10241       }
10242       if (!ok) {
10243         ss << "; add --yes-i-really-mean-it to do it anyway";
10244         err = -EPERM;
10245         goto reply;
10246       }
10247     }
10248     ss << "set require_min_compat_client to " << ceph_release_name(vno);
10249     pending_inc.new_require_min_compat_client = vno;
10250     getline(ss, rs);
10251     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10252                                                           get_last_committed() + 1));
10253     return true;
10254   } else if (prefix == "osd pause") {
10255     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10256
10257   } else if (prefix == "osd unpause") {
10258     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10259
10260   } else if (prefix == "osd set") {
10261     bool sure = false;
10262     cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
10263
10264     string key;
10265     cmd_getval(cct, cmdmap, "key", key);
10266     if (key == "full")
10267       return prepare_set_flag(op, CEPH_OSDMAP_FULL);
10268     else if (key == "pause")
10269       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10270     else if (key == "noup")
10271       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
10272     else if (key == "nodown")
10273       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
10274     else if (key == "noout")
10275       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
10276     else if (key == "noin")
10277       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
10278     else if (key == "nobackfill")
10279       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
10280     else if (key == "norebalance")
10281       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
10282     else if (key == "norecover")
10283       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
10284     else if (key == "noscrub")
10285       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
10286     else if (key == "nodeep-scrub")
10287       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10288     else if (key == "notieragent")
10289       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
10290     else if (key == "nosnaptrim")
10291       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
10292     else if (key == "pglog_hardlimit") {
10293       if (!osdmap.get_num_up_osds() && !sure) {
10294         ss << "Not advisable to continue since no OSDs are up. Pass "
10295            << "--yes-i-really-mean-it if you really wish to continue.";
10296         err = -EPERM;
10297         goto reply;
10298       }
10299       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
10300       // we are reusing a jewel feature bit that was retired in luminous.
10301       if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
10302          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
10303           || sure)) {
10304         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
10305       } else {
10306         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
10307         err = -EPERM;
10308         goto reply;
10309       }
10310     } else {
10311       ss << "unrecognized flag '" << key << "'";
10312       err = -EINVAL;
10313     }
10314
10315   } else if (prefix == "osd unset") {
10316     string key;
10317     cmd_getval(cct, cmdmap, "key", key);
10318     if (key == "full")
10319       return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
10320     else if (key == "pause")
10321       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10322     else if (key == "noup")
10323       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
10324     else if (key == "nodown")
10325       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
10326     else if (key == "noout")
10327       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
10328     else if (key == "noin")
10329       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
10330     else if (key == "nobackfill")
10331       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
10332     else if (key == "norebalance")
10333       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
10334     else if (key == "norecover")
10335       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
10336     else if (key == "noscrub")
10337       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
10338     else if (key == "nodeep-scrub")
10339       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10340     else if (key == "notieragent")
10341       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
10342     else if (key == "nosnaptrim")
10343       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
10344     else {
10345       ss << "unrecognized flag '" << key << "'";
10346       err = -EINVAL;
10347     }
10348
10349   } else if (prefix == "osd require-osd-release") {
10350     string release;
10351     cmd_getval(cct, cmdmap, "release", release);
10352     bool sure = false;
10353     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
10354     int rel = ceph_release_from_name(release.c_str());
10355     if (rel <= 0) {
10356       ss << "unrecognized release " << release;
10357       err = -EINVAL;
10358       goto reply;
10359     }
10360     if (rel == osdmap.require_osd_release) {
10361       // idempotent
10362       err = 0;
10363       goto reply;
10364     }
10365     ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
10366     if (!osdmap.get_num_up_osds() && !sure) {
10367       ss << "Not advisable to continue since no OSDs are up. Pass "
10368          << "--yes-i-really-mean-it if you really wish to continue.";
10369       err = -EPERM;
10370       goto reply;
10371     }
10372     if (rel == CEPH_RELEASE_MIMIC) {
10373       if (!mon->monmap->get_required_features().contains_all(
10374             ceph::features::mon::FEATURE_MIMIC)) {
10375         ss << "not all mons are mimic";
10376         err = -EPERM;
10377         goto reply;
10378       }
10379       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
10380            && !sure) {
10381         ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
10382         err = -EPERM;
10383         goto reply;
10384       }
10385     } else if (rel == CEPH_RELEASE_NAUTILUS) {
10386       if (!mon->monmap->get_required_features().contains_all(
10387             ceph::features::mon::FEATURE_NAUTILUS)) {
10388         ss << "not all mons are nautilus";
10389         err = -EPERM;
10390         goto reply;
10391       }
10392       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
10393            && !sure) {
10394         ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
10395         err = -EPERM;
10396         goto reply;
10397       }
10398     } else {
10399       ss << "not supported for this release yet";
10400       err = -EPERM;
10401       goto reply;
10402     }
10403     if (rel < osdmap.require_osd_release) {
10404       ss << "require_osd_release cannot be lowered once it has been set";
10405       err = -EPERM;
10406       goto reply;
10407     }
10408     pending_inc.new_require_osd_release = rel;
10409     goto update;
10410   } else if (prefix == "osd down" ||
10411              prefix == "osd out" ||
10412              prefix == "osd in" ||
10413              prefix == "osd rm") {
10414
10415     bool any = false;
10416     bool stop = false;
10417     bool verbose = true;
10418
10419     vector<string> idvec;
10420     cmd_getval(cct, cmdmap, "ids", idvec);
10421     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10422       set<int> osds;
10423
10424       // wildcard?
10425       if (j == 0 &&
10426           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10427         if (prefix == "osd in") {
10428           // touch out osds only
10429           osdmap.get_out_existing_osds(osds);
10430         } else {
10431           osdmap.get_all_osds(osds);
10432         }
10433         stop = true;
10434         verbose = false; // so the output is less noisy.
10435       } else {
10436         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10437         if (osd < 0) {
10438           ss << "invalid osd id" << osd;
10439           err = -EINVAL;
10440           continue;
10441         } else if (!osdmap.exists(osd)) {
10442           ss << "osd." << osd << " does not exist. ";
10443           continue;
10444         }
10445
10446         osds.insert(osd);
10447       }
10448
10449       for (auto &osd : osds) {
10450         if (prefix == "osd down") {
10451           if (osdmap.is_down(osd)) {
10452             if (verbose)
10453               ss << "osd." << osd << " is already down. ";
10454           } else {
10455             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
10456             ss << "marked down osd." << osd << ". ";
10457             any = true;
10458           }
10459         } else if (prefix == "osd out") {
10460           if (osdmap.is_out(osd)) {
10461             if (verbose)
10462               ss << "osd." << osd << " is already out. ";
10463           } else {
10464             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
10465             if (osdmap.osd_weight[osd]) {
10466               if (pending_inc.new_xinfo.count(osd) == 0) {
10467                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10468               }
10469               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
10470             }
10471             ss << "marked out osd." << osd << ". ";
10472             std::ostringstream msg;
10473             msg << "Client " << op->get_session()->entity_name
10474                 << " marked osd." << osd << " out";
10475             if (osdmap.is_up(osd)) {
10476               msg << ", while it was still marked up";
10477             } else {
10478               auto period = ceph_clock_now() - down_pending_out[osd];
10479               msg << ", after it was down for " << int(period.sec())
10480                   << " seconds";
10481             }
10482
10483             mon->clog->info() << msg.str();
10484             any = true;
10485           }
10486         } else if (prefix == "osd in") {
10487           if (osdmap.is_in(osd)) {
10488             if (verbose)
10489               ss << "osd." << osd << " is already in. ";
10490           } else {
10491             if (osdmap.osd_xinfo[osd].old_weight > 0) {
10492               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
10493               if (pending_inc.new_xinfo.count(osd) == 0) {
10494                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10495               }
10496               pending_inc.new_xinfo[osd].old_weight = 0;
10497             } else {
10498               pending_inc.new_weight[osd] = CEPH_OSD_IN;
10499             }
10500             ss << "marked in osd." << osd << ". ";
10501             any = true;
10502           }
10503         } else if (prefix == "osd rm") {
10504           err = prepare_command_osd_remove(osd);
10505
10506           if (err == -EBUSY) {
10507             if (any)
10508               ss << ", ";
10509             ss << "osd." << osd << " is still up; must be down before removal. ";
10510           } else {
10511             ceph_assert(err == 0);
10512             if (any) {
10513               ss << ", osd." << osd;
10514             } else {
10515               ss << "removed osd." << osd;
10516             }
10517             any = true;
10518           }
10519         }
10520       }
10521     }
10522     if (any) {
10523       getline(ss, rs);
10524       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
10525                                                 get_last_committed() + 1));
10526       return true;
10527     }
10528   } else if (prefix == "osd set-group" ||
10529              prefix == "osd unset-group" ||
10530              prefix == "osd add-noup" ||
10531              prefix == "osd add-nodown" ||
10532              prefix == "osd add-noin" ||
10533              prefix == "osd add-noout" ||
10534              prefix == "osd rm-noup" ||
10535              prefix == "osd rm-nodown" ||
10536              prefix == "osd rm-noin" ||
10537              prefix == "osd rm-noout") {
10538     bool do_set = prefix == "osd set-group" ||
10539                   prefix.find("add") != string::npos;
10540     string flag_str;
10541     unsigned flags = 0;
10542     vector<string> who;
10543     if (prefix == "osd set-group" || prefix == "osd unset-group") {
10544       cmd_getval(cct, cmdmap, "flags", flag_str);
10545       cmd_getval(cct, cmdmap, "who", who);
10546       vector<string> raw_flags;
10547       boost::split(raw_flags, flag_str, boost::is_any_of(","));
10548       for (auto& f : raw_flags) {
10549         if (f == "noup")
10550           flags |= CEPH_OSD_NOUP;
10551         else if (f == "nodown")
10552           flags |= CEPH_OSD_NODOWN;
10553         else if (f == "noin")
10554           flags |= CEPH_OSD_NOIN;
10555         else if (f == "noout")
10556           flags |= CEPH_OSD_NOOUT;
10557         else {
10558           ss << "unrecognized flag '" << f << "', must be one of "
10559              << "{noup,nodown,noin,noout}";
10560           err = -EINVAL;
10561           goto reply;
10562         }
10563       }
10564     } else {
10565       cmd_getval(cct, cmdmap, "ids", who);
10566       if (prefix.find("noup") != string::npos)
10567         flags = CEPH_OSD_NOUP;
10568       else if (prefix.find("nodown") != string::npos)
10569         flags = CEPH_OSD_NODOWN;
10570       else if (prefix.find("noin") != string::npos)
10571         flags = CEPH_OSD_NOIN;
10572       else if (prefix.find("noout") != string::npos)
10573         flags = CEPH_OSD_NOOUT;
10574       else
10575         ceph_assert(0 == "Unreachable!");
10576     }
10577     if (flags == 0) {
10578       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
10579       err = -EINVAL;
10580       goto reply;
10581     }
10582     if (who.empty()) {
10583       ss << "must specify at least one or more targets to set/unset";
10584       err = -EINVAL;
10585       goto reply;
10586     }
10587     set<int> osds;
10588     set<int> crush_nodes;
10589     set<int> device_classes;
10590     for (auto& w : who) {
10591       if (w == "any" || w == "all" || w == "*") {
10592         osdmap.get_all_osds(osds);
10593         break;
10594       }
10595       std::stringstream ts;
10596       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
10597         osds.insert(osd);
10598       } else if (osdmap.crush->name_exists(w)) {
10599         crush_nodes.insert(osdmap.crush->get_item_id(w));
10600       } else if (osdmap.crush->class_exists(w)) {
10601         device_classes.insert(osdmap.crush->get_class_id(w));
10602       } else {
10603         ss << "unable to parse osd id or crush node or device class: "
10604            << "\"" << w << "\". ";
10605       }
10606     }
10607     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
10608       // ss has reason for failure
10609       err = -EINVAL;
10610       goto reply;
10611     }
10612     bool any = false;
10613     for (auto osd : osds) {
10614       if (!osdmap.exists(osd)) {
10615         ss << "osd." << osd << " does not exist. ";
10616         continue;
10617       }
10618       if (do_set) {
10619         if (flags & CEPH_OSD_NOUP) {
10620           any |= osdmap.is_noup_by_osd(osd) ?
10621             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
10622             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
10623         }
10624         if (flags & CEPH_OSD_NODOWN) {
10625           any |= osdmap.is_nodown_by_osd(osd) ?
10626             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
10627             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
10628         }
10629         if (flags & CEPH_OSD_NOIN) {
10630           any |= osdmap.is_noin_by_osd(osd) ?
10631             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
10632             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
10633         }
10634         if (flags & CEPH_OSD_NOOUT) {
10635           any |= osdmap.is_noout_by_osd(osd) ?
10636             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
10637             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
10638         }
10639       } else {
10640         if (flags & CEPH_OSD_NOUP) {
10641           any |= osdmap.is_noup_by_osd(osd) ?
10642             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
10643             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
10644         }
10645         if (flags & CEPH_OSD_NODOWN) {
10646           any |= osdmap.is_nodown_by_osd(osd) ?
10647             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
10648             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
10649         }
10650         if (flags & CEPH_OSD_NOIN) {
10651           any |= osdmap.is_noin_by_osd(osd) ?
10652             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
10653             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
10654         }
10655         if (flags & CEPH_OSD_NOOUT) {
10656           any |= osdmap.is_noout_by_osd(osd) ?
10657             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
10658             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
10659         }
10660       }
10661     }
10662     for (auto& id : crush_nodes) {
10663       auto old_flags = osdmap.get_crush_node_flags(id);
10664       auto& pending_flags = pending_inc.new_crush_node_flags[id];
10665       pending_flags |= old_flags; // adopt existing flags first!
10666       if (do_set) {
10667         pending_flags |= flags;
10668       } else {
10669         pending_flags &= ~flags;
10670       }
10671       any = true;
10672     }
10673     for (auto& id : device_classes) {
10674       auto old_flags = osdmap.get_device_class_flags(id);
10675       auto& pending_flags = pending_inc.new_device_class_flags[id];
10676       pending_flags |= old_flags;
10677       if (do_set) {
10678         pending_flags |= flags;
10679       } else {
10680         pending_flags &= ~flags;
10681       }
10682       any = true;
10683     }
10684     if (any) {
10685       getline(ss, rs);
10686       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
10687                                  get_last_committed() + 1));
10688       return true;
10689     }
10690   } else if (prefix == "osd pg-temp") {
10691     string pgidstr;
10692     if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10693       ss << "unable to parse 'pgid' value '"
10694          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10695       err = -EINVAL;
10696       goto reply;
10697     }
10698     pg_t pgid;
10699     if (!pgid.parse(pgidstr.c_str())) {
10700       ss << "invalid pgid '" << pgidstr << "'";
10701       err = -EINVAL;
10702       goto reply;
10703     }
10704     if (!osdmap.pg_exists(pgid)) {
10705       ss << "pg " << pgid << " does not exist";
10706       err = -ENOENT;
10707       goto reply;
10708     }
10709     if (pending_inc.new_pg_temp.count(pgid)) {
10710       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
10711       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10712       return true;
10713     }
10714
10715     vector<int64_t> id_vec;
10716     vector<int32_t> new_pg_temp;
10717     cmd_getval(cct, cmdmap, "id", id_vec);
10718     if (id_vec.empty())  {
10719       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
10720       ss << "done cleaning up pg_temp of " << pgid;
10721       goto update;
10722     }
10723     for (auto osd : id_vec) {
10724       if (!osdmap.exists(osd)) {
10725         ss << "osd." << osd << " does not exist";
10726         err = -ENOENT;
10727         goto reply;
10728       }
10729       new_pg_temp.push_back(osd);
10730     }
10731
10732     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10733     if ((int)new_pg_temp.size() < pool_min_size) {
10734       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
10735          << pool_min_size << ")";
10736       err = -EINVAL;
10737       goto reply;
10738     }
10739
10740     int pool_size = osdmap.get_pg_pool_size(pgid);
10741     if ((int)new_pg_temp.size() > pool_size) {
10742       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
10743          << pool_size << ")";
10744       err = -EINVAL;
10745       goto reply;
10746     }
10747
10748     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
10749       new_pg_temp.begin(), new_pg_temp.end());
10750     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
10751     goto update;
10752   } else if (prefix == "osd primary-temp") {
10753     string pgidstr;
10754     if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10755       ss << "unable to parse 'pgid' value '"
10756          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10757       err = -EINVAL;
10758       goto reply;
10759     }
10760     pg_t pgid;
10761     if (!pgid.parse(pgidstr.c_str())) {
10762       ss << "invalid pgid '" << pgidstr << "'";
10763       err = -EINVAL;
10764       goto reply;
10765     }
10766     if (!osdmap.pg_exists(pgid)) {
10767       ss << "pg " << pgid << " does not exist";
10768       err = -ENOENT;
10769       goto reply;
10770     }
10771
10772     int64_t osd;
10773     if (!cmd_getval(cct, cmdmap, "id", osd)) {
10774       ss << "unable to parse 'id' value '"
10775          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10776       err = -EINVAL;
10777       goto reply;
10778     }
10779     if (osd != -1 && !osdmap.exists(osd)) {
10780       ss << "osd." << osd << " does not exist";
10781       err = -ENOENT;
10782       goto reply;
10783     }
10784
10785     if (osdmap.require_min_compat_client > 0 &&
10786         osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10787       ss << "require_min_compat_client "
10788          << ceph_release_name(osdmap.require_min_compat_client)
10789          << " < firefly, which is required for primary-temp";
10790       err = -EPERM;
10791       goto reply;
10792     }
10793
10794     pending_inc.new_primary_temp[pgid] = osd;
10795     ss << "set " << pgid << " primary_temp mapping to " << osd;
10796     goto update;
10797   } else if (prefix == "pg repeer") {
10798     pg_t pgid;
10799     string pgidstr;
10800     cmd_getval(cct, cmdmap, "pgid", pgidstr);
10801     if (!pgid.parse(pgidstr.c_str())) {
10802       ss << "invalid pgid '" << pgidstr << "'";
10803       err = -EINVAL;
10804       goto reply;
10805     }
10806     if (!osdmap.pg_exists(pgid)) {
10807       ss << "pg '" << pgidstr << "' does not exist";
10808       err = -ENOENT;
10809       goto reply;
10810     }
10811     vector<int> acting;
10812     int primary;
10813     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
10814     if (primary < 0) {
10815       err = -EAGAIN;
10816       ss << "pg currently has no primary";
10817       goto reply;
10818     }
10819     if (acting.size() > 1) {
10820       // map to just primary; it will map back to what it wants
10821       pending_inc.new_pg_temp[pgid] = { primary };
10822     } else {
10823       // hmm, pick another arbitrary osd to induce a change.  Note
10824       // that this won't work if there is only one suitable OSD in the cluster.
10825       int i;
10826       bool done = false;
10827       for (i = 0; i < osdmap.get_max_osd(); ++i) {
10828         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
10829           continue;
10830         }
10831         pending_inc.new_pg_temp[pgid] = { primary, i };
10832         done = true;
10833         break;
10834       }
10835       if (!done) {
10836         err = -EAGAIN;
10837         ss << "not enough up OSDs in the cluster to force repeer";
10838         goto reply;
10839       }
10840     }
10841     goto update;
10842   } else if (prefix == "osd pg-upmap" ||
10843              prefix == "osd rm-pg-upmap" ||
10844              prefix == "osd pg-upmap-items" ||
10845              prefix == "osd rm-pg-upmap-items") {
10846     if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
10847       ss << "min_compat_client "
10848          << ceph_release_name(osdmap.require_min_compat_client)
10849          << " < luminous, which is required for pg-upmap. "
10850          << "Try 'ceph osd set-require-min-compat-client luminous' "
10851          << "before using the new interface";
10852       err = -EPERM;
10853       goto reply;
10854     }
10855     err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
10856     if (err == -EAGAIN)
10857       goto wait;
10858     if (err < 0)
10859       goto reply;
10860     string pgidstr;
10861     if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10862       ss << "unable to parse 'pgid' value '"
10863          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10864       err = -EINVAL;
10865       goto reply;
10866     }
10867     pg_t pgid;
10868     if (!pgid.parse(pgidstr.c_str())) {
10869       ss << "invalid pgid '" << pgidstr << "'";
10870       err = -EINVAL;
10871       goto reply;
10872     }
10873     if (!osdmap.pg_exists(pgid)) {
10874       ss << "pg " << pgid << " does not exist";
10875       err = -ENOENT;
10876       goto reply;
10877     }
10878     if (pending_inc.old_pools.count(pgid.pool())) {
10879       ss << "pool of " << pgid << " is pending removal";
10880       err = -ENOENT;
10881       getline(ss, rs);
10882       wait_for_finished_proposal(op,
10883         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
10884       return true;
10885     }
10886
10887     enum {
10888       OP_PG_UPMAP,
10889       OP_RM_PG_UPMAP,
10890       OP_PG_UPMAP_ITEMS,
10891       OP_RM_PG_UPMAP_ITEMS,
10892     } option;
10893
10894     if (prefix == "osd pg-upmap") {
10895       option = OP_PG_UPMAP;
10896     } else if (prefix == "osd rm-pg-upmap") {
10897       option = OP_RM_PG_UPMAP;
10898     } else if (prefix == "osd pg-upmap-items") {
10899       option = OP_PG_UPMAP_ITEMS;
10900     } else {
10901       option = OP_RM_PG_UPMAP_ITEMS;
10902     }
10903
10904     // check pending upmap changes
10905     switch (option) {
10906     case OP_PG_UPMAP: // fall through
10907     case OP_RM_PG_UPMAP:
10908       if (pending_inc.new_pg_upmap.count(pgid) ||
10909           pending_inc.old_pg_upmap.count(pgid)) {
10910         dout(10) << __func__ << " waiting for pending update on "
10911                  << pgid << dendl;
10912         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10913         return true;
10914       }
10915       break;
10916
10917     case OP_PG_UPMAP_ITEMS: // fall through
10918     case OP_RM_PG_UPMAP_ITEMS:
10919       if (pending_inc.new_pg_upmap_items.count(pgid) ||
10920           pending_inc.old_pg_upmap_items.count(pgid)) {
10921         dout(10) << __func__ << " waiting for pending update on "
10922                  << pgid << dendl;
10923         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10924         return true;
10925       }
10926       break;
10927
10928     default:
10929       ceph_abort_msg("invalid option");
10930     }
10931
10932     switch (option) {
10933     case OP_PG_UPMAP:
10934       {
10935         vector<int64_t> id_vec;
10936         if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
10937           ss << "unable to parse 'id' value(s) '"
10938              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10939           err = -EINVAL;
10940           goto reply;
10941         }
10942
10943         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10944         if ((int)id_vec.size() < pool_min_size) {
10945           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
10946              << pool_min_size << ")";
10947           err = -EINVAL;
10948           goto reply;
10949         }
10950
10951         int pool_size = osdmap.get_pg_pool_size(pgid);
10952         if ((int)id_vec.size() > pool_size) {
10953           ss << "num of osds (" << id_vec.size() <<") > pool size ("
10954              << pool_size << ")";
10955           err = -EINVAL;
10956           goto reply;
10957         }
10958
10959         vector<int32_t> new_pg_upmap;
10960         for (auto osd : id_vec) {
10961           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
10962             ss << "osd." << osd << " does not exist";
10963             err = -ENOENT;
10964             goto reply;
10965           }
10966           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
10967           if (it != new_pg_upmap.end()) {
10968             ss << "osd." << osd << " already exists, ";
10969             continue;
10970           }
10971           new_pg_upmap.push_back(osd);
10972         }
10973
10974         if (new_pg_upmap.empty()) {
10975           ss << "no valid upmap items(pairs) is specified";
10976           err = -EINVAL;
10977           goto reply;
10978         }
10979
10980         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
10981           new_pg_upmap.begin(), new_pg_upmap.end());
10982         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
10983       }
10984       break;
10985
10986     case OP_RM_PG_UPMAP:
10987       {
10988         pending_inc.old_pg_upmap.insert(pgid);
10989         ss << "clear " << pgid << " pg_upmap mapping";
10990       }
10991       break;
10992
10993     case OP_PG_UPMAP_ITEMS:
10994       {
10995         vector<int64_t> id_vec;
10996         if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
10997           ss << "unable to parse 'id' value(s) '"
10998              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10999           err = -EINVAL;
11000           goto reply;
11001         }
11002
11003         if (id_vec.size() % 2) {
11004           ss << "you must specify pairs of osd ids to be remapped";
11005           err = -EINVAL;
11006           goto reply;
11007         }
11008
11009         int pool_size = osdmap.get_pg_pool_size(pgid);
11010         if ((int)(id_vec.size() / 2) > pool_size) {
11011           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11012              << pool_size << ")";
11013           err = -EINVAL;
11014           goto reply;
11015         }
11016
11017         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11018         ostringstream items;
11019         items << "[";
11020         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11021           int from = *p++;
11022           int to = *p;
11023           if (from == to) {
11024             ss << "from osd." << from << " == to osd." << to << ", ";
11025             continue;
11026           }
11027           if (!osdmap.exists(from)) {
11028             ss << "osd." << from << " does not exist";
11029             err = -ENOENT;
11030             goto reply;
11031           }
11032           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11033             ss << "osd." << to << " does not exist";
11034             err = -ENOENT;
11035             goto reply;
11036           }
11037           pair<int32_t,int32_t> entry = make_pair(from, to);
11038           auto it = std::find(new_pg_upmap_items.begin(),
11039             new_pg_upmap_items.end(), entry);
11040           if (it != new_pg_upmap_items.end()) {
11041             ss << "osd." << from << " -> osd." << to << " already exists, ";
11042             continue;
11043           }
11044           new_pg_upmap_items.push_back(entry);
11045           items << from << "->" << to << ",";
11046         }
11047         string out(items.str());
11048         out.resize(out.size() - 1); // drop last ','
11049         out += "]";
11050
11051         if (new_pg_upmap_items.empty()) {
11052           ss << "no valid upmap items(pairs) is specified";
11053           err = -EINVAL;
11054           goto reply;
11055         }
11056
11057         pending_inc.new_pg_upmap_items[pgid] =
11058           mempool::osdmap::vector<pair<int32_t,int32_t>>(
11059           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11060         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11061       }
11062       break;
11063
11064     case OP_RM_PG_UPMAP_ITEMS:
11065       {
11066         pending_inc.old_pg_upmap_items.insert(pgid);
11067         ss << "clear " << pgid << " pg_upmap_items mapping";
11068       }
11069       break;
11070
11071     default:
11072       ceph_abort_msg("invalid option");
11073     }
11074
11075     goto update;
11076   } else if (prefix == "osd primary-affinity") {
11077     int64_t id;
11078     if (!cmd_getval(cct, cmdmap, "id", id)) {
11079       ss << "invalid osd id value '"
11080          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11081       err = -EINVAL;
11082       goto reply;
11083     }
11084     double w;
11085     if (!cmd_getval(cct, cmdmap, "weight", w)) {
11086       ss << "unable to parse 'weight' value '"
11087          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11088       err = -EINVAL;
11089       goto reply;
11090     }
11091     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11092     if (ww < 0L) {
11093       ss << "weight must be >= 0";
11094       err = -EINVAL;
11095       goto reply;
11096     }
11097     if (osdmap.require_min_compat_client > 0 &&
11098         osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
11099       ss << "require_min_compat_client "
11100          << ceph_release_name(osdmap.require_min_compat_client)
11101          << " < firefly, which is required for primary-affinity";
11102       err = -EPERM;
11103       goto reply;
11104     }
11105     if (osdmap.exists(id)) {
11106       pending_inc.new_primary_affinity[id] = ww;
11107       ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11108       getline(ss, rs);
11109       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11110                                                 get_last_committed() + 1));
11111       return true;
11112     } else {
11113       ss << "osd." << id << " does not exist";
11114       err = -ENOENT;
11115       goto reply;
11116     }
11117   } else if (prefix == "osd reweight") {
11118     int64_t id;
11119     if (!cmd_getval(cct, cmdmap, "id", id)) {
11120       ss << "unable to parse osd id value '"
11121          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11122       err = -EINVAL;
11123       goto reply;
11124     }
11125     double w;
11126     if (!cmd_getval(cct, cmdmap, "weight", w)) {
11127       ss << "unable to parse weight value '"
11128          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11129       err = -EINVAL;
11130       goto reply;
11131     }
11132     long ww = (int)((double)CEPH_OSD_IN*w);
11133     if (ww < 0L) {
11134       ss << "weight must be >= 0";
11135       err = -EINVAL;
11136       goto reply;
11137     }
11138     if (osdmap.exists(id)) {
11139       pending_inc.new_weight[id] = ww;
11140       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
11141       getline(ss, rs);
11142       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11143                                                 get_last_committed() + 1));
11144       return true;
11145     } else {
11146       ss << "osd." << id << " does not exist";
11147       err = -ENOENT;
11148       goto reply;
11149     }
11150   } else if (prefix == "osd reweightn") {
11151     map<int32_t, uint32_t> weights;
11152     err = parse_reweights(cct, cmdmap, osdmap, &weights);
11153     if (err) {
11154       ss << "unable to parse 'weights' value '"
11155          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
11156       goto reply;
11157     }
11158     pending_inc.new_weight.insert(weights.begin(), weights.end());
11159     wait_for_finished_proposal(
11160         op,
11161         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11162     return true;
11163   } else if (prefix == "osd lost") {
11164     int64_t id;
11165     if (!cmd_getval(cct, cmdmap, "id", id)) {
11166       ss << "unable to parse osd id value '"
11167          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11168       err = -EINVAL;
11169       goto reply;
11170     }
11171     bool sure = false;
11172     cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11173     if (!sure) {
11174       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
11175             "--yes-i-really-mean-it if you really do.";
11176       err = -EPERM;
11177       goto reply;
11178     } else if (!osdmap.exists(id)) {
11179       ss << "osd." << id << " does not exist";
11180       err = -ENOENT;
11181       goto reply;
11182     } else if (!osdmap.is_down(id)) {
11183       ss << "osd." << id << " is not down";
11184       err = -EBUSY;
11185       goto reply;
11186     } else {
11187       epoch_t e = osdmap.get_info(id).down_at;
11188       pending_inc.new_lost[id] = e;
11189       ss << "marked osd lost in epoch " << e;
11190       getline(ss, rs);
11191       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11192                                                 get_last_committed() + 1));
11193       return true;
11194     }
11195
11196   } else if (prefix == "osd destroy-actual" ||
11197              prefix == "osd purge-actual" ||
11198              prefix == "osd purge-new") {
11199     /* Destroying an OSD means that we don't expect to further make use of
11200      * the OSDs data (which may even become unreadable after this operation),
11201      * and that we are okay with scrubbing all its cephx keys and config-key
11202      * data (which may include lockbox keys, thus rendering the osd's data
11203      * unreadable).
11204      *
11205      * The OSD will not be removed. Instead, we will mark it as destroyed,
11206      * such that a subsequent call to `create` will not reuse the osd id.
11207      * This will play into being able to recreate the OSD, at the same
11208      * crush location, with minimal data movement.
11209      */
11210
11211     // make sure authmon is writeable.
11212     if (!mon->authmon()->is_writeable()) {
11213       dout(10) << __func__ << " waiting for auth mon to be writeable for "
11214                << "osd destroy" << dendl;
11215       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11216       return false;
11217     }
11218
11219     int64_t id;
11220     if (!cmd_getval(cct, cmdmap, "id", id)) {
11221       auto p = cmdmap.find("id");
11222       if (p == cmdmap.end()) {
11223         ss << "no osd id specified";
11224       } else {
11225         ss << "unable to parse osd id value '"
11226            << cmd_vartype_stringify(cmdmap.at("id")) << "";
11227       }
11228       err = -EINVAL;
11229       goto reply;
11230     }
11231
11232     bool is_destroy = (prefix == "osd destroy-actual");
11233     if (!is_destroy) {
11234       ceph_assert("osd purge-actual" == prefix ||
11235              "osd purge-new" == prefix);
11236     }
11237
11238     bool sure = false;
11239     cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11240     if (!sure) {
11241       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
11242          << "This will mean real, permanent data loss, as well "
11243          << "as deletion of cephx and lockbox keys. "
11244          << "Pass --yes-i-really-mean-it if you really do.";
11245       err = -EPERM;
11246       goto reply;
11247     } else if (!osdmap.exists(id)) {
11248       ss << "osd." << id << " does not exist";
11249       err = 0; // idempotent
11250       goto reply;
11251     } else if (osdmap.is_up(id)) {
11252       ss << "osd." << id << " is not `down`.";
11253       err = -EBUSY;
11254       goto reply;
11255     } else if (is_destroy && osdmap.is_destroyed(id)) {
11256       ss << "destroyed osd." << id;
11257       err = 0;
11258       goto reply;
11259     }
11260
11261     if (prefix == "osd purge-new" &&
11262         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
11263       ss << "osd." << id << " is not new";
11264       err = -EPERM;
11265       goto reply;
11266     }
11267
11268     bool goto_reply = false;
11269
11270     paxos->plug();
11271     if (is_destroy) {
11272       err = prepare_command_osd_destroy(id, ss);
11273       // we checked above that it should exist.
11274       ceph_assert(err != -ENOENT);
11275     } else {
11276       err = prepare_command_osd_purge(id, ss);
11277       if (err == -ENOENT) {
11278         err = 0;
11279         ss << "osd." << id << " does not exist.";
11280         goto_reply = true;
11281       }
11282     }
11283     paxos->unplug();
11284
11285     if (err < 0 || goto_reply) {
11286       goto reply;
11287     }
11288
11289     if (is_destroy) {
11290       ss << "destroyed osd." << id;
11291     } else {
11292       ss << "purged osd." << id;
11293     }
11294
11295     getline(ss, rs);
11296     wait_for_finished_proposal(op,
11297         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
11298     force_immediate_propose();
11299     return true;
11300
11301   } else if (prefix == "osd new") {
11302
11303     // make sure authmon is writeable.
11304     if (!mon->authmon()->is_writeable()) {
11305       dout(10) << __func__ << " waiting for auth mon to be writeable for "
11306                << "osd new" << dendl;
11307       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11308       return false;
11309     }
11310
11311     map<string,string> param_map;
11312
11313     bufferlist bl = m->get_data();
11314     string param_json = bl.to_str();
11315     dout(20) << __func__ << " osd new json = " << param_json << dendl;
11316
11317     err = get_json_str_map(param_json, ss, &param_map);
11318     if (err < 0)
11319       goto reply;
11320
11321     dout(20) << __func__ << " osd new params " << param_map << dendl;
11322
11323     paxos->plug();
11324     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
11325     paxos->unplug();
11326
11327     if (err < 0) {
11328       goto reply;
11329     }
11330
11331     if (f) {
11332       f->flush(rdata);
11333     } else {
11334       rdata.append(ss);
11335     }
11336
11337     if (err == EEXIST) {
11338       // idempotent operation
11339       err = 0;
11340       goto reply;
11341     }
11342
11343     wait_for_finished_proposal(op,
11344         new Monitor::C_Command(mon, op, 0, rs, rdata,
11345                                get_last_committed() + 1));
11346     force_immediate_propose();
11347     return true;
11348
11349   } else if (prefix == "osd create") {
11350
11351     // optional id provided?
11352     int64_t id = -1, cmd_id = -1;
11353     if (cmd_getval(cct, cmdmap, "id", cmd_id)) {
11354       if (cmd_id < 0) {
11355         ss << "invalid osd id value '" << cmd_id << "'";
11356         err = -EINVAL;
11357         goto reply;
11358       }
11359       dout(10) << " osd create got id " << cmd_id << dendl;
11360     }
11361
11362     uuid_d uuid;
11363     string uuidstr;
11364     if (cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
11365       if (!uuid.parse(uuidstr.c_str())) {
11366         ss << "invalid uuid value '" << uuidstr << "'";
11367         err = -EINVAL;
11368         goto reply;
11369       }
11370       // we only care about the id if we also have the uuid, to
11371       // ensure the operation's idempotency.
11372       id = cmd_id;
11373     }
11374
11375     int32_t new_id = -1;
11376     err = prepare_command_osd_create(id, uuid, &new_id, ss);
11377     if (err < 0) {
11378       if (err == -EAGAIN) {
11379         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11380         return true;
11381       }
11382       // a check has failed; reply to the user.
11383       goto reply;
11384
11385     } else if (err == EEXIST) {
11386       // this is an idempotent operation; we can go ahead and reply.
11387       if (f) {
11388         f->open_object_section("created_osd");
11389         f->dump_int("osdid", new_id);
11390         f->close_section();
11391         f->flush(rdata);
11392       } else {
11393         ss << new_id;
11394         rdata.append(ss);
11395       }
11396       err = 0;
11397       goto reply;
11398     }
11399
11400     string empty_device_class;
11401     do_osd_create(id, uuid, empty_device_class, &new_id);
11402
11403     if (f) {
11404       f->open_object_section("created_osd");
11405       f->dump_int("osdid", new_id);
11406       f->close_section();
11407       f->flush(rdata);
11408     } else {
11409       ss << new_id;
11410       rdata.append(ss);
11411     }
11412     wait_for_finished_proposal(op,
11413         new Monitor::C_Command(mon, op, 0, rs, rdata,
11414                                get_last_committed() + 1));
11415     return true;
11416
11417   } else if (prefix == "osd blacklist clear") {
11418     pending_inc.new_blacklist.clear();
11419     std::list<std::pair<entity_addr_t,utime_t > > blacklist;
11420     osdmap.get_blacklist(&blacklist);
11421     for (const auto &entry : blacklist) {
11422       pending_inc.old_blacklist.push_back(entry.first);
11423     }
11424     ss << " removed all blacklist entries";
11425     getline(ss, rs);
11426     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11427                                               get_last_committed() + 1));
11428     return true;
11429   } else if (prefix == "osd blacklist") {
11430     string addrstr;
11431     cmd_getval(cct, cmdmap, "addr", addrstr);
11432     entity_addr_t addr;
11433     if (!addr.parse(addrstr.c_str(), 0)) {
11434       ss << "unable to parse address " << addrstr;
11435       err = -EINVAL;
11436       goto reply;
11437     }
11438     else {
11439       if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
11440         // always blacklist type ANY
11441         addr.set_type(entity_addr_t::TYPE_ANY);
11442       } else {
11443         addr.set_type(entity_addr_t::TYPE_LEGACY);
11444       }
11445
11446       string blacklistop;
11447       cmd_getval(cct, cmdmap, "blacklistop", blacklistop);
11448       if (blacklistop == "add") {
11449         utime_t expires = ceph_clock_now();
11450         double d;
11451         // default one hour
11452         cmd_getval(cct, cmdmap, "expire", d,
11453           g_conf()->mon_osd_blacklist_default_expire);
11454         expires += d;
11455
11456         pending_inc.new_blacklist[addr] = expires;
11457
11458         {
11459           // cancel any pending un-blacklisting request too
11460           auto it = std::find(pending_inc.old_blacklist.begin(),
11461             pending_inc.old_blacklist.end(), addr);
11462           if (it != pending_inc.old_blacklist.end()) {
11463             pending_inc.old_blacklist.erase(it);
11464           }
11465         }
11466
11467         ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
11468         getline(ss, rs);
11469         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11470                                                   get_last_committed() + 1));
11471         return true;
11472       } else if (blacklistop == "rm") {
11473         if (osdmap.is_blacklisted(addr) ||
11474             pending_inc.new_blacklist.count(addr)) {
11475           if (osdmap.is_blacklisted(addr))
11476             pending_inc.old_blacklist.push_back(addr);
11477           else
11478             pending_inc.new_blacklist.erase(addr);
11479           ss << "un-blacklisting " << addr;
11480           getline(ss, rs);
11481           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11482                                                     get_last_committed() + 1));
11483           return true;
11484         }
11485         ss << addr << " isn't blacklisted";
11486         err = 0;
11487         goto reply;
11488       }
11489     }
11490   } else if (prefix == "osd pool mksnap") {
11491     string poolstr;
11492     cmd_getval(cct, cmdmap, "pool", poolstr);
11493     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11494     if (pool < 0) {
11495       ss << "unrecognized pool '" << poolstr << "'";
11496       err = -ENOENT;
11497       goto reply;
11498     }
11499     string snapname;
11500     cmd_getval(cct, cmdmap, "snap", snapname);
11501     const pg_pool_t *p = osdmap.get_pg_pool(pool);
11502     if (p->is_unmanaged_snaps_mode()) {
11503       ss << "pool " << poolstr << " is in unmanaged snaps mode";
11504       err = -EINVAL;
11505       goto reply;
11506     } else if (p->snap_exists(snapname.c_str())) {
11507       ss << "pool " << poolstr << " snap " << snapname << " already exists";
11508       err = 0;
11509       goto reply;
11510     } else if (p->is_tier()) {
11511       ss << "pool " << poolstr << " is a cache tier";
11512       err = -EINVAL;
11513       goto reply;
11514     }
11515     pg_pool_t *pp = 0;
11516     if (pending_inc.new_pools.count(pool))
11517       pp = &pending_inc.new_pools[pool];
11518     if (!pp) {
11519       pp = &pending_inc.new_pools[pool];
11520       *pp = *p;
11521     }
11522     if (pp->snap_exists(snapname.c_str())) {
11523       ss << "pool " << poolstr << " snap " << snapname << " already exists";
11524     } else {
11525       pp->add_snap(snapname.c_str(), ceph_clock_now());
11526       pp->set_snap_epoch(pending_inc.epoch);
11527       ss << "created pool " << poolstr << " snap " << snapname;
11528     }
11529     getline(ss, rs);
11530     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11531                                               get_last_committed() + 1));
11532     return true;
11533   } else if (prefix == "osd pool rmsnap") {
11534     string poolstr;
11535     cmd_getval(cct, cmdmap, "pool", poolstr);
11536     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11537     if (pool < 0) {
11538       ss << "unrecognized pool '" << poolstr << "'";
11539       err = -ENOENT;
11540       goto reply;
11541     }
11542     string snapname;
11543     cmd_getval(cct, cmdmap, "snap", snapname);
11544     const pg_pool_t *p = osdmap.get_pg_pool(pool);
11545     if (p->is_unmanaged_snaps_mode()) {
11546       ss << "pool " << poolstr << " is in unmanaged snaps mode";
11547       err = -EINVAL;
11548       goto reply;
11549     } else if (!p->snap_exists(snapname.c_str())) {
11550       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
11551       err = 0;
11552       goto reply;
11553     }
11554     pg_pool_t *pp = 0;
11555     if (pending_inc.new_pools.count(pool))
11556       pp = &pending_inc.new_pools[pool];
11557     if (!pp) {
11558       pp = &pending_inc.new_pools[pool];
11559       *pp = *p;
11560     }
11561     snapid_t sn = pp->snap_exists(snapname.c_str());
11562     if (sn) {
11563       pp->remove_snap(sn);
11564       pp->set_snap_epoch(pending_inc.epoch);
11565       ss << "removed pool " << poolstr << " snap " << snapname;
11566     } else {
11567       ss << "already removed pool " << poolstr << " snap " << snapname;
11568     }
11569     getline(ss, rs);
11570     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11571                                               get_last_committed() + 1));
11572     return true;
11573   } else if (prefix == "osd pool create") {
11574     int64_t pg_num, pg_num_min;
11575     int64_t pgp_num;
11576     cmd_getval(cct, cmdmap, "pg_num", pg_num, int64_t(0));
11577     cmd_getval(cct, cmdmap, "pgp_num", pgp_num, pg_num);
11578     cmd_getval(cct, cmdmap, "pg_num_min", pg_num_min, int64_t(0));
11579
11580     string pool_type_str;
11581     cmd_getval(cct, cmdmap, "pool_type", pool_type_str);
11582     if (pool_type_str.empty())
11583       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
11584
11585     string poolstr;
11586     cmd_getval(cct, cmdmap, "pool", poolstr);
11587     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11588     if (pool_id >= 0) {
11589       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11590       if (pool_type_str != p->get_type_name()) {
11591         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
11592         err = -EINVAL;
11593       } else {
11594         ss << "pool '" << poolstr << "' already exists";
11595         err = 0;
11596       }
11597       goto reply;
11598     }
11599
11600     int pool_type;
11601     if (pool_type_str == "replicated") {
11602       pool_type = pg_pool_t::TYPE_REPLICATED;
11603     } else if (pool_type_str == "erasure") {
11604       pool_type = pg_pool_t::TYPE_ERASURE;
11605     } else {
11606       ss << "unknown pool type '" << pool_type_str << "'";
11607       err = -EINVAL;
11608       goto reply;
11609     }
11610
11611     bool implicit_rule_creation = false;
11612     int64_t expected_num_objects = 0;
11613     string rule_name;
11614     cmd_getval(cct, cmdmap, "rule", rule_name);
11615     string erasure_code_profile;
11616     cmd_getval(cct, cmdmap, "erasure_code_profile", erasure_code_profile);
11617
11618     if (pool_type == pg_pool_t::TYPE_ERASURE) {
11619       if (erasure_code_profile == "")
11620         erasure_code_profile = "default";
11621       //handle the erasure code profile
11622       if (erasure_code_profile == "default") {
11623         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
11624           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
11625             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
11626             goto wait;
11627           }
11628
11629           map<string,string> profile_map;
11630           err = osdmap.get_erasure_code_profile_default(cct,
11631                                                       profile_map,
11632                                                       &ss);
11633           if (err)
11634             goto reply;
11635           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
11636           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
11637           goto wait;
11638         }
11639       }
11640       if (rule_name == "") {
11641         implicit_rule_creation = true;
11642         if (erasure_code_profile == "default") {
11643           rule_name = "erasure-code";
11644         } else {
11645           dout(1) << "implicitly use rule named after the pool: "
11646                 << poolstr << dendl;
11647           rule_name = poolstr;
11648         }
11649       }
11650       cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
11651                  expected_num_objects, int64_t(0));
11652     } else {
11653       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
11654       //     and put expected_num_objects to rule field
11655       if (erasure_code_profile != "") { // cmd is from CLI
11656         if (rule_name != "") {
11657           string interr;
11658           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
11659           if (interr.length()) {
11660             ss << "error parsing integer value '" << rule_name << "': " << interr;
11661             err = -EINVAL;
11662             goto reply;
11663           }
11664         }
11665         rule_name = erasure_code_profile;
11666       } else { // cmd is well-formed
11667         cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
11668                    expected_num_objects, int64_t(0));
11669       }
11670     }
11671
11672     if (!implicit_rule_creation && rule_name != "") {
11673       int rule;
11674       err = get_crush_rule(rule_name, &rule, &ss);
11675       if (err == -EAGAIN) {
11676         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11677         return true;
11678       }
11679       if (err)
11680         goto reply;
11681     }
11682
11683     if (expected_num_objects < 0) {
11684       ss << "'expected_num_objects' must be non-negative";
11685       err = -EINVAL;
11686       goto reply;
11687     }
11688
11689     if (expected_num_objects > 0 &&
11690         cct->_conf->osd_objectstore == "filestore" &&
11691         cct->_conf->filestore_merge_threshold > 0) {
11692       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
11693       err = -EINVAL;
11694       goto reply;
11695     }
11696
11697     if (expected_num_objects == 0 &&
11698         cct->_conf->osd_objectstore == "filestore" &&
11699         cct->_conf->filestore_merge_threshold < 0) {
11700       int osds = osdmap.get_num_osds();
11701       if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
11702         ss << "For better initial performance on pools expected to store a "
11703            << "large number of objects, consider supplying the "
11704            << "expected_num_objects parameter when creating the pool.\n";
11705       }
11706     }
11707
11708     int64_t fast_read_param;
11709     cmd_getval(cct, cmdmap, "fast_read", fast_read_param, int64_t(-1));
11710     FastReadType fast_read = FAST_READ_DEFAULT;
11711     if (fast_read_param == 0)
11712       fast_read = FAST_READ_OFF;
11713     else if (fast_read_param > 0)
11714       fast_read = FAST_READ_ON;
11715
11716     int64_t repl_size = 0;
11717     cmd_getval(cct, cmdmap, "size", repl_size);
11718     int64_t target_size_bytes = 0;
11719     double target_size_ratio = 0.0;
11720     cmd_getval(cct, cmdmap, "target_size_bytes", target_size_bytes);
11721     cmd_getval(cct, cmdmap, "target_size_ratio", target_size_ratio);
11722
11723     err = prepare_new_pool(poolstr,
11724                            -1, // default crush rule
11725                            rule_name,
11726                            pg_num, pgp_num, pg_num_min,
11727                            repl_size, target_size_bytes, target_size_ratio,
11728                            erasure_code_profile, pool_type,
11729                            (uint64_t)expected_num_objects,
11730                            fast_read,
11731                            &ss);
11732     if (err < 0) {
11733       switch(err) {
11734       case -EEXIST:
11735         ss << "pool '" << poolstr << "' already exists";
11736         break;
11737       case -EAGAIN:
11738         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11739         return true;
11740       case -ERANGE:
11741         goto reply;
11742       default:
11743         goto reply;
11744         break;
11745       }
11746     } else {
11747       ss << "pool '" << poolstr << "' created";
11748     }
11749     getline(ss, rs);
11750     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11751                                               get_last_committed() + 1));
11752     return true;
11753
11754   } else if (prefix == "osd pool delete" ||
11755              prefix == "osd pool rm") {
11756     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
11757     string poolstr, poolstr2, sure;
11758     cmd_getval(cct, cmdmap, "pool", poolstr);
11759     cmd_getval(cct, cmdmap, "pool2", poolstr2);
11760     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11761     if (pool < 0) {
11762       ss << "pool '" << poolstr << "' does not exist";
11763       err = 0;
11764       goto reply;
11765     }
11766
11767     bool force_no_fake = false;
11768     cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it", force_no_fake);
11769     bool force = false;
11770     cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it_not_faking", force);
11771     if (poolstr2 != poolstr ||
11772         (!force && !force_no_fake)) {
11773       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
11774          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
11775          << "followed by --yes-i-really-really-mean-it.";
11776       err = -EPERM;
11777       goto reply;
11778     }
11779     err = _prepare_remove_pool(pool, &ss, force_no_fake);
11780     if (err == -EAGAIN) {
11781       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11782       return true;
11783     }
11784     if (err < 0)
11785       goto reply;
11786     goto update;
11787   } else if (prefix == "osd pool rename") {
11788     string srcpoolstr, destpoolstr;
11789     cmd_getval(cct, cmdmap, "srcpool", srcpoolstr);
11790     cmd_getval(cct, cmdmap, "destpool", destpoolstr);
11791     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
11792     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
11793
11794     if (pool_src < 0) {
11795       if (pool_dst >= 0) {
11796         // src pool doesn't exist, dst pool does exist: to ensure idempotency
11797         // of operations, assume this rename succeeded, as it is not changing
11798         // the current state.  Make sure we output something understandable
11799         // for whoever is issuing the command, if they are paying attention,
11800         // in case it was not intentional; or to avoid a "wtf?" and a bug
11801         // report in case it was intentional, while expecting a failure.
11802         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
11803           << destpoolstr << "' does -- assuming successful rename";
11804         err = 0;
11805       } else {
11806         ss << "unrecognized pool '" << srcpoolstr << "'";
11807         err = -ENOENT;
11808       }
11809       goto reply;
11810     } else if (pool_dst >= 0) {
11811       // source pool exists and so does the destination pool
11812       ss << "pool '" << destpoolstr << "' already exists";
11813       err = -EEXIST;
11814       goto reply;
11815     }
11816
11817     int ret = _prepare_rename_pool(pool_src, destpoolstr);
11818     if (ret == 0) {
11819       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
11820     } else {
11821       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
11822         << cpp_strerror(ret);
11823     }
11824     getline(ss, rs);
11825     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
11826                                               get_last_committed() + 1));
11827     return true;
11828
11829   } else if (prefix == "osd pool set") {
11830     err = prepare_command_pool_set(cmdmap, ss);
11831     if (err == -EAGAIN)
11832       goto wait;
11833     if (err < 0)
11834       goto reply;
11835
11836     getline(ss, rs);
11837     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11838                                                    get_last_committed() + 1));
11839     return true;
11840   } else if (prefix == "osd tier add") {
11841     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11842     if (err == -EAGAIN)
11843       goto wait;
11844     if (err)
11845       goto reply;
11846     string poolstr;
11847     cmd_getval(cct, cmdmap, "pool", poolstr);
11848     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11849     if (pool_id < 0) {
11850       ss << "unrecognized pool '" << poolstr << "'";
11851       err = -ENOENT;
11852       goto reply;
11853     }
11854     string tierpoolstr;
11855     cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
11856     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11857     if (tierpool_id < 0) {
11858       ss << "unrecognized pool '" << tierpoolstr << "'";
11859       err = -ENOENT;
11860       goto reply;
11861     }
11862     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11863     ceph_assert(p);
11864     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11865     ceph_assert(tp);
11866
11867     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11868       goto reply;
11869     }
11870
11871     // make sure new tier is empty
11872     string force_nonempty;
11873     cmd_getval(cct, cmdmap, "force_nonempty", force_nonempty);
11874     const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
11875     if (pstats && pstats->stats.sum.num_objects != 0 &&
11876         force_nonempty != "--force-nonempty") {
11877       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
11878       err = -ENOTEMPTY;
11879       goto reply;
11880     }
11881     if (tp->is_erasure()) {
11882       ss << "tier pool '" << tierpoolstr
11883          << "' is an ec pool, which cannot be a tier";
11884       err = -ENOTSUP;
11885       goto reply;
11886     }
11887     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
11888         ((force_nonempty != "--force-nonempty") ||
11889          (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
11890       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
11891       err = -ENOTEMPTY;
11892       goto reply;
11893     }
11894     // go
11895     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11896     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11897     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11898       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11899       return true;
11900     }
11901     np->tiers.insert(tierpool_id);
11902     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11903     ntp->tier_of = pool_id;
11904     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
11905     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11906                                               get_last_committed() + 1));
11907     return true;
11908   } else if (prefix == "osd tier remove" ||
11909              prefix == "osd tier rm") {
11910     string poolstr;
11911     cmd_getval(cct, cmdmap, "pool", poolstr);
11912     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11913     if (pool_id < 0) {
11914       ss << "unrecognized pool '" << poolstr << "'";
11915       err = -ENOENT;
11916       goto reply;
11917     }
11918     string tierpoolstr;
11919     cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
11920     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11921     if (tierpool_id < 0) {
11922       ss << "unrecognized pool '" << tierpoolstr << "'";
11923       err = -ENOENT;
11924       goto reply;
11925     }
11926     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11927     ceph_assert(p);
11928     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11929     ceph_assert(tp);
11930
11931     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
11932       goto reply;
11933     }
11934
11935     if (p->tiers.count(tierpool_id) == 0) {
11936       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11937       err = 0;
11938       goto reply;
11939     }
11940     if (tp->tier_of != pool_id) {
11941       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
11942          << osdmap.get_pool_name(tp->tier_of) << "': "
11943          // be scary about it; this is an inconsistency and bells must go off
11944          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
11945       err = -EINVAL;
11946       goto reply;
11947     }
11948     if (p->read_tier == tierpool_id) {
11949       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
11950       err = -EBUSY;
11951       goto reply;
11952     }
11953     // go
11954     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11955     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11956     if (np->tiers.count(tierpool_id) == 0 ||
11957         ntp->tier_of != pool_id ||
11958         np->read_tier == tierpool_id) {
11959       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11960       return true;
11961     }
11962     np->tiers.erase(tierpool_id);
11963     ntp->clear_tier();
11964     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11965     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11966                                               get_last_committed() + 1));
11967     return true;
11968   } else if (prefix == "osd tier set-overlay") {
11969     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11970     if (err == -EAGAIN)
11971       goto wait;
11972     if (err)
11973       goto reply;
11974     string poolstr;
11975     cmd_getval(cct, cmdmap, "pool", poolstr);
11976     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11977     if (pool_id < 0) {
11978       ss << "unrecognized pool '" << poolstr << "'";
11979       err = -ENOENT;
11980       goto reply;
11981     }
11982     string overlaypoolstr;
11983     cmd_getval(cct, cmdmap, "overlaypool", overlaypoolstr);
11984     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
11985     if (overlaypool_id < 0) {
11986       ss << "unrecognized pool '" << overlaypoolstr << "'";
11987       err = -ENOENT;
11988       goto reply;
11989     }
11990     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11991     ceph_assert(p);
11992     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11993     ceph_assert(overlay_p);
11994     if (p->tiers.count(overlaypool_id) == 0) {
11995       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
11996       err = -EINVAL;
11997       goto reply;
11998     }
11999     if (p->read_tier == overlaypool_id) {
12000       err = 0;
12001       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12002       goto reply;
12003     }
12004     if (p->has_read_tier()) {
12005       ss << "pool '" << poolstr << "' has overlay '"
12006          << osdmap.get_pool_name(p->read_tier)
12007          << "'; please remove-overlay first";
12008       err = -EINVAL;
12009       goto reply;
12010     }
12011
12012     // go
12013     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12014     np->read_tier = overlaypool_id;
12015     np->write_tier = overlaypool_id;
12016     np->set_last_force_op_resend(pending_inc.epoch);
12017     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12018     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12019     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12020     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12021       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12022     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12023                                               get_last_committed() + 1));
12024     return true;
12025   } else if (prefix == "osd tier remove-overlay" ||
12026              prefix == "osd tier rm-overlay") {
12027     string poolstr;
12028     cmd_getval(cct, cmdmap, "pool", poolstr);
12029     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12030     if (pool_id < 0) {
12031       ss << "unrecognized pool '" << poolstr << "'";
12032       err = -ENOENT;
12033       goto reply;
12034     }
12035     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12036     ceph_assert(p);
12037     if (!p->has_read_tier()) {
12038       err = 0;
12039       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12040       goto reply;
12041     }
12042
12043     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12044       goto reply;
12045     }
12046
12047     // go
12048     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12049     if (np->has_read_tier()) {
12050       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12051       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12052       nop->set_last_force_op_resend(pending_inc.epoch);
12053     }
12054     if (np->has_write_tier()) {
12055       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12056       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12057       nop->set_last_force_op_resend(pending_inc.epoch);
12058     }
12059     np->clear_read_tier();
12060     np->clear_write_tier();
12061     np->set_last_force_op_resend(pending_inc.epoch);
12062     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12063     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12064                                               get_last_committed() + 1));
12065     return true;
12066   } else if (prefix == "osd tier cache-mode") {
12067     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12068     if (err == -EAGAIN)
12069       goto wait;
12070     if (err)
12071       goto reply;
12072     string poolstr;
12073     cmd_getval(cct, cmdmap, "pool", poolstr);
12074     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12075     if (pool_id < 0) {
12076       ss << "unrecognized pool '" << poolstr << "'";
12077       err = -ENOENT;
12078       goto reply;
12079     }
12080     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12081     ceph_assert(p);
12082     if (!p->is_tier()) {
12083       ss << "pool '" << poolstr << "' is not a tier";
12084       err = -EINVAL;
12085       goto reply;
12086     }
12087     string modestr;
12088     cmd_getval(cct, cmdmap, "mode", modestr);
12089     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12090     if (mode < 0) {
12091       ss << "'" << modestr << "' is not a valid cache mode";
12092       err = -EINVAL;
12093       goto reply;
12094     }
12095
12096     bool sure = false;
12097     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12098
12099     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12100          mode != pg_pool_t::CACHEMODE_NONE &&
12101          mode != pg_pool_t::CACHEMODE_PROXY &&
12102          mode != pg_pool_t::CACHEMODE_READPROXY) &&
12103          !sure) {
12104       ss << "'" << modestr << "' is not a well-supported cache mode and may "
12105          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
12106       err = -EPERM;
12107       goto reply;
12108     }
12109
12110     // pool already has this cache-mode set and there are no pending changes
12111     if (p->cache_mode == mode &&
12112         (pending_inc.new_pools.count(pool_id) == 0 ||
12113          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12114       ss << "set cache-mode for pool '" << poolstr << "'"
12115          << " to " << pg_pool_t::get_cache_mode_name(mode);
12116       err = 0;
12117       goto reply;
12118     }
12119
12120     /* Mode description:
12121      *
12122      *  none:       No cache-mode defined
12123      *  forward:    Forward all reads and writes to base pool
12124      *  writeback:  Cache writes, promote reads from base pool
12125      *  readonly:   Forward writes to base pool
12126      *  readforward: Writes are in writeback mode, Reads are in forward mode
12127      *  proxy:       Proxy all reads and writes to base pool
12128      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
12129      *
12130      * Hence, these are the allowed transitions:
12131      *
12132      *  none -> any
12133      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12134      *  proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12135      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12136      *  readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
12137      *  writeback -> readforward || readproxy || forward || proxy
12138      *  readonly -> any
12139      */
12140
12141     // We check if the transition is valid against the current pool mode, as
12142     // it is the only committed state thus far.  We will blantly squash
12143     // whatever mode is on the pending state.
12144
12145     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
12146         (mode != pg_pool_t::CACHEMODE_FORWARD &&
12147           mode != pg_pool_t::CACHEMODE_PROXY &&
12148           mode != pg_pool_t::CACHEMODE_READFORWARD &&
12149           mode != pg_pool_t::CACHEMODE_READPROXY)) {
12150       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
12151          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
12152          << "' pool; only '"
12153          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
12154          << "','"
12155          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
12156          << "','"
12157          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
12158          << "','"
12159          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
12160         << "' allowed.";
12161       err = -EINVAL;
12162       goto reply;
12163     }
12164     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
12165         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12166           mode != pg_pool_t::CACHEMODE_FORWARD &&
12167           mode != pg_pool_t::CACHEMODE_PROXY &&
12168           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12169
12170         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
12171         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12172           mode != pg_pool_t::CACHEMODE_FORWARD &&
12173           mode != pg_pool_t::CACHEMODE_READFORWARD &&
12174           mode != pg_pool_t::CACHEMODE_PROXY)) ||
12175
12176         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
12177         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12178           mode != pg_pool_t::CACHEMODE_FORWARD &&
12179           mode != pg_pool_t::CACHEMODE_READFORWARD &&
12180           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12181
12182         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
12183         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12184           mode != pg_pool_t::CACHEMODE_READFORWARD &&
12185           mode != pg_pool_t::CACHEMODE_PROXY &&
12186           mode != pg_pool_t::CACHEMODE_READPROXY))) {
12187
12188       const pool_stat_t* pstats =
12189         mon->mgrstatmon()->get_pool_stat(pool_id);
12190
12191       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
12192         ss << "unable to set cache-mode '"
12193            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
12194            << "': dirty objects found";
12195         err = -EBUSY;
12196         goto reply;
12197       }
12198     }
12199     // go
12200     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12201     np->cache_mode = mode;
12202     // set this both when moving to and from cache_mode NONE.  this is to
12203     // capture legacy pools that were set up before this flag existed.
12204     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
12205     ss << "set cache-mode for pool '" << poolstr
12206         << "' to " << pg_pool_t::get_cache_mode_name(mode);
12207     if (mode == pg_pool_t::CACHEMODE_NONE) {
12208       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
12209       ceph_assert(base_pool);
12210       if (base_pool->read_tier == pool_id ||
12211           base_pool->write_tier == pool_id)
12212         ss <<" (WARNING: pool is still configured as read or write tier)";
12213     }
12214     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12215                                               get_last_committed() + 1));
12216     return true;
12217   } else if (prefix == "osd tier add-cache") {
12218     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12219     if (err == -EAGAIN)
12220       goto wait;
12221     if (err)
12222       goto reply;
12223     string poolstr;
12224     cmd_getval(cct, cmdmap, "pool", poolstr);
12225     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12226     if (pool_id < 0) {
12227       ss << "unrecognized pool '" << poolstr << "'";
12228       err = -ENOENT;
12229       goto reply;
12230     }
12231     string tierpoolstr;
12232     cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
12233     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12234     if (tierpool_id < 0) {
12235       ss << "unrecognized pool '" << tierpoolstr << "'";
12236       err = -ENOENT;
12237       goto reply;
12238     }
12239     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12240     ceph_assert(p);
12241     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12242     ceph_assert(tp);
12243
12244     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12245       goto reply;
12246     }
12247
12248     int64_t size = 0;
12249     if (!cmd_getval(cct, cmdmap, "size", size)) {
12250       ss << "unable to parse 'size' value '"
12251          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
12252       err = -EINVAL;
12253       goto reply;
12254     }
12255     // make sure new tier is empty
12256     const pool_stat_t *pstats =
12257       mon->mgrstatmon()->get_pool_stat(tierpool_id);
12258     if (pstats && pstats->stats.sum.num_objects != 0) {
12259       ss << "tier pool '" << tierpoolstr << "' is not empty";
12260       err = -ENOTEMPTY;
12261       goto reply;
12262     }
12263     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
12264     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12265     if (mode < 0) {
12266       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
12267       err = -EINVAL;
12268       goto reply;
12269     }
12270     HitSet::Params hsp;
12271     auto& cache_hit_set_type =
12272       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
12273     if (cache_hit_set_type == "bloom") {
12274       BloomHitSet::Params *bsp = new BloomHitSet::Params;
12275       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
12276       hsp = HitSet::Params(bsp);
12277     } else if (cache_hit_set_type == "explicit_hash") {
12278       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
12279     } else if (cache_hit_set_type == "explicit_object") {
12280       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
12281     } else {
12282       ss << "osd tier cache default hit set type '"
12283          << cache_hit_set_type << "' is not a known type";
12284       err = -EINVAL;
12285       goto reply;
12286     }
12287     // go
12288     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12289     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12290     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12291       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12292       return true;
12293     }
12294     np->tiers.insert(tierpool_id);
12295     np->read_tier = np->write_tier = tierpool_id;
12296     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12297     np->set_last_force_op_resend(pending_inc.epoch);
12298     ntp->set_last_force_op_resend(pending_inc.epoch);
12299     ntp->tier_of = pool_id;
12300     ntp->cache_mode = mode;
12301     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
12302     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
12303     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
12304     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
12305     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
12306     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
12307     ntp->hit_set_params = hsp;
12308     ntp->target_max_bytes = size;
12309     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
12310     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12311                                               get_last_committed() + 1));
12312     return true;
12313   } else if (prefix == "osd pool set-quota") {
12314     string poolstr;
12315     cmd_getval(cct, cmdmap, "pool", poolstr);
12316     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12317     if (pool_id < 0) {
12318       ss << "unrecognized pool '" << poolstr << "'";
12319       err = -ENOENT;
12320       goto reply;
12321     }
12322
12323     string field;
12324     cmd_getval(cct, cmdmap, "field", field);
12325     if (field != "max_objects" && field != "max_bytes") {
12326       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
12327       err = -EINVAL;
12328       goto reply;
12329     }
12330
12331     // val could contain unit designations, so we treat as a string
12332     string val;
12333     cmd_getval(cct, cmdmap, "val", val);
12334     string tss;
12335     int64_t value;
12336     if (field == "max_objects") {
12337       value = strict_sistrtoll(val.c_str(), &tss);
12338     } else if (field == "max_bytes") {
12339       value = strict_iecstrtoll(val.c_str(), &tss);
12340     } else {
12341       ceph_abort_msg("unrecognized option");
12342     }
12343     if (!tss.empty()) {
12344       ss << "error parsing value '" << val << "': " << tss;
12345       err = -EINVAL;
12346       goto reply;
12347     }
12348
12349     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
12350     if (field == "max_objects") {
12351       pi->quota_max_objects = value;
12352     } else if (field == "max_bytes") {
12353       pi->quota_max_bytes = value;
12354     } else {
12355       ceph_abort_msg("unrecognized option");
12356     }
12357     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
12358     rs = ss.str();
12359     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12360                                               get_last_committed() + 1));
12361     return true;
12362   } else if (prefix == "osd pool application enable" ||
12363              prefix == "osd pool application disable" ||
12364              prefix == "osd pool application set" ||
12365              prefix == "osd pool application rm") {
12366     err = prepare_command_pool_application(prefix, cmdmap, ss);
12367     if (err == -EAGAIN) {
12368       goto wait;
12369     } else if (err < 0) {
12370       goto reply;
12371     } else {
12372       goto update;
12373     }
12374   } else if (prefix == "osd force-create-pg") {
12375     pg_t pgid;
12376     string pgidstr;
12377     cmd_getval(cct, cmdmap, "pgid", pgidstr);
12378     if (!pgid.parse(pgidstr.c_str())) {
12379       ss << "invalid pgid '" << pgidstr << "'";
12380       err = -EINVAL;
12381       goto reply;
12382     }
12383     if (!osdmap.pg_exists(pgid)) {
12384       ss << "pg " << pgid << " should not exist";
12385       err = -ENOENT;
12386       goto reply;
12387     }
12388     bool sure = false;
12389     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12390     if (!sure) {
12391       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
12392          << "that the cluster will give up ever trying to recover the lost data.  Do this "
12393          << "only if you are certain that all copies of the PG are in fact lost and you are "
12394          << "willing to accept that the data is permanently destroyed.  Pass "
12395          << "--yes-i-really-mean-it to proceed.";
12396       err = -EPERM;
12397       goto reply;
12398     }
12399     bool creating_now;
12400     {
12401       std::lock_guard<std::mutex> l(creating_pgs_lock);
12402       auto emplaced = creating_pgs.pgs.emplace(pgid,
12403                                                make_pair(osdmap.get_epoch(),
12404                                                          ceph_clock_now()));
12405       creating_now = emplaced.second;
12406     }
12407     if (creating_now) {
12408       ss << "pg " << pgidstr << " now creating, ok";
12409       // set the pool's CREATING flag so that (1) the osd won't ignore our
12410       // create message and (2) we won't propose any future pg_num changes
12411       // until after the PG has been instantiated.
12412       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
12413         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
12414       }
12415       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
12416       err = 0;
12417       goto update;
12418     } else {
12419       ss << "pg " << pgid << " already creating";
12420       err = 0;
12421       goto reply;
12422     }
12423   } else {
12424     err = -EINVAL;
12425   }
12426
12427  reply:
12428   getline(ss, rs);
12429   if (err < 0 && rs.length() == 0)
12430     rs = cpp_strerror(err);
12431   mon->reply_command(op, err, rs, rdata, get_last_committed());
12432   return ret;
12433
12434  update:
12435   getline(ss, rs);
12436   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12437                                             get_last_committed() + 1));
12438   return true;
12439
12440  wait:
12441   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12442   return true;
12443 }
12444
12445 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
12446 {
12447   op->mark_osdmon_event(__func__);
12448
12449   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12450   MonSession *session = op->get_session();
12451   if (!session) {
12452     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12453     return true;
12454   }
12455
12456   switch (m->op) {
12457   case POOL_OP_CREATE_UNMANAGED_SNAP:
12458   case POOL_OP_DELETE_UNMANAGED_SNAP:
12459     {
12460       const std::string* pool_name = nullptr;
12461       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
12462       if (pg_pool != nullptr) {
12463         pool_name = &osdmap.get_pool_name(m->pool);
12464       }
12465
12466       if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
12467                                           session->entity_name, session->caps,
12468                                           session->get_peer_socket_addr(),
12469                                           pool_name)) {
12470         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
12471                 << "privileges. message: " << *m  << std::endl
12472                 << "caps: " << session->caps << dendl;
12473         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12474         return true;
12475       }
12476     }
12477     break;
12478   default:
12479     if (!session->is_capable("osd", MON_CAP_W)) {
12480       dout(0) << "got pool op from entity with insufficient privileges. "
12481               << "message: " << *m  << std::endl
12482               << "caps: " << session->caps << dendl;
12483       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12484       return true;
12485     }
12486     break;
12487   }
12488
12489   return false;
12490 }
12491
12492 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
12493 {
12494   op->mark_osdmon_event(__func__);
12495   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12496
12497   if (enforce_pool_op_caps(op)) {
12498     return true;
12499   }
12500
12501   if (m->fsid != mon->monmap->fsid) {
12502     dout(0) << __func__ << " drop message on fsid " << m->fsid
12503             << " != " << mon->monmap->fsid << " for " << *m << dendl;
12504     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12505     return true;
12506   }
12507
12508   if (m->op == POOL_OP_CREATE)
12509     return preprocess_pool_op_create(op);
12510
12511   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
12512   if (p == nullptr) {
12513     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
12514     if (m->op == POOL_OP_DELETE) {
12515       _pool_op_reply(op, 0, osdmap.get_epoch());
12516     } else {
12517       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12518     }
12519     return true;
12520   }
12521
12522   // check if the snap and snapname exist
12523   bool snap_exists = false;
12524   if (p->snap_exists(m->name.c_str()))
12525     snap_exists = true;
12526
12527   switch (m->op) {
12528   case POOL_OP_CREATE_SNAP:
12529     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
12530       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12531       return true;
12532     }
12533     if (snap_exists) {
12534       _pool_op_reply(op, 0, osdmap.get_epoch());
12535       return true;
12536     }
12537     return false;
12538   case POOL_OP_CREATE_UNMANAGED_SNAP:
12539     if (p->is_pool_snaps_mode()) {
12540       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12541       return true;
12542     }
12543     return false;
12544   case POOL_OP_DELETE_SNAP:
12545     if (p->is_unmanaged_snaps_mode()) {
12546       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12547       return true;
12548     }
12549     if (!snap_exists) {
12550       _pool_op_reply(op, 0, osdmap.get_epoch());
12551       return true;
12552     }
12553     return false;
12554   case POOL_OP_DELETE_UNMANAGED_SNAP:
12555     if (p->is_pool_snaps_mode()) {
12556       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12557       return true;
12558     }
12559     if (p->is_removed_snap(m->snapid)) {
12560       _pool_op_reply(op, 0, osdmap.get_epoch());
12561       return true;
12562     }
12563     return false;
12564   case POOL_OP_DELETE:
12565     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
12566       _pool_op_reply(op, 0, osdmap.get_epoch());
12567       return true;
12568     }
12569     return false;
12570   case POOL_OP_AUID_CHANGE:
12571     return false;
12572   default:
12573     ceph_abort();
12574     break;
12575   }
12576
12577   return false;
12578 }
12579
12580 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
12581 {
12582   op->mark_osdmon_event(__func__);
12583   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12584   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
12585   if (pool >= 0) {
12586     _pool_op_reply(op, 0, osdmap.get_epoch());
12587     return true;
12588   }
12589
12590   return false;
12591 }
12592
12593 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
12594 {
12595   op->mark_osdmon_event(__func__);
12596   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12597   dout(10) << "prepare_pool_op " << *m << dendl;
12598   if (m->op == POOL_OP_CREATE) {
12599     return prepare_pool_op_create(op);
12600   } else if (m->op == POOL_OP_DELETE) {
12601     return prepare_pool_op_delete(op);
12602   }
12603
12604   int ret = 0;
12605   bool changed = false;
12606
12607   if (!osdmap.have_pg_pool(m->pool)) {
12608     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12609     return false;
12610   }
12611
12612   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
12613
12614   switch (m->op) {
12615     case POOL_OP_CREATE_SNAP:
12616       if (pool->is_tier()) {
12617         ret = -EINVAL;
12618         _pool_op_reply(op, ret, osdmap.get_epoch());
12619         return false;
12620       }  // else, fall through
12621     case POOL_OP_DELETE_SNAP:
12622       if (!pool->is_unmanaged_snaps_mode()) {
12623         bool snap_exists = pool->snap_exists(m->name.c_str());
12624         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
12625           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
12626           ret = 0;
12627         } else {
12628           break;
12629         }
12630       } else {
12631         ret = -EINVAL;
12632       }
12633       _pool_op_reply(op, ret, osdmap.get_epoch());
12634       return false;
12635
12636     case POOL_OP_DELETE_UNMANAGED_SNAP:
12637       // we won't allow removal of an unmanaged snapshot from a pool
12638       // not in unmanaged snaps mode.
12639       if (!pool->is_unmanaged_snaps_mode()) {
12640         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
12641         return false;
12642       }
12643       /* fall-thru */
12644     case POOL_OP_CREATE_UNMANAGED_SNAP:
12645       // but we will allow creating an unmanaged snapshot on any pool
12646       // as long as it is not in 'pool' snaps mode.
12647       if (pool->is_pool_snaps_mode()) {
12648         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12649         return false;
12650       }
12651   }
12652
12653   // projected pool info
12654   pg_pool_t pp;
12655   if (pending_inc.new_pools.count(m->pool))
12656     pp = pending_inc.new_pools[m->pool];
12657   else
12658     pp = *osdmap.get_pg_pool(m->pool);
12659
12660   bufferlist reply_data;
12661
12662   // pool snaps vs unmanaged snaps are mutually exclusive
12663   switch (m->op) {
12664   case POOL_OP_CREATE_SNAP:
12665   case POOL_OP_DELETE_SNAP:
12666     if (pp.is_unmanaged_snaps_mode()) {
12667       ret = -EINVAL;
12668       goto out;
12669     }
12670     break;
12671
12672   case POOL_OP_CREATE_UNMANAGED_SNAP:
12673   case POOL_OP_DELETE_UNMANAGED_SNAP:
12674     if (pp.is_pool_snaps_mode()) {
12675       ret = -EINVAL;
12676       goto out;
12677     }
12678   }
12679
12680   switch (m->op) {
12681   case POOL_OP_CREATE_SNAP:
12682     if (!pp.snap_exists(m->name.c_str())) {
12683       pp.add_snap(m->name.c_str(), ceph_clock_now());
12684       dout(10) << "create snap in pool " << m->pool << " " << m->name
12685                << " seq " << pp.get_snap_epoch() << dendl;
12686       changed = true;
12687     }
12688     break;
12689
12690   case POOL_OP_DELETE_SNAP:
12691     {
12692       snapid_t s = pp.snap_exists(m->name.c_str());
12693       if (s) {
12694         pp.remove_snap(s);
12695         pending_inc.new_removed_snaps[m->pool].insert(s);
12696         changed = true;
12697       }
12698     }
12699     break;
12700
12701   case POOL_OP_CREATE_UNMANAGED_SNAP:
12702     {
12703       uint64_t snapid;
12704       pp.add_unmanaged_snap(snapid);
12705       encode(snapid, reply_data);
12706       changed = true;
12707     }
12708     break;
12709
12710   case POOL_OP_DELETE_UNMANAGED_SNAP:
12711     if (!pp.is_removed_snap(m->snapid)) {
12712       if (m->snapid > pp.get_snap_seq()) {
12713         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12714         return false;
12715       }
12716       pp.remove_unmanaged_snap(m->snapid);
12717       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
12718       changed = true;
12719     }
12720     break;
12721
12722   case POOL_OP_AUID_CHANGE:
12723     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
12724     return false;
12725
12726   default:
12727     ceph_abort();
12728     break;
12729   }
12730
12731   if (changed) {
12732     pp.set_snap_epoch(pending_inc.epoch);
12733     pending_inc.new_pools[m->pool] = pp;
12734   }
12735
12736  out:
12737   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
12738   return true;
12739 }
12740
12741 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
12742 {
12743   op->mark_osdmon_event(__func__);
12744   int err = prepare_new_pool(op);
12745   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
12746   return true;
12747 }
12748
12749 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
12750                                    ostream *ss)
12751 {
12752   const string& poolstr = osdmap.get_pool_name(pool_id);
12753
12754   // If the Pool is in use by CephFS, refuse to delete it
12755   FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12756   if (pending_fsmap.pool_in_use(pool_id)) {
12757     *ss << "pool '" << poolstr << "' is in use by CephFS";
12758     return -EBUSY;
12759   }
12760
12761   if (pool.tier_of >= 0) {
12762     *ss << "pool '" << poolstr << "' is a tier of '"
12763         << osdmap.get_pool_name(pool.tier_of) << "'";
12764     return -EBUSY;
12765   }
12766   if (!pool.tiers.empty()) {
12767     *ss << "pool '" << poolstr << "' has tiers";
12768     for(auto tier : pool.tiers) {
12769       *ss << " " << osdmap.get_pool_name(tier);
12770     }
12771     return -EBUSY;
12772   }
12773
12774   if (!g_conf()->mon_allow_pool_delete) {
12775     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
12776     return -EPERM;
12777   }
12778
12779   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
12780     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
12781     return -EPERM;
12782   }
12783
12784   *ss << "pool '" << poolstr << "' removed";
12785   return 0;
12786 }
12787
12788 /**
12789  * Check if it is safe to add a tier to a base pool
12790  *
12791  * @return
12792  * True if the operation should proceed, false if we should abort here
12793  * (abort doesn't necessarily mean error, could be idempotency)
12794  */
12795 bool OSDMonitor::_check_become_tier(
12796     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
12797     const int64_t base_pool_id, const pg_pool_t *base_pool,
12798     int *err,
12799     ostream *ss) const
12800 {
12801   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
12802   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12803
12804   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12805   if (pending_fsmap.pool_in_use(tier_pool_id)) {
12806     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
12807     *err = -EBUSY;
12808     return false;
12809   }
12810
12811   if (base_pool->tiers.count(tier_pool_id)) {
12812     ceph_assert(tier_pool->tier_of == base_pool_id);
12813     *err = 0;
12814     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
12815       << base_pool_name << "'";
12816     return false;
12817   }
12818
12819   if (base_pool->is_tier()) {
12820     *ss << "pool '" << base_pool_name << "' is already a tier of '"
12821       << osdmap.get_pool_name(base_pool->tier_of) << "', "
12822       << "multiple tiers are not yet supported.";
12823     *err = -EINVAL;
12824     return false;
12825   }
12826
12827   if (tier_pool->has_tiers()) {
12828     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
12829     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
12830          it != tier_pool->tiers.end(); ++it)
12831       *ss << "'" << osdmap.get_pool_name(*it) << "',";
12832     *ss << " multiple tiers are not yet supported.";
12833     *err = -EINVAL;
12834     return false;
12835   }
12836
12837   if (tier_pool->is_tier()) {
12838     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
12839        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
12840     *err = -EINVAL;
12841     return false;
12842   }
12843
12844   *err = 0;
12845   return true;
12846 }
12847
12848
12849 /**
12850  * Check if it is safe to remove a tier from this base pool
12851  *
12852  * @return
12853  * True if the operation should proceed, false if we should abort here
12854  * (abort doesn't necessarily mean error, could be idempotency)
12855  */
12856 bool OSDMonitor::_check_remove_tier(
12857     const int64_t base_pool_id, const pg_pool_t *base_pool,
12858     const pg_pool_t *tier_pool,
12859     int *err, ostream *ss) const
12860 {
12861   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12862
12863   // Apply CephFS-specific checks
12864   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12865   if (pending_fsmap.pool_in_use(base_pool_id)) {
12866     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
12867       // If the underlying pool is erasure coded and does not allow EC
12868       // overwrites, we can't permit the removal of the replicated tier that
12869       // CephFS relies on to access it
12870       *ss << "pool '" << base_pool_name <<
12871           "' does not allow EC overwrites and is in use by CephFS"
12872           " via its tier";
12873       *err = -EBUSY;
12874       return false;
12875     }
12876
12877     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
12878       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
12879              "tier is still in use as a writeback cache.  Change the cache "
12880              "mode and flush the cache before removing it";
12881       *err = -EBUSY;
12882       return false;
12883     }
12884   }
12885
12886   *err = 0;
12887   return true;
12888 }
12889
12890 int OSDMonitor::_prepare_remove_pool(
12891   int64_t pool, ostream *ss, bool no_fake)
12892 {
12893   dout(10) << __func__ << " " << pool << dendl;
12894   const pg_pool_t *p = osdmap.get_pg_pool(pool);
12895   int r = _check_remove_pool(pool, *p, ss);
12896   if (r < 0)
12897     return r;
12898
12899   auto new_pool = pending_inc.new_pools.find(pool);
12900   if (new_pool != pending_inc.new_pools.end()) {
12901     // if there is a problem with the pending info, wait and retry
12902     // this op.
12903     const auto& p = new_pool->second;
12904     int r = _check_remove_pool(pool, p, ss);
12905     if (r < 0)
12906       return -EAGAIN;
12907   }
12908
12909   if (pending_inc.old_pools.count(pool)) {
12910     dout(10) << __func__ << " " << pool << " already pending removal"
12911              << dendl;
12912     return 0;
12913   }
12914
12915   if (g_conf()->mon_fake_pool_delete && !no_fake) {
12916     string old_name = osdmap.get_pool_name(pool);
12917     string new_name = old_name + "." + stringify(pool) + ".DELETED";
12918     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
12919             << old_name << " -> " << new_name << dendl;
12920     pending_inc.new_pool_names[pool] = new_name;
12921     return 0;
12922   }
12923
12924   // remove
12925   pending_inc.old_pools.insert(pool);
12926
12927   // remove any pg_temp mappings for this pool
12928   for (auto p = osdmap.pg_temp->begin();
12929        p != osdmap.pg_temp->end();
12930        ++p) {
12931     if (p->first.pool() == pool) {
12932       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
12933                << p->first << dendl;
12934       pending_inc.new_pg_temp[p->first].clear();
12935     }
12936   }
12937   // remove any primary_temp mappings for this pool
12938   for (auto p = osdmap.primary_temp->begin();
12939       p != osdmap.primary_temp->end();
12940       ++p) {
12941     if (p->first.pool() == pool) {
12942       dout(10) << __func__ << " " << pool
12943                << " removing obsolete primary_temp" << p->first << dendl;
12944       pending_inc.new_primary_temp[p->first] = -1;
12945     }
12946   }
12947   // remove any pg_upmap mappings for this pool
12948   for (auto& p : osdmap.pg_upmap) {
12949     if (p.first.pool() == pool) {
12950       dout(10) << __func__ << " " << pool
12951                << " removing obsolete pg_upmap "
12952                << p.first << dendl;
12953       pending_inc.old_pg_upmap.insert(p.first);
12954     }
12955   }
12956   // remove any pending pg_upmap mappings for this pool
12957   {
12958     auto it = pending_inc.new_pg_upmap.begin();
12959     while (it != pending_inc.new_pg_upmap.end()) {
12960       if (it->first.pool() == pool) {
12961         dout(10) << __func__ << " " << pool
12962                  << " removing pending pg_upmap "
12963                  << it->first << dendl;
12964         it = pending_inc.new_pg_upmap.erase(it);
12965       } else {
12966         it++;
12967       }
12968     }
12969   }
12970   // remove any pg_upmap_items mappings for this pool
12971   for (auto& p : osdmap.pg_upmap_items) {
12972     if (p.first.pool() == pool) {
12973       dout(10) << __func__ << " " << pool
12974                << " removing obsolete pg_upmap_items " << p.first
12975                << dendl;
12976       pending_inc.old_pg_upmap_items.insert(p.first);
12977     }
12978   }
12979   // remove any pending pg_upmap mappings for this pool
12980   {
12981     auto it = pending_inc.new_pg_upmap_items.begin();
12982     while (it != pending_inc.new_pg_upmap_items.end()) {
12983       if (it->first.pool() == pool) {
12984         dout(10) << __func__ << " " << pool
12985                  << " removing pending pg_upmap_items "
12986                  << it->first << dendl;
12987         it = pending_inc.new_pg_upmap_items.erase(it);
12988       } else {
12989         it++;
12990       }
12991     }
12992   }
12993
12994   // remove any choose_args for this pool
12995   CrushWrapper newcrush;
12996   _get_pending_crush(newcrush);
12997   if (newcrush.have_choose_args(pool)) {
12998     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
12999     newcrush.rm_choose_args(pool);
13000     pending_inc.crush.clear();
13001     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13002   }
13003   return 0;
13004 }
13005
13006 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13007 {
13008   dout(10) << "_prepare_rename_pool " << pool << dendl;
13009   if (pending_inc.old_pools.count(pool)) {
13010     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13011     return -ENOENT;
13012   }
13013   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13014        p != pending_inc.new_pool_names.end();
13015        ++p) {
13016     if (p->second == newname && p->first != pool) {
13017       return -EEXIST;
13018     }
13019   }
13020
13021   pending_inc.new_pool_names[pool] = newname;
13022   return 0;
13023 }
13024
13025 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13026 {
13027   op->mark_osdmon_event(__func__);
13028   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
13029   ostringstream ss;
13030   int ret = _prepare_remove_pool(m->pool, &ss, false);
13031   if (ret == -EAGAIN) {
13032     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13033     return true;
13034   }
13035   if (ret < 0)
13036     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13037   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13038                                                       pending_inc.epoch));
13039   return true;
13040 }
13041
13042 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
13043                                 int ret, epoch_t epoch, bufferlist *blp)
13044 {
13045   op->mark_osdmon_event(__func__);
13046   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
13047   dout(20) << "_pool_op_reply " << ret << dendl;
13048   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13049                                          ret, epoch, get_last_committed(), blp);
13050   mon->send_reply(op, reply);
13051 }
13052
13053 void OSDMonitor::convert_pool_priorities(void)
13054 {
13055   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13056   int64_t max_prio = 0;
13057   int64_t min_prio = 0;
13058   for (const auto &i : osdmap.get_pools()) {
13059     const auto &pool = i.second;
13060
13061     if (pool.opts.is_set(key)) {
13062       int64_t prio;
13063       pool.opts.get(key, &prio);
13064       if (prio > max_prio)
13065         max_prio = prio;
13066       if (prio < min_prio)
13067         min_prio = prio;
13068     }
13069   }
13070   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13071     dout(20) << __func__ << " nothing to fix" << dendl;
13072     return;
13073   }
13074   // Current pool priorities exceeds new maximum
13075   for (const auto &i : osdmap.get_pools()) {
13076     const auto pool_id = i.first;
13077     pg_pool_t pool = i.second;
13078
13079     int64_t prio = 0;
13080     pool.opts.get(key, &prio);
13081     int64_t n;
13082
13083     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13084       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13085       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13086     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13087       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
13088       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13089     } else {
13090       continue;
13091     }
13092     if (n == 0) {
13093       pool.opts.unset(key);
13094     } else {
13095       pool.opts.set(key, static_cast<int64_t>(n));
13096     }
13097     dout(10) << __func__ << " pool " << pool_id
13098              << " recovery_priority adjusted "
13099              << prio << " to " << n << dendl;
13100     pool.last_change = pending_inc.epoch;
13101     pending_inc.new_pools[pool_id] = pool;
13102   }
13103 }