ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/ConfigKeyService.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDFull.h"
  43 #include "messages/MOSDMap.h"
  44 #include "messages/MMonGetOSDMap.h"
  45 #include "messages/MOSDBoot.h"
  46 #include "messages/MOSDAlive.h"
  47 #include "messages/MPoolOp.h"
  48 #include "messages/MPoolOpReply.h"
  49 #include "messages/MOSDPGCreate.h"
  50 #include "messages/MOSDPGCreate2.h"
  51 #include "messages/MOSDPGCreated.h"
  52 #include "messages/MOSDPGTemp.h"
  53 #include "messages/MOSDPGReadyToMerge.h"
  54 #include "messages/MMonCommand.h"
  55 #include "messages/MRemoveSnaps.h"
  56 #include "messages/MOSDScrub.h"
  57 #include "messages/MRoute.h"
  58
  59 #include "common/TextTable.h"
  60 #include "common/Timer.h"
  61 #include "common/ceph_argparse.h"
  62 #include "common/perf_counters.h"
  63 #include "common/strtol.h"
  64 #include "common/numa.h"
  65
  66 #include "common/config.h"
  67 #include "common/errno.h"
  68
  69 #include "erasure-code/ErasureCodePlugin.h"
  70 #include "compressor/Compressor.h"
  71 #include "common/Checksummer.h"
  72
  73 #include "include/compat.h"
  74 #include "include/ceph_assert.h"
  75 #include "include/stringify.h"
  76 #include "include/util.h"
  77 #include "common/cmdparse.h"
  78 #include "include/str_list.h"
  79 #include "include/str_map.h"
  80 #include "include/scope_guard.h"
  81
  82 #include "auth/cephx/CephxKeyServer.h"
  83 #include "osd/OSDCap.h"
  84
  85 #include "json_spirit/json_spirit_reader.h"
  86
  87 #include <boost/algorithm/string/predicate.hpp>
  88
  89 #define dout_subsys ceph_subsys_mon
  90 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
  91 static const string OSD_METADATA_PREFIX("osd_metadata");
  92 static const string OSD_SNAP_PREFIX("osd_snap");
  93
  94 namespace {
  95
  96 const uint32_t MAX_POOL_APPLICATIONS = 4;
  97 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
  98 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
  99
 100 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 101   // Note: this doesn't include support for the application tag match
 102   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 103     auto& match = grant.match;
 104     if (match.is_match_all()) {
 105       return true;
 106     } else if (pool_name != nullptr &&
 107                !match.pool_namespace.pool_name.empty() &&
 108                match.pool_namespace.pool_name == *pool_name) {
 109       return true;
 110     }
 111   }
 112   return false;
 113 }
 114
 115 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 116                                     const KeyServer& key_server,
 117                                     const EntityName& entity_name,
 118                                     const MonCap& mon_caps,
 119                                     const entity_addr_t& peer_socket_addr,
 120                                     const std::string* pool_name)
 121 {
 122   typedef std::map<std::string, std::string> CommandArgs;
 123
 124   if (mon_caps.is_capable(
 125         cct, CEPH_ENTITY_TYPE_MON,
 126         entity_name, "osd",
 127         "osd pool op unmanaged-snap",
 128         (pool_name == nullptr ?
 129          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 130          CommandArgs{{"poolname", *pool_name}}),
 131         false, true, false,
 132         peer_socket_addr)) {
 133     return true;
 134   }
 135
 136   AuthCapsInfo caps_info;
 137   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 138                                    caps_info)) {
 139     dout(10) << "unable to locate OSD cap data for " << entity_name
 140              << " in auth db" << dendl;
 141     return false;
 142   }
 143
 144   string caps_str;
 145   if (caps_info.caps.length() > 0) {
 146     auto p = caps_info.caps.cbegin();
 147     try {
 148       decode(caps_str, p);
 149     } catch (const buffer::error &err) {
 150       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 151            << dendl;
 152       return false;
 153     }
 154   }
 155
 156   OSDCap osd_cap;
 157   if (!osd_cap.parse(caps_str, nullptr)) {
 158     dout(10) << "unable to parse OSD cap data for " << entity_name
 159              << " in auth db" << dendl;
 160     return false;
 161   }
 162
 163   // if the entity has write permissions in one or all pools, permit
 164   // usage of unmanaged-snapshots
 165   if (osd_cap.allow_all()) {
 166     return true;
 167   }
 168
 169   for (auto& grant : osd_cap.grants) {
 170     if (grant.profile.is_valid()) {
 171       for (auto& profile_grant : grant.profile_grants) {
 172         if (is_osd_writable(profile_grant, pool_name)) {
 173           return true;
 174         }
 175       }
 176     } else if (is_osd_writable(grant, pool_name)) {
 177       return true;
 178     }
 179   }
 180
 181   return false;
 182 }
 183
 184 } // anonymous namespace
 185
 186 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
 187 {
 188   if (epoch_by_pg.size() <= ps) {
 189     epoch_by_pg.resize(ps + 1, 0);
 190   }
 191   const auto old_lec = epoch_by_pg[ps];
 192   if (old_lec >= last_epoch_clean) {
 193     // stale lec
 194     return;
 195   }
 196   epoch_by_pg[ps] = last_epoch_clean;
 197   if (last_epoch_clean < floor) {
 198     floor = last_epoch_clean;
 199   } else if (last_epoch_clean > floor) {
 200     if (old_lec == floor) {
 201       // probably should increase floor?
 202       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 203                                         std::end(epoch_by_pg));
 204       floor = *new_floor;
 205     }
 206   }
 207   if (ps != next_missing) {
 208     return;
 209   }
 210   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 211     if (epoch_by_pg[next_missing] == 0) {
 212       break;
 213     }
 214   }
 215 }
 216
 217 void LastEpochClean::remove_pool(uint64_t pool)
 218 {
 219   report_by_pool.erase(pool);
 220 }
 221
 222 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
 223 {
 224   auto& lec = report_by_pool[pg.pool()];
 225   return lec.report(pg.ps(), last_epoch_clean);
 226 }
 227
 228 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 229 {
 230   auto floor = latest.get_epoch();
 231   for (auto& pool : latest.get_pools()) {
 232     auto reported = report_by_pool.find(pool.first);
 233     if (reported == report_by_pool.end()) {
 234       return 0;
 235     }
 236     if (reported->second.next_missing < pool.second.get_pg_num()) {
 237       return 0;
 238     }
 239     if (reported->second.floor < floor) {
 240       floor = reported->second.floor;
 241     }
 242   }
 243   return floor;
 244 }
 245
 246
 247 class C_UpdateCreatingPGs : public Context {
 248 public:
 249   OSDMonitor *osdmon;
 250   utime_t start;
 251   epoch_t epoch;
 252   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 253     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 254   void finish(int r) override {
 255     if (r >= 0) {
 256       utime_t end = ceph_clock_now();
 257       dout(10) << "osdmap epoch " << epoch << " mapping took "
 258                << (end - start) << " seconds" << dendl;
 259       osdmon->update_creating_pgs();
 260       osdmon->check_pg_creates_subs();
 261     }
 262   }
 263 };
 264
 265 #undef dout_prefix
 266 #define dout_prefix _prefix(_dout, mon, osdmap)
 267 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
 268   return *_dout << "mon." << mon->name << "@" << mon->rank
 269                 << "(" << mon->get_state_name()
 270                 << ").osd e" << osdmap.get_epoch() << " ";
 271 }
 272
 273 OSDMonitor::OSDMonitor(
 274   CephContext *cct,
 275   Monitor *mn,
 276   Paxos *p,
 277   const string& service_name)
 278  : PaxosService(mn, p, service_name),
 279    cct(cct),
 280    inc_osd_cache(g_conf()->mon_osd_cache_size),
 281    full_osd_cache(g_conf()->mon_osd_cache_size),
 282    has_osdmap_manifest(false),
 283    mapper(mn->cct, &mn->cpu_tp)
 284 {}
 285
 286 bool OSDMonitor::_have_pending_crush()
 287 {
 288   return pending_inc.crush.length() > 0;
 289 }
 290
 291 CrushWrapper &OSDMonitor::_get_stable_crush()
 292 {
 293   return *osdmap.crush;
 294 }
 295
 296 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
 297 {
 298   bufferlist bl;
 299   if (pending_inc.crush.length())
 300     bl = pending_inc.crush;
 301   else
 302     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 303
 304   auto p = bl.cbegin();
 305   newcrush.decode(p);
 306 }
 307
 308 void OSDMonitor::create_initial()
 309 {
 310   dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
 311
 312   OSDMap newmap;
 313
 314   bufferlist bl;
 315   mon->store->get("mkfs", "osdmap", bl);
 316
 317   if (bl.length()) {
 318     newmap.decode(bl);
 319     newmap.set_fsid(mon->monmap->fsid);
 320   } else {
 321     newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
 322   }
 323   newmap.set_epoch(1);
 324   newmap.created = newmap.modified = ceph_clock_now();
 325
 326   // new clusters should sort bitwise by default.
 327   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 328
 329   newmap.flags |=
 330     CEPH_OSDMAP_RECOVERY_DELETES |
 331     CEPH_OSDMAP_PURGED_SNAPDIRS |
 332     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 333   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 334   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 335   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 336   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 337   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 338   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 339
 340   // new cluster should require latest by default
 341   if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
 342     if (g_conf()->mon_debug_no_require_mimic) {
 343       derr << __func__ << " mon_debug_no_require_mimic=true and nautilus=true" << dendl;
 344       newmap.require_osd_release = CEPH_RELEASE_LUMINOUS;
 345     } else {
 346       derr << __func__ << " mon_debug_no_require_nautilus=true" << dendl;
 347       newmap.require_osd_release = CEPH_RELEASE_MIMIC;
 348     }
 349   } else {
 350     newmap.require_osd_release = CEPH_RELEASE_NAUTILUS;
 351     int r = ceph_release_from_name(
 352       g_conf()->mon_osd_initial_require_min_compat_client.c_str());
 353     if (r <= 0) {
 354       ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 355     }
 356     newmap.require_min_compat_client = r;
 357   }
 358
 359   // encode into pending incremental
 360   uint64_t features = newmap.get_encoding_features();
 361   newmap.encode(pending_inc.fullmap,
 362                 features | CEPH_FEATURE_RESERVED);
 363   pending_inc.full_crc = newmap.get_crc();
 364   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 365 }
 366
 367 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 368 {
 369   s.insert(service_name);
 370   s.insert(OSD_PG_CREATING_PREFIX);
 371   s.insert(OSD_METADATA_PREFIX);
 372   s.insert(OSD_SNAP_PREFIX);
 373 }
 374
 375 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 376 {
 377   // we really don't care if the version has been updated, because we may
 378   // have trimmed without having increased the last committed; yet, we may
 379   // need to update the in-memory manifest.
 380   load_osdmap_manifest();
 381
 382   version_t version = get_last_committed();
 383   if (version == osdmap.epoch)
 384     return;
 385   ceph_assert(version > osdmap.epoch);
 386
 387   dout(15) << "update_from_paxos paxos e " << version
 388            << ", my e " << osdmap.epoch << dendl;
 389
 390   if (mapping_job) {
 391     if (!mapping_job->is_done()) {
 392       dout(1) << __func__ << " mapping job "
 393               << mapping_job.get() << " did not complete, "
 394               << mapping_job->shards << " left, canceling" << dendl;
 395       mapping_job->abort();
 396     }
 397     mapping_job.reset();
 398   }
 399
 400   load_health();
 401
 402   /*
 403    * We will possibly have a stashed latest that *we* wrote, and we will
 404    * always be sure to have the oldest full map in the first..last range
 405    * due to encode_trim_extra(), which includes the oldest full map in the trim
 406    * transaction.
 407    *
 408    * encode_trim_extra() does not however write the full map's
 409    * version to 'full_latest'.  This is only done when we are building the
 410    * full maps from the incremental versions.  But don't panic!  We make sure
 411    * that the following conditions find whichever full map version is newer.
 412    */
 413   version_t latest_full = get_version_latest_full();
 414   if (latest_full == 0 && get_first_committed() > 1)
 415     latest_full = get_first_committed();
 416
 417   if (get_first_committed() > 1 &&
 418       latest_full < get_first_committed()) {
 419     // the monitor could be just sync'ed with its peer, and the latest_full key
 420     // is not encoded in the paxos commits in encode_pending(), so we need to
 421     // make sure we get it pointing to a proper version.
 422     version_t lc = get_last_committed();
 423     version_t fc = get_first_committed();
 424
 425     dout(10) << __func__ << " looking for valid full map in interval"
 426              << " [" << fc << ", " << lc << "]" << dendl;
 427
 428     latest_full = 0;
 429     for (version_t v = lc; v >= fc; v--) {
 430       string full_key = "full_" + stringify(v);
 431       if (mon->store->exists(get_service_name(), full_key)) {
 432         dout(10) << __func__ << " found latest full map v " << v << dendl;
 433         latest_full = v;
 434         break;
 435       }
 436     }
 437
 438     ceph_assert(latest_full > 0);
 439     auto t(std::make_shared<MonitorDBStore::Transaction>());
 440     put_version_latest_full(t, latest_full);
 441     mon->store->apply_transaction(t);
 442     dout(10) << __func__ << " updated the on-disk full map version to "
 443              << latest_full << dendl;
 444   }
 445
 446   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 447     bufferlist latest_bl;
 448     get_version_full(latest_full, latest_bl);
 449     ceph_assert(latest_bl.length() != 0);
 450     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 451     osdmap = OSDMap();
 452     osdmap.decode(latest_bl);
 453   }
 454
 455   bufferlist bl;
 456   if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 457     auto p = bl.cbegin();
 458     std::lock_guard<std::mutex> l(creating_pgs_lock);
 459     creating_pgs.decode(p);
 460     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 461             << creating_pgs.last_scan_epoch
 462             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 463   } else {
 464     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 465             << dendl;
 466   }
 467
 468   // walk through incrementals
 469   MonitorDBStore::TransactionRef t;
 470   size_t tx_size = 0;
 471   while (version > osdmap.epoch) {
 472     bufferlist inc_bl;
 473     int err = get_version(osdmap.epoch+1, inc_bl);
 474     ceph_assert(err == 0);
 475     ceph_assert(inc_bl.length());
 476
 477     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 478             << dendl;
 479     OSDMap::Incremental inc(inc_bl);
 480     err = osdmap.apply_incremental(inc);
 481     ceph_assert(err == 0);
 482
 483     if (!t)
 484       t.reset(new MonitorDBStore::Transaction);
 485
 486     // Write out the full map for all past epochs.  Encode the full
 487     // map with the same features as the incremental.  If we don't
 488     // know, use the quorum features.  If we don't know those either,
 489     // encode with all features.
 490     uint64_t f = inc.encode_features;
 491     if (!f)
 492       f = mon->get_quorum_con_features();
 493     if (!f)
 494       f = -1;
 495     bufferlist full_bl;
 496     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 497     tx_size += full_bl.length();
 498
 499     bufferlist orig_full_bl;
 500     get_version_full(osdmap.epoch, orig_full_bl);
 501     if (orig_full_bl.length()) {
 502       // the primary provided the full map
 503       ceph_assert(inc.have_crc);
 504       if (inc.full_crc != osdmap.crc) {
 505         // This will happen if the mons were running mixed versions in
 506         // the past or some other circumstance made the full encoded
 507         // maps divergent.  Reloading here will bring us back into
 508         // sync with the primary for this and all future maps.  OSDs
 509         // will also be brought back into sync when they discover the
 510         // crc mismatch and request a full map from a mon.
 511         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 512              << dendl;
 513
 514         dout(20) << __func__ << " my (bad) full osdmap:\n";
 515         JSONFormatter jf(true);
 516         jf.dump_object("osdmap", osdmap);
 517         jf.flush(*_dout);
 518         *_dout << "\nhexdump:\n";
 519         full_bl.hexdump(*_dout);
 520         *_dout << dendl;
 521
 522         osdmap = OSDMap();
 523         osdmap.decode(orig_full_bl);
 524
 525         dout(20) << __func__ << " canonical full osdmap:\n";
 526         JSONFormatter jf(true);
 527         jf.dump_object("osdmap", osdmap);
 528         jf.flush(*_dout);
 529         *_dout << "\nhexdump:\n";
 530         orig_full_bl.hexdump(*_dout);
 531         *_dout << dendl;
 532       }
 533     } else {
 534       ceph_assert(!inc.have_crc);
 535       put_version_full(t, osdmap.epoch, full_bl);
 536     }
 537     put_version_latest_full(t, osdmap.epoch);
 538
 539     // share
 540     dout(1) << osdmap << dendl;
 541
 542     if (osdmap.epoch == 1) {
 543       t->erase("mkfs", "osdmap");
 544     }
 545
 546     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 547       mon->store->apply_transaction(t);
 548       t = MonitorDBStore::TransactionRef();
 549       tx_size = 0;
 550     }
 551     for (const auto &osd_state : inc.new_state) {
 552       if (osd_state.second & CEPH_OSD_UP) {
 553         // could be marked up *or* down, but we're too lazy to check which
 554         last_osd_report.erase(osd_state.first);
 555       }
 556       if (osd_state.second & CEPH_OSD_EXISTS) {
 557         // could be created *or* destroyed, but we can safely drop it
 558         osd_epochs.erase(osd_state.first);
 559       }
 560     }
 561   }
 562
 563   if (t) {
 564     mon->store->apply_transaction(t);
 565   }
 566
 567   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 568     if (osdmap.is_out(o))
 569       continue;
 570     auto found = down_pending_out.find(o);
 571     if (osdmap.is_down(o)) {
 572       // populate down -> out map
 573       if (found == down_pending_out.end()) {
 574         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 575         down_pending_out[o] = ceph_clock_now();
 576       }
 577     } else {
 578       if (found != down_pending_out.end()) {
 579         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 580         down_pending_out.erase(found);
 581       }
 582     }
 583   }
 584   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 585
 586   check_osdmap_subs();
 587   check_pg_creates_subs();
 588
 589   share_map_with_random_osd();
 590   update_logger();
 591
 592   process_failures();
 593
 594   // make sure our feature bits reflect the latest map
 595   update_msgr_features();
 596
 597   if (!mon->is_leader()) {
 598     // will be called by on_active() on the leader, avoid doing so twice
 599     start_mapping();
 600   }
 601 }
 602
 603 void OSDMonitor::start_mapping()
 604 {
 605   // initiate mapping job
 606   if (mapping_job) {
 607     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
 608              << dendl;
 609     mapping_job->abort();
 610   }
 611   if (!osdmap.get_pools().empty()) {
 612     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
 613     mapping_job = mapping.start_update(osdmap, mapper,
 614                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
 615     dout(10) << __func__ << " started mapping job " << mapping_job.get()
 616              << " at " << fin->start << dendl;
 617     mapping_job->set_finish_event(fin);
 618   } else {
 619     dout(10) << __func__ << " no pools, no mapping job" << dendl;
 620     mapping_job = nullptr;
 621   }
 622 }
 623
 624 void OSDMonitor::update_msgr_features()
 625 {
 626   set<int> types;
 627   types.insert((int)entity_name_t::TYPE_OSD);
 628   types.insert((int)entity_name_t::TYPE_CLIENT);
 629   types.insert((int)entity_name_t::TYPE_MDS);
 630   types.insert((int)entity_name_t::TYPE_MON);
 631   for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
 632     uint64_t mask;
 633     uint64_t features = osdmap.get_features(*q, &mask);
 634     if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
 635       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
 636       ceph::net::Policy p = mon->messenger->get_policy(*q);
 637       p.features_required = (p.features_required & ~mask) | features;
 638       mon->messenger->set_policy(*q, p);
 639     }
 640   }
 641 }
 642
 643 void OSDMonitor::on_active()
 644 {
 645   update_logger();
 646
 647   if (mon->is_leader()) {
 648     mon->clog->debug() << "osdmap " << osdmap;
 649     if (!priority_convert) {
 650       // Only do this once at start-up
 651       convert_pool_priorities();
 652       priority_convert = true;
 653     }
 654   } else {
 655     list<MonOpRequestRef> ls;
 656     take_all_failures(ls);
 657     while (!ls.empty()) {
 658       MonOpRequestRef op = ls.front();
 659       op->mark_osdmon_event(__func__);
 660       dispatch(op);
 661       ls.pop_front();
 662     }
 663   }
 664   start_mapping();
 665 }
 666
 667 void OSDMonitor::on_restart()
 668 {
 669   last_osd_report.clear();
 670 }
 671
 672 void OSDMonitor::on_shutdown()
 673 {
 674   dout(10) << __func__ << dendl;
 675   if (mapping_job) {
 676     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
 677              << dendl;
 678     mapping_job->abort();
 679   }
 680
 681   // discard failure info, waiters
 682   list<MonOpRequestRef> ls;
 683   take_all_failures(ls);
 684   ls.clear();
 685 }
 686
 687 void OSDMonitor::update_logger()
 688 {
 689   dout(10) << "update_logger" << dendl;
 690
 691   mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
 692   mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
 693   mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
 694   mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
 695 }
 696
 697 void OSDMonitor::create_pending()
 698 {
 699   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
 700   pending_inc.fsid = mon->monmap->fsid;
 701   pending_metadata.clear();
 702   pending_metadata_rm.clear();
 703
 704   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
 705
 706   // safety checks (this shouldn't really happen)
 707   {
 708     if (osdmap.backfillfull_ratio <= 0) {
 709       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 710       if (pending_inc.new_backfillfull_ratio > 1.0)
 711         pending_inc.new_backfillfull_ratio /= 100;
 712       dout(1) << __func__ << " setting backfillfull_ratio = "
 713               << pending_inc.new_backfillfull_ratio << dendl;
 714     }
 715     if (osdmap.full_ratio <= 0) {
 716       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
 717       if (pending_inc.new_full_ratio > 1.0)
 718         pending_inc.new_full_ratio /= 100;
 719       dout(1) << __func__ << " setting full_ratio = "
 720               << pending_inc.new_full_ratio << dendl;
 721     }
 722     if (osdmap.nearfull_ratio <= 0) {
 723       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 724       if (pending_inc.new_nearfull_ratio > 1.0)
 725         pending_inc.new_nearfull_ratio /= 100;
 726       dout(1) << __func__ << " setting nearfull_ratio = "
 727               << pending_inc.new_nearfull_ratio << dendl;
 728     }
 729   }
 730
 731   // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
 732   // structure.
 733   if (osdmap.crush->has_legacy_rule_ids()) {
 734     CrushWrapper newcrush;
 735     _get_pending_crush(newcrush);
 736
 737     // First, for all pools, work out which rule they really used
 738     // by resolving ruleset to rule.
 739     for (const auto &i : osdmap.get_pools()) {
 740       const auto pool_id = i.first;
 741       const auto &pool = i.second;
 742       int new_rule_id = newcrush.find_rule(pool.crush_rule,
 743                                            pool.type, pool.size);
 744
 745       dout(1) << __func__ << " rewriting pool "
 746               << osdmap.get_pool_name(pool_id) << " crush ruleset "
 747               << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
 748       if (pending_inc.new_pools.count(pool_id) == 0) {
 749         pending_inc.new_pools[pool_id] = pool;
 750       }
 751       pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
 752     }
 753
 754     // Now, go ahead and renumber all the rules so that their
 755     // rule_id field corresponds to their position in the array
 756     auto old_to_new = newcrush.renumber_rules();
 757     dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
 758     for (const auto &i : old_to_new) {
 759       dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
 760     }
 761     pending_inc.crush.clear();
 762     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
 763   }
 764 }
 765
 766 creating_pgs_t
 767 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
 768                                const OSDMap& nextmap)
 769 {
 770   dout(10) << __func__ << dendl;
 771   creating_pgs_t pending_creatings;
 772   {
 773     std::lock_guard<std::mutex> l(creating_pgs_lock);
 774     pending_creatings = creating_pgs;
 775   }
 776   // check for new or old pools
 777   if (pending_creatings.last_scan_epoch < inc.epoch) {
 778     unsigned queued = 0;
 779     queued += scan_for_creating_pgs(osdmap.get_pools(),
 780                                     inc.old_pools,
 781                                     inc.modified,
 782                                     &pending_creatings);
 783     queued += scan_for_creating_pgs(inc.new_pools,
 784                                     inc.old_pools,
 785                                     inc.modified,
 786                                     &pending_creatings);
 787     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
 788     for (auto deleted_pool : inc.old_pools) {
 789       auto removed = pending_creatings.remove_pool(deleted_pool);
 790       dout(10) << __func__ << " " << removed
 791                << " pg removed because containing pool deleted: "
 792                << deleted_pool << dendl;
 793       last_epoch_clean.remove_pool(deleted_pool);
 794     }
 795     // pgmon updates its creating_pgs in check_osd_map() which is called by
 796     // on_active() and check_osd_map() could be delayed if lease expires, so its
 797     // creating_pgs could be stale in comparison with the one of osdmon. let's
 798     // trim them here. otherwise, they will be added back after being erased.
 799     unsigned removed = 0;
 800     for (auto& pg : pending_created_pgs) {
 801       dout(20) << __func__ << " noting created pg " << pg << dendl;
 802       pending_creatings.created_pools.insert(pg.pool());
 803       removed += pending_creatings.pgs.erase(pg);
 804     }
 805     pending_created_pgs.clear();
 806     dout(10) << __func__ << " " << removed
 807              << " pgs removed because they're created" << dendl;
 808     pending_creatings.last_scan_epoch = osdmap.get_epoch();
 809   }
 810
 811   // filter out any pgs that shouldn't exist.
 812   {
 813     auto i = pending_creatings.pgs.begin();
 814     while (i != pending_creatings.pgs.end()) {
 815       if (!nextmap.pg_exists(i->first)) {
 816         dout(10) << __func__ << " removing pg " << i->first
 817                  << " which should not exist" << dendl;
 818         i = pending_creatings.pgs.erase(i);
 819       } else {
 820         ++i;
 821       }
 822     }
 823   }
 824
 825   // process queue
 826   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
 827   const auto total = pending_creatings.pgs.size();
 828   while (pending_creatings.pgs.size() < max &&
 829          !pending_creatings.queue.empty()) {
 830     auto p = pending_creatings.queue.begin();
 831     int64_t poolid = p->first;
 832     dout(10) << __func__ << " pool " << poolid
 833              << " created " << p->second.created
 834              << " modified " << p->second.modified
 835              << " [" << p->second.start << "-" << p->second.end << ")"
 836              << dendl;
 837     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
 838                                   p->second.end - p->second.start);
 839     ps_t first = p->second.start;
 840     ps_t end = first + n;
 841     for (ps_t ps = first; ps < end; ++ps) {
 842       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
 843       // NOTE: use the *current* epoch as the PG creation epoch so that the
 844       // OSD does not have to generate a long set of PastIntervals.
 845       pending_creatings.pgs.emplace(pgid, make_pair(inc.epoch,
 846                                                     p->second.modified));
 847       dout(10) << __func__ << " adding " << pgid << dendl;
 848     }
 849     p->second.start = end;
 850     if (p->second.done()) {
 851       dout(10) << __func__ << " done with queue for " << poolid << dendl;
 852       pending_creatings.queue.erase(p);
 853     } else {
 854       dout(10) << __func__ << " pool " << poolid
 855                << " now [" << p->second.start << "-" << p->second.end << ")"
 856                << dendl;
 857     }
 858   }
 859   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
 860            << " pools" << dendl;
 861   dout(10) << __func__
 862            << " " << (pending_creatings.pgs.size() - total)
 863            << "/" << pending_creatings.pgs.size()
 864            << " pgs added from queued pools" << dendl;
 865   return pending_creatings;
 866 }
 867
 868 void OSDMonitor::maybe_prime_pg_temp()
 869 {
 870   bool all = false;
 871   if (pending_inc.crush.length()) {
 872     dout(10) << __func__ << " new crush map, all" << dendl;
 873     all = true;
 874   }
 875
 876   if (!pending_inc.new_up_client.empty()) {
 877     dout(10) << __func__ << " new up osds, all" << dendl;
 878     all = true;
 879   }
 880
 881   // check for interesting OSDs
 882   set<int> osds;
 883   for (auto p = pending_inc.new_state.begin();
 884        !all && p != pending_inc.new_state.end();
 885        ++p) {
 886     if ((p->second & CEPH_OSD_UP) &&
 887         osdmap.is_up(p->first)) {
 888       osds.insert(p->first);
 889     }
 890   }
 891   for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
 892        !all && p != pending_inc.new_weight.end();
 893        ++p) {
 894     if (p->second < osdmap.get_weight(p->first)) {
 895       // weight reduction
 896       osds.insert(p->first);
 897     } else {
 898       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
 899                << dendl;
 900       all = true;
 901     }
 902   }
 903
 904   if (!all && osds.empty())
 905     return;
 906
 907   if (!all) {
 908     unsigned estimate =
 909       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
 910     if (estimate > mapping.get_num_pgs() *
 911         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
 912       dout(10) << __func__ << " estimate " << estimate << " pgs on "
 913                << osds.size() << " osds >= "
 914                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
 915                << mapping.get_num_pgs() << " pgs, all"
 916                << dendl;
 917       all = true;
 918     } else {
 919       dout(10) << __func__ << " estimate " << estimate << " pgs on "
 920                << osds.size() << " osds" << dendl;
 921     }
 922   }
 923
 924   OSDMap next;
 925   next.deepish_copy_from(osdmap);
 926   next.apply_incremental(pending_inc);
 927
 928   if (next.get_pools().empty()) {
 929     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
 930   } else if (all) {
 931     PrimeTempJob job(next, this);
 932     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk);
 933     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
 934       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
 935     } else {
 936       dout(10) << __func__ << " did not finish in "
 937                << g_conf()->mon_osd_prime_pg_temp_max_time
 938                << ", stopping" << dendl;
 939       job.abort();
 940     }
 941   } else {
 942     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
 943     utime_t stop = ceph_clock_now();
 944     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
 945     const int chunk = 1000;
 946     int n = chunk;
 947     std::unordered_set<pg_t> did_pgs;
 948     for (auto osd : osds) {
 949       auto& pgs = mapping.get_osd_acting_pgs(osd);
 950       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
 951       for (auto pgid : pgs) {
 952         if (!did_pgs.insert(pgid).second) {
 953           continue;
 954         }
 955         prime_pg_temp(next, pgid);
 956         if (--n <= 0) {
 957           n = chunk;
 958           if (ceph_clock_now() > stop) {
 959             dout(10) << __func__ << " consumed more than "
 960                      << g_conf()->mon_osd_prime_pg_temp_max_time
 961                      << " seconds, stopping"
 962                      << dendl;
 963             return;
 964           }
 965         }
 966       }
 967     }
 968   }
 969 }
 970
 971 void OSDMonitor::prime_pg_temp(
 972   const OSDMap& next,
 973   pg_t pgid)
 974 {
 975   // TODO: remove this creating_pgs direct access?
 976   if (creating_pgs.pgs.count(pgid)) {
 977     return;
 978   }
 979   if (!osdmap.pg_exists(pgid)) {
 980     return;
 981   }
 982
 983   vector<int> up, acting;
 984   mapping.get(pgid, &up, nullptr, &acting, nullptr);
 985
 986   vector<int> next_up, next_acting;
 987   int next_up_primary, next_acting_primary;
 988   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
 989                             &next_acting, &next_acting_primary);
 990   if (acting == next_acting &&
 991       !(up != acting && next_up == next_acting))
 992     return;  // no change since last epoch
 993
 994   if (acting.empty())
 995     return;  // if previously empty now we can be no worse off
 996   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
 997   if (pool && acting.size() < pool->min_size)
 998     return;  // can be no worse off than before
 999
1000   if (next_up == next_acting) {
1001     acting.clear();
1002     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1003              << dendl;
1004   }
1005
1006   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1007            << " -> " << next_up << "/" << next_acting
1008            << ", priming " << acting
1009            << dendl;
1010   {
1011     std::lock_guard l(prime_pg_temp_lock);
1012     // do not touch a mapping if a change is pending
1013     pending_inc.new_pg_temp.emplace(
1014       pgid,
1015       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1016   }
1017 }
1018
1019 /**
1020  * @note receiving a transaction in this function gives a fair amount of
1021  * freedom to the service implementation if it does need it. It shouldn't.
1022  */
1023 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1024 {
1025   dout(10) << "encode_pending e " << pending_inc.epoch
1026            << dendl;
1027
1028   if (do_prune(t)) {
1029     dout(1) << __func__ << " osdmap full prune encoded e"
1030             << pending_inc.epoch << dendl;
1031   }
1032
1033   // finalize up pending_inc
1034   pending_inc.modified = ceph_clock_now();
1035
1036   int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1037   ceph_assert(r == 0);
1038
1039   if (mapping_job) {
1040     if (!mapping_job->is_done()) {
1041       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1042               << mapping_job.get() << " did not complete, "
1043               << mapping_job->shards << " left" << dendl;
1044       mapping_job->abort();
1045     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1046       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1047               << mapping_job.get() << " is prior epoch "
1048               << mapping.get_epoch() << dendl;
1049     } else {
1050       if (g_conf()->mon_osd_prime_pg_temp) {
1051         maybe_prime_pg_temp();
1052       }
1053     }
1054   } else if (g_conf()->mon_osd_prime_pg_temp) {
1055     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1056             << dendl;
1057   }
1058   mapping_job.reset();
1059
1060   // ensure we don't have blank new_state updates.  these are interrpeted as
1061   // CEPH_OSD_UP (and almost certainly not what we want!).
1062   auto p = pending_inc.new_state.begin();
1063   while (p != pending_inc.new_state.end()) {
1064     if (p->second == 0) {
1065       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1066       p = pending_inc.new_state.erase(p);
1067     } else {
1068       if (p->second & CEPH_OSD_UP) {
1069         pending_inc.new_last_up_change = pending_inc.modified;
1070       }
1071       ++p;
1072     }
1073   }
1074   if (!pending_inc.new_up_client.empty()) {
1075     pending_inc.new_last_up_change = pending_inc.modified;
1076   }
1077   for (auto& i : pending_inc.new_weight) {
1078     if (i.first > osdmap.max_osd) {
1079       if (i.second) {
1080         // new osd is already marked in
1081         pending_inc.new_last_in_change = pending_inc.modified;
1082       }
1083     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1084       // existing osd marked in or out
1085       pending_inc.new_last_in_change = pending_inc.modified;
1086     }
1087   }
1088
1089   {
1090     OSDMap tmp;
1091     tmp.deepish_copy_from(osdmap);
1092     tmp.apply_incremental(pending_inc);
1093
1094     // clean pg_temp mappings
1095     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1096
1097     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1098     osdmap.maybe_remove_pg_upmaps(cct, osdmap, tmp, &pending_inc);
1099
1100     // update creating pgs first so that we can remove the created pgid and
1101     // process the pool flag removal below in the same osdmap epoch.
1102     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1103     bufferlist creatings_bl;
1104     encode(pending_creatings, creatings_bl);
1105     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1106
1107     // remove any old (or incompat) POOL_CREATING flags
1108     for (auto& i : tmp.get_pools()) {
1109       if (tmp.require_osd_release < CEPH_RELEASE_NAUTILUS) {
1110         // pre-nautilus OSDMaps shouldn't get this flag.
1111         if (pending_inc.new_pools.count(i.first)) {
1112           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1113         }
1114       }
1115       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1116           !pending_creatings.still_creating_pool(i.first)) {
1117         dout(10) << __func__ << " done creating pool " << i.first
1118                  << ", clearing CREATING flag" << dendl;
1119         if (pending_inc.new_pools.count(i.first) == 0) {
1120           pending_inc.new_pools[i.first] = i.second;
1121         }
1122         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1123       }
1124     }
1125
1126     // remove any legacy osdmap nearfull/full flags
1127     {
1128       if (tmp.test_flag(CEPH_OSDMAP_FULL | CEPH_OSDMAP_NEARFULL)) {
1129         dout(10) << __func__ << " clearing legacy osdmap nearfull/full flag"
1130                  << dendl;
1131         remove_flag(CEPH_OSDMAP_NEARFULL);
1132         remove_flag(CEPH_OSDMAP_FULL);
1133       }
1134     }
1135     // collect which pools are currently affected by
1136     // the near/backfill/full osd(s),
1137     // and set per-pool near/backfill/full flag instead
1138     set<int64_t> full_pool_ids;
1139     set<int64_t> backfillfull_pool_ids;
1140     set<int64_t> nearfull_pool_ids;
1141     tmp.get_full_pools(cct,
1142                        &full_pool_ids,
1143                        &backfillfull_pool_ids,
1144                          &nearfull_pool_ids);
1145     if (full_pool_ids.empty() ||
1146         backfillfull_pool_ids.empty() ||
1147         nearfull_pool_ids.empty()) {
1148       // normal case - no nearfull, backfillfull or full osds
1149         // try cancel any improper nearfull/backfillfull/full pool
1150         // flags first
1151       for (auto &pool: tmp.get_pools()) {
1152         auto p = pool.first;
1153         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1154             nearfull_pool_ids.empty()) {
1155           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1156                    << "'s nearfull flag" << dendl;
1157           if (pending_inc.new_pools.count(p) == 0) {
1158             // load original pool info first!
1159             pending_inc.new_pools[p] = pool.second;
1160           }
1161           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1162         }
1163         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1164             backfillfull_pool_ids.empty()) {
1165           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1166                    << "'s backfillfull flag" << dendl;
1167           if (pending_inc.new_pools.count(p) == 0) {
1168             pending_inc.new_pools[p] = pool.second;
1169           }
1170           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1171         }
1172         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1173             full_pool_ids.empty()) {
1174           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1175             // set by EQUOTA, skipping
1176             continue;
1177           }
1178           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1179                    << "'s full flag" << dendl;
1180           if (pending_inc.new_pools.count(p) == 0) {
1181             pending_inc.new_pools[p] = pool.second;
1182           }
1183           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1184         }
1185       }
1186     }
1187     if (!full_pool_ids.empty()) {
1188       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1189                << " as full" << dendl;
1190       for (auto &p: full_pool_ids) {
1191         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1192           continue;
1193         }
1194         if (pending_inc.new_pools.count(p) == 0) {
1195           pending_inc.new_pools[p] = tmp.pools[p];
1196         }
1197         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1198         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1199         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1200       }
1201       // cancel FLAG_FULL for pools which are no longer full too
1202       for (auto &pool: tmp.get_pools()) {
1203         auto p = pool.first;
1204         if (full_pool_ids.count(p)) {
1205           // skip pools we have just marked as full above
1206           continue;
1207         }
1208         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1209             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1210           // don't touch if currently is not full
1211           // or is running out of quota (and hence considered as full)
1212           continue;
1213         }
1214         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1215                  << "'s full flag" << dendl;
1216         if (pending_inc.new_pools.count(p) == 0) {
1217           pending_inc.new_pools[p] = pool.second;
1218         }
1219         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1220       }
1221     }
1222     if (!backfillfull_pool_ids.empty()) {
1223       for (auto &p: backfillfull_pool_ids) {
1224         if (full_pool_ids.count(p)) {
1225           // skip pools we have already considered as full above
1226           continue;
1227         }
1228         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1229           // make sure FLAG_FULL is truly set, so we are safe not
1230           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1231           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1232           continue;
1233         }
1234         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1235           // don't bother if pool is already marked as backfillfull
1236           continue;
1237         }
1238         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1239                  << "'s as backfillfull" << dendl;
1240         if (pending_inc.new_pools.count(p) == 0) {
1241           pending_inc.new_pools[p] = tmp.pools[p];
1242         }
1243         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1244         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1245       }
1246       // cancel FLAG_BACKFILLFULL for pools
1247       // which are no longer backfillfull too
1248       for (auto &pool: tmp.get_pools()) {
1249         auto p = pool.first;
1250         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1251           // skip pools we have just marked as backfillfull/full above
1252           continue;
1253         }
1254         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1255           // and don't touch if currently is not backfillfull
1256           continue;
1257         }
1258         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1259                  << "'s backfillfull flag" << dendl;
1260         if (pending_inc.new_pools.count(p) == 0) {
1261           pending_inc.new_pools[p] = pool.second;
1262         }
1263         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1264       }
1265     }
1266     if (!nearfull_pool_ids.empty()) {
1267       for (auto &p: nearfull_pool_ids) {
1268         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1269           continue;
1270         }
1271         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1272           // make sure FLAG_FULL is truly set, so we are safe not
1273           // to set a extra (redundant) FLAG_NEARFULL flag
1274           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1275           continue;
1276         }
1277         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1278           // don't bother if pool is already marked as nearfull
1279           continue;
1280         }
1281         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1282                  << "'s as nearfull" << dendl;
1283         if (pending_inc.new_pools.count(p) == 0) {
1284           pending_inc.new_pools[p] = tmp.pools[p];
1285         }
1286         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1287       }
1288       // cancel FLAG_NEARFULL for pools
1289       // which are no longer nearfull too
1290       for (auto &pool: tmp.get_pools()) {
1291         auto p = pool.first;
1292         if (full_pool_ids.count(p) ||
1293             backfillfull_pool_ids.count(p) ||
1294             nearfull_pool_ids.count(p)) {
1295           // skip pools we have just marked as
1296           // nearfull/backfillfull/full above
1297           continue;
1298         }
1299         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1300           // and don't touch if currently is not nearfull
1301           continue;
1302         }
1303         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1304                  << "'s nearfull flag" << dendl;
1305         if (pending_inc.new_pools.count(p) == 0) {
1306           pending_inc.new_pools[p] = pool.second;
1307         }
1308         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1309       }
1310     }
1311
1312     // min_compat_client?
1313     if (tmp.require_min_compat_client == 0) {
1314       auto mv = tmp.get_min_compat_client();
1315       dout(1) << __func__ << " setting require_min_compat_client to currently "
1316               << "required " << ceph_release_name(mv) << dendl;
1317       mon->clog->info() << "setting require_min_compat_client to currently "
1318                         << "required " << ceph_release_name(mv);
1319       pending_inc.new_require_min_compat_client = mv;
1320     }
1321
1322     // upgrade to mimic?
1323     if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC &&
1324         tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1325       dout(10) << __func__ << " first mimic+ epoch" << dendl;
1326       // record this epoch as the deletion for all legacy removed_snaps
1327       for (auto& p : tmp.get_pools()) {
1328         // update every pool
1329         if (pending_inc.new_pools.count(p.first) == 0) {
1330           pending_inc.new_pools[p.first] = p.second;
1331         }
1332         auto& pi = pending_inc.new_pools[p.first];
1333         if (pi.snap_seq == 0) {
1334           // no snaps on this pool
1335           continue;
1336         }
1337         if ((pi.flags & (pg_pool_t::FLAG_SELFMANAGED_SNAPS |
1338                          pg_pool_t::FLAG_POOL_SNAPS)) == 0) {
1339           if (!pi.removed_snaps.empty()) {
1340             pi.flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
1341           } else {
1342             pi.flags |= pg_pool_t::FLAG_POOL_SNAPS;
1343           }
1344         }
1345
1346         // Make all previously removed snaps appear to be removed in this
1347         // epoch.  this populates removed_snaps_queue.  The OSD will subtract
1348         // off its purged_snaps, as before, and this set will shrink over the
1349         // following epochs as the purged snaps are reported back through the
1350         // mgr.
1351         OSDMap::snap_interval_set_t removed;
1352         if (!p.second.removed_snaps.empty()) {
1353           // different flavor of interval_set :(
1354           for (auto q = p.second.removed_snaps.begin();
1355                q != p.second.removed_snaps.end();
1356                ++q) {
1357             removed.insert(q.get_start(), q.get_len());
1358           }
1359         } else {
1360           for (snapid_t s = 1; s <= pi.get_snap_seq(); s = s + 1) {
1361             if (pi.snaps.count(s) == 0) {
1362               removed.insert(s);
1363             }
1364           }
1365         }
1366         pending_inc.new_removed_snaps[p.first].union_of(removed);
1367
1368         dout(10) << __func__ << " converting pool " << p.first
1369                  << " with " << p.second.removed_snaps.size()
1370                  << " legacy removed_snaps" << dendl;
1371         string k = make_snap_epoch_key(p.first, pending_inc.epoch);
1372         bufferlist v;
1373         encode(p.second.removed_snaps, v);
1374         t->put(OSD_SNAP_PREFIX, k, v);
1375         for (auto q = p.second.removed_snaps.begin();
1376              q != p.second.removed_snaps.end();
1377              ++q) {
1378           bufferlist v;
1379           string k = make_snap_key_value(p.first, q.get_start(),
1380                                          q.get_len(), pending_inc.epoch, &v);
1381           t->put(OSD_SNAP_PREFIX, k, v);
1382         }
1383       }
1384     }
1385     if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS &&
1386         tmp.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
1387       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1388       // add creating flags?
1389       for (auto& i : tmp.get_pools()) {
1390         if (pending_creatings.still_creating_pool(i.first)) {
1391           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1392                    << dendl;
1393           if (pending_inc.new_pools.count(i.first) == 0) {
1394             pending_inc.new_pools[i.first] = i.second;
1395           }
1396           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1397         }
1398       }
1399       // adjust blacklist items to all be TYPE_ANY
1400       for (auto& i : tmp.blacklist) {
1401         auto a = i.first;
1402         a.set_type(entity_addr_t::TYPE_ANY);
1403         pending_inc.new_blacklist[a] = i.second;
1404         pending_inc.old_blacklist.push_back(i.first);
1405       }
1406     }
1407   }
1408
1409   // tell me about it
1410   for (auto i = pending_inc.new_state.begin();
1411        i != pending_inc.new_state.end();
1412        ++i) {
1413     int s = i->second ? i->second : CEPH_OSD_UP;
1414     if (s & CEPH_OSD_UP)
1415       dout(2) << " osd." << i->first << " DOWN" << dendl;
1416     if (s & CEPH_OSD_EXISTS)
1417       dout(2) << " osd." << i->first << " DNE" << dendl;
1418   }
1419   for (auto i = pending_inc.new_up_client.begin();
1420        i != pending_inc.new_up_client.end();
1421        ++i) {
1422     //FIXME: insert cluster addresses too
1423     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1424   }
1425   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1426        i != pending_inc.new_weight.end();
1427        ++i) {
1428     if (i->second == CEPH_OSD_OUT) {
1429       dout(2) << " osd." << i->first << " OUT" << dendl;
1430     } else if (i->second == CEPH_OSD_IN) {
1431       dout(2) << " osd." << i->first << " IN" << dendl;
1432     } else {
1433       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1434     }
1435   }
1436
1437   // features for osdmap and its incremental
1438   uint64_t features;
1439
1440   // encode full map and determine its crc
1441   OSDMap tmp;
1442   {
1443     tmp.deepish_copy_from(osdmap);
1444     tmp.apply_incremental(pending_inc);
1445
1446     // determine appropriate features
1447     features = tmp.get_encoding_features();
1448     dout(10) << __func__ << " encoding full map with "
1449              << ceph_release_name(tmp.require_osd_release)
1450              << " features " << features << dendl;
1451
1452     // the features should be a subset of the mon quorum's features!
1453     ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1454
1455     bufferlist fullbl;
1456     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1457     pending_inc.full_crc = tmp.get_crc();
1458
1459     // include full map in the txn.  note that old monitors will
1460     // overwrite this.  new ones will now skip the local full map
1461     // encode and reload from this.
1462     put_version_full(t, pending_inc.epoch, fullbl);
1463   }
1464
1465   // encode
1466   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1467   bufferlist bl;
1468   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1469
1470   dout(20) << " full_crc " << tmp.get_crc()
1471            << " inc_crc " << pending_inc.inc_crc << dendl;
1472
1473   /* put everything in the transaction */
1474   put_version(t, pending_inc.epoch, bl);
1475   put_last_committed(t, pending_inc.epoch);
1476
1477   // metadata, too!
1478   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1479        p != pending_metadata.end();
1480        ++p)
1481     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1482   for (set<int>::iterator p = pending_metadata_rm.begin();
1483        p != pending_metadata_rm.end();
1484        ++p)
1485     t->erase(OSD_METADATA_PREFIX, stringify(*p));
1486   pending_metadata.clear();
1487   pending_metadata_rm.clear();
1488
1489   // removed_snaps
1490   if (tmp.require_osd_release >= CEPH_RELEASE_MIMIC) {
1491     for (auto& i : pending_inc.new_removed_snaps) {
1492       {
1493         // all snaps removed this epoch
1494         string k = make_snap_epoch_key(i.first, pending_inc.epoch);
1495         bufferlist v;
1496         encode(i.second, v);
1497         t->put(OSD_SNAP_PREFIX, k, v);
1498       }
1499       for (auto q = i.second.begin();
1500            q != i.second.end();
1501            ++q) {
1502         bufferlist v;
1503         string k = make_snap_key_value(i.first, q.get_start(),
1504                                        q.get_len(), pending_inc.epoch, &v);
1505         t->put(OSD_SNAP_PREFIX, k, v);
1506       }
1507     }
1508     for (auto& i : pending_inc.new_purged_snaps) {
1509       for (auto q = i.second.begin();
1510            q != i.second.end();
1511            ++q) {
1512         bufferlist v;
1513         string k = make_snap_purged_key_value(i.first, q.get_start(),
1514                                               q.get_len(), pending_inc.epoch,
1515                                               &v);
1516         t->put(OSD_SNAP_PREFIX, k, v);
1517       }
1518     }
1519   }
1520
1521   // health
1522   health_check_map_t next;
1523   tmp.check_health(&next);
1524   encode_health(next, t);
1525 }
1526
1527 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
1528 {
1529   bufferlist bl;
1530   int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
1531   if (r < 0)
1532     return r;
1533   try {
1534     auto p = bl.cbegin();
1535     decode(m, p);
1536   }
1537   catch (buffer::error& e) {
1538     if (err)
1539       *err << "osd." << osd << " metadata is corrupt";
1540     return -EIO;
1541   }
1542   return 0;
1543 }
1544
1545 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
1546 {
1547   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
1548     if (osdmap.is_up(osd)) {
1549       map<string,string> meta;
1550       load_metadata(osd, meta, nullptr);
1551       auto p = meta.find(field);
1552       if (p == meta.end()) {
1553         (*out)["unknown"]++;
1554       } else {
1555         (*out)[p->second]++;
1556       }
1557     }
1558   }
1559 }
1560
1561 void OSDMonitor::count_metadata(const string& field, Formatter *f)
1562 {
1563   map<string,int> by_val;
1564   count_metadata(field, &by_val);
1565   f->open_object_section(field.c_str());
1566   for (auto& p : by_val) {
1567     f->dump_int(p.first.c_str(), p.second);
1568   }
1569   f->close_section();
1570 }
1571
1572 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
1573 {
1574   map<string, string> metadata;
1575   int r = load_metadata(osd, metadata, nullptr);
1576   if (r < 0)
1577     return r;
1578
1579   auto it = metadata.find("osd_objectstore");
1580   if (it == metadata.end())
1581     return -ENOENT;
1582   *type = it->second;
1583   return 0;
1584 }
1585
1586 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
1587                                                  const pg_pool_t &pool,
1588                                                  ostream *err)
1589 {
1590   // just check a few pgs for efficiency - this can't give a guarantee anyway,
1591   // since filestore osds could always join the pool later
1592   set<int> checked_osds;
1593   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
1594     vector<int> up, acting;
1595     pg_t pgid(ps, pool_id);
1596     osdmap.pg_to_up_acting_osds(pgid, up, acting);
1597     for (int osd : up) {
1598       if (checked_osds.find(osd) != checked_osds.end())
1599         continue;
1600       string objectstore_type;
1601       int r = get_osd_objectstore_type(osd, &objectstore_type);
1602       // allow with missing metadata, e.g. due to an osd never booting yet
1603       if (r < 0 || objectstore_type == "bluestore") {
1604         checked_osds.insert(osd);
1605         continue;
1606       }
1607       *err << "osd." << osd << " uses " << objectstore_type;
1608       return false;
1609     }
1610   }
1611   return true;
1612 }
1613
1614 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
1615 {
1616   map<string,string> m;
1617   if (int r = load_metadata(osd, m, err))
1618     return r;
1619   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
1620     f->dump_string(p->first.c_str(), p->second);
1621   return 0;
1622 }
1623
1624 void OSDMonitor::print_nodes(Formatter *f)
1625 {
1626   // group OSDs by their hosts
1627   map<string, list<int> > osds; // hostname => osd
1628   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
1629     map<string, string> m;
1630     if (load_metadata(osd, m, NULL)) {
1631       continue;
1632     }
1633     map<string, string>::iterator hostname = m.find("hostname");
1634     if (hostname == m.end()) {
1635       // not likely though
1636       continue;
1637     }
1638     osds[hostname->second].push_back(osd);
1639   }
1640
1641   dump_services(f, osds, "osd");
1642 }
1643
1644 void OSDMonitor::share_map_with_random_osd()
1645 {
1646   if (osdmap.get_num_up_osds() == 0) {
1647     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
1648     return;
1649   }
1650
1651   MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
1652   if (!s) {
1653     dout(10) << __func__ << " no up osd on our session map" << dendl;
1654     return;
1655   }
1656
1657   dout(10) << "committed, telling random " << s->name
1658            << " all about it" << dendl;
1659
1660   // get feature of the peer
1661   // use quorum_con_features, if it's an anonymous connection.
1662   uint64_t features = s->con_features ? s->con_features :
1663                                         mon->get_quorum_con_features();
1664   // whatev, they'll request more if they need it
1665   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
1666   s->con->send_message(m);
1667   // NOTE: do *not* record osd has up to this epoch (as we do
1668   // elsewhere) as they may still need to request older values.
1669 }
1670
1671 version_t OSDMonitor::get_trim_to() const
1672 {
1673   if (mon->get_quorum().empty()) {
1674     dout(10) << __func__ << ": quorum not formed" << dendl;
1675     return 0;
1676   }
1677
1678   {
1679     std::lock_guard<std::mutex> l(creating_pgs_lock);
1680     if (!creating_pgs.pgs.empty()) {
1681       return 0;
1682     }
1683   }
1684
1685   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
1686     dout(0) << __func__
1687             << " blocking osdmap trim"
1688                " ('mon_debug_block_osdmap_trim' set to 'true')"
1689             << dendl;
1690     return 0;
1691   }
1692
1693   {
1694     epoch_t floor = get_min_last_epoch_clean();
1695     dout(10) << " min_last_epoch_clean " << floor << dendl;
1696     if (g_conf()->mon_osd_force_trim_to > 0 &&
1697         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
1698       floor = g_conf()->mon_osd_force_trim_to;
1699       dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
1700     }
1701     unsigned min = g_conf()->mon_min_osdmap_epochs;
1702     if (floor + min > get_last_committed()) {
1703       if (min < get_last_committed())
1704         floor = get_last_committed() - min;
1705       else
1706         floor = 0;
1707     }
1708     if (floor > get_first_committed())
1709       return floor;
1710   }
1711   return 0;
1712 }
1713
1714 epoch_t OSDMonitor::get_min_last_epoch_clean() const
1715 {
1716   auto floor = last_epoch_clean.get_lower_bound(osdmap);
1717   // also scan osd epochs
1718   // don't trim past the oldest reported osd epoch
1719   for (auto& osd_epoch : osd_epochs) {
1720     if (osd_epoch.second < floor) {
1721       floor = osd_epoch.second;
1722     }
1723   }
1724   return floor;
1725 }
1726
1727 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
1728                                    version_t first)
1729 {
1730   dout(10) << __func__ << " including full map for e " << first << dendl;
1731   bufferlist bl;
1732   get_version_full(first, bl);
1733   put_version_full(tx, first, bl);
1734
1735   if (has_osdmap_manifest &&
1736       first > osdmap_manifest.get_first_pinned()) {
1737     _prune_update_trimmed(tx, first);
1738   }
1739 }
1740
1741
1742 /* full osdmap prune
1743  *
1744  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
1745  */
1746
1747 void OSDMonitor::load_osdmap_manifest()
1748 {
1749   bool store_has_manifest =
1750     mon->store->exists(get_service_name(), "osdmap_manifest");
1751
1752   if (!store_has_manifest) {
1753     if (!has_osdmap_manifest) {
1754       return;
1755     }
1756
1757     dout(20) << __func__
1758              << " dropping osdmap manifest from memory." << dendl;
1759     osdmap_manifest = osdmap_manifest_t();
1760     has_osdmap_manifest = false;
1761     return;
1762   }
1763
1764   dout(20) << __func__
1765            << " osdmap manifest detected in store; reload." << dendl;
1766
1767   bufferlist manifest_bl;
1768   int r = get_value("osdmap_manifest", manifest_bl);
1769   if (r < 0) {
1770     derr << __func__ << " unable to read osdmap version manifest" << dendl;
1771     ceph_abort_msg("error reading manifest");
1772   }
1773   osdmap_manifest.decode(manifest_bl);
1774   has_osdmap_manifest = true;
1775
1776   dout(10) << __func__ << " store osdmap manifest pinned ("
1777            << osdmap_manifest.get_first_pinned()
1778            << " .. "
1779            << osdmap_manifest.get_last_pinned()
1780            << ")"
1781            << dendl;
1782 }
1783
1784 bool OSDMonitor::should_prune() const
1785 {
1786   version_t first = get_first_committed();
1787   version_t last = get_last_committed();
1788   version_t min_osdmap_epochs =
1789     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
1790   version_t prune_min =
1791     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
1792   version_t prune_interval =
1793     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
1794   version_t last_pinned = osdmap_manifest.get_last_pinned();
1795   version_t last_to_pin = last - min_osdmap_epochs;
1796
1797   // Make it or break it constraints.
1798   //
1799   // If any of these conditions fails, we will not prune, regardless of
1800   // whether we have an on-disk manifest with an on-going pruning state.
1801   //
1802   if ((last - first) <= min_osdmap_epochs) {
1803     // between the first and last committed epochs, we don't have
1804     // enough epochs to trim, much less to prune.
1805     dout(10) << __func__
1806              << " currently holding only " << (last - first)
1807              << " epochs (min osdmap epochs: " << min_osdmap_epochs
1808              << "); do not prune."
1809              << dendl;
1810     return false;
1811
1812   } else if ((last_to_pin - first) < prune_min) {
1813     // between the first committed epoch and the last epoch we would prune,
1814     // we simply don't have enough versions over the minimum to prune maps.
1815     dout(10) << __func__
1816              << " could only prune " << (last_to_pin - first)
1817              << " epochs (" << first << ".." << last_to_pin << "), which"
1818                 " is less than the required minimum (" << prune_min << ")"
1819              << dendl;
1820     return false;
1821
1822   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
1823     dout(10) << __func__
1824              << " we have pruned as far as we can; do not prune."
1825              << dendl;
1826     return false;
1827
1828   } else if (last_pinned + prune_interval > last_to_pin) {
1829     dout(10) << __func__
1830              << " not enough epochs to form an interval (last pinned: "
1831              << last_pinned << ", last to pin: "
1832              << last_to_pin << ", interval: " << prune_interval << ")"
1833              << dendl;
1834     return false;
1835   }
1836
1837   dout(15) << __func__
1838            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
1839            << " lc (" << first << ".." << last << ")"
1840            << dendl;
1841   return true;
1842 }
1843
1844 void OSDMonitor::_prune_update_trimmed(
1845     MonitorDBStore::TransactionRef tx,
1846     version_t first)
1847 {
1848   dout(10) << __func__
1849            << " first " << first
1850            << " last_pinned " << osdmap_manifest.get_last_pinned()
1851            << " last_pinned " << osdmap_manifest.get_last_pinned()
1852            << dendl;
1853
1854   osdmap_manifest_t manifest = osdmap_manifest;
1855
1856   if (!manifest.is_pinned(first)) {
1857     manifest.pin(first);
1858   }
1859
1860   set<version_t>::iterator p_end = manifest.pinned.find(first);
1861   set<version_t>::iterator p = manifest.pinned.begin();
1862   manifest.pinned.erase(p, p_end);
1863   ceph_assert(manifest.get_first_pinned() == first);
1864
1865   if (manifest.get_last_pinned() == first+1 ||
1866       manifest.pinned.size() == 1) {
1867     // we reached the end of the line, as pinned maps go; clean up our
1868     // manifest, and let `should_prune()` decide whether we should prune
1869     // again.
1870     tx->erase(get_service_name(), "osdmap_manifest");
1871     return;
1872   }
1873
1874   bufferlist bl;
1875   manifest.encode(bl);
1876   tx->put(get_service_name(), "osdmap_manifest", bl);
1877 }
1878
1879 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
1880 {
1881   dout(1) << __func__ << dendl;
1882
1883   version_t pin_first;
1884
1885   // verify constrainsts on stable in-memory state
1886   if (!has_osdmap_manifest) {
1887     // we must have never pruned, OR if we pruned the state must no longer
1888     // be relevant (i.e., the state must have been removed alongside with
1889     // the trim that *must* have removed past the last pinned map in a
1890     // previous prune).
1891     ceph_assert(osdmap_manifest.pinned.empty());
1892     ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
1893     pin_first = get_first_committed();
1894
1895   } else {
1896     // we must have pruned in the past AND its state is still relevant
1897     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
1898     // and thus we still hold a manifest in the store).
1899     ceph_assert(!osdmap_manifest.pinned.empty());
1900     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
1901     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
1902
1903     dout(10) << __func__
1904              << " first_pinned " << osdmap_manifest.get_first_pinned()
1905              << " last_pinned " << osdmap_manifest.get_last_pinned()
1906              << dendl;
1907
1908     pin_first = osdmap_manifest.get_last_pinned();
1909   }
1910
1911   manifest.pin(pin_first);
1912 }
1913
1914 bool OSDMonitor::_prune_sanitize_options() const
1915 {
1916   uint64_t prune_interval =
1917     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
1918   uint64_t prune_min =
1919     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
1920   uint64_t txsize =
1921     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
1922
1923   bool r = true;
1924
1925   if (prune_interval == 0) {
1926     derr << __func__
1927          << " prune is enabled BUT prune interval is zero; abort."
1928          << dendl;
1929     r = false;
1930   } else if (prune_interval == 1) {
1931     derr << __func__
1932          << " prune interval is equal to one, which essentially means"
1933             " no pruning; abort."
1934          << dendl;
1935     r = false;
1936   }
1937   if (prune_min == 0) {
1938     derr << __func__
1939          << " prune is enabled BUT prune min is zero; abort."
1940          << dendl;
1941     r = false;
1942   }
1943   if (prune_interval > prune_min) {
1944     derr << __func__
1945          << " impossible to ascertain proper prune interval because"
1946          << " it is greater than the minimum prune epochs"
1947          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
1948          << dendl;
1949     r = false;
1950   }
1951
1952   if (txsize < prune_interval - 1) {
1953     derr << __func__
1954          << "'mon_osdmap_full_prune_txsize' (" << txsize
1955          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
1956          << "); abort." << dendl;
1957     r = false;
1958   }
1959   return r;
1960 }
1961
1962 bool OSDMonitor::is_prune_enabled() const {
1963   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
1964 }
1965
1966 bool OSDMonitor::is_prune_supported() const {
1967   return mon->get_required_mon_features().contains_any(
1968       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
1969 }
1970
1971 /** do_prune
1972  *
1973  * @returns true if has side-effects; false otherwise.
1974  */
1975 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
1976 {
1977   bool enabled = is_prune_enabled();
1978
1979   dout(1) << __func__ << " osdmap full prune "
1980           << ( enabled ? "enabled" : "disabled")
1981           << dendl;
1982
1983   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
1984     return false;
1985   }
1986
1987   // we are beyond the minimum prune versions, we need to remove maps because
1988   // otherwise the store will grow unbounded and we may end up having issues
1989   // with available disk space or store hangs.
1990
1991   // we will not pin all versions. We will leave a buffer number of versions.
1992   // this allows us the monitor to trim maps without caring too much about
1993   // pinned maps, and then allow us to use another ceph-mon without these
1994   // capabilities, without having to repair the store.
1995
1996   osdmap_manifest_t manifest = osdmap_manifest;
1997
1998   version_t first = get_first_committed();
1999   version_t last = get_last_committed();
2000
2001   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2002   version_t last_pinned = manifest.get_last_pinned();
2003   uint64_t prune_interval =
2004     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2005   uint64_t txsize =
2006     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2007
2008   prune_init(manifest);
2009
2010   // we need to get rid of some osdmaps
2011
2012   dout(5) << __func__
2013           << " lc (" << first << " .. " << last << ")"
2014           << " last_pinned " << last_pinned
2015           << " interval " << prune_interval
2016           << " last_to_pin " << last_to_pin
2017           << dendl;
2018
2019   // We will be erasing maps as we go.
2020   //
2021   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2022   //
2023   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2024   // we stop pruning. We could prune the maps between `next_to_pin` and
2025   // `last_to_pin`, but by not doing it we end up with neater pruned
2026   // intervals, aligned with `prune_interval`. Besides, this should not be a
2027   // problem as long as `prune_interval` is set to a sane value, instead of
2028   // hundreds or thousands of maps.
2029
2030   auto map_exists = [this](version_t v) {
2031     string k = mon->store->combine_strings("full", v);
2032     return mon->store->exists(get_service_name(), k);
2033   };
2034
2035   // 'interval' represents the number of maps from the last pinned
2036   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2037   // version 11 next; all intermediate versions will be removed.
2038   //
2039   // 'txsize' represents the maximum number of versions we'll be removing in
2040   // this iteration. If 'txsize' is large enough to perform multiple passes
2041   // pinning and removing maps, we will do so; if not, we'll do at least one
2042   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2043   // ensure that we never go *over* the maximum.
2044
2045   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2046   uint64_t removal_interval = prune_interval - 1;
2047
2048   if (txsize < removal_interval) {
2049     dout(5) << __func__
2050             << " setting txsize to removal interval size ("
2051             << removal_interval << " versions"
2052             << dendl;
2053     txsize = removal_interval;
2054   }
2055   ceph_assert(removal_interval > 0);
2056
2057   uint64_t num_pruned = 0;
2058   while (num_pruned + removal_interval <= txsize) {
2059     last_pinned = manifest.get_last_pinned();
2060
2061     if (last_pinned + prune_interval > last_to_pin) {
2062       break;
2063     }
2064     ceph_assert(last_pinned < last_to_pin);
2065
2066     version_t next_pinned = last_pinned + prune_interval;
2067     ceph_assert(next_pinned <= last_to_pin);
2068     manifest.pin(next_pinned);
2069
2070     dout(20) << __func__
2071              << " last_pinned " << last_pinned
2072              << " next_pinned " << next_pinned
2073              << " num_pruned " << num_pruned
2074              << " removal interval (" << (last_pinned+1)
2075              << ".." << (next_pinned-1) << ")"
2076              << " txsize " << txsize << dendl;
2077
2078     ceph_assert(map_exists(last_pinned));
2079     ceph_assert(map_exists(next_pinned));
2080
2081     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2082       ceph_assert(!manifest.is_pinned(v));
2083
2084       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2085       string full_key = mon->store->combine_strings("full", v);
2086       tx->erase(get_service_name(), full_key);
2087       ++num_pruned;
2088     }
2089   }
2090
2091   ceph_assert(num_pruned > 0);
2092
2093   bufferlist bl;
2094   manifest.encode(bl);
2095   tx->put(get_service_name(), "osdmap_manifest", bl);
2096
2097   return true;
2098 }
2099
2100
2101 // -------------
2102
2103 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2104 {
2105   op->mark_osdmon_event(__func__);
2106   Message *m = op->get_req();
2107   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2108
2109   switch (m->get_type()) {
2110     // READs
2111   case MSG_MON_COMMAND:
2112     try {
2113       return preprocess_command(op);
2114     } catch (const bad_cmd_get& e) {
2115       bufferlist bl;
2116       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2117       return true;
2118     }
2119   case CEPH_MSG_MON_GET_OSDMAP:
2120     return preprocess_get_osdmap(op);
2121
2122     // damp updates
2123   case MSG_OSD_MARK_ME_DOWN:
2124     return preprocess_mark_me_down(op);
2125   case MSG_OSD_FULL:
2126     return preprocess_full(op);
2127   case MSG_OSD_FAILURE:
2128     return preprocess_failure(op);
2129   case MSG_OSD_BOOT:
2130     return preprocess_boot(op);
2131   case MSG_OSD_ALIVE:
2132     return preprocess_alive(op);
2133   case MSG_OSD_PG_CREATED:
2134     return preprocess_pg_created(op);
2135   case MSG_OSD_PG_READY_TO_MERGE:
2136     return preprocess_pg_ready_to_merge(op);
2137   case MSG_OSD_PGTEMP:
2138     return preprocess_pgtemp(op);
2139   case MSG_OSD_BEACON:
2140     return preprocess_beacon(op);
2141
2142   case CEPH_MSG_POOLOP:
2143     return preprocess_pool_op(op);
2144
2145   case MSG_REMOVE_SNAPS:
2146     return preprocess_remove_snaps(op);
2147
2148   default:
2149     ceph_abort();
2150     return true;
2151   }
2152 }
2153
2154 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2155 {
2156   op->mark_osdmon_event(__func__);
2157   Message *m = op->get_req();
2158   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2159
2160   switch (m->get_type()) {
2161     // damp updates
2162   case MSG_OSD_MARK_ME_DOWN:
2163     return prepare_mark_me_down(op);
2164   case MSG_OSD_FULL:
2165     return prepare_full(op);
2166   case MSG_OSD_FAILURE:
2167     return prepare_failure(op);
2168   case MSG_OSD_BOOT:
2169     return prepare_boot(op);
2170   case MSG_OSD_ALIVE:
2171     return prepare_alive(op);
2172   case MSG_OSD_PG_CREATED:
2173     return prepare_pg_created(op);
2174   case MSG_OSD_PGTEMP:
2175     return prepare_pgtemp(op);
2176   case MSG_OSD_PG_READY_TO_MERGE:
2177     return prepare_pg_ready_to_merge(op);
2178   case MSG_OSD_BEACON:
2179     return prepare_beacon(op);
2180
2181   case MSG_MON_COMMAND:
2182     try {
2183       return prepare_command(op);
2184     } catch (const bad_cmd_get& e) {
2185       bufferlist bl;
2186       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2187       return true;
2188     }
2189
2190   case CEPH_MSG_POOLOP:
2191     return prepare_pool_op(op);
2192
2193   case MSG_REMOVE_SNAPS:
2194     return prepare_remove_snaps(op);
2195
2196
2197   default:
2198     ceph_abort();
2199   }
2200
2201   return false;
2202 }
2203
2204 bool OSDMonitor::should_propose(double& delay)
2205 {
2206   dout(10) << "should_propose" << dendl;
2207
2208   // if full map, propose immediately!  any subsequent changes will be clobbered.
2209   if (pending_inc.fullmap.length())
2210     return true;
2211
2212   // adjust osd weights?
2213   if (!osd_weight.empty() &&
2214       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2215     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2216     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2217     delay = 0.0;
2218     osd_weight.clear();
2219     return true;
2220   }
2221
2222   return PaxosService::should_propose(delay);
2223 }
2224
2225
2226
2227 // ---------------------------
2228 // READs
2229
2230 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2231 {
2232   op->mark_osdmon_event(__func__);
2233   MMonGetOSDMap *m = static_cast<MMonGetOSDMap*>(op->get_req());
2234
2235   uint64_t features = mon->get_quorum_con_features();
2236   if (op->get_session() && op->get_session()->con_features)
2237     features = op->get_session()->con_features;
2238
2239   dout(10) << __func__ << " " << *m << dendl;
2240   MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2241   epoch_t first = get_first_committed();
2242   epoch_t last = osdmap.get_epoch();
2243   int max = g_conf()->osd_map_message_max;
2244   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2245   for (epoch_t e = std::max(first, m->get_full_first());
2246        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2247        ++e, --max) {
2248     bufferlist& bl = reply->maps[e];
2249     int r = get_version_full(e, features, bl);
2250     ceph_assert(r >= 0);
2251     max_bytes -= bl.length();
2252   }
2253   for (epoch_t e = std::max(first, m->get_inc_first());
2254        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2255        ++e, --max) {
2256     bufferlist& bl = reply->incremental_maps[e];
2257     int r = get_version(e, features, bl);
2258     ceph_assert(r >= 0);
2259     max_bytes -= bl.length();
2260   }
2261   reply->oldest_map = first;
2262   reply->newest_map = last;
2263   mon->send_reply(op, reply);
2264   return true;
2265 }
2266
2267
2268 // ---------------------------
2269 // UPDATEs
2270
2271 // failure --
2272
2273 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2274   // check permissions
2275   MonSession *session = op->get_session();
2276   if (!session)
2277     return true;
2278   if (!session->is_capable("osd", MON_CAP_X)) {
2279     dout(0) << "got MOSDFailure from entity with insufficient caps "
2280             << session->caps << dendl;
2281     return true;
2282   }
2283   if (fsid != mon->monmap->fsid) {
2284     dout(0) << "check_source: on fsid " << fsid
2285             << " != " << mon->monmap->fsid << dendl;
2286     return true;
2287   }
2288   return false;
2289 }
2290
2291
2292 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2293 {
2294   op->mark_osdmon_event(__func__);
2295   MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2296   // who is target_osd
2297   int badboy = m->get_target_osd();
2298
2299   // check permissions
2300   if (check_source(op, m->fsid))
2301     goto didit;
2302
2303   // first, verify the reporting host is valid
2304   if (m->get_orig_source().is_osd()) {
2305     int from = m->get_orig_source().num();
2306     if (!osdmap.exists(from) ||
2307         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2308         (osdmap.is_down(from) && m->if_osd_failed())) {
2309       dout(5) << "preprocess_failure from dead osd." << from
2310               << ", ignoring" << dendl;
2311       send_incremental(op, m->get_epoch()+1);
2312       goto didit;
2313     }
2314   }
2315
2316
2317   // weird?
2318   if (osdmap.is_down(badboy)) {
2319     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2320             << " " << m->get_target_addrs()
2321             << ", from " << m->get_orig_source() << dendl;
2322     if (m->get_epoch() < osdmap.get_epoch())
2323       send_incremental(op, m->get_epoch()+1);
2324     goto didit;
2325   }
2326   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2327     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2328             << " " << m->get_target_addrs()
2329             << " != map's " << osdmap.get_addrs(badboy)
2330             << ", from " << m->get_orig_source() << dendl;
2331     if (m->get_epoch() < osdmap.get_epoch())
2332       send_incremental(op, m->get_epoch()+1);
2333     goto didit;
2334   }
2335
2336   // already reported?
2337   if (osdmap.is_down(badboy) ||
2338       osdmap.get_up_from(badboy) > m->get_epoch()) {
2339     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2340             << " " << m->get_target_addrs()
2341             << ", from " << m->get_orig_source() << dendl;
2342     if (m->get_epoch() < osdmap.get_epoch())
2343       send_incremental(op, m->get_epoch()+1);
2344     goto didit;
2345   }
2346
2347   if (!can_mark_down(badboy)) {
2348     dout(5) << "preprocess_failure ignoring report of osd."
2349             << m->get_target_osd() << " " << m->get_target_addrs()
2350             << " from " << m->get_orig_source() << dendl;
2351     goto didit;
2352   }
2353
2354   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2355            << " " << m->get_target_addrs()
2356            << ", from " << m->get_orig_source() << dendl;
2357   return false;
2358
2359  didit:
2360   mon->no_reply(op);
2361   return true;
2362 }
2363
2364 class C_AckMarkedDown : public C_MonOp {
2365   OSDMonitor *osdmon;
2366 public:
2367   C_AckMarkedDown(
2368     OSDMonitor *osdmon,
2369     MonOpRequestRef op)
2370     : C_MonOp(op), osdmon(osdmon) {}
2371
2372   void _finish(int) override {
2373     MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2374     osdmon->mon->send_reply(
2375       op,
2376       new MOSDMarkMeDown(
2377         m->fsid,
2378         m->target_osd,
2379         m->target_addrs,
2380         m->get_epoch(),
2381         false));   // ACK itself does not request an ack
2382   }
2383   ~C_AckMarkedDown() override {
2384   }
2385 };
2386
2387 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2388 {
2389   op->mark_osdmon_event(__func__);
2390   MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2391   int from = m->target_osd;
2392
2393   // check permissions
2394   if (check_source(op, m->fsid))
2395     goto reply;
2396
2397   // first, verify the reporting host is valid
2398   if (!m->get_orig_source().is_osd())
2399     goto reply;
2400
2401   if (!osdmap.exists(from) ||
2402       osdmap.is_down(from) ||
2403       osdmap.get_addrs(from) != m->target_addrs) {
2404     dout(5) << "preprocess_mark_me_down from dead osd."
2405             << from << ", ignoring" << dendl;
2406     send_incremental(op, m->get_epoch()+1);
2407     goto reply;
2408   }
2409
2410   // no down might be set
2411   if (!can_mark_down(from))
2412     goto reply;
2413
2414   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2415            << " " << m->target_addrs << dendl;
2416   return false;
2417
2418  reply:
2419   if (m->request_ack) {
2420     Context *c(new C_AckMarkedDown(this, op));
2421     c->complete(0);
2422   }
2423   return true;
2424 }
2425
2426 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2427 {
2428   op->mark_osdmon_event(__func__);
2429   MOSDMarkMeDown *m = static_cast<MOSDMarkMeDown*>(op->get_req());
2430   int target_osd = m->target_osd;
2431
2432   ceph_assert(osdmap.is_up(target_osd));
2433   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2434
2435   mon->clog->info() << "osd." << target_osd << " marked itself down";
2436   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2437   if (m->request_ack)
2438     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2439   return true;
2440 }
2441
2442 bool OSDMonitor::can_mark_down(int i)
2443 {
2444   if (osdmap.is_nodown(i)) {
2445     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
2446             << "will not mark it down" << dendl;
2447     return false;
2448   }
2449
2450   int num_osds = osdmap.get_num_osds();
2451   if (num_osds == 0) {
2452     dout(5) << __func__ << " no osds" << dendl;
2453     return false;
2454   }
2455   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
2456   float up_ratio = (float)up / (float)num_osds;
2457   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
2458     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
2459             << g_conf()->mon_osd_min_up_ratio
2460             << ", will not mark osd." << i << " down" << dendl;
2461     return false;
2462   }
2463   return true;
2464 }
2465
2466 bool OSDMonitor::can_mark_up(int i)
2467 {
2468   if (osdmap.is_noup(i)) {
2469     dout(5) << __func__ << " osd." << i << " is marked as noup, "
2470             << "will not mark it up" << dendl;
2471     return false;
2472   }
2473
2474   return true;
2475 }
2476
2477 /**
2478  * @note the parameter @p i apparently only exists here so we can output the
2479  *       osd's id on messages.
2480  */
2481 bool OSDMonitor::can_mark_out(int i)
2482 {
2483   if (osdmap.is_noout(i)) {
2484     dout(5) << __func__ << " osd." << i << " is marked as noout, "
2485             << "will not mark it out" << dendl;
2486     return false;
2487   }
2488
2489   int num_osds = osdmap.get_num_osds();
2490   if (num_osds == 0) {
2491     dout(5) << __func__ << " no osds" << dendl;
2492     return false;
2493   }
2494   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
2495   float in_ratio = (float)in / (float)num_osds;
2496   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
2497     if (i >= 0)
2498       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2499               << g_conf()->mon_osd_min_in_ratio
2500               << ", will not mark osd." << i << " out" << dendl;
2501     else
2502       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
2503               << g_conf()->mon_osd_min_in_ratio
2504               << ", will not mark osds out" << dendl;
2505     return false;
2506   }
2507
2508   return true;
2509 }
2510
2511 bool OSDMonitor::can_mark_in(int i)
2512 {
2513   if (osdmap.is_noin(i)) {
2514     dout(5) << __func__ << " osd." << i << " is marked as noin, "
2515             << "will not mark it in" << dendl;
2516     return false;
2517   }
2518
2519   return true;
2520 }
2521
2522 bool OSDMonitor::check_failures(utime_t now)
2523 {
2524   bool found_failure = false;
2525   for (map<int,failure_info_t>::iterator p = failure_info.begin();
2526        p != failure_info.end();
2527        ++p) {
2528     if (can_mark_down(p->first)) {
2529       found_failure |= check_failure(now, p->first, p->second);
2530     }
2531   }
2532   return found_failure;
2533 }
2534
2535 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
2536 {
2537   // already pending failure?
2538   if (pending_inc.new_state.count(target_osd) &&
2539       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2540     dout(10) << " already pending failure" << dendl;
2541     return true;
2542   }
2543
2544   set<string> reporters_by_subtree;
2545   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
2546   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
2547   utime_t max_failed_since = fi.get_failed_since();
2548   utime_t failed_for = now - max_failed_since;
2549
2550   utime_t grace = orig_grace;
2551   double my_grace = 0, peer_grace = 0;
2552   double decay_k = 0;
2553   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2554     double halflife = (double)g_conf()->mon_osd_laggy_halflife;
2555     decay_k = ::log(.5) / halflife;
2556
2557     // scale grace period based on historical probability of 'lagginess'
2558     // (false positive failures due to slowness).
2559     const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
2560     double decay = exp((double)failed_for * decay_k);
2561     dout(20) << " halflife " << halflife << " decay_k " << decay_k
2562              << " failed_for " << failed_for << " decay " << decay << dendl;
2563     my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
2564     grace += my_grace;
2565   }
2566
2567   // consider the peers reporting a failure a proxy for a potential
2568   // 'subcluster' over the overall cluster that is similarly
2569   // laggy.  this is clearly not true in all cases, but will sometimes
2570   // help us localize the grace correction to a subset of the system
2571   // (say, a rack with a bad switch) that is unhappy.
2572   ceph_assert(fi.reporters.size());
2573   for (map<int,failure_reporter_t>::iterator p = fi.reporters.begin();
2574         p != fi.reporters.end();
2575         ++p) {
2576     // get the parent bucket whose type matches with "reporter_subtree_level".
2577     // fall back to OSD if the level doesn't exist.
2578     map<string, string> reporter_loc = osdmap.crush->get_full_location(p->first);
2579     map<string, string>::iterator iter = reporter_loc.find(reporter_subtree_level);
2580     if (iter == reporter_loc.end()) {
2581       reporters_by_subtree.insert("osd." + to_string(p->first));
2582     } else {
2583       reporters_by_subtree.insert(iter->second);
2584     }
2585     if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2586       const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
2587       utime_t elapsed = now - xi.down_stamp;
2588       double decay = exp((double)elapsed * decay_k);
2589       peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
2590     }
2591   }
2592
2593   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
2594     peer_grace /= (double)fi.reporters.size();
2595     grace += peer_grace;
2596   }
2597
2598   dout(10) << " osd." << target_osd << " has "
2599            << fi.reporters.size() << " reporters, "
2600            << grace << " grace (" << orig_grace << " + " << my_grace
2601            << " + " << peer_grace << "), max_failed_since " << max_failed_since
2602            << dendl;
2603
2604   if (failed_for >= grace &&
2605       reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
2606     dout(1) << " we have enough reporters to mark osd." << target_osd
2607             << " down" << dendl;
2608     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2609
2610     mon->clog->info() << "osd." << target_osd << " failed ("
2611                       << osdmap.crush->get_full_location_ordered_string(
2612                         target_osd)
2613                       << ") ("
2614                       << (int)reporters_by_subtree.size()
2615                       << " reporters from different "
2616                       << reporter_subtree_level << " after "
2617                       << failed_for << " >= grace " << grace << ")";
2618     return true;
2619   }
2620   return false;
2621 }
2622
2623 void OSDMonitor::force_failure(int target_osd, int by)
2624 {
2625   // already pending failure?
2626   if (pending_inc.new_state.count(target_osd) &&
2627       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
2628     dout(10) << " already pending failure" << dendl;
2629     return;
2630   }
2631
2632   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
2633   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2634
2635   mon->clog->info() << "osd." << target_osd << " failed ("
2636                     << osdmap.crush->get_full_location_ordered_string(target_osd)
2637                     << ") (connection refused reported by osd." << by << ")";
2638   return;
2639 }
2640
2641 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
2642 {
2643   op->mark_osdmon_event(__func__);
2644   MOSDFailure *m = static_cast<MOSDFailure*>(op->get_req());
2645   dout(1) << "prepare_failure osd." << m->get_target_osd()
2646           << " " << m->get_target_addrs()
2647           << " from " << m->get_orig_source()
2648           << " is reporting failure:" << m->if_osd_failed() << dendl;
2649
2650   int target_osd = m->get_target_osd();
2651   int reporter = m->get_orig_source().num();
2652   ceph_assert(osdmap.is_up(target_osd));
2653   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
2654
2655   if (m->if_osd_failed()) {
2656     // calculate failure time
2657     utime_t now = ceph_clock_now();
2658     utime_t failed_since =
2659       m->get_recv_stamp() - utime_t(m->failed_for, 0);
2660
2661     // add a report
2662     if (m->is_immediate()) {
2663       mon->clog->debug() << "osd." << m->get_target_osd()
2664                          << " reported immediately failed by "
2665                          << m->get_orig_source();
2666       force_failure(target_osd, reporter);
2667       mon->no_reply(op);
2668       return true;
2669     }
2670     mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
2671                       << m->get_orig_source();
2672
2673     failure_info_t& fi = failure_info[target_osd];
2674     MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
2675     if (old_op) {
2676       mon->no_reply(old_op);
2677     }
2678
2679     return check_failure(now, target_osd, fi);
2680   } else {
2681     // remove the report
2682     mon->clog->debug() << "osd." << m->get_target_osd()
2683                        << " failure report canceled by "
2684                        << m->get_orig_source();
2685     if (failure_info.count(target_osd)) {
2686       failure_info_t& fi = failure_info[target_osd];
2687       MonOpRequestRef report_op = fi.cancel_report(reporter);
2688       if (report_op) {
2689         mon->no_reply(report_op);
2690       }
2691       if (fi.reporters.empty()) {
2692         dout(10) << " removing last failure_info for osd." << target_osd
2693                  << dendl;
2694         failure_info.erase(target_osd);
2695       } else {
2696         dout(10) << " failure_info for osd." << target_osd << " now "
2697                  << fi.reporters.size() << " reporters" << dendl;
2698       }
2699     } else {
2700       dout(10) << " no failure_info for osd." << target_osd << dendl;
2701     }
2702     mon->no_reply(op);
2703   }
2704
2705   return false;
2706 }
2707
2708 void OSDMonitor::process_failures()
2709 {
2710   map<int,failure_info_t>::iterator p = failure_info.begin();
2711   while (p != failure_info.end()) {
2712     if (osdmap.is_up(p->first)) {
2713       ++p;
2714     } else {
2715       dout(10) << "process_failures osd." << p->first << dendl;
2716       list<MonOpRequestRef> ls;
2717       p->second.take_report_messages(ls);
2718       failure_info.erase(p++);
2719
2720       while (!ls.empty()) {
2721         MonOpRequestRef o = ls.front();
2722         if (o) {
2723           o->mark_event(__func__);
2724           MOSDFailure *m = o->get_req<MOSDFailure>();
2725           send_latest(o, m->get_epoch());
2726           mon->no_reply(o);
2727         }
2728         ls.pop_front();
2729       }
2730     }
2731   }
2732 }
2733
2734 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
2735 {
2736   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
2737
2738   for (map<int,failure_info_t>::iterator p = failure_info.begin();
2739        p != failure_info.end();
2740        ++p) {
2741     p->second.take_report_messages(ls);
2742   }
2743   failure_info.clear();
2744 }
2745
2746
2747 // boot --
2748
2749 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
2750 {
2751   op->mark_osdmon_event(__func__);
2752   MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2753   int from = m->get_orig_source_inst().name.num();
2754
2755   // check permissions, ignore if failed (no response expected)
2756   MonSession *session = op->get_session();
2757   if (!session)
2758     goto ignore;
2759   if (!session->is_capable("osd", MON_CAP_X)) {
2760     dout(0) << "got preprocess_boot message from entity with insufficient caps"
2761             << session->caps << dendl;
2762     goto ignore;
2763   }
2764
2765   if (m->sb.cluster_fsid != mon->monmap->fsid) {
2766     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
2767             << " != " << mon->monmap->fsid << dendl;
2768     goto ignore;
2769   }
2770
2771   if (m->get_orig_source_inst().addr.is_blank_ip()) {
2772     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
2773     goto ignore;
2774   }
2775
2776   ceph_assert(m->get_orig_source_inst().name.is_osd());
2777
2778   // force all osds to have gone through luminous prior to upgrade to nautilus
2779   {
2780     vector<string> missing;
2781     if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
2782       missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
2783     }
2784     if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
2785       missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
2786     }
2787     if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
2788       missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
2789     }
2790     if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
2791       missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
2792     }
2793
2794     if (!missing.empty()) {
2795       using std::experimental::make_ostream_joiner;
2796
2797       stringstream ss;
2798       copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
2799
2800       mon->clog->info() << "disallowing boot of OSD "
2801                         << m->get_orig_source_inst()
2802                         << " because the osd lacks " << ss.str();
2803       goto ignore;
2804     }
2805   }
2806
2807   // make sure upgrades stop at nautilus
2808   if (HAVE_FEATURE(m->osd_features, SERVER_O) &&
2809       osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
2810     mon->clog->info() << "disallowing boot of post-nautilus OSD "
2811                       << m->get_orig_source_inst()
2812                       << " because require_osd_release < nautilus";
2813     goto ignore;
2814   }
2815
2816   // The release check here is required because for OSD_PGLOG_HARDLIMIT,
2817   // we are reusing a jewel feature bit that was retired in luminous.
2818   if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
2819       osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
2820       !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
2821     mon->clog->info() << "disallowing boot of OSD "
2822                       << m->get_orig_source_inst()
2823                       << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
2824     goto ignore;
2825   }
2826
2827   // already booted?
2828   if (osdmap.is_up(from) &&
2829       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
2830       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
2831     // yup.
2832     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
2833             << " " << m->get_orig_source_addrs()
2834             << " =~ " << osdmap.get_addrs(from) << dendl;
2835     _booted(op, false);
2836     return true;
2837   }
2838
2839   if (osdmap.exists(from) &&
2840       !osdmap.get_uuid(from).is_zero() &&
2841       osdmap.get_uuid(from) != m->sb.osd_fsid) {
2842     dout(7) << __func__ << " from " << m->get_orig_source_inst()
2843             << " clashes with existing osd: different fsid"
2844             << " (ours: " << osdmap.get_uuid(from)
2845             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
2846     goto ignore;
2847   }
2848
2849   if (osdmap.exists(from) &&
2850       osdmap.get_info(from).up_from > m->version &&
2851       osdmap.get_most_recent_addrs(from).legacy_equals(
2852         m->get_orig_source_addrs())) {
2853     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
2854     send_latest(op, m->sb.current_epoch+1);
2855     return true;
2856   }
2857
2858   // noup?
2859   if (!can_mark_up(from)) {
2860     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
2861     send_latest(op, m->sb.current_epoch+1);
2862     return true;
2863   }
2864
2865   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
2866   return false;
2867
2868  ignore:
2869   return true;
2870 }
2871
2872 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
2873 {
2874   op->mark_osdmon_event(__func__);
2875   MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
2876   dout(7) << __func__ << " from " << m->get_source()
2877           << " sb " << m->sb
2878           << " client_addrs" << m->get_connection()->get_peer_addrs()
2879           << " cluster_addrs " << m->cluster_addrs
2880           << " hb_back_addrs " << m->hb_back_addrs
2881           << " hb_front_addrs " << m->hb_front_addrs
2882           << dendl;
2883
2884   ceph_assert(m->get_orig_source().is_osd());
2885   int from = m->get_orig_source().num();
2886
2887   // does this osd exist?
2888   if (from >= osdmap.get_max_osd()) {
2889     dout(1) << "boot from osd." << from << " >= max_osd "
2890             << osdmap.get_max_osd() << dendl;
2891     return false;
2892   }
2893
2894   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
2895   if (pending_inc.new_state.count(from))
2896     oldstate ^= pending_inc.new_state[from];
2897
2898   // already up?  mark down first?
2899   if (osdmap.is_up(from)) {
2900     dout(7) << __func__ << " was up, first marking down osd." << from << " "
2901             << osdmap.get_addrs(from) << dendl;
2902     // preprocess should have caught these;  if not, assert.
2903     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
2904                   m->get_orig_source_addrs()) ||
2905                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
2906     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
2907
2908     if (pending_inc.new_state.count(from) == 0 ||
2909         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
2910       // mark previous guy down
2911       pending_inc.new_state[from] = CEPH_OSD_UP;
2912     }
2913     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2914   } else if (pending_inc.new_up_client.count(from)) {
2915     // already prepared, just wait
2916     dout(7) << __func__ << " already prepared, waiting on "
2917             << m->get_orig_source_addr() << dendl;
2918     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
2919   } else {
2920     // mark new guy up.
2921     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
2922     pending_inc.new_up_cluster[from] = m->cluster_addrs;
2923     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
2924     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
2925
2926     down_pending_out.erase(from);  // if any
2927
2928     if (m->sb.weight)
2929       osd_weight[from] = m->sb.weight;
2930
2931     // set uuid?
2932     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
2933              << dendl;
2934     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
2935       // preprocess should have caught this;  if not, assert.
2936       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
2937       pending_inc.new_uuid[from] = m->sb.osd_fsid;
2938     }
2939
2940     // fresh osd?
2941     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
2942       const osd_info_t& i = osdmap.get_info(from);
2943       if (i.up_from > i.lost_at) {
2944         dout(10) << " fresh osd; marking lost_at too" << dendl;
2945         pending_inc.new_lost[from] = osdmap.get_epoch();
2946       }
2947     }
2948
2949     // metadata
2950     bufferlist osd_metadata;
2951     encode(m->metadata, osd_metadata);
2952     pending_metadata[from] = osd_metadata;
2953     pending_metadata_rm.erase(from);
2954
2955     // adjust last clean unmount epoch?
2956     const osd_info_t& info = osdmap.get_info(from);
2957     dout(10) << " old osd_info: " << info << dendl;
2958     if (m->sb.mounted > info.last_clean_begin ||
2959         (m->sb.mounted == info.last_clean_begin &&
2960          m->sb.clean_thru > info.last_clean_end)) {
2961       epoch_t begin = m->sb.mounted;
2962       epoch_t end = m->sb.clean_thru;
2963
2964       dout(10) << __func__ << " osd." << from << " last_clean_interval "
2965                << "[" << info.last_clean_begin << "," << info.last_clean_end
2966                << ") -> [" << begin << "-" << end << ")"
2967                << dendl;
2968       pending_inc.new_last_clean_interval[from] =
2969         pair<epoch_t,epoch_t>(begin, end);
2970     }
2971
2972     osd_xinfo_t xi = osdmap.get_xinfo(from);
2973     if (m->boot_epoch == 0) {
2974       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
2975       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
2976       dout(10) << " not laggy, new xi " << xi << dendl;
2977     } else {
2978       if (xi.down_stamp.sec()) {
2979         int interval = ceph_clock_now().sec() -
2980           xi.down_stamp.sec();
2981         if (g_conf()->mon_osd_laggy_max_interval &&
2982             (interval > g_conf()->mon_osd_laggy_max_interval)) {
2983           interval =  g_conf()->mon_osd_laggy_max_interval;
2984         }
2985         xi.laggy_interval =
2986           interval * g_conf()->mon_osd_laggy_weight +
2987           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
2988       }
2989       xi.laggy_probability =
2990         g_conf()->mon_osd_laggy_weight +
2991         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
2992       dout(10) << " laggy, now xi " << xi << dendl;
2993     }
2994
2995     // set features shared by the osd
2996     if (m->osd_features)
2997       xi.features = m->osd_features;
2998     else
2999       xi.features = m->get_connection()->get_features();
3000
3001     // mark in?
3002     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3003          (oldstate & CEPH_OSD_AUTOOUT)) ||
3004         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3005         (g_conf()->mon_osd_auto_mark_in)) {
3006       if (can_mark_in(from)) {
3007         if (osdmap.osd_xinfo[from].old_weight > 0) {
3008           pending_inc.new_weight[from] = osdmap.osd_xinfo[from].old_weight;
3009           xi.old_weight = 0;
3010         } else {
3011           pending_inc.new_weight[from] = CEPH_OSD_IN;
3012         }
3013       } else {
3014         dout(7) << __func__ << " NOIN set, will not mark in "
3015                 << m->get_orig_source_addr() << dendl;
3016       }
3017     }
3018
3019     pending_inc.new_xinfo[from] = xi;
3020
3021     // wait
3022     wait_for_finished_proposal(op, new C_Booted(this, op));
3023   }
3024   return true;
3025 }
3026
3027 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3028 {
3029   op->mark_osdmon_event(__func__);
3030   MOSDBoot *m = static_cast<MOSDBoot*>(op->get_req());
3031   dout(7) << "_booted " << m->get_orig_source_inst()
3032           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3033
3034   if (logit) {
3035     mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3036                       << " boot";
3037   }
3038
3039   send_latest(op, m->sb.current_epoch+1);
3040 }
3041
3042
3043 // -------------
3044 // full
3045
3046 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3047 {
3048   op->mark_osdmon_event(__func__);
3049   MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3050   int from = m->get_orig_source().num();
3051   set<string> state;
3052   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3053
3054   // check permissions, ignore if failed
3055   MonSession *session = op->get_session();
3056   if (!session)
3057     goto ignore;
3058   if (!session->is_capable("osd", MON_CAP_X)) {
3059     dout(0) << "MOSDFull from entity with insufficient privileges:"
3060             << session->caps << dendl;
3061     goto ignore;
3062   }
3063
3064   // ignore a full message from the osd instance that already went down
3065   if (!osdmap.exists(from)) {
3066     dout(7) << __func__ << " ignoring full message from nonexistent "
3067             << m->get_orig_source_inst() << dendl;
3068     goto ignore;
3069   }
3070   if ((!osdmap.is_up(from) &&
3071        osdmap.get_most_recent_addrs(from).legacy_equals(
3072          m->get_orig_source_addrs())) ||
3073       (osdmap.is_up(from) &&
3074        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3075     dout(7) << __func__ << " ignoring full message from down "
3076             << m->get_orig_source_inst() << dendl;
3077     goto ignore;
3078   }
3079
3080   OSDMap::calc_state_set(osdmap.get_state(from), state);
3081
3082   if ((osdmap.get_state(from) & mask) == m->state) {
3083     dout(7) << __func__ << " state already " << state << " for osd." << from
3084             << " " << m->get_orig_source_inst() << dendl;
3085     _reply_map(op, m->version);
3086     goto ignore;
3087   }
3088
3089   dout(10) << __func__ << " want state " << state << " for osd." << from
3090            << " " << m->get_orig_source_inst() << dendl;
3091   return false;
3092
3093  ignore:
3094   return true;
3095 }
3096
3097 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3098 {
3099   op->mark_osdmon_event(__func__);
3100   const MOSDFull *m = static_cast<MOSDFull*>(op->get_req());
3101   const int from = m->get_orig_source().num();
3102
3103   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3104   const unsigned want_state = m->state & mask;  // safety first
3105
3106   unsigned cur_state = osdmap.get_state(from);
3107   auto p = pending_inc.new_state.find(from);
3108   if (p != pending_inc.new_state.end()) {
3109     cur_state ^= p->second;
3110   }
3111   cur_state &= mask;
3112
3113   set<string> want_state_set, cur_state_set;
3114   OSDMap::calc_state_set(want_state, want_state_set);
3115   OSDMap::calc_state_set(cur_state, cur_state_set);
3116
3117   if (cur_state != want_state) {
3118     if (p != pending_inc.new_state.end()) {
3119       p->second &= ~mask;
3120     } else {
3121       pending_inc.new_state[from] = 0;
3122     }
3123     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3124     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3125             << " -> " << want_state_set << dendl;
3126   } else {
3127     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3128             << " = wanted " << want_state_set << ", just waiting" << dendl;
3129   }
3130
3131   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3132   return true;
3133 }
3134
3135 // -------------
3136 // alive
3137
3138 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3139 {
3140   op->mark_osdmon_event(__func__);
3141   MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3142   int from = m->get_orig_source().num();
3143
3144   // check permissions, ignore if failed
3145   MonSession *session = op->get_session();
3146   if (!session)
3147     goto ignore;
3148   if (!session->is_capable("osd", MON_CAP_X)) {
3149     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3150             << session->caps << dendl;
3151     goto ignore;
3152   }
3153
3154   if (!osdmap.is_up(from) ||
3155       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3156     dout(7) << "preprocess_alive ignoring alive message from down "
3157             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3158             << dendl;
3159     goto ignore;
3160   }
3161
3162   if (osdmap.get_up_thru(from) >= m->want) {
3163     // yup.
3164     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3165     _reply_map(op, m->version);
3166     return true;
3167   }
3168
3169   dout(10) << "preprocess_alive want up_thru " << m->want
3170            << " from " << m->get_orig_source_inst() << dendl;
3171   return false;
3172
3173  ignore:
3174   return true;
3175 }
3176
3177 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3178 {
3179   op->mark_osdmon_event(__func__);
3180   MOSDAlive *m = static_cast<MOSDAlive*>(op->get_req());
3181   int from = m->get_orig_source().num();
3182
3183   if (0) {  // we probably don't care much about these
3184     mon->clog->debug() << m->get_orig_source_inst() << " alive";
3185   }
3186
3187   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3188           << " from " << m->get_orig_source_inst() << dendl;
3189
3190   update_up_thru(from, m->version); // set to the latest map the OSD has
3191   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3192   return true;
3193 }
3194
3195 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3196 {
3197   op->mark_osdmon_event(__func__);
3198   dout(7) << "_reply_map " << e
3199           << " from " << op->get_req()->get_orig_source_inst()
3200           << dendl;
3201   send_latest(op, e);
3202 }
3203
3204 // pg_created
3205 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3206 {
3207   op->mark_osdmon_event(__func__);
3208   auto m = static_cast<MOSDPGCreated*>(op->get_req());
3209   dout(10) << __func__ << " " << *m << dendl;
3210   auto session = op->get_session();
3211   mon->no_reply(op);
3212   if (!session) {
3213     dout(10) << __func__ << ": no monitor session!" << dendl;
3214     return true;
3215   }
3216   if (!session->is_capable("osd", MON_CAP_X)) {
3217     derr << __func__ << " received from entity "
3218          << "with insufficient privileges " << session->caps << dendl;
3219     return true;
3220   }
3221   // always forward the "created!" to the leader
3222   return false;
3223 }
3224
3225 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3226 {
3227   op->mark_osdmon_event(__func__);
3228   auto m = static_cast<MOSDPGCreated*>(op->get_req());
3229   dout(10) << __func__ << " " << *m << dendl;
3230   auto src = m->get_orig_source();
3231   auto from = src.num();
3232   if (!src.is_osd() ||
3233       !mon->osdmon()->osdmap.is_up(from) ||
3234       !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3235         m->get_orig_source_addrs())) {
3236     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3237     return false;
3238   }
3239   pending_created_pgs.push_back(m->pgid);
3240   return true;
3241 }
3242
3243 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3244 {
3245   op->mark_osdmon_event(__func__);
3246   auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3247   dout(10) << __func__ << " " << *m << dendl;
3248   const pg_pool_t *pi;
3249   auto session = op->get_session();
3250   if (!session) {
3251     dout(10) << __func__ << ": no monitor session!" << dendl;
3252     goto ignore;
3253   }
3254   if (!session->is_capable("osd", MON_CAP_X)) {
3255     derr << __func__ << " received from entity "
3256          << "with insufficient privileges " << session->caps << dendl;
3257     goto ignore;
3258   }
3259   pi = osdmap.get_pg_pool(m->pgid.pool());
3260   if (!pi) {
3261     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3262     goto ignore;
3263   }
3264   if (pi->get_pg_num() <= m->pgid.ps()) {
3265     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3266     goto ignore;
3267   }
3268   if (pi->get_pg_num() != m->pgid.ps() + 1) {
3269     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3270     goto ignore;
3271   }
3272   if (pi->get_pg_num_pending() > m->pgid.ps()) {
3273     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3274     goto ignore;
3275   }
3276   return false;
3277
3278  ignore:
3279   mon->no_reply(op);
3280   return true;
3281 }
3282
3283 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3284 {
3285   op->mark_osdmon_event(__func__);
3286   auto m = static_cast<MOSDPGReadyToMerge*>(op->get_req());
3287   dout(10) << __func__ << " " << *m << dendl;
3288   pg_pool_t p;
3289   if (pending_inc.new_pools.count(m->pgid.pool()))
3290     p = pending_inc.new_pools[m->pgid.pool()];
3291   else
3292     p = *osdmap.get_pg_pool(m->pgid.pool());
3293   if (p.get_pg_num() != m->pgid.ps() + 1 ||
3294       p.get_pg_num_pending() > m->pgid.ps()) {
3295     dout(10) << __func__
3296              << " race with concurrent pg_num[_pending] update, will retry"
3297              << dendl;
3298     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3299     return true;
3300   }
3301
3302   if (m->ready) {
3303     p.dec_pg_num(m->pgid,
3304                  pending_inc.epoch,
3305                  m->source_version,
3306                  m->target_version,
3307                  m->last_epoch_started,
3308                  m->last_epoch_clean);
3309     p.last_change = pending_inc.epoch;
3310   } else {
3311     // back off the merge attempt!
3312     p.set_pg_num_pending(p.get_pg_num());
3313   }
3314
3315   // force pre-nautilus clients to resend their ops, since they
3316   // don't understand pg_num_pending changes form a new interval
3317   p.last_force_op_resend_prenautilus = pending_inc.epoch;
3318
3319   pending_inc.new_pools[m->pgid.pool()] = p;
3320
3321   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3322   if (m->ready &&
3323       prob > 0 &&
3324       prob > (double)(rand() % 1000)/1000.0) {
3325     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3326     auto n = new MMonCommand(mon->monmap->get_fsid());
3327     n->set_connection(m->get_connection());
3328     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3329                osdmap.get_pool_name(m->pgid.pool()) +
3330                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3331                stringify(m->pgid.ps() + 1) + "\"}" };
3332     MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3333     nop->set_type_service();
3334     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3335   } else {
3336     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3337   }
3338   return true;
3339 }
3340
3341
3342 // -------------
3343 // pg_temp changes
3344
3345 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3346 {
3347   MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3348   dout(10) << "preprocess_pgtemp " << *m << dendl;
3349   mempool::osdmap::vector<int> empty;
3350   int from = m->get_orig_source().num();
3351   size_t ignore_cnt = 0;
3352
3353   // check caps
3354   MonSession *session = op->get_session();
3355   if (!session)
3356     goto ignore;
3357   if (!session->is_capable("osd", MON_CAP_X)) {
3358     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3359             << session->caps << dendl;
3360     goto ignore;
3361   }
3362
3363   if (!osdmap.is_up(from) ||
3364       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3365     dout(7) << "ignoring pgtemp message from down "
3366             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3367             << dendl;
3368     goto ignore;
3369   }
3370
3371   if (m->forced) {
3372     return false;
3373   }
3374
3375   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3376     dout(20) << " " << p->first
3377              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3378              << " -> " << p->second << dendl;
3379
3380     // does the pool exist?
3381     if (!osdmap.have_pg_pool(p->first.pool())) {
3382       /*
3383        * 1. If the osdmap does not have the pool, it means the pool has been
3384        *    removed in-between the osd sending this message and us handling it.
3385        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3386        *    not exist in the pending either, as the osds would not send a
3387        *    message about a pool they know nothing about (yet).
3388        * 3. However, if the pool does exist in the pending, then it must be a
3389        *    new pool, and not relevant to this message (see 1).
3390        */
3391       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3392                << ": pool has been removed" << dendl;
3393       ignore_cnt++;
3394       continue;
3395     }
3396
3397     int acting_primary = -1;
3398     osdmap.pg_to_up_acting_osds(
3399       p->first, nullptr, nullptr, nullptr, &acting_primary);
3400     if (acting_primary != from) {
3401       /* If the source isn't the primary based on the current osdmap, we know
3402        * that the interval changed and that we can discard this message.
3403        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3404        * which of two pg temp mappings on the same pg is more recent.
3405        */
3406       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3407                << ": primary has changed" << dendl;
3408       ignore_cnt++;
3409       continue;
3410     }
3411
3412     // removal?
3413     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3414                               osdmap.primary_temp->count(p->first)))
3415       return false;
3416     // change?
3417     //  NOTE: we assume that this will clear pg_primary, so consider
3418     //        an existing pg_primary field to imply a change
3419     if (p->second.size() &&
3420         (osdmap.pg_temp->count(p->first) == 0 ||
3421          osdmap.pg_temp->get(p->first) != p->second ||
3422          osdmap.primary_temp->count(p->first)))
3423       return false;
3424   }
3425
3426   // should we ignore all the pgs?
3427   if (ignore_cnt == m->pg_temp.size())
3428     goto ignore;
3429
3430   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
3431   _reply_map(op, m->map_epoch);
3432   return true;
3433
3434  ignore:
3435   return true;
3436 }
3437
3438 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
3439 {
3440   epoch_t old_up_thru = osdmap.get_up_thru(from);
3441   auto ut = pending_inc.new_up_thru.find(from);
3442   if (ut != pending_inc.new_up_thru.end()) {
3443     old_up_thru = ut->second;
3444   }
3445   if (up_thru > old_up_thru) {
3446     // set up_thru too, so the osd doesn't have to ask again
3447     pending_inc.new_up_thru[from] = up_thru;
3448   }
3449 }
3450
3451 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
3452 {
3453   op->mark_osdmon_event(__func__);
3454   MOSDPGTemp *m = static_cast<MOSDPGTemp*>(op->get_req());
3455   int from = m->get_orig_source().num();
3456   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
3457   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3458     uint64_t pool = p->first.pool();
3459     if (pending_inc.old_pools.count(pool)) {
3460       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3461                << ": pool pending removal" << dendl;
3462       continue;
3463     }
3464     if (!osdmap.have_pg_pool(pool)) {
3465       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3466                << ": pool has been removed" << dendl;
3467       continue;
3468     }
3469     pending_inc.new_pg_temp[p->first] =
3470       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
3471
3472     // unconditionally clear pg_primary (until this message can encode
3473     // a change for that, too.. at which point we need to also fix
3474     // preprocess_pg_temp)
3475     if (osdmap.primary_temp->count(p->first) ||
3476         pending_inc.new_primary_temp.count(p->first))
3477       pending_inc.new_primary_temp[p->first] = -1;
3478   }
3479
3480   // set up_thru too, so the osd doesn't have to ask again
3481   update_up_thru(from, m->map_epoch);
3482
3483   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
3484   return true;
3485 }
3486
3487
3488 // ---
3489
3490 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
3491 {
3492   op->mark_osdmon_event(__func__);
3493   MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3494   dout(7) << "preprocess_remove_snaps " << *m << dendl;
3495
3496   // check privilege, ignore if failed
3497   MonSession *session = op->get_session();
3498   mon->no_reply(op);
3499   if (!session)
3500     goto ignore;
3501   if (!session->caps.is_capable(
3502         cct,
3503         CEPH_ENTITY_TYPE_MON,
3504         session->entity_name,
3505         "osd", "osd pool rmsnap", {}, true, true, false,
3506         session->get_peer_socket_addr())) {
3507     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
3508             << session->caps << dendl;
3509     goto ignore;
3510   }
3511
3512   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
3513        q != m->snaps.end();
3514        ++q) {
3515     if (!osdmap.have_pg_pool(q->first)) {
3516       dout(10) << " ignoring removed_snaps " << q->second << " on non-existent pool " << q->first << dendl;
3517       continue;
3518     }
3519     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
3520     for (vector<snapid_t>::iterator p = q->second.begin();
3521          p != q->second.end();
3522          ++p) {
3523       if (*p > pi->get_snap_seq() ||
3524           !pi->removed_snaps.contains(*p))
3525         return false;
3526     }
3527   }
3528
3529  ignore:
3530   return true;
3531 }
3532
3533 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
3534 {
3535   op->mark_osdmon_event(__func__);
3536   MRemoveSnaps *m = static_cast<MRemoveSnaps*>(op->get_req());
3537   dout(7) << "prepare_remove_snaps " << *m << dendl;
3538
3539   for (map<int, vector<snapid_t> >::iterator p = m->snaps.begin();
3540        p != m->snaps.end();
3541        ++p) {
3542
3543     if (!osdmap.have_pg_pool(p->first)) {
3544       dout(10) << " ignoring removed_snaps " << p->second << " on non-existent pool " << p->first << dendl;
3545       continue;
3546     }
3547
3548     pg_pool_t& pi = osdmap.pools[p->first];
3549     for (vector<snapid_t>::iterator q = p->second.begin();
3550          q != p->second.end();
3551          ++q) {
3552       if (!pi.removed_snaps.contains(*q) &&
3553           (!pending_inc.new_pools.count(p->first) ||
3554            !pending_inc.new_pools[p->first].removed_snaps.contains(*q))) {
3555         pg_pool_t *newpi = pending_inc.get_new_pool(p->first, &pi);
3556         newpi->removed_snaps.insert(*q);
3557         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
3558         dout(10) << " pool " << p->first << " removed_snaps added " << *q
3559                  << " (now " << newpi->removed_snaps << ")" << dendl;
3560         if (*q > newpi->get_snap_seq()) {
3561           dout(10) << " pool " << p->first << " snap_seq "
3562                    << newpi->get_snap_seq() << " -> " << *q << dendl;
3563           newpi->set_snap_seq(*q);
3564         }
3565         newpi->set_snap_epoch(pending_inc.epoch);
3566         pending_inc.new_removed_snaps[p->first].insert(*q);
3567       }
3568     }
3569   }
3570   return true;
3571 }
3572
3573 // osd beacon
3574 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
3575 {
3576   op->mark_osdmon_event(__func__);
3577   // check caps
3578   auto session = op->get_session();
3579   mon->no_reply(op);
3580   if (!session) {
3581     dout(10) << __func__ << " no monitor session!" << dendl;
3582     return true;
3583   }
3584   if (!session->is_capable("osd", MON_CAP_X)) {
3585     derr << __func__ << " received from entity "
3586          << "with insufficient privileges " << session->caps << dendl;
3587     return true;
3588   }
3589   // Always forward the beacon to the leader, even if they are the same as
3590   // the old one. The leader will mark as down osds that haven't sent
3591   // beacon for a few minutes.
3592   return false;
3593 }
3594
3595 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
3596 {
3597   op->mark_osdmon_event(__func__);
3598   const auto beacon = static_cast<MOSDBeacon*>(op->get_req());
3599   const auto src = beacon->get_orig_source();
3600   dout(10) << __func__ << " " << *beacon
3601            << " from " << src << dendl;
3602   int from = src.num();
3603
3604   if (!src.is_osd() ||
3605       !osdmap.is_up(from) ||
3606       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
3607     if (src.is_osd() && !osdmap.is_up(from)) {
3608       // share some new maps with this guy in case it may not be
3609       // aware of its own deadness...
3610       send_latest(op, beacon->version+1);
3611     }
3612     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
3613     return false;
3614   }
3615
3616   last_osd_report[from] = ceph_clock_now();
3617   osd_epochs[from] = beacon->version;
3618
3619   for (const auto& pg : beacon->pgs) {
3620     last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
3621   }
3622   return false;
3623 }
3624
3625 // ---------------
3626 // map helpers
3627
3628 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
3629 {
3630   op->mark_osdmon_event(__func__);
3631   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
3632           << " start " << start << dendl;
3633   if (start == 0)
3634     send_full(op);
3635   else
3636     send_incremental(op, start);
3637 }
3638
3639
3640 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
3641 {
3642   MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
3643   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
3644   r->oldest_map = get_first_committed();
3645   r->newest_map = osdmap.get_epoch();
3646   return r;
3647 }
3648
3649 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
3650 {
3651   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
3652            << std::hex << features << std::dec << dendl;
3653   MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
3654   m->oldest_map = get_first_committed();
3655   m->newest_map = osdmap.get_epoch();
3656
3657   for (epoch_t e = to; e >= from && e > 0; e--) {
3658     bufferlist bl;
3659     int err = get_version(e, features, bl);
3660     if (err == 0) {
3661       ceph_assert(bl.length());
3662       // if (get_version(e, bl) > 0) {
3663       dout(20) << "build_incremental    inc " << e << " "
3664                << bl.length() << " bytes" << dendl;
3665       m->incremental_maps[e] = bl;
3666     } else {
3667       ceph_assert(err == -ENOENT);
3668       ceph_assert(!bl.length());
3669       get_version_full(e, features, bl);
3670       if (bl.length() > 0) {
3671       //else if (get_version("full", e, bl) > 0) {
3672       dout(20) << "build_incremental   full " << e << " "
3673                << bl.length() << " bytes" << dendl;
3674       m->maps[e] = bl;
3675       } else {
3676         ceph_abort();  // we should have all maps.
3677       }
3678     }
3679   }
3680   return m;
3681 }
3682
3683 void OSDMonitor::send_full(MonOpRequestRef op)
3684 {
3685   op->mark_osdmon_event(__func__);
3686   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
3687   mon->send_reply(op, build_latest_full(op->get_session()->con_features));
3688 }
3689
3690 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
3691 {
3692   op->mark_osdmon_event(__func__);
3693
3694   MonSession *s = op->get_session();
3695   ceph_assert(s);
3696
3697   if (s->proxy_con) {
3698     // oh, we can tell the other mon to do it
3699     dout(10) << __func__ << " asking proxying mon to send_incremental from "
3700              << first << dendl;
3701     MRoute *r = new MRoute(s->proxy_tid, NULL);
3702     r->send_osdmap_first = first;
3703     s->proxy_con->send_message(r);
3704     op->mark_event("reply: send routed send_osdmap_first reply");
3705   } else {
3706     // do it ourselves
3707     send_incremental(first, s, false, op);
3708   }
3709 }
3710
3711 void OSDMonitor::send_incremental(epoch_t first,
3712                                   MonSession *session,
3713                                   bool onetime,
3714                                   MonOpRequestRef req)
3715 {
3716   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
3717           << " to " << session->name << dendl;
3718
3719   // get feature of the peer
3720   // use quorum_con_features, if it's an anonymous connection.
3721   uint64_t features = session->con_features ? session->con_features :
3722     mon->get_quorum_con_features();
3723
3724   if (first <= session->osd_epoch) {
3725     dout(10) << __func__ << " " << session->name << " should already have epoch "
3726              << session->osd_epoch << dendl;
3727     first = session->osd_epoch + 1;
3728   }
3729
3730   if (first < get_first_committed()) {
3731     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
3732     m->oldest_map = get_first_committed();
3733     m->newest_map = osdmap.get_epoch();
3734
3735     // share removed snaps during the gap
3736     get_removed_snaps_range(first, m->oldest_map, &m->gap_removed_snaps);
3737
3738     first = get_first_committed();
3739     bufferlist bl;
3740     int err = get_version_full(first, features, bl);
3741     ceph_assert(err == 0);
3742     ceph_assert(bl.length());
3743     dout(20) << "send_incremental starting with base full "
3744              << first << " " << bl.length() << " bytes" << dendl;
3745     m->maps[first] = bl;
3746
3747     if (req) {
3748       mon->send_reply(req, m);
3749       session->osd_epoch = first;
3750       return;
3751     } else {
3752       session->con->send_message(m);
3753       session->osd_epoch = first;
3754     }
3755     first++;
3756   }
3757
3758   while (first <= osdmap.get_epoch()) {
3759     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
3760                                      osdmap.get_epoch());
3761     MOSDMap *m = build_incremental(first, last, features);
3762
3763     if (req) {
3764       // send some maps.  it may not be all of them, but it will get them
3765       // started.
3766       mon->send_reply(req, m);
3767     } else {
3768       session->con->send_message(m);
3769       first = last + 1;
3770     }
3771     session->osd_epoch = last;
3772     if (onetime || req)
3773       break;
3774   }
3775 }
3776
3777 void OSDMonitor::get_removed_snaps_range(
3778   epoch_t start, epoch_t end,
3779   mempool::osdmap::map<int64_t,OSDMap::snap_interval_set_t> *gap_removed_snaps)
3780 {
3781   // we only care about pools that exist now.
3782   for (auto& p : osdmap.get_pools()) {
3783     auto& t = (*gap_removed_snaps)[p.first];
3784     for (epoch_t epoch = start; epoch < end; ++epoch) {
3785       string k = make_snap_epoch_key(p.first, epoch);
3786       bufferlist v;
3787       mon->store->get(OSD_SNAP_PREFIX, k, v);
3788       if (v.length()) {
3789         auto q = v.cbegin();
3790         OSDMap::snap_interval_set_t snaps;
3791         decode(snaps, q);
3792         t.union_of(snaps);
3793       }
3794     }
3795     dout(10) << __func__ << " " << p.first << " " << t << dendl;
3796   }
3797 }
3798
3799 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
3800 {
3801   return get_version(ver, mon->get_quorum_con_features(), bl);
3802 }
3803
3804 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
3805 {
3806   OSDMap::Incremental inc;
3807   auto q = bl.cbegin();
3808   inc.decode(q);
3809   // always encode with subset of osdmap's canonical features
3810   uint64_t f = features & inc.encode_features;
3811   dout(20) << __func__ << " " << inc.epoch << " with features " << f
3812            << dendl;
3813   bl.clear();
3814   if (inc.fullmap.length()) {
3815     // embedded full map?
3816     OSDMap m;
3817     m.decode(inc.fullmap);
3818     inc.fullmap.clear();
3819     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
3820   }
3821   if (inc.crush.length()) {
3822     // embedded crush map
3823     CrushWrapper c;
3824     auto p = inc.crush.cbegin();
3825     c.decode(p);
3826     inc.crush.clear();
3827     c.encode(inc.crush, f);
3828   }
3829   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
3830 }
3831
3832 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
3833 {
3834   OSDMap m;
3835   auto q = bl.cbegin();
3836   m.decode(q);
3837   // always encode with subset of osdmap's canonical features
3838   uint64_t f = features & m.get_encoding_features();
3839   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
3840            << dendl;
3841   bl.clear();
3842   m.encode(bl, f | CEPH_FEATURE_RESERVED);
3843 }
3844
3845 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
3846 {
3847   uint64_t significant_features = OSDMap::get_significant_features(features);
3848   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
3849     return 0;
3850   }
3851   int ret = PaxosService::get_version(ver, bl);
3852   if (ret < 0) {
3853     return ret;
3854   }
3855   // NOTE: this check is imprecise; the OSDMap encoding features may
3856   // be a subset of the latest mon quorum features, but worst case we
3857   // reencode once and then cache the (identical) result under both
3858   // feature masks.
3859   if (significant_features !=
3860       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
3861     reencode_incremental_map(bl, features);
3862   }
3863   inc_osd_cache.add({ver, significant_features}, bl);
3864   return 0;
3865 }
3866
3867 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
3868 {
3869   bufferlist inc_bl;
3870   int err = get_version(ver, inc_bl);
3871   ceph_assert(err == 0);
3872   ceph_assert(inc_bl.length());
3873
3874   auto p = inc_bl.cbegin();
3875   inc.decode(p);
3876   dout(10) << __func__ << "     "
3877            << " epoch " << inc.epoch
3878            << " inc_crc " << inc.inc_crc
3879            << " full_crc " << inc.full_crc
3880            << " encode_features " << inc.encode_features << dendl;
3881   return 0;
3882 }
3883
3884 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
3885 {
3886   dout(10) << __func__ << " ver " << ver << dendl;
3887
3888   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
3889   if (closest_pinned == 0) {
3890     return -ENOENT;
3891   }
3892   if (closest_pinned > ver) {
3893     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
3894   }
3895   ceph_assert(closest_pinned <= ver);
3896
3897   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
3898
3899   // get osdmap incremental maps and apply on top of this one.
3900   bufferlist osdm_bl;
3901   bool has_cached_osdmap = false;
3902   for (version_t v = ver-1; v >= closest_pinned; --v) {
3903     if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
3904                                 &osdm_bl)) {
3905       dout(10) << __func__ << " found map in cache ver " << v << dendl;
3906       closest_pinned = v;
3907       has_cached_osdmap = true;
3908       break;
3909     }
3910   }
3911
3912   if (!has_cached_osdmap) {
3913     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
3914     if (err != 0) {
3915       derr << __func__ << " closest pinned map ver " << closest_pinned
3916            << " not available! error: " << cpp_strerror(err) << dendl;
3917     }
3918     ceph_assert(err == 0);
3919   }
3920
3921   ceph_assert(osdm_bl.length());
3922
3923   OSDMap osdm;
3924   osdm.decode(osdm_bl);
3925
3926   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
3927            << " e" << osdm.epoch
3928            << " crc " << osdm.get_crc()
3929            << " -- applying incremental maps." << dendl;
3930
3931   uint64_t encode_features = 0;
3932   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
3933     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
3934
3935     OSDMap::Incremental inc;
3936     int err = get_inc(v, inc);
3937     ceph_assert(err == 0);
3938
3939     encode_features = inc.encode_features;
3940
3941     err = osdm.apply_incremental(inc);
3942     ceph_assert(err == 0);
3943
3944     // this block performs paranoid checks on map retrieval
3945     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
3946         inc.full_crc != 0) {
3947
3948       uint64_t f = encode_features;
3949       if (!f) {
3950         f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
3951       }
3952
3953       // encode osdmap to force calculating crcs
3954       bufferlist tbl;
3955       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
3956       // decode osdmap to compare crcs with what's expected by incremental
3957       OSDMap tosdm;
3958       tosdm.decode(tbl);
3959
3960       if (tosdm.get_crc() != inc.full_crc) {
3961         derr << __func__
3962              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
3963              << ", expected " << inc.full_crc << ")" << dendl;
3964         ceph_abort_msg("osdmap crc mismatch");
3965       }
3966     }
3967
3968     // note: we cannot add the recently computed map to the cache, as is,
3969     // because we have not encoded the map into a bl.
3970   }
3971
3972   if (!encode_features) {
3973     dout(10) << __func__
3974              << " last incremental map didn't have features;"
3975              << " defaulting to quorum's or all" << dendl;
3976     encode_features =
3977       (mon->quorum_con_features ? mon->quorum_con_features : -1);
3978   }
3979   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
3980
3981   return 0;
3982 }
3983
3984 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
3985 {
3986   return get_version_full(ver, mon->get_quorum_con_features(), bl);
3987 }
3988
3989 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
3990                                  bufferlist& bl)
3991 {
3992   uint64_t significant_features = OSDMap::get_significant_features(features);
3993   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
3994     return 0;
3995   }
3996   int ret = PaxosService::get_version_full(ver, bl);
3997   if (ret == -ENOENT) {
3998     // build map?
3999     ret = get_full_from_pinned_map(ver, bl);
4000   }
4001   if (ret < 0) {
4002     return ret;
4003   }
4004   // NOTE: this check is imprecise; the OSDMap encoding features may
4005   // be a subset of the latest mon quorum features, but worst case we
4006   // reencode once and then cache the (identical) result under both
4007   // feature masks.
4008   if (significant_features !=
4009       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4010     reencode_full_map(bl, features);
4011   }
4012   full_osd_cache.add({ver, significant_features}, bl);
4013   return 0;
4014 }
4015
4016 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4017 {
4018   dout(10) << "blacklist " << av << " until " << until << dendl;
4019   for (auto a : av.v) {
4020     if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4021       a.set_type(entity_addr_t::TYPE_ANY);
4022     } else {
4023       a.set_type(entity_addr_t::TYPE_LEGACY);
4024     }
4025     pending_inc.new_blacklist[a] = until;
4026   }
4027   return pending_inc.epoch;
4028 }
4029
4030 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4031 {
4032   if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
4033     a.set_type(entity_addr_t::TYPE_ANY);
4034   } else {
4035     a.set_type(entity_addr_t::TYPE_LEGACY);
4036   }
4037   dout(10) << "blacklist " << a << " until " << until << dendl;
4038   pending_inc.new_blacklist[a] = until;
4039   return pending_inc.epoch;
4040 }
4041
4042
4043 void OSDMonitor::check_osdmap_subs()
4044 {
4045   dout(10) << __func__ << dendl;
4046   if (!osdmap.get_epoch()) {
4047     return;
4048   }
4049   auto osdmap_subs = mon->session_map.subs.find("osdmap");
4050   if (osdmap_subs == mon->session_map.subs.end()) {
4051     return;
4052   }
4053   auto p = osdmap_subs->second->begin();
4054   while (!p.end()) {
4055     auto sub = *p;
4056     ++p;
4057     check_osdmap_sub(sub);
4058   }
4059 }
4060
4061 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4062 {
4063   dout(10) << __func__ << " " << sub << " next " << sub->next
4064            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4065   if (sub->next <= osdmap.get_epoch()) {
4066     if (sub->next >= 1)
4067       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4068     else
4069       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4070     if (sub->onetime)
4071       mon->session_map.remove_sub(sub);
4072     else
4073       sub->next = osdmap.get_epoch() + 1;
4074   }
4075 }
4076
4077 void OSDMonitor::check_pg_creates_subs()
4078 {
4079   if (!osdmap.get_num_up_osds()) {
4080     return;
4081   }
4082   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4083   mon->with_session_map([this](const MonSessionMap& session_map) {
4084       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4085       if (pg_creates_subs == session_map.subs.end()) {
4086         return;
4087       }
4088       for (auto sub : *pg_creates_subs->second) {
4089         check_pg_creates_sub(sub);
4090       }
4091     });
4092 }
4093
4094 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4095 {
4096   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4097   ceph_assert(sub->type == "osd_pg_creates");
4098   // only send these if the OSD is up.  we will check_subs() when they do
4099   // come up so they will get the creates then.
4100   if (sub->session->name.is_osd() &&
4101       mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4102     sub->next = send_pg_creates(sub->session->name.num(),
4103                                 sub->session->con.get(),
4104                                 sub->next);
4105   }
4106 }
4107
4108 void OSDMonitor::do_application_enable(int64_t pool_id,
4109                                        const std::string &app_name,
4110                                        const std::string &app_key,
4111                                        const std::string &app_value)
4112 {
4113   ceph_assert(paxos->is_plugged() && is_writeable());
4114
4115   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4116            << dendl;
4117
4118   ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
4119
4120   auto pp = osdmap.get_pg_pool(pool_id);
4121   ceph_assert(pp != nullptr);
4122
4123   pg_pool_t p = *pp;
4124   if (pending_inc.new_pools.count(pool_id)) {
4125     p = pending_inc.new_pools[pool_id];
4126   }
4127
4128   if (app_key.empty()) {
4129     p.application_metadata.insert({app_name, {}});
4130   } else {
4131     p.application_metadata.insert({app_name, {{app_key, app_value}}});
4132   }
4133   p.last_change = pending_inc.epoch;
4134   pending_inc.new_pools[pool_id] = p;
4135 }
4136
4137 unsigned OSDMonitor::scan_for_creating_pgs(
4138   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4139   const mempool::osdmap::set<int64_t>& removed_pools,
4140   utime_t modified,
4141   creating_pgs_t* creating_pgs) const
4142 {
4143   unsigned queued = 0;
4144   for (auto& p : pools) {
4145     int64_t poolid = p.first;
4146     if (creating_pgs->created_pools.count(poolid)) {
4147       dout(10) << __func__ << " already created " << poolid << dendl;
4148       continue;
4149     }
4150     const pg_pool_t& pool = p.second;
4151     int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4152                                          pool.get_type(), pool.get_size());
4153     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4154       continue;
4155
4156     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4157     const auto created = pool.get_last_change();
4158     if (last_scan_epoch && created <= last_scan_epoch) {
4159       dout(10) << __func__ << " no change in pool " << poolid
4160                << " " << pool << dendl;
4161       continue;
4162     }
4163     if (removed_pools.count(poolid)) {
4164       dout(10) << __func__ << " pool is being removed: " << poolid
4165                << " " << pool << dendl;
4166       continue;
4167     }
4168     dout(10) << __func__ << " queueing pool create for " << poolid
4169              << " " << pool << dendl;
4170     creating_pgs->create_pool(poolid, pool.get_pg_num(),
4171                               created, modified);
4172     queued++;
4173   }
4174   return queued;
4175 }
4176
4177 void OSDMonitor::update_creating_pgs()
4178 {
4179   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4180            << creating_pgs.queue.size() << " pools in queue" << dendl;
4181   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4182   std::lock_guard<std::mutex> l(creating_pgs_lock);
4183   for (const auto& pg : creating_pgs.pgs) {
4184     int acting_primary = -1;
4185     auto pgid = pg.first;
4186     if (!osdmap.pg_exists(pgid)) {
4187       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4188                << dendl;
4189       continue;
4190     }
4191     auto mapped = pg.second.first;
4192     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4193     spg_t spgid(pgid);
4194     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4195     // check the previous creating_pgs, look for the target to whom the pg was
4196     // previously mapped
4197     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4198       const auto last_acting_primary = pgs_by_epoch.first;
4199       for (auto& pgs: pgs_by_epoch.second) {
4200         if (pgs.second.count(spgid)) {
4201           if (last_acting_primary == acting_primary) {
4202             mapped = pgs.first;
4203           } else {
4204             dout(20) << __func__ << " " << pgid << " "
4205                      << " acting_primary:" << last_acting_primary
4206                      << " -> " << acting_primary << dendl;
4207             // note epoch if the target of the create message changed.
4208             mapped = mapping.get_epoch();
4209           }
4210           break;
4211         } else {
4212           // newly creating
4213           mapped = mapping.get_epoch();
4214         }
4215       }
4216     }
4217     dout(10) << __func__ << " will instruct osd." << acting_primary
4218              << " to create " << pgid << "@" << mapped << dendl;
4219     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4220   }
4221   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4222   creating_pgs_epoch = mapping.get_epoch();
4223 }
4224
4225 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4226 {
4227   dout(30) << __func__ << " osd." << osd << " next=" << next
4228            << " " << creating_pgs_by_osd_epoch << dendl;
4229   std::lock_guard<std::mutex> l(creating_pgs_lock);
4230   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4231     dout(20) << __func__
4232              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4233     // the subscribers will be updated when the mapping is completed anyway
4234     return next;
4235   }
4236   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4237   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4238     return next;
4239   ceph_assert(!creating_pgs_by_epoch->second.empty());
4240
4241   MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4242   MOSDPGCreate2 *m = nullptr;
4243
4244   bool old = osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS;
4245
4246   epoch_t last = 0;
4247   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4248        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4249     auto epoch = epoch_pgs->first;
4250     auto& pgs = epoch_pgs->second;
4251     dout(20) << __func__ << " osd." << osd << " from " << next
4252              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4253     last = epoch;
4254     for (auto& pg : pgs) {
4255       // Need the create time from the monitor using its clock to set
4256       // last_scrub_stamp upon pg creation.
4257       auto create = creating_pgs.pgs.find(pg.pgid);
4258       ceph_assert(create != creating_pgs.pgs.end());
4259       if (old) {
4260         if (!oldm) {
4261           oldm = new MOSDPGCreate(creating_pgs_epoch);
4262         }
4263         oldm->mkpg.emplace(pg.pgid,
4264                            pg_create_t{create->second.first, pg.pgid, 0});
4265         oldm->ctimes.emplace(pg.pgid, create->second.second);
4266       } else {
4267         if (!m) {
4268           m = new MOSDPGCreate2(creating_pgs_epoch);
4269         }
4270         m->pgs.emplace(pg, create->second);
4271       }
4272       dout(20) << __func__ << " will create " << pg
4273                << " at " << create->second.first << dendl;
4274     }
4275   }
4276   if (m) {
4277     con->send_message(m);
4278   } else if (oldm) {
4279     con->send_message(oldm);
4280   } else {
4281     dout(20) << __func__ << " osd." << osd << " from " << next
4282              << " has nothing to send" << dendl;
4283     return next;
4284   }
4285
4286   // sub is current through last + 1
4287   return last + 1;
4288 }
4289
4290 // TICK
4291
4292
4293 void OSDMonitor::tick()
4294 {
4295   if (!is_active()) return;
4296
4297   dout(10) << osdmap << dendl;
4298
4299   // always update osdmap manifest, regardless of being the leader.
4300   load_osdmap_manifest();
4301
4302   if (!mon->is_leader()) return;
4303
4304   bool do_propose = false;
4305   utime_t now = ceph_clock_now();
4306
4307   if (handle_osd_timeouts(now, last_osd_report)) {
4308     do_propose = true;
4309   }
4310
4311   // mark osds down?
4312   if (check_failures(now)) {
4313     do_propose = true;
4314   }
4315
4316   // Force a proposal if we need to prune; pruning is performed on
4317   // ``encode_pending()``, hence why we need to regularly trigger a proposal
4318   // even if there's nothing going on.
4319   if (is_prune_enabled() && should_prune()) {
4320     do_propose = true;
4321   }
4322
4323   // mark down osds out?
4324
4325   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4326    * influence at all. The decision is made based on the ratio of "in" osds,
4327    * and the function returns false if this ratio is lower that the minimum
4328    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
4329    */
4330   if (can_mark_out(-1)) {
4331     string down_out_subtree_limit = g_conf().get_val<string>(
4332       "mon_osd_down_out_subtree_limit");
4333     set<int> down_cache;  // quick cache of down subtrees
4334
4335     map<int,utime_t>::iterator i = down_pending_out.begin();
4336     while (i != down_pending_out.end()) {
4337       int o = i->first;
4338       utime_t down = now;
4339       down -= i->second;
4340       ++i;
4341
4342       if (osdmap.is_down(o) &&
4343           osdmap.is_in(o) &&
4344           can_mark_out(o)) {
4345         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
4346         utime_t grace = orig_grace;
4347         double my_grace = 0.0;
4348
4349         if (g_conf()->mon_osd_adjust_down_out_interval) {
4350           // scale grace period the same way we do the heartbeat grace.
4351           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
4352           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
4353           double decay_k = ::log(.5) / halflife;
4354           double decay = exp((double)down * decay_k);
4355           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
4356                    << " down for " << down << " decay " << decay << dendl;
4357           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
4358           grace += my_grace;
4359         }
4360
4361         // is this an entire large subtree down?
4362         if (down_out_subtree_limit.length()) {
4363           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
4364           if (type > 0) {
4365             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
4366               dout(10) << "tick entire containing " << down_out_subtree_limit
4367                        << " subtree for osd." << o
4368                        << " is down; resetting timer" << dendl;
4369               // reset timer, too.
4370               down_pending_out[o] = now;
4371               continue;
4372             }
4373           }
4374         }
4375
4376         bool down_out = !osdmap.is_destroyed(o) &&
4377           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
4378         bool destroyed_out = osdmap.is_destroyed(o) &&
4379           g_conf()->mon_osd_destroyed_out_interval > 0 &&
4380         // this is not precise enough as we did not make a note when this osd
4381         // was marked as destroyed, but let's not bother with that
4382         // complexity for now.
4383           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
4384         if (down_out || destroyed_out) {
4385           dout(10) << "tick marking osd." << o << " OUT after " << down
4386                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
4387           pending_inc.new_weight[o] = CEPH_OSD_OUT;
4388
4389           // set the AUTOOUT bit.
4390           if (pending_inc.new_state.count(o) == 0)
4391             pending_inc.new_state[o] = 0;
4392           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
4393
4394           // remember previous weight
4395           if (pending_inc.new_xinfo.count(o) == 0)
4396             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
4397           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
4398
4399           do_propose = true;
4400
4401           mon->clog->info() << "Marking osd." << o << " out (has been down for "
4402                             << int(down.sec()) << " seconds)";
4403         } else
4404           continue;
4405       }
4406
4407       down_pending_out.erase(o);
4408     }
4409   } else {
4410     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
4411   }
4412
4413   // expire blacklisted items?
4414   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
4415        p != osdmap.blacklist.end();
4416        ++p) {
4417     if (p->second < now) {
4418       dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
4419       pending_inc.old_blacklist.push_back(p->first);
4420       do_propose = true;
4421     }
4422   }
4423
4424   if (try_prune_purged_snaps()) {
4425     do_propose = true;
4426   }
4427
4428   if (update_pools_status())
4429     do_propose = true;
4430
4431   if (do_propose ||
4432       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
4433     propose_pending();
4434 }
4435
4436 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
4437                                      std::map<int,utime_t> &last_osd_report)
4438 {
4439   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
4440   if (now - mon->get_leader_since() < timeo) {
4441     // We haven't been the leader for long enough to consider OSD timeouts
4442     return false;
4443   }
4444
4445   int max_osd = osdmap.get_max_osd();
4446   bool new_down = false;
4447
4448   for (int i=0; i < max_osd; ++i) {
4449     dout(30) << __func__ << ": checking up on osd " << i << dendl;
4450     if (!osdmap.exists(i)) {
4451       last_osd_report.erase(i); // if any
4452       continue;
4453     }
4454     if (!osdmap.is_up(i))
4455       continue;
4456     const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
4457     if (t == last_osd_report.end()) {
4458       // it wasn't in the map; start the timer.
4459       last_osd_report[i] = now;
4460     } else if (can_mark_down(i)) {
4461       utime_t diff = now - t->second;
4462       if (diff > timeo) {
4463         mon->clog->info() << "osd." << i << " marked down after no beacon for "
4464                           << diff << " seconds";
4465         derr << "no beacon from osd." << i << " since " << t->second
4466              << ", " << diff << " seconds ago.  marking down" << dendl;
4467         pending_inc.new_state[i] = CEPH_OSD_UP;
4468         new_down = true;
4469       }
4470     }
4471   }
4472   return new_down;
4473 }
4474
4475 static void dump_cpu_list(Formatter *f, const char *name,
4476                           const string& strlist)
4477 {
4478   cpu_set_t cpu_set;
4479   size_t cpu_set_size;
4480   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
4481     return;
4482   }
4483   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
4484   f->open_array_section(name);
4485   for (auto cpu : cpus) {
4486     f->dump_int("cpu", cpu);
4487   }
4488   f->close_section();
4489 }
4490
4491 void OSDMonitor::dump_info(Formatter *f)
4492 {
4493   f->open_object_section("osdmap");
4494   osdmap.dump(f);
4495   f->close_section();
4496
4497   f->open_array_section("osd_metadata");
4498   for (int i=0; i<osdmap.get_max_osd(); ++i) {
4499     if (osdmap.exists(i)) {
4500       f->open_object_section("osd");
4501       f->dump_unsigned("id", i);
4502       dump_osd_metadata(i, f, NULL);
4503       f->close_section();
4504     }
4505   }
4506   f->close_section();
4507
4508   f->dump_unsigned("osdmap_first_committed", get_first_committed());
4509   f->dump_unsigned("osdmap_last_committed", get_last_committed());
4510
4511   f->open_object_section("crushmap");
4512   osdmap.crush->dump(f);
4513   f->close_section();
4514
4515   if (has_osdmap_manifest) {
4516     f->open_object_section("osdmap_manifest");
4517     osdmap_manifest.dump(f);
4518     f->close_section();
4519   }
4520 }
4521
4522 namespace {
4523   enum osd_pool_get_choices {
4524     SIZE, MIN_SIZE,
4525     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
4526     NODELETE, NOPGCHANGE, NOSIZECHANGE,
4527     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
4528     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
4529     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
4530     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
4531     CACHE_TARGET_FULL_RATIO,
4532     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
4533     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
4534     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
4535     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
4536     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
4537     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
4538     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
4539     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
4540     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
4541     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
4542     PG_AUTOSCALE_BIAS };
4543
4544   std::set<osd_pool_get_choices>
4545     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
4546                                 const std::set<osd_pool_get_choices>& second)
4547     {
4548       std::set<osd_pool_get_choices> result;
4549       std::set_difference(first.begin(), first.end(),
4550                           second.begin(), second.end(),
4551                           std::inserter(result, result.end()));
4552       return result;
4553     }
4554 }
4555
4556
4557 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
4558 {
4559   op->mark_osdmon_event(__func__);
4560   MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
4561   int r = 0;
4562   bufferlist rdata;
4563   stringstream ss, ds;
4564
4565   cmdmap_t cmdmap;
4566   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
4567     string rs = ss.str();
4568     mon->reply_command(op, -EINVAL, rs, get_last_committed());
4569     return true;
4570   }
4571
4572   MonSession *session = op->get_session();
4573   if (!session) {
4574     derr << __func__ << " no session" << dendl;
4575     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
4576     return true;
4577   }
4578
4579   string prefix;
4580   cmd_getval(cct, cmdmap, "prefix", prefix);
4581
4582   string format;
4583   cmd_getval(cct, cmdmap, "format", format, string("plain"));
4584   boost::scoped_ptr<Formatter> f(Formatter::create(format));
4585
4586   if (prefix == "osd stat") {
4587     osdmap.print_summary(f.get(), ds, "", true);
4588     if (f)
4589       f->flush(rdata);
4590     else
4591       rdata.append(ds);
4592   }
4593   else if (prefix == "osd dump" ||
4594            prefix == "osd tree" ||
4595            prefix == "osd tree-from" ||
4596            prefix == "osd ls" ||
4597            prefix == "osd getmap" ||
4598            prefix == "osd getcrushmap" ||
4599            prefix == "osd ls-tree") {
4600     string val;
4601
4602     epoch_t epoch = 0;
4603     int64_t epochnum;
4604     cmd_getval(cct, cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
4605     epoch = epochnum;
4606
4607     bufferlist osdmap_bl;
4608     int err = get_version_full(epoch, osdmap_bl);
4609     if (err == -ENOENT) {
4610       r = -ENOENT;
4611       ss << "there is no map for epoch " << epoch;
4612       goto reply;
4613     }
4614     ceph_assert(err == 0);
4615     ceph_assert(osdmap_bl.length());
4616
4617     OSDMap *p;
4618     if (epoch == osdmap.get_epoch()) {
4619       p = &osdmap;
4620     } else {
4621       p = new OSDMap;
4622       p->decode(osdmap_bl);
4623     }
4624
4625     auto sg = make_scope_guard([&] {
4626       if (p != &osdmap) {
4627         delete p;
4628       }
4629     });
4630
4631     if (prefix == "osd dump") {
4632       stringstream ds;
4633       if (f) {
4634         f->open_object_section("osdmap");
4635         p->dump(f.get());
4636         f->close_section();
4637         f->flush(ds);
4638       } else {
4639         p->print(ds);
4640       }
4641       rdata.append(ds);
4642       if (!f)
4643         ds << " ";
4644     } else if (prefix == "osd ls") {
4645       if (f) {
4646         f->open_array_section("osds");
4647         for (int i = 0; i < osdmap.get_max_osd(); i++) {
4648           if (osdmap.exists(i)) {
4649             f->dump_int("osd", i);
4650           }
4651         }
4652         f->close_section();
4653         f->flush(ds);
4654       } else {
4655         bool first = true;
4656         for (int i = 0; i < osdmap.get_max_osd(); i++) {
4657           if (osdmap.exists(i)) {
4658             if (!first)
4659               ds << "\n";
4660             first = false;
4661             ds << i;
4662           }
4663         }
4664       }
4665       rdata.append(ds);
4666     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
4667       string bucket;
4668       if (prefix == "osd tree-from") {
4669         cmd_getval(cct, cmdmap, "bucket", bucket);
4670         if (!osdmap.crush->name_exists(bucket)) {
4671           ss << "bucket '" << bucket << "' does not exist";
4672           r = -ENOENT;
4673           goto reply;
4674         }
4675         int id = osdmap.crush->get_item_id(bucket);
4676         if (id >= 0) {
4677           ss << "\"" << bucket << "\" is not a bucket";
4678           r = -EINVAL;
4679           goto reply;
4680         }
4681       }
4682
4683       vector<string> states;
4684       cmd_getval(cct, cmdmap, "states", states);
4685       unsigned filter = 0;
4686       for (auto& s : states) {
4687         if (s == "up") {
4688           filter |= OSDMap::DUMP_UP;
4689         } else if (s == "down") {
4690           filter |= OSDMap::DUMP_DOWN;
4691         } else if (s == "in") {
4692           filter |= OSDMap::DUMP_IN;
4693         } else if (s == "out") {
4694           filter |= OSDMap::DUMP_OUT;
4695         } else if (s == "destroyed") {
4696           filter |= OSDMap::DUMP_DESTROYED;
4697         } else {
4698           ss << "unrecognized state '" << s << "'";
4699           r = -EINVAL;
4700           goto reply;
4701         }
4702       }
4703       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
4704           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
4705         ss << "cannot specify both 'in' and 'out'";
4706         r = -EINVAL;
4707         goto reply;
4708       }
4709       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
4710            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
4711            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
4712            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
4713            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
4714            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
4715         ss << "can specify only one of 'up', 'down' and 'destroyed'";
4716         r = -EINVAL;
4717         goto reply;
4718       }
4719       if (f) {
4720         f->open_object_section("tree");
4721         p->print_tree(f.get(), NULL, filter, bucket);
4722         f->close_section();
4723         f->flush(ds);
4724       } else {
4725         p->print_tree(NULL, &ds, filter, bucket);
4726       }
4727       rdata.append(ds);
4728     } else if (prefix == "osd getmap") {
4729       rdata.append(osdmap_bl);
4730       ss << "got osdmap epoch " << p->get_epoch();
4731     } else if (prefix == "osd getcrushmap") {
4732       p->crush->encode(rdata, mon->get_quorum_con_features());
4733       ss << p->get_crush_version();
4734     } else if (prefix == "osd ls-tree") {
4735       string bucket_name;
4736       cmd_getval(cct, cmdmap, "name", bucket_name);
4737       set<int> osds;
4738       r = p->get_osds_by_bucket_name(bucket_name, &osds);
4739       if (r == -ENOENT) {
4740         ss << "\"" << bucket_name << "\" does not exist";
4741         goto reply;
4742       } else if (r < 0) {
4743         ss << "can not parse bucket name:\"" << bucket_name << "\"";
4744         goto reply;
4745       }
4746
4747       if (f) {
4748         f->open_array_section("osds");
4749         for (auto &i : osds) {
4750           if (osdmap.exists(i)) {
4751             f->dump_int("osd", i);
4752           }
4753         }
4754         f->close_section();
4755         f->flush(ds);
4756       } else {
4757         bool first = true;
4758         for (auto &i : osds) {
4759           if (osdmap.exists(i)) {
4760             if (!first)
4761               ds << "\n";
4762             first = false;
4763             ds << i;
4764           }
4765         }
4766       }
4767
4768       rdata.append(ds);
4769     }
4770   } else if (prefix == "osd getmaxosd") {
4771     if (f) {
4772       f->open_object_section("getmaxosd");
4773       f->dump_unsigned("epoch", osdmap.get_epoch());
4774       f->dump_int("max_osd", osdmap.get_max_osd());
4775       f->close_section();
4776       f->flush(rdata);
4777     } else {
4778       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
4779       rdata.append(ds);
4780     }
4781   } else if (prefix == "osd utilization") {
4782     string out;
4783     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
4784     if (f)
4785       f->flush(rdata);
4786     else
4787       rdata.append(out);
4788     r = 0;
4789     goto reply;
4790   } else if (prefix  == "osd find") {
4791     int64_t osd;
4792     if (!cmd_getval(cct, cmdmap, "id", osd)) {
4793       ss << "unable to parse osd id value '"
4794          << cmd_vartype_stringify(cmdmap["id"]) << "'";
4795       r = -EINVAL;
4796       goto reply;
4797     }
4798     if (!osdmap.exists(osd)) {
4799       ss << "osd." << osd << " does not exist";
4800       r = -ENOENT;
4801       goto reply;
4802     }
4803     string format;
4804     cmd_getval(cct, cmdmap, "format", format);
4805     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4806     f->open_object_section("osd_location");
4807     f->dump_int("osd", osd);
4808     f->dump_object("addrs", osdmap.get_addrs(osd));
4809     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
4810
4811     // try to identify host, pod/container name, etc.
4812     map<string,string> m;
4813     load_metadata(osd, m, nullptr);
4814     if (auto p = m.find("hostname"); p != m.end()) {
4815       f->dump_string("host", p->second);
4816     }
4817     for (auto& k : {
4818         "pod_name", "pod_namespace", // set by rook
4819         "container_name"             // set by ceph-ansible
4820         }) {
4821       if (auto p = m.find(k); p != m.end()) {
4822         f->dump_string(k, p->second);
4823       }
4824     }
4825
4826     // crush is helpful too
4827     f->open_object_section("crush_location");
4828     map<string,string> loc = osdmap.crush->get_full_location(osd);
4829     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
4830       f->dump_string(p->first.c_str(), p->second);
4831     f->close_section();
4832     f->close_section();
4833     f->flush(rdata);
4834   } else if (prefix == "osd metadata") {
4835     int64_t osd = -1;
4836     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
4837         !cmd_getval(cct, cmdmap, "id", osd)) {
4838       ss << "unable to parse osd id value '"
4839          << cmd_vartype_stringify(cmdmap["id"]) << "'";
4840       r = -EINVAL;
4841       goto reply;
4842     }
4843     if (osd >= 0 && !osdmap.exists(osd)) {
4844       ss << "osd." << osd << " does not exist";
4845       r = -ENOENT;
4846       goto reply;
4847     }
4848     string format;
4849     cmd_getval(cct, cmdmap, "format", format);
4850     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
4851     if (osd >= 0) {
4852       f->open_object_section("osd_metadata");
4853       f->dump_unsigned("id", osd);
4854       r = dump_osd_metadata(osd, f.get(), &ss);
4855       if (r < 0)
4856         goto reply;
4857       f->close_section();
4858     } else {
4859       r = 0;
4860       f->open_array_section("osd_metadata");
4861       for (int i=0; i<osdmap.get_max_osd(); ++i) {
4862         if (osdmap.exists(i)) {
4863           f->open_object_section("osd");
4864           f->dump_unsigned("id", i);
4865           r = dump_osd_metadata(i, f.get(), NULL);
4866           if (r == -EINVAL || r == -ENOENT) {
4867             // Drop error, continue to get other daemons' metadata
4868             dout(4) << "No metadata for osd." << i << dendl;
4869             r = 0;
4870           } else if (r < 0) {
4871             // Unexpected error
4872             goto reply;
4873           }
4874           f->close_section();
4875         }
4876       }
4877       f->close_section();
4878     }
4879     f->flush(rdata);
4880   } else if (prefix == "osd versions") {
4881     if (!f)
4882       f.reset(Formatter::create("json-pretty"));
4883     count_metadata("ceph_version", f.get());
4884     f->flush(rdata);
4885     r = 0;
4886   } else if (prefix == "osd count-metadata") {
4887     if (!f)
4888       f.reset(Formatter::create("json-pretty"));
4889     string field;
4890     cmd_getval(cct, cmdmap, "property", field);
4891     count_metadata(field, f.get());
4892     f->flush(rdata);
4893     r = 0;
4894   } else if (prefix == "osd numa-status") {
4895     TextTable tbl;
4896     if (f) {
4897       f->open_array_section("osds");
4898     } else {
4899       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
4900       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
4901       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
4902       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
4903       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
4904       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
4905     }
4906     for (int i=0; i<osdmap.get_max_osd(); ++i) {
4907       if (osdmap.exists(i)) {
4908         map<string,string> m;
4909         ostringstream err;
4910         if (load_metadata(i, m, &err) < 0) {
4911           continue;
4912         }
4913         string host;
4914         auto p = m.find("hostname");
4915         if (p != m.end()) {
4916           host = p->second;
4917         }
4918         if (f) {
4919           f->open_object_section("osd");
4920           f->dump_int("osd", i);
4921           f->dump_string("host", host);
4922           for (auto n : { "network_numa_node", "objectstore_numa_node",
4923                 "numa_node" }) {
4924             p = m.find(n);
4925             if (p != m.end()) {
4926               f->dump_int(n, atoi(p->second.c_str()));
4927             }
4928           }
4929           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
4930             p = m.find(n);
4931             if (p != m.end()) {
4932               list<string> ls = get_str_list(p->second, ",");
4933               f->open_array_section(n);
4934               for (auto node : ls) {
4935                 f->dump_int("node", atoi(node.c_str()));
4936               }
4937               f->close_section();
4938             }
4939           }
4940           for (auto n : { "numa_node_cpus" }) {
4941             p = m.find(n);
4942             if (p != m.end()) {
4943               dump_cpu_list(f.get(), n, p->second);
4944             }
4945           }
4946           f->close_section();
4947         } else {
4948           tbl << i;
4949           tbl << host;
4950           p = m.find("network_numa_nodes");
4951           if (p != m.end()) {
4952             tbl << p->second;
4953           } else {
4954             tbl << "-";
4955           }
4956           p = m.find("objectstore_numa_nodes");
4957           if (p != m.end()) {
4958             tbl << p->second;
4959           } else {
4960             tbl << "-";
4961           }
4962           p = m.find("numa_node");
4963           auto q = m.find("numa_node_cpus");
4964           if (p != m.end() && q != m.end()) {
4965             tbl << p->second;
4966             tbl << q->second;
4967           } else {
4968             tbl << "-";
4969             tbl << "-";
4970           }
4971           tbl << TextTable::endrow;
4972         }
4973       }
4974     }
4975     if (f) {
4976       f->close_section();
4977       f->flush(rdata);
4978     } else {
4979       rdata.append(stringify(tbl));
4980     }
4981   } else if (prefix == "osd map") {
4982     string poolstr, objstr, namespacestr;
4983     cmd_getval(cct, cmdmap, "pool", poolstr);
4984     cmd_getval(cct, cmdmap, "object", objstr);
4985     cmd_getval(cct, cmdmap, "nspace", namespacestr);
4986
4987     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
4988     if (pool < 0) {
4989       ss << "pool " << poolstr << " does not exist";
4990       r = -ENOENT;
4991       goto reply;
4992     }
4993     object_locator_t oloc(pool, namespacestr);
4994     object_t oid(objstr);
4995     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
4996     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
4997     vector<int> up, acting;
4998     int up_p, acting_p;
4999     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5000
5001     string fullobjname;
5002     if (!namespacestr.empty())
5003       fullobjname = namespacestr + string("/") + oid.name;
5004     else
5005       fullobjname = oid.name;
5006     if (f) {
5007       f->open_object_section("osd_map");
5008       f->dump_unsigned("epoch", osdmap.get_epoch());
5009       f->dump_string("pool", poolstr);
5010       f->dump_int("pool_id", pool);
5011       f->dump_stream("objname") << fullobjname;
5012       f->dump_stream("raw_pgid") << pgid;
5013       f->dump_stream("pgid") << mpgid;
5014       f->open_array_section("up");
5015       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5016         f->dump_int("osd", *p);
5017       f->close_section();
5018       f->dump_int("up_primary", up_p);
5019       f->open_array_section("acting");
5020       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5021         f->dump_int("osd", *p);
5022       f->close_section();
5023       f->dump_int("acting_primary", acting_p);
5024       f->close_section(); // osd_map
5025       f->flush(rdata);
5026     } else {
5027       ds << "osdmap e" << osdmap.get_epoch()
5028         << " pool '" << poolstr << "' (" << pool << ")"
5029         << " object '" << fullobjname << "' ->"
5030         << " pg " << pgid << " (" << mpgid << ")"
5031         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5032         << pg_vector_string(acting) << ", p" << acting_p << ")";
5033       rdata.append(ds);
5034     }
5035
5036   } else if (prefix == "pg map") {
5037     pg_t pgid;
5038     string pgidstr;
5039     cmd_getval(cct, cmdmap, "pgid", pgidstr);
5040     if (!pgid.parse(pgidstr.c_str())) {
5041       ss << "invalid pgid '" << pgidstr << "'";
5042       r = -EINVAL;
5043       goto reply;
5044     }
5045     vector<int> up, acting;
5046     if (!osdmap.have_pg_pool(pgid.pool())) {
5047       ss << "pg '" << pgidstr << "' does not exist";
5048       r = -ENOENT;
5049       goto reply;
5050     }
5051     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5052     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5053     if (f) {
5054       f->open_object_section("pg_map");
5055       f->dump_unsigned("epoch", osdmap.get_epoch());
5056       f->dump_stream("raw_pgid") << pgid;
5057       f->dump_stream("pgid") << mpgid;
5058       f->open_array_section("up");
5059       for (auto osd : up) {
5060         f->dump_int("up_osd", osd);
5061       }
5062       f->close_section();
5063       f->open_array_section("acting");
5064       for (auto osd : acting) {
5065         f->dump_int("acting_osd", osd);
5066       }
5067       f->close_section();
5068       f->close_section();
5069       f->flush(rdata);
5070     } else {
5071       ds << "osdmap e" << osdmap.get_epoch()
5072          << " pg " << pgid << " (" << mpgid << ")"
5073          << " -> up " << up << " acting " << acting;
5074       rdata.append(ds);
5075     }
5076     goto reply;
5077
5078   } else if (prefix == "osd lspools") {
5079     if (f)
5080       f->open_array_section("pools");
5081     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5082          p != osdmap.pools.end();
5083          ++p) {
5084       if (f) {
5085         f->open_object_section("pool");
5086         f->dump_int("poolnum", p->first);
5087         f->dump_string("poolname", osdmap.pool_name[p->first]);
5088         f->close_section();
5089       } else {
5090         ds << p->first << ' ' << osdmap.pool_name[p->first];
5091         if (next(p) != osdmap.pools.end()) {
5092           ds << '\n';
5093         }
5094       }
5095     }
5096     if (f) {
5097       f->close_section();
5098       f->flush(ds);
5099     }
5100     rdata.append(ds);
5101   } else if (prefix == "osd blacklist ls") {
5102     if (f)
5103       f->open_array_section("blacklist");
5104
5105     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5106          p != osdmap.blacklist.end();
5107          ++p) {
5108       if (f) {
5109         f->open_object_section("entry");
5110         f->dump_string("addr", p->first.get_legacy_str());
5111         f->dump_stream("until") << p->second;
5112         f->close_section();
5113       } else {
5114         stringstream ss;
5115         string s;
5116         ss << p->first << " " << p->second;
5117         getline(ss, s);
5118         s += "\n";
5119         rdata.append(s);
5120       }
5121     }
5122     if (f) {
5123       f->close_section();
5124       f->flush(rdata);
5125     }
5126     ss << "listed " << osdmap.blacklist.size() << " entries";
5127
5128   } else if (prefix == "osd pool ls") {
5129     string detail;
5130     cmd_getval(cct, cmdmap, "detail", detail);
5131     if (!f && detail == "detail") {
5132       ostringstream ss;
5133       osdmap.print_pools(ss);
5134       rdata.append(ss.str());
5135     } else {
5136       if (f)
5137         f->open_array_section("pools");
5138       for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5139            it != osdmap.get_pools().end();
5140            ++it) {
5141         if (f) {
5142           if (detail == "detail") {
5143             f->open_object_section("pool");
5144             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5145             it->second.dump(f.get());
5146             f->close_section();
5147           } else {
5148             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5149           }
5150         } else {
5151           rdata.append(osdmap.get_pool_name(it->first) + "\n");
5152         }
5153       }
5154       if (f) {
5155         f->close_section();
5156         f->flush(rdata);
5157       }
5158     }
5159
5160   } else if (prefix == "osd crush get-tunable") {
5161     string tunable;
5162     cmd_getval(cct, cmdmap, "tunable", tunable);
5163     ostringstream rss;
5164     if (f)
5165       f->open_object_section("tunable");
5166     if (tunable == "straw_calc_version") {
5167       if (f)
5168         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5169       else
5170         rss << osdmap.crush->get_straw_calc_version() << "\n";
5171     } else {
5172       r = -EINVAL;
5173       goto reply;
5174     }
5175     if (f) {
5176       f->close_section();
5177       f->flush(rdata);
5178     } else {
5179       rdata.append(rss.str());
5180     }
5181     r = 0;
5182
5183   } else if (prefix == "osd pool get") {
5184     string poolstr;
5185     cmd_getval(cct, cmdmap, "pool", poolstr);
5186     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5187     if (pool < 0) {
5188       ss << "unrecognized pool '" << poolstr << "'";
5189       r = -ENOENT;
5190       goto reply;
5191     }
5192
5193     const pg_pool_t *p = osdmap.get_pg_pool(pool);
5194     string var;
5195     cmd_getval(cct, cmdmap, "var", var);
5196
5197     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5198     const choices_map_t ALL_CHOICES = {
5199       {"size", SIZE},
5200       {"min_size", MIN_SIZE},
5201       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5202       {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5203       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5204       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5205       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5206       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5207       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5208       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5209       {"use_gmt_hitset", USE_GMT_HITSET},
5210       {"target_max_objects", TARGET_MAX_OBJECTS},
5211       {"target_max_bytes", TARGET_MAX_BYTES},
5212       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5213       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5214       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5215       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5216       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5217       {"erasure_code_profile", ERASURE_CODE_PROFILE},
5218       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5219       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5220       {"fast_read", FAST_READ},
5221       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5222       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5223       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5224       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5225       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5226       {"recovery_priority", RECOVERY_PRIORITY},
5227       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5228       {"scrub_priority", SCRUB_PRIORITY},
5229       {"compression_mode", COMPRESSION_MODE},
5230       {"compression_algorithm", COMPRESSION_ALGORITHM},
5231       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5232       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5233       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5234       {"csum_type", CSUM_TYPE},
5235       {"csum_max_block", CSUM_MAX_BLOCK},
5236       {"csum_min_block", CSUM_MIN_BLOCK},
5237       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5238       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5239       {"pg_num_min", PG_NUM_MIN},
5240       {"target_size_bytes", TARGET_SIZE_BYTES},
5241       {"target_size_ratio", TARGET_SIZE_RATIO},
5242       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
5243     };
5244
5245     typedef std::set<osd_pool_get_choices> choices_set_t;
5246
5247     const choices_set_t ONLY_TIER_CHOICES = {
5248       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5249       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5250       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5251       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5252       MIN_READ_RECENCY_FOR_PROMOTE,
5253       MIN_WRITE_RECENCY_FOR_PROMOTE,
5254       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
5255     };
5256     const choices_set_t ONLY_ERASURE_CHOICES = {
5257       EC_OVERWRITES, ERASURE_CODE_PROFILE
5258     };
5259
5260     choices_set_t selected_choices;
5261     if (var == "all") {
5262       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
5263           it != ALL_CHOICES.end(); ++it) {
5264         selected_choices.insert(it->second);
5265       }
5266
5267       if(!p->is_tier()) {
5268         selected_choices = subtract_second_from_first(selected_choices,
5269                                                       ONLY_TIER_CHOICES);
5270       }
5271
5272       if(!p->is_erasure()) {
5273         selected_choices = subtract_second_from_first(selected_choices,
5274                                                       ONLY_ERASURE_CHOICES);
5275       }
5276     } else /* var != "all" */  {
5277       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
5278       osd_pool_get_choices selected = found->second;
5279
5280       if (!p->is_tier() &&
5281           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
5282         ss << "pool '" << poolstr
5283            << "' is not a tier pool: variable not applicable";
5284         r = -EACCES;
5285         goto reply;
5286       }
5287
5288       if (!p->is_erasure() &&
5289           ONLY_ERASURE_CHOICES.find(selected)
5290           != ONLY_ERASURE_CHOICES.end()) {
5291         ss << "pool '" << poolstr
5292            << "' is not a erasure pool: variable not applicable";
5293         r = -EACCES;
5294         goto reply;
5295       }
5296
5297       if (pool_opts_t::is_opt_name(var) &&
5298           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
5299         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
5300         r = -ENOENT;
5301         goto reply;
5302       }
5303
5304       selected_choices.insert(selected);
5305     }
5306
5307     if (f) {
5308       f->open_object_section("pool");
5309       f->dump_string("pool", poolstr);
5310       f->dump_int("pool_id", pool);
5311       for(choices_set_t::const_iterator it = selected_choices.begin();
5312           it != selected_choices.end(); ++it) {
5313         choices_map_t::const_iterator i;
5314         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5315           if (i->second == *it) {
5316             break;
5317           }
5318         }
5319         ceph_assert(i != ALL_CHOICES.end());
5320         switch(*it) {
5321           case PG_NUM:
5322             f->dump_int("pg_num", p->get_pg_num());
5323             break;
5324           case PGP_NUM:
5325             f->dump_int("pgp_num", p->get_pgp_num());
5326             break;
5327           case SIZE:
5328             f->dump_int("size", p->get_size());
5329             break;
5330           case MIN_SIZE:
5331             f->dump_int("min_size", p->get_min_size());
5332             break;
5333           case CRUSH_RULE:
5334             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
5335               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
5336                                p->get_crush_rule()));
5337             } else {
5338               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
5339             }
5340             break;
5341           case EC_OVERWRITES:
5342             f->dump_bool("allow_ec_overwrites",
5343                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
5344             break;
5345           case PG_AUTOSCALE_MODE:
5346             f->dump_string("pg_autoscale_mode",
5347                            pg_pool_t::get_pg_autoscale_mode_name(
5348                              p->pg_autoscale_mode));
5349             break;
5350           case HASHPSPOOL:
5351           case NODELETE:
5352           case NOPGCHANGE:
5353           case NOSIZECHANGE:
5354           case WRITE_FADVISE_DONTNEED:
5355           case NOSCRUB:
5356           case NODEEP_SCRUB:
5357             f->dump_bool(i->first.c_str(),
5358                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
5359             break;
5360           case HIT_SET_PERIOD:
5361             f->dump_int("hit_set_period", p->hit_set_period);
5362             break;
5363           case HIT_SET_COUNT:
5364             f->dump_int("hit_set_count", p->hit_set_count);
5365             break;
5366           case HIT_SET_TYPE:
5367             f->dump_string("hit_set_type",
5368                            HitSet::get_type_name(p->hit_set_params.get_type()));
5369             break;
5370           case HIT_SET_FPP:
5371             {
5372               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5373                 BloomHitSet::Params *bloomp =
5374                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5375                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
5376               } else if(var != "all") {
5377                 f->close_section();
5378                 ss << "hit set is not of type Bloom; " <<
5379                   "invalid to get a false positive rate!";
5380                 r = -EINVAL;
5381                 goto reply;
5382               }
5383             }
5384             break;
5385           case USE_GMT_HITSET:
5386             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
5387             break;
5388           case TARGET_MAX_OBJECTS:
5389             f->dump_unsigned("target_max_objects", p->target_max_objects);
5390             break;
5391           case TARGET_MAX_BYTES:
5392             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
5393             break;
5394           case CACHE_TARGET_DIRTY_RATIO:
5395             f->dump_unsigned("cache_target_dirty_ratio_micro",
5396                              p->cache_target_dirty_ratio_micro);
5397             f->dump_float("cache_target_dirty_ratio",
5398                           ((float)p->cache_target_dirty_ratio_micro/1000000));
5399             break;
5400           case CACHE_TARGET_DIRTY_HIGH_RATIO:
5401             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
5402                              p->cache_target_dirty_high_ratio_micro);
5403             f->dump_float("cache_target_dirty_high_ratio",
5404                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
5405             break;
5406           case CACHE_TARGET_FULL_RATIO:
5407             f->dump_unsigned("cache_target_full_ratio_micro",
5408                              p->cache_target_full_ratio_micro);
5409             f->dump_float("cache_target_full_ratio",
5410                           ((float)p->cache_target_full_ratio_micro/1000000));
5411             break;
5412           case CACHE_MIN_FLUSH_AGE:
5413             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
5414             break;
5415           case CACHE_MIN_EVICT_AGE:
5416             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
5417             break;
5418           case ERASURE_CODE_PROFILE:
5419             f->dump_string("erasure_code_profile", p->erasure_code_profile);
5420             break;
5421           case MIN_READ_RECENCY_FOR_PROMOTE:
5422             f->dump_int("min_read_recency_for_promote",
5423                         p->min_read_recency_for_promote);
5424             break;
5425           case MIN_WRITE_RECENCY_FOR_PROMOTE:
5426             f->dump_int("min_write_recency_for_promote",
5427                         p->min_write_recency_for_promote);
5428             break;
5429           case FAST_READ:
5430             f->dump_int("fast_read", p->fast_read);
5431             break;
5432           case HIT_SET_GRADE_DECAY_RATE:
5433             f->dump_int("hit_set_grade_decay_rate",
5434                         p->hit_set_grade_decay_rate);
5435             break;
5436           case HIT_SET_SEARCH_LAST_N:
5437             f->dump_int("hit_set_search_last_n",
5438                         p->hit_set_search_last_n);
5439             break;
5440           case SCRUB_MIN_INTERVAL:
5441           case SCRUB_MAX_INTERVAL:
5442           case DEEP_SCRUB_INTERVAL:
5443           case RECOVERY_PRIORITY:
5444           case RECOVERY_OP_PRIORITY:
5445           case SCRUB_PRIORITY:
5446           case COMPRESSION_MODE:
5447           case COMPRESSION_ALGORITHM:
5448           case COMPRESSION_REQUIRED_RATIO:
5449           case COMPRESSION_MAX_BLOB_SIZE:
5450           case COMPRESSION_MIN_BLOB_SIZE:
5451           case CSUM_TYPE:
5452           case CSUM_MAX_BLOCK:
5453           case CSUM_MIN_BLOCK:
5454           case FINGERPRINT_ALGORITHM:
5455           case PG_NUM_MIN:
5456           case TARGET_SIZE_BYTES:
5457           case TARGET_SIZE_RATIO:
5458           case PG_AUTOSCALE_BIAS:
5459             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5460             if (p->opts.is_set(key)) {
5461               if(*it == CSUM_TYPE) {
5462                 int64_t val;
5463                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
5464                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
5465               } else {
5466                 p->opts.dump(i->first, f.get());
5467               }
5468             }
5469             break;
5470         }
5471       }
5472       f->close_section();
5473       f->flush(rdata);
5474     } else /* !f */ {
5475       for(choices_set_t::const_iterator it = selected_choices.begin();
5476           it != selected_choices.end(); ++it) {
5477         choices_map_t::const_iterator i;
5478         switch(*it) {
5479           case PG_NUM:
5480             ss << "pg_num: " << p->get_pg_num() << "\n";
5481             break;
5482           case PGP_NUM:
5483             ss << "pgp_num: " << p->get_pgp_num() << "\n";
5484             break;
5485           case SIZE:
5486             ss << "size: " << p->get_size() << "\n";
5487             break;
5488           case MIN_SIZE:
5489             ss << "min_size: " << p->get_min_size() << "\n";
5490             break;
5491           case CRUSH_RULE:
5492             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
5493               ss << "crush_rule: " << osdmap.crush->get_rule_name(
5494                 p->get_crush_rule()) << "\n";
5495             } else {
5496               ss << "crush_rule: " << p->get_crush_rule() << "\n";
5497             }
5498             break;
5499           case PG_AUTOSCALE_MODE:
5500             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
5501               p->pg_autoscale_mode) <<"\n";
5502             break;
5503           case HIT_SET_PERIOD:
5504             ss << "hit_set_period: " << p->hit_set_period << "\n";
5505             break;
5506           case HIT_SET_COUNT:
5507             ss << "hit_set_count: " << p->hit_set_count << "\n";
5508             break;
5509           case HIT_SET_TYPE:
5510             ss << "hit_set_type: " <<
5511               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
5512             break;
5513           case HIT_SET_FPP:
5514             {
5515               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
5516                 BloomHitSet::Params *bloomp =
5517                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
5518                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
5519               } else if(var != "all") {
5520                 ss << "hit set is not of type Bloom; " <<
5521                   "invalid to get a false positive rate!";
5522                 r = -EINVAL;
5523                 goto reply;
5524               }
5525             }
5526             break;
5527           case USE_GMT_HITSET:
5528             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
5529             break;
5530           case TARGET_MAX_OBJECTS:
5531             ss << "target_max_objects: " << p->target_max_objects << "\n";
5532             break;
5533           case TARGET_MAX_BYTES:
5534             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
5535             break;
5536           case CACHE_TARGET_DIRTY_RATIO:
5537             ss << "cache_target_dirty_ratio: "
5538                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
5539             break;
5540           case CACHE_TARGET_DIRTY_HIGH_RATIO:
5541             ss << "cache_target_dirty_high_ratio: "
5542                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
5543             break;
5544           case CACHE_TARGET_FULL_RATIO:
5545             ss << "cache_target_full_ratio: "
5546                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
5547             break;
5548           case CACHE_MIN_FLUSH_AGE:
5549             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
5550             break;
5551           case CACHE_MIN_EVICT_AGE:
5552             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
5553             break;
5554           case ERASURE_CODE_PROFILE:
5555             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
5556             break;
5557           case MIN_READ_RECENCY_FOR_PROMOTE:
5558             ss << "min_read_recency_for_promote: " <<
5559               p->min_read_recency_for_promote << "\n";
5560             break;
5561           case HIT_SET_GRADE_DECAY_RATE:
5562             ss << "hit_set_grade_decay_rate: " <<
5563               p->hit_set_grade_decay_rate << "\n";
5564             break;
5565           case HIT_SET_SEARCH_LAST_N:
5566             ss << "hit_set_search_last_n: " <<
5567               p->hit_set_search_last_n << "\n";
5568             break;
5569           case EC_OVERWRITES:
5570             ss << "allow_ec_overwrites: " <<
5571               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
5572               "\n";
5573             break;
5574           case HASHPSPOOL:
5575           case NODELETE:
5576           case NOPGCHANGE:
5577           case NOSIZECHANGE:
5578           case WRITE_FADVISE_DONTNEED:
5579           case NOSCRUB:
5580           case NODEEP_SCRUB:
5581             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5582               if (i->second == *it)
5583                 break;
5584             }
5585             ceph_assert(i != ALL_CHOICES.end());
5586             ss << i->first << ": " <<
5587               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
5588                "true" : "false") << "\n";
5589             break;
5590           case MIN_WRITE_RECENCY_FOR_PROMOTE:
5591             ss << "min_write_recency_for_promote: " <<
5592               p->min_write_recency_for_promote << "\n";
5593             break;
5594           case FAST_READ:
5595             ss << "fast_read: " << p->fast_read << "\n";
5596             break;
5597           case SCRUB_MIN_INTERVAL:
5598           case SCRUB_MAX_INTERVAL:
5599           case DEEP_SCRUB_INTERVAL:
5600           case RECOVERY_PRIORITY:
5601           case RECOVERY_OP_PRIORITY:
5602           case SCRUB_PRIORITY:
5603           case COMPRESSION_MODE:
5604           case COMPRESSION_ALGORITHM:
5605           case COMPRESSION_REQUIRED_RATIO:
5606           case COMPRESSION_MAX_BLOB_SIZE:
5607           case COMPRESSION_MIN_BLOB_SIZE:
5608           case CSUM_TYPE:
5609           case CSUM_MAX_BLOCK:
5610           case CSUM_MIN_BLOCK:
5611           case FINGERPRINT_ALGORITHM:
5612           case PG_NUM_MIN:
5613           case TARGET_SIZE_BYTES:
5614           case TARGET_SIZE_RATIO:
5615           case PG_AUTOSCALE_BIAS:
5616             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
5617               if (i->second == *it)
5618                 break;
5619             }
5620             ceph_assert(i != ALL_CHOICES.end());
5621             {
5622               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
5623               if (p->opts.is_set(key)) {
5624                 if(key == pool_opts_t::CSUM_TYPE) {
5625                   int64_t val;
5626                   p->opts.get(key, &val);
5627                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
5628                 } else {
5629                   ss << i->first << ": " << p->opts.get(key) << "\n";
5630                 }
5631               }
5632             }
5633             break;
5634         }
5635         rdata.append(ss.str());
5636         ss.str("");
5637       }
5638     }
5639     r = 0;
5640   } else if (prefix == "osd pool get-quota") {
5641     string pool_name;
5642     cmd_getval(cct, cmdmap, "pool", pool_name);
5643
5644     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
5645     if (poolid < 0) {
5646       ceph_assert(poolid == -ENOENT);
5647       ss << "unrecognized pool '" << pool_name << "'";
5648       r = -ENOENT;
5649       goto reply;
5650     }
5651     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
5652
5653     if (f) {
5654       f->open_object_section("pool_quotas");
5655       f->dump_string("pool_name", pool_name);
5656       f->dump_unsigned("pool_id", poolid);
5657       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
5658       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
5659       f->close_section();
5660       f->flush(rdata);
5661     } else {
5662       stringstream rs;
5663       rs << "quotas for pool '" << pool_name << "':\n"
5664          << "  max objects: ";
5665       if (p->quota_max_objects == 0)
5666         rs << "N/A";
5667       else
5668         rs << si_u_t(p->quota_max_objects) << " objects";
5669       rs << "\n"
5670          << "  max bytes  : ";
5671       if (p->quota_max_bytes == 0)
5672         rs << "N/A";
5673       else
5674         rs << byte_u_t(p->quota_max_bytes);
5675       rdata.append(rs.str());
5676     }
5677     rdata.append("\n");
5678     r = 0;
5679   } else if (prefix == "osd crush rule list" ||
5680              prefix == "osd crush rule ls") {
5681     if (f) {
5682       f->open_array_section("rules");
5683       osdmap.crush->list_rules(f.get());
5684       f->close_section();
5685       f->flush(rdata);
5686     } else {
5687       ostringstream ss;
5688       osdmap.crush->list_rules(&ss);
5689       rdata.append(ss.str());
5690     }
5691   } else if (prefix == "osd crush rule ls-by-class") {
5692     string class_name;
5693     cmd_getval(cct, cmdmap, "class", class_name);
5694     if (class_name.empty()) {
5695       ss << "no class specified";
5696       r = -EINVAL;
5697       goto reply;
5698     }
5699     set<int> rules;
5700     r = osdmap.crush->get_rules_by_class(class_name, &rules);
5701     if (r < 0) {
5702       ss << "failed to get rules by class '" << class_name << "'";
5703       goto reply;
5704     }
5705     if (f) {
5706       f->open_array_section("rules");
5707       for (auto &rule: rules) {
5708         f->dump_string("name", osdmap.crush->get_rule_name(rule));
5709       }
5710       f->close_section();
5711       f->flush(rdata);
5712     } else {
5713       ostringstream rs;
5714       for (auto &rule: rules) {
5715         rs << osdmap.crush->get_rule_name(rule) << "\n";
5716       }
5717       rdata.append(rs.str());
5718     }
5719   } else if (prefix == "osd crush rule dump") {
5720     string name;
5721     cmd_getval(cct, cmdmap, "name", name);
5722     string format;
5723     cmd_getval(cct, cmdmap, "format", format);
5724     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5725     if (name == "") {
5726       f->open_array_section("rules");
5727       osdmap.crush->dump_rules(f.get());
5728       f->close_section();
5729     } else {
5730       int ruleno = osdmap.crush->get_rule_id(name);
5731       if (ruleno < 0) {
5732         ss << "unknown crush rule '" << name << "'";
5733         r = ruleno;
5734         goto reply;
5735       }
5736       osdmap.crush->dump_rule(ruleno, f.get());
5737     }
5738     ostringstream rs;
5739     f->flush(rs);
5740     rs << "\n";
5741     rdata.append(rs.str());
5742   } else if (prefix == "osd crush dump") {
5743     string format;
5744     cmd_getval(cct, cmdmap, "format", format);
5745     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5746     f->open_object_section("crush_map");
5747     osdmap.crush->dump(f.get());
5748     f->close_section();
5749     ostringstream rs;
5750     f->flush(rs);
5751     rs << "\n";
5752     rdata.append(rs.str());
5753   } else if (prefix == "osd crush show-tunables") {
5754     string format;
5755     cmd_getval(cct, cmdmap, "format", format);
5756     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5757     f->open_object_section("crush_map_tunables");
5758     osdmap.crush->dump_tunables(f.get());
5759     f->close_section();
5760     ostringstream rs;
5761     f->flush(rs);
5762     rs << "\n";
5763     rdata.append(rs.str());
5764   } else if (prefix == "osd crush tree") {
5765     string shadow;
5766     cmd_getval(cct, cmdmap, "shadow", shadow);
5767     bool show_shadow = shadow == "--show-shadow";
5768     boost::scoped_ptr<Formatter> f(Formatter::create(format));
5769     if (f) {
5770       f->open_object_section("crush_tree");
5771       osdmap.crush->dump_tree(nullptr,
5772                               f.get(),
5773                               osdmap.get_pool_names(),
5774                               show_shadow);
5775       f->close_section();
5776       f->flush(rdata);
5777     } else {
5778       ostringstream ss;
5779       osdmap.crush->dump_tree(&ss,
5780                               nullptr,
5781                               osdmap.get_pool_names(),
5782                               show_shadow);
5783       rdata.append(ss.str());
5784     }
5785   } else if (prefix == "osd crush ls") {
5786     string name;
5787     if (!cmd_getval(cct, cmdmap, "node", name)) {
5788       ss << "no node specified";
5789       r = -EINVAL;
5790       goto reply;
5791     }
5792     if (!osdmap.crush->name_exists(name)) {
5793       ss << "node '" << name << "' does not exist";
5794       r = -ENOENT;
5795       goto reply;
5796     }
5797     int id = osdmap.crush->get_item_id(name);
5798     list<int> result;
5799     if (id >= 0) {
5800       result.push_back(id);
5801     } else {
5802       int num = osdmap.crush->get_bucket_size(id);
5803       for (int i = 0; i < num; ++i) {
5804         result.push_back(osdmap.crush->get_bucket_item(id, i));
5805       }
5806     }
5807     if (f) {
5808       f->open_array_section("items");
5809       for (auto i : result) {
5810         f->dump_string("item", osdmap.crush->get_item_name(i));
5811       }
5812       f->close_section();
5813       f->flush(rdata);
5814     } else {
5815       ostringstream ss;
5816       for (auto i : result) {
5817         ss << osdmap.crush->get_item_name(i) << "\n";
5818       }
5819       rdata.append(ss.str());
5820     }
5821     r = 0;
5822   } else if (prefix == "osd crush class ls") {
5823     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5824     f->open_array_section("crush_classes");
5825     for (auto i : osdmap.crush->class_name)
5826       f->dump_string("class", i.second);
5827     f->close_section();
5828     f->flush(rdata);
5829   } else if (prefix == "osd crush class ls-osd") {
5830     string name;
5831     cmd_getval(cct, cmdmap, "class", name);
5832     set<int> osds;
5833     osdmap.crush->get_devices_by_class(name, &osds);
5834     if (f) {
5835       f->open_array_section("osds");
5836       for (auto &osd: osds)
5837         f->dump_int("osd", osd);
5838       f->close_section();
5839       f->flush(rdata);
5840     } else {
5841       bool first = true;
5842       for (auto &osd : osds) {
5843         if (!first)
5844           ds << "\n";
5845         first = false;
5846         ds << osd;
5847       }
5848       rdata.append(ds);
5849     }
5850   } else if (prefix == "osd crush get-device-class") {
5851     vector<string> idvec;
5852     cmd_getval(cct, cmdmap, "ids", idvec);
5853     map<int, string> class_by_osd;
5854     for (auto& id : idvec) {
5855       ostringstream ts;
5856       long osd = parse_osd_id(id.c_str(), &ts);
5857       if (osd < 0) {
5858         ss << "unable to parse osd id:'" << id << "'";
5859         r = -EINVAL;
5860         goto reply;
5861       }
5862       auto device_class = osdmap.crush->get_item_class(osd);
5863       if (device_class)
5864         class_by_osd[osd] = device_class;
5865       else
5866         class_by_osd[osd] = ""; // no class
5867     }
5868     if (f) {
5869       f->open_array_section("osd_device_classes");
5870       for (auto& i : class_by_osd) {
5871         f->open_object_section("osd_device_class");
5872         f->dump_int("osd", i.first);
5873         f->dump_string("device_class", i.second);
5874         f->close_section();
5875       }
5876       f->close_section();
5877       f->flush(rdata);
5878     } else {
5879       if (class_by_osd.size() == 1) {
5880         // for single input, make a clean output
5881         ds << class_by_osd.begin()->second;
5882       } else {
5883         // note that we do not group osds by class here
5884         for (auto it = class_by_osd.begin();
5885              it != class_by_osd.end();
5886              it++) {
5887           ds << "osd." << it->first << ' ' << it->second;
5888           if (next(it) != class_by_osd.end())
5889             ds << '\n';
5890         }
5891       }
5892       rdata.append(ds);
5893     }
5894   } else if (prefix == "osd erasure-code-profile ls") {
5895     const auto &profiles = osdmap.get_erasure_code_profiles();
5896     if (f)
5897       f->open_array_section("erasure-code-profiles");
5898     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
5899       if (f)
5900         f->dump_string("profile", i->first.c_str());
5901       else
5902         rdata.append(i->first + "\n");
5903     }
5904     if (f) {
5905       f->close_section();
5906       ostringstream rs;
5907       f->flush(rs);
5908       rs << "\n";
5909       rdata.append(rs.str());
5910     }
5911   } else if (prefix == "osd crush weight-set ls") {
5912     boost::scoped_ptr<Formatter> f(Formatter::create(format));
5913     if (f) {
5914       f->open_array_section("weight_sets");
5915       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5916         f->dump_string("pool", "(compat)");
5917       }
5918       for (auto& i : osdmap.crush->choose_args) {
5919         if (i.first >= 0) {
5920           f->dump_string("pool", osdmap.get_pool_name(i.first));
5921         }
5922       }
5923       f->close_section();
5924       f->flush(rdata);
5925     } else {
5926       ostringstream rs;
5927       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
5928         rs << "(compat)\n";
5929       }
5930       for (auto& i : osdmap.crush->choose_args) {
5931         if (i.first >= 0) {
5932           rs << osdmap.get_pool_name(i.first) << "\n";
5933         }
5934       }
5935       rdata.append(rs.str());
5936     }
5937   } else if (prefix == "osd crush weight-set dump") {
5938     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5939                                                      "json-pretty"));
5940     osdmap.crush->dump_choose_args(f.get());
5941     f->flush(rdata);
5942   } else if (prefix == "osd erasure-code-profile get") {
5943     string name;
5944     cmd_getval(cct, cmdmap, "name", name);
5945     if (!osdmap.has_erasure_code_profile(name)) {
5946       ss << "unknown erasure code profile '" << name << "'";
5947       r = -ENOENT;
5948       goto reply;
5949     }
5950     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
5951     if (f)
5952       f->open_object_section("profile");
5953     for (map<string,string>::const_iterator i = profile.begin();
5954          i != profile.end();
5955          ++i) {
5956       if (f)
5957         f->dump_string(i->first.c_str(), i->second.c_str());
5958       else
5959         rdata.append(i->first + "=" + i->second + "\n");
5960     }
5961     if (f) {
5962       f->close_section();
5963       ostringstream rs;
5964       f->flush(rs);
5965       rs << "\n";
5966       rdata.append(rs.str());
5967     }
5968   } else if (prefix == "osd pool application get") {
5969     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
5970                                                      "json-pretty"));
5971     string pool_name;
5972     cmd_getval(cct, cmdmap, "pool", pool_name);
5973     string app;
5974     cmd_getval(cct, cmdmap, "app", app);
5975     string key;
5976     cmd_getval(cct, cmdmap, "key", key);
5977
5978     if (pool_name.empty()) {
5979       // all
5980       f->open_object_section("pools");
5981       for (const auto &pool : osdmap.pools) {
5982         std::string name("<unknown>");
5983         const auto &pni = osdmap.pool_name.find(pool.first);
5984         if (pni != osdmap.pool_name.end())
5985           name = pni->second;
5986         f->open_object_section(name.c_str());
5987         for (auto &app_pair : pool.second.application_metadata) {
5988           f->open_object_section(app_pair.first.c_str());
5989           for (auto &kv_pair : app_pair.second) {
5990             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
5991           }
5992           f->close_section();
5993         }
5994         f->close_section(); // name
5995       }
5996       f->close_section(); // pools
5997       f->flush(rdata);
5998     } else {
5999       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6000       if (pool < 0) {
6001         ss << "unrecognized pool '" << pool_name << "'";
6002         r = -ENOENT;
6003         goto reply;
6004       }
6005       auto p = osdmap.get_pg_pool(pool);
6006       // filter by pool
6007       if (app.empty()) {
6008         f->open_object_section(pool_name.c_str());
6009         for (auto &app_pair : p->application_metadata) {
6010           f->open_object_section(app_pair.first.c_str());
6011           for (auto &kv_pair : app_pair.second) {
6012             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6013           }
6014           f->close_section(); // application
6015         }
6016         f->close_section(); // pool_name
6017         f->flush(rdata);
6018         goto reply;
6019       }
6020
6021       auto app_it = p->application_metadata.find(app);
6022       if (app_it == p->application_metadata.end()) {
6023         ss << "pool '" << pool_name << "' has no application '" << app << "'";
6024         r = -ENOENT;
6025         goto reply;
6026       }
6027       // filter by pool + app
6028       if (key.empty()) {
6029         f->open_object_section(app_it->first.c_str());
6030         for (auto &kv_pair : app_it->second) {
6031           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6032         }
6033         f->close_section(); // application
6034         f->flush(rdata);
6035         goto reply;
6036       }
6037       // filter by pool + app + key
6038       auto key_it = app_it->second.find(key);
6039       if (key_it == app_it->second.end()) {
6040         ss << "application '" << app << "' on pool '" << pool_name
6041            << "' does not have key '" << key << "'";
6042         r = -ENOENT;
6043         goto reply;
6044       }
6045       ss << key_it->second << "\n";
6046       rdata.append(ss.str());
6047       ss.str("");
6048     }
6049   } else if (prefix == "osd get-require-min-compat-client") {
6050     ss << ceph_release_name(osdmap.require_min_compat_client) << std::endl;
6051     rdata.append(ss.str());
6052     ss.str("");
6053     goto reply;
6054   } else if (prefix == "osd pool application enable" ||
6055              prefix == "osd pool application disable" ||
6056              prefix == "osd pool application set" ||
6057              prefix == "osd pool application rm") {
6058     bool changed = false;
6059     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6060     if (r != 0) {
6061       // Error, reply.
6062       goto reply;
6063     } else if (changed) {
6064       // Valid mutation, proceed to prepare phase
6065       return false;
6066     } else {
6067       // Idempotent case, reply
6068       goto reply;
6069     }
6070   } else {
6071     // try prepare update
6072     return false;
6073   }
6074
6075  reply:
6076   string rs;
6077   getline(ss, rs);
6078   mon->reply_command(op, r, rs, rdata, get_last_committed());
6079   return true;
6080 }
6081
6082 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6083 {
6084   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6085     osdmap.get_pg_pool(pool_id));
6086   ceph_assert(pool);
6087   pool->set_flag(flags);
6088 }
6089
6090 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6091 {
6092   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6093     osdmap.get_pg_pool(pool_id));
6094   ceph_assert(pool);
6095   pool->unset_flag(flags);
6096 }
6097
6098 string OSDMonitor::make_snap_epoch_key(int64_t pool, epoch_t epoch)
6099 {
6100   char k[80];
6101   snprintf(k, sizeof(k), "removed_epoch_%llu_%08lx",
6102            (unsigned long long)pool, (unsigned long)epoch);
6103   return k;
6104 }
6105
6106 string OSDMonitor::make_snap_key(int64_t pool, snapid_t snap)
6107 {
6108   char k[80];
6109   snprintf(k, sizeof(k), "removed_snap_%llu_%016llx",
6110            (unsigned long long)pool, (unsigned long long)snap);
6111   return k;
6112 }
6113
6114
6115 string OSDMonitor::make_snap_key_value(
6116   int64_t pool, snapid_t snap, snapid_t num,
6117   epoch_t epoch, bufferlist *v)
6118 {
6119   // encode the *last* epoch in the key so that we can use forward
6120   // iteration only to search for an epoch in an interval.
6121   encode(snap, *v);
6122   encode(snap + num, *v);
6123   encode(epoch, *v);
6124   return make_snap_key(pool, snap + num - 1);
6125 }
6126
6127 string OSDMonitor::make_snap_purged_key(int64_t pool, snapid_t snap)
6128 {
6129   char k[80];
6130   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6131            (unsigned long long)pool, (unsigned long long)snap);
6132   return k;
6133 }
6134 string OSDMonitor::make_snap_purged_key_value(
6135   int64_t pool, snapid_t snap, snapid_t num,
6136   epoch_t epoch, bufferlist *v)
6137 {
6138   // encode the *last* epoch in the key so that we can use forward
6139   // iteration only to search for an epoch in an interval.
6140   encode(snap, *v);
6141   encode(snap + num, *v);
6142   encode(epoch, *v);
6143   return make_snap_purged_key(pool, snap + num - 1);
6144 }
6145
6146 int OSDMonitor::lookup_pruned_snap(int64_t pool, snapid_t snap,
6147                                    snapid_t *begin, snapid_t *end)
6148 {
6149   string k = make_snap_key(pool, snap);
6150   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6151   it->lower_bound(k);
6152   if (!it->valid()) {
6153     return -ENOENT;
6154   }
6155   if (it->key().find(OSD_SNAP_PREFIX) != 0) {
6156     return -ENOENT;
6157   }
6158   bufferlist v = it->value();
6159   auto p = v.cbegin();
6160   decode(*begin, p);
6161   decode(*end, p);
6162   if (snap < *begin || snap >= *end) {
6163     return -ENOENT;
6164   }
6165   return 0;
6166 }
6167
6168 bool OSDMonitor::try_prune_purged_snaps()
6169 {
6170   if (!mon->mgrstatmon()->is_readable()) {
6171     return false;
6172   }
6173   if (osdmap.require_osd_release < CEPH_RELEASE_MIMIC) {
6174     return false;
6175   }
6176   if (!pending_inc.new_purged_snaps.empty()) {
6177     return false;  // we already pruned for this epoch
6178   }
6179
6180   unsigned max_prune = cct->_conf.get_val<uint64_t>(
6181     "mon_max_snap_prune_per_epoch");
6182   if (!max_prune) {
6183     max_prune = 100000;
6184   }
6185   dout(10) << __func__ << " max_prune " << max_prune << dendl;
6186
6187   unsigned actually_pruned = 0;
6188   auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
6189   for (auto& p : osdmap.get_pools()) {
6190     auto q = purged_snaps.find(p.first);
6191     if (q == purged_snaps.end()) {
6192       continue;
6193     }
6194     auto& purged = q->second;
6195     if (purged.empty()) {
6196       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
6197       continue;
6198     }
6199     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
6200     OSDMap::snap_interval_set_t to_prune;
6201     unsigned maybe_pruned = actually_pruned;
6202     for (auto i = purged.begin(); i != purged.end(); ++i) {
6203       snapid_t begin = i.get_start();
6204       auto end = i.get_start() + i.get_len();
6205       snapid_t pbegin = 0, pend = 0;
6206       int r = lookup_pruned_snap(p.first, begin, &pbegin, &pend);
6207       if (r == 0) {
6208         // already purged.
6209         // be a bit aggressive about backing off here, because the mon may
6210         // do a lot of work going through this set, and if we know the
6211         // purged set from the OSDs is at least *partly* stale we may as
6212         // well wait for it to be fresh.
6213         dout(20) << __func__ << "  we've already pruned " << pbegin
6214                  << "~" << (pend - pbegin) << dendl;
6215         break;  // next pool
6216       }
6217       if (pbegin && pbegin < end) {
6218         // the tail of [begin,end) is purged; shorten the range
6219         ceph_assert(pbegin > begin);
6220         end = pbegin;
6221       }
6222       to_prune.insert(begin, end - begin);
6223       maybe_pruned += end - begin;
6224       if (maybe_pruned >= max_prune) {
6225         break;
6226       }
6227     }
6228     if (!to_prune.empty()) {
6229       // PGs may still be reporting things as purged that we have already
6230       // pruned from removed_snaps_queue.
6231       OSDMap::snap_interval_set_t actual;
6232       auto r = osdmap.removed_snaps_queue.find(p.first);
6233       if (r != osdmap.removed_snaps_queue.end()) {
6234         actual.intersection_of(to_prune, r->second);
6235       }
6236       actually_pruned += actual.size();
6237       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
6238                << ", actual pruned " << actual << dendl;
6239       if (!actual.empty()) {
6240         pending_inc.new_purged_snaps[p.first].swap(actual);
6241       }
6242     }
6243     if (actually_pruned >= max_prune) {
6244       break;
6245     }
6246   }
6247   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
6248   return !!actually_pruned;
6249 }
6250
6251 bool OSDMonitor::update_pools_status()
6252 {
6253   if (!mon->mgrstatmon()->is_readable())
6254     return false;
6255
6256   bool ret = false;
6257
6258   auto& pools = osdmap.get_pools();
6259   for (auto it = pools.begin(); it != pools.end(); ++it) {
6260     const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
6261     if (!pstat)
6262       continue;
6263     const object_stat_sum_t& sum = pstat->stats.sum;
6264     const pg_pool_t &pool = it->second;
6265     const string& pool_name = osdmap.get_pool_name(it->first);
6266
6267     bool pool_is_full =
6268       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
6269       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
6270
6271     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
6272       if (pool_is_full)
6273         continue;
6274
6275       mon->clog->info() << "pool '" << pool_name
6276                        << "' no longer out of quota; removing NO_QUOTA flag";
6277       // below we cancel FLAG_FULL too, we'll set it again in
6278       // OSDMonitor::encode_pending if it still fails the osd-full checking.
6279       clear_pool_flags(it->first,
6280                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
6281       ret = true;
6282     } else {
6283       if (!pool_is_full)
6284         continue;
6285
6286       if (pool.quota_max_bytes > 0 &&
6287           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
6288         mon->clog->warn() << "pool '" << pool_name << "' is full"
6289                          << " (reached quota's max_bytes: "
6290                          << byte_u_t(pool.quota_max_bytes) << ")";
6291       }
6292       if (pool.quota_max_objects > 0 &&
6293                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
6294         mon->clog->warn() << "pool '" << pool_name << "' is full"
6295                          << " (reached quota's max_objects: "
6296                          << pool.quota_max_objects << ")";
6297       }
6298       // set both FLAG_FULL_QUOTA and FLAG_FULL
6299       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
6300       // since FLAG_FULL should always take precedence
6301       set_pool_flags(it->first,
6302                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
6303       clear_pool_flags(it->first,
6304                        pg_pool_t::FLAG_NEARFULL |
6305                        pg_pool_t::FLAG_BACKFILLFULL);
6306       ret = true;
6307     }
6308   }
6309   return ret;
6310 }
6311
6312 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
6313 {
6314   op->mark_osdmon_event(__func__);
6315   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
6316   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
6317   MonSession *session = op->get_session();
6318   if (!session)
6319     return -EPERM;
6320   string erasure_code_profile;
6321   stringstream ss;
6322   string rule_name;
6323   int ret = 0;
6324   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
6325                          0, 0, 0, 0, 0, 0.0,
6326                          erasure_code_profile,
6327                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, &ss);
6328
6329   if (ret < 0) {
6330     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
6331   }
6332   return ret;
6333 }
6334
6335 int OSDMonitor::crush_rename_bucket(const string& srcname,
6336                                     const string& dstname,
6337                                     ostream *ss)
6338 {
6339   int ret;
6340   //
6341   // Avoid creating a pending crush if it does not already exists and
6342   // the rename would fail.
6343   //
6344   if (!_have_pending_crush()) {
6345     ret = _get_stable_crush().can_rename_bucket(srcname,
6346                                                 dstname,
6347                                                 ss);
6348     if (ret)
6349       return ret;
6350   }
6351
6352   CrushWrapper newcrush;
6353   _get_pending_crush(newcrush);
6354
6355   ret = newcrush.rename_bucket(srcname,
6356                                dstname,
6357                                ss);
6358   if (ret)
6359     return ret;
6360
6361   pending_inc.crush.clear();
6362   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6363   *ss << "renamed bucket " << srcname << " into " << dstname;
6364   return 0;
6365 }
6366
6367 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
6368 {
6369   string replacement = "";
6370
6371   if (plugin == "jerasure_generic" ||
6372       plugin == "jerasure_sse3" ||
6373       plugin == "jerasure_sse4" ||
6374       plugin == "jerasure_neon") {
6375     replacement = "jerasure";
6376   } else if (plugin == "shec_generic" ||
6377              plugin == "shec_sse3" ||
6378              plugin == "shec_sse4" ||
6379              plugin == "shec_neon") {
6380     replacement = "shec";
6381   }
6382
6383   if (replacement != "") {
6384     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
6385             << plugin << " that has been deprecated. Please use "
6386             << replacement << " instead." << dendl;
6387   }
6388 }
6389
6390 int OSDMonitor::normalize_profile(const string& profilename,
6391                                   ErasureCodeProfile &profile,
6392                                   bool force,
6393                                   ostream *ss)
6394 {
6395   ErasureCodeInterfaceRef erasure_code;
6396   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6397   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
6398   check_legacy_ec_plugin(plugin->second, profilename);
6399   int err = instance.factory(plugin->second,
6400                              g_conf().get_val<std::string>("erasure_code_dir"),
6401                              profile, &erasure_code, ss);
6402   if (err) {
6403     return err;
6404   }
6405
6406   err = erasure_code->init(profile, ss);
6407   if (err) {
6408     return err;
6409   }
6410
6411   auto it = profile.find("stripe_unit");
6412   if (it != profile.end()) {
6413     string err_str;
6414     uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
6415     if (!err_str.empty()) {
6416       *ss << "could not parse stripe_unit '" << it->second
6417           << "': " << err_str << std::endl;
6418       return -EINVAL;
6419     }
6420     uint32_t data_chunks = erasure_code->get_data_chunk_count();
6421     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
6422     if (chunk_size != stripe_unit) {
6423       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
6424           << "alignment. Would be padded to " << chunk_size
6425           << std::endl;
6426       return -EINVAL;
6427     }
6428     if ((stripe_unit % 4096) != 0 && !force) {
6429       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
6430           << "use --force to override this check" << std::endl;
6431       return -EINVAL;
6432     }
6433   }
6434   return 0;
6435 }
6436
6437 int OSDMonitor::crush_rule_create_erasure(const string &name,
6438                                              const string &profile,
6439                                              int *rule,
6440                                              ostream *ss)
6441 {
6442   int ruleid = osdmap.crush->get_rule_id(name);
6443   if (ruleid != -ENOENT) {
6444     *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
6445     return -EEXIST;
6446   }
6447
6448   CrushWrapper newcrush;
6449   _get_pending_crush(newcrush);
6450
6451   ruleid = newcrush.get_rule_id(name);
6452   if (ruleid != -ENOENT) {
6453     *rule = newcrush.get_rule_mask_ruleset(ruleid);
6454     return -EALREADY;
6455   } else {
6456     ErasureCodeInterfaceRef erasure_code;
6457     int err = get_erasure_code(profile, &erasure_code, ss);
6458     if (err) {
6459       *ss << "failed to load plugin using profile " << profile << std::endl;
6460       return err;
6461     }
6462
6463     err = erasure_code->create_rule(name, newcrush, ss);
6464     erasure_code.reset();
6465     if (err < 0)
6466       return err;
6467     *rule = err;
6468     pending_inc.crush.clear();
6469     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
6470     return 0;
6471   }
6472 }
6473
6474 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
6475                                  ErasureCodeInterfaceRef *erasure_code,
6476                                  ostream *ss) const
6477 {
6478   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
6479     return -EAGAIN;
6480   ErasureCodeProfile profile =
6481     osdmap.get_erasure_code_profile(erasure_code_profile);
6482   ErasureCodeProfile::const_iterator plugin =
6483     profile.find("plugin");
6484   if (plugin == profile.end()) {
6485     *ss << "cannot determine the erasure code plugin"
6486         << " because there is no 'plugin' entry in the erasure_code_profile "
6487         << profile << std::endl;
6488     return -EINVAL;
6489   }
6490   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
6491   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
6492   return instance.factory(plugin->second,
6493                           g_conf().get_val<std::string>("erasure_code_dir"),
6494                           profile, erasure_code, ss);
6495 }
6496
6497 int OSDMonitor::check_cluster_features(uint64_t features,
6498                                        stringstream &ss)
6499 {
6500   stringstream unsupported_ss;
6501   int unsupported_count = 0;
6502   if ((mon->get_quorum_con_features() & features) != features) {
6503     unsupported_ss << "the monitor cluster";
6504     ++unsupported_count;
6505   }
6506
6507   set<int32_t> up_osds;
6508   osdmap.get_up_osds(up_osds);
6509   for (set<int32_t>::iterator it = up_osds.begin();
6510        it != up_osds.end(); ++it) {
6511     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
6512     if ((xi.features & features) != features) {
6513       if (unsupported_count > 0)
6514         unsupported_ss << ", ";
6515       unsupported_ss << "osd." << *it;
6516       unsupported_count ++;
6517     }
6518   }
6519
6520   if (unsupported_count > 0) {
6521     ss << "features " << features << " unsupported by: "
6522        << unsupported_ss.str();
6523     return -ENOTSUP;
6524   }
6525
6526   // check pending osd state, too!
6527   for (map<int32_t,osd_xinfo_t>::const_iterator p =
6528          pending_inc.new_xinfo.begin();
6529        p != pending_inc.new_xinfo.end(); ++p) {
6530     const osd_xinfo_t &xi = p->second;
6531     if ((xi.features & features) != features) {
6532       dout(10) << __func__ << " pending osd." << p->first
6533                << " features are insufficient; retry" << dendl;
6534       return -EAGAIN;
6535     }
6536   }
6537
6538   return 0;
6539 }
6540
6541 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
6542                                                  stringstream& ss)
6543 {
6544   OSDMap::Incremental new_pending = pending_inc;
6545   encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
6546   OSDMap newmap;
6547   newmap.deepish_copy_from(osdmap);
6548   newmap.apply_incremental(new_pending);
6549
6550   // client compat
6551   if (newmap.require_min_compat_client > 0) {
6552     auto mv = newmap.get_min_compat_client();
6553     if (mv > newmap.require_min_compat_client) {
6554       ss << "new crush map requires client version " << ceph_release_name(mv)
6555          << " but require_min_compat_client is "
6556          << ceph_release_name(newmap.require_min_compat_client);
6557       return false;
6558     }
6559   }
6560
6561   // osd compat
6562   uint64_t features =
6563     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
6564     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
6565   stringstream features_ss;
6566   int r = check_cluster_features(features, features_ss);
6567   if (r) {
6568     ss << "Could not change CRUSH: " << features_ss.str();
6569     return false;
6570   }
6571
6572   return true;
6573 }
6574
6575 bool OSDMonitor::erasure_code_profile_in_use(
6576   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
6577   const string &profile,
6578   ostream *ss)
6579 {
6580   bool found = false;
6581   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
6582        p != pools.end();
6583        ++p) {
6584     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
6585       *ss << osdmap.pool_name[p->first] << " ";
6586       found = true;
6587     }
6588   }
6589   if (found) {
6590     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
6591   }
6592   return found;
6593 }
6594
6595 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
6596                                            map<string,string> *erasure_code_profile_map,
6597                                            ostream *ss)
6598 {
6599   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
6600                                    get_json_str_map,
6601                                    *ss,
6602                                    erasure_code_profile_map,
6603                                    true);
6604   if (r)
6605     return r;
6606   ceph_assert((*erasure_code_profile_map).count("plugin"));
6607   string default_plugin = (*erasure_code_profile_map)["plugin"];
6608   map<string,string> user_map;
6609   for (vector<string>::const_iterator i = erasure_code_profile.begin();
6610        i != erasure_code_profile.end();
6611        ++i) {
6612     size_t equal = i->find('=');
6613     if (equal == string::npos) {
6614       user_map[*i] = string();
6615       (*erasure_code_profile_map)[*i] = string();
6616     } else {
6617       const string key = i->substr(0, equal);
6618       equal++;
6619       const string value = i->substr(equal);
6620       if (key.find("ruleset-") == 0) {
6621         *ss << "property '" << key << "' is no longer supported; try "
6622             << "'crush-" << key.substr(8) << "' instead";
6623         return -EINVAL;
6624       }
6625       user_map[key] = value;
6626       (*erasure_code_profile_map)[key] = value;
6627     }
6628   }
6629
6630   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
6631     (*erasure_code_profile_map) = user_map;
6632
6633   return 0;
6634 }
6635
6636 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
6637                                   const string &erasure_code_profile,
6638                                   uint8_t repl_size,
6639                                   unsigned *size, unsigned *min_size,
6640                                   ostream *ss)
6641 {
6642   int err = 0;
6643   switch (pool_type) {
6644   case pg_pool_t::TYPE_REPLICATED:
6645     if (repl_size == 0) {
6646       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
6647     }
6648     *size = repl_size;
6649     *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
6650     break;
6651   case pg_pool_t::TYPE_ERASURE:
6652     {
6653       ErasureCodeInterfaceRef erasure_code;
6654       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6655       if (err == 0) {
6656         *size = erasure_code->get_chunk_count();
6657         *min_size =
6658           erasure_code->get_data_chunk_count() +
6659           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
6660         assert(*min_size <= *size);
6661         assert(*min_size >= erasure_code->get_data_chunk_count());
6662       }
6663     }
6664     break;
6665   default:
6666     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
6667     err = -EINVAL;
6668     break;
6669   }
6670   return err;
6671 }
6672
6673 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
6674                                           const string &erasure_code_profile,
6675                                           uint32_t *stripe_width,
6676                                           ostream *ss)
6677 {
6678   int err = 0;
6679   switch (pool_type) {
6680   case pg_pool_t::TYPE_REPLICATED:
6681     // ignored
6682     break;
6683   case pg_pool_t::TYPE_ERASURE:
6684     {
6685       ErasureCodeProfile profile =
6686         osdmap.get_erasure_code_profile(erasure_code_profile);
6687       ErasureCodeInterfaceRef erasure_code;
6688       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
6689       if (err)
6690         break;
6691       uint32_t data_chunks = erasure_code->get_data_chunk_count();
6692       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
6693       auto it = profile.find("stripe_unit");
6694       if (it != profile.end()) {
6695         string err_str;
6696         stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
6697         ceph_assert(err_str.empty());
6698       }
6699       *stripe_width = data_chunks *
6700         erasure_code->get_chunk_size(stripe_unit * data_chunks);
6701     }
6702     break;
6703   default:
6704     *ss << "prepare_pool_stripe_width: "
6705        << pool_type << " is not a known pool type";
6706     err = -EINVAL;
6707     break;
6708   }
6709   return err;
6710 }
6711
6712 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
6713                                         const string &erasure_code_profile,
6714                                         const string &rule_name,
6715                                         int *crush_rule,
6716                                         ostream *ss)
6717 {
6718
6719   if (*crush_rule < 0) {
6720     switch (pool_type) {
6721     case pg_pool_t::TYPE_REPLICATED:
6722       {
6723         if (rule_name == "") {
6724           // Use default rule
6725           *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
6726           if (*crush_rule < 0) {
6727             // Errors may happen e.g. if no valid rule is available
6728             *ss << "No suitable CRUSH rule exists, check "
6729                 << "'osd pool default crush *' config options";
6730             return -ENOENT;
6731           }
6732         } else {
6733           return get_crush_rule(rule_name, crush_rule, ss);
6734         }
6735       }
6736       break;
6737     case pg_pool_t::TYPE_ERASURE:
6738       {
6739         int err = crush_rule_create_erasure(rule_name,
6740                                                erasure_code_profile,
6741                                                crush_rule, ss);
6742         switch (err) {
6743         case -EALREADY:
6744           dout(20) << "prepare_pool_crush_rule: rule "
6745                    << rule_name << " try again" << dendl;
6746           // fall through
6747         case 0:
6748           // need to wait for the crush rule to be proposed before proceeding
6749           err = -EAGAIN;
6750           break;
6751         case -EEXIST:
6752           err = 0;
6753           break;
6754         }
6755         return err;
6756       }
6757       break;
6758     default:
6759       *ss << "prepare_pool_crush_rule: " << pool_type
6760          << " is not a known pool type";
6761       return -EINVAL;
6762       break;
6763     }
6764   } else {
6765     if (!osdmap.crush->ruleset_exists(*crush_rule)) {
6766       *ss << "CRUSH rule " << *crush_rule << " not found";
6767       return -ENOENT;
6768     }
6769   }
6770
6771   return 0;
6772 }
6773
6774 int OSDMonitor::get_crush_rule(const string &rule_name,
6775                                int *crush_rule,
6776                                ostream *ss)
6777 {
6778   int ret;
6779   ret = osdmap.crush->get_rule_id(rule_name);
6780   if (ret != -ENOENT) {
6781     // found it, use it
6782     *crush_rule = ret;
6783   } else {
6784     CrushWrapper newcrush;
6785     _get_pending_crush(newcrush);
6786
6787     ret = newcrush.get_rule_id(rule_name);
6788     if (ret != -ENOENT) {
6789       // found it, wait for it to be proposed
6790       dout(20) << __func__ << ": rule " << rule_name
6791                << " try again" << dendl;
6792       return -EAGAIN;
6793     } else {
6794       // Cannot find it , return error
6795       *ss << "specified rule " << rule_name << " doesn't exist";
6796       return ret;
6797     }
6798   }
6799   return 0;
6800 }
6801
6802 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
6803 {
6804   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
6805   auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
6806   auto max_pgs = max_pgs_per_osd * num_osds;
6807   uint64_t projected = 0;
6808   if (pool < 0) {
6809     projected += pg_num * size;
6810   }
6811   for (const auto& i : osdmap.get_pools()) {
6812     if (i.first == pool) {
6813       projected += pg_num * size;
6814     } else {
6815       projected += i.second.get_pg_num_target() * i.second.get_size();
6816     }
6817   }
6818   if (projected > max_pgs) {
6819     if (pool >= 0) {
6820       *ss << "pool id " << pool;
6821     }
6822     *ss << " pg_num " << pg_num << " size " << size
6823         << " would mean " << projected
6824         << " total pgs, which exceeds max " << max_pgs
6825         << " (mon_max_pg_per_osd " << max_pgs_per_osd
6826         << " * num_in_osds " << num_osds << ")";
6827     return -ERANGE;
6828   }
6829   return 0;
6830 }
6831
6832 /**
6833  * @param name The name of the new pool
6834  * @param crush_rule The crush rule to use. If <0, will use the system default
6835  * @param crush_rule_name The crush rule to use, if crush_rulset <0
6836  * @param pg_num The pg_num to use. If set to 0, will use the system default
6837  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
6838  * @param repl_size Replication factor, or 0 for default
6839  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
6840  * @param pool_type TYPE_ERASURE, or TYPE_REP
6841  * @param expected_num_objects expected number of objects on the pool
6842  * @param fast_read fast read type.
6843  * @param ss human readable error message, if any.
6844  *
6845  * @return 0 on success, negative errno on failure.
6846  */
6847 int OSDMonitor::prepare_new_pool(string& name,
6848                                  int crush_rule,
6849                                  const string &crush_rule_name,
6850                                  unsigned pg_num, unsigned pgp_num,
6851                                  unsigned pg_num_min,
6852                                  const uint64_t repl_size,
6853                                  const uint64_t target_size_bytes,
6854                                  const float target_size_ratio,
6855                                  const string &erasure_code_profile,
6856                                  const unsigned pool_type,
6857                                  const uint64_t expected_num_objects,
6858                                  FastReadType fast_read,
6859                                  ostream *ss)
6860 {
6861   if (name.length() == 0)
6862     return -EINVAL;
6863   if (pg_num == 0)
6864     pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
6865   if (pgp_num == 0)
6866     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
6867   if (!pgp_num)
6868     pgp_num = pg_num;
6869   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
6870     *ss << "'pg_num' must be greater than 0 and less than or equal to "
6871         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
6872         << " (you may adjust 'mon max pool pg num' for higher values)";
6873     return -ERANGE;
6874   }
6875   if (pgp_num > pg_num) {
6876     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
6877         << ", which in this case is " << pg_num;
6878     return -ERANGE;
6879   }
6880   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
6881     *ss << "'fast_read' can only apply to erasure coding pool";
6882     return -EINVAL;
6883   }
6884   int r;
6885   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
6886                                  crush_rule_name, &crush_rule, ss);
6887   if (r) {
6888     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
6889     return r;
6890   }
6891   if (g_conf()->mon_osd_crush_smoke_test) {
6892     CrushWrapper newcrush;
6893     _get_pending_crush(newcrush);
6894     ostringstream err;
6895     CrushTester tester(newcrush, err);
6896     tester.set_min_x(0);
6897     tester.set_max_x(50);
6898     tester.set_rule(crush_rule);
6899     auto start = ceph::coarse_mono_clock::now();
6900     r = tester.test_with_fork(g_conf()->mon_lease);
6901     auto duration = ceph::coarse_mono_clock::now() - start;
6902     if (r < 0) {
6903       dout(10) << "tester.test_with_fork returns " << r
6904                << ": " << err.str() << dendl;
6905       *ss << "crush test failed with " << r << ": " << err.str();
6906       return r;
6907     }
6908     dout(10) << __func__ << " crush smoke test duration: "
6909              << duration << dendl;
6910   }
6911   unsigned size, min_size;
6912   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
6913                         &size, &min_size, ss);
6914   if (r) {
6915     dout(10) << "prepare_pool_size returns " << r << dendl;
6916     return r;
6917   }
6918   r = check_pg_num(-1, pg_num, size, ss);
6919   if (r) {
6920     dout(10) << "check_pg_num returns " << r << dendl;
6921     return r;
6922   }
6923
6924   if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
6925     return -EINVAL;
6926   }
6927
6928   uint32_t stripe_width = 0;
6929   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
6930   if (r) {
6931     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
6932     return r;
6933   }
6934
6935   bool fread = false;
6936   if (pool_type == pg_pool_t::TYPE_ERASURE) {
6937     switch (fast_read) {
6938       case FAST_READ_OFF:
6939         fread = false;
6940         break;
6941       case FAST_READ_ON:
6942         fread = true;
6943         break;
6944       case FAST_READ_DEFAULT:
6945         fread = g_conf()->osd_pool_default_ec_fast_read;
6946         break;
6947       default:
6948         *ss << "invalid fast_read setting: " << fast_read;
6949         return -EINVAL;
6950     }
6951   }
6952
6953   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
6954        p != pending_inc.new_pool_names.end();
6955        ++p) {
6956     if (p->second == name)
6957       return 0;
6958   }
6959
6960   if (-1 == pending_inc.new_pool_max)
6961     pending_inc.new_pool_max = osdmap.pool_max;
6962   int64_t pool = ++pending_inc.new_pool_max;
6963   pg_pool_t empty;
6964   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
6965   pi->create_time = ceph_clock_now();
6966   pi->type = pool_type;
6967   pi->fast_read = fread;
6968   pi->flags = g_conf()->osd_pool_default_flags;
6969   if (g_conf()->osd_pool_default_flag_hashpspool)
6970     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
6971   if (g_conf()->osd_pool_default_flag_nodelete)
6972     pi->set_flag(pg_pool_t::FLAG_NODELETE);
6973   if (g_conf()->osd_pool_default_flag_nopgchange)
6974     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
6975   if (g_conf()->osd_pool_default_flag_nosizechange)
6976     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
6977   pi->set_flag(pg_pool_t::FLAG_CREATING);
6978   if (g_conf()->osd_pool_use_gmt_hitset)
6979     pi->use_gmt_hitset = true;
6980   else
6981     pi->use_gmt_hitset = false;
6982
6983   pi->size = size;
6984   pi->min_size = min_size;
6985   pi->crush_rule = crush_rule;
6986   pi->expected_num_objects = expected_num_objects;
6987   pi->object_hash = CEPH_STR_HASH_RJENKINS;
6988
6989   {
6990     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
6991       g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
6992     pi->pg_autoscale_mode = m >= 0 ? m : 0;
6993   }
6994   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
6995   pi->set_pg_num(
6996     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
6997     : pg_num);
6998   pi->set_pg_num_pending(pi->get_pg_num());
6999   pi->set_pg_num_target(pg_num);
7000   pi->set_pgp_num(pi->get_pg_num());
7001   pi->set_pgp_num_target(pgp_num);
7002   if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7003       pg_num_min) {
7004     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7005   }
7006
7007   pi->last_change = pending_inc.epoch;
7008   pi->auid = 0;
7009
7010   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7011       pi->erasure_code_profile = erasure_code_profile;
7012   } else {
7013       pi->erasure_code_profile = "";
7014   }
7015   pi->stripe_width = stripe_width;
7016
7017   if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS &&
7018       target_size_bytes) {
7019     // only store for nautilus+ because TARGET_SIZE_BYTES may be
7020     // larger than int32_t max.
7021     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7022   }
7023   if (target_size_ratio > 0.0 &&
7024     osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
7025     // only store for nautilus+, just to be consistent and tidy.
7026     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7027   }
7028
7029   pi->cache_target_dirty_ratio_micro =
7030     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7031   pi->cache_target_dirty_high_ratio_micro =
7032     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7033   pi->cache_target_full_ratio_micro =
7034     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7035   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7036   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7037
7038   pending_inc.new_pool_names[pool] = name;
7039   return 0;
7040 }
7041
7042 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7043 {
7044   op->mark_osdmon_event(__func__);
7045   ostringstream ss;
7046   if (pending_inc.new_flags < 0)
7047     pending_inc.new_flags = osdmap.get_flags();
7048   pending_inc.new_flags |= flag;
7049   ss << OSDMap::get_flag_string(flag) << " is set";
7050   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7051                                                     get_last_committed() + 1));
7052   return true;
7053 }
7054
7055 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7056 {
7057   op->mark_osdmon_event(__func__);
7058   ostringstream ss;
7059   if (pending_inc.new_flags < 0)
7060     pending_inc.new_flags = osdmap.get_flags();
7061   pending_inc.new_flags &= ~flag;
7062   ss << OSDMap::get_flag_string(flag) << " is unset";
7063   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7064                                                     get_last_committed() + 1));
7065   return true;
7066 }
7067
7068 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7069                                          stringstream& ss)
7070 {
7071   string poolstr;
7072   cmd_getval(cct, cmdmap, "pool", poolstr);
7073   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7074   if (pool < 0) {
7075     ss << "unrecognized pool '" << poolstr << "'";
7076     return -ENOENT;
7077   }
7078   string var;
7079   cmd_getval(cct, cmdmap, "var", var);
7080
7081   pg_pool_t p = *osdmap.get_pg_pool(pool);
7082   if (pending_inc.new_pools.count(pool))
7083     p = pending_inc.new_pools[pool];
7084
7085   // accept val as a json string in the normal case (current
7086   // generation monitor).  parse out int or float values from the
7087   // string as needed.  however, if it is not a string, try to pull
7088   // out an int, in case an older monitor with an older json schema is
7089   // forwarding a request.
7090   string val;
7091   string interr, floaterr;
7092   int64_t n = 0;
7093   double f = 0;
7094   int64_t uf = 0;  // micro-f
7095   cmd_getval(cct, cmdmap, "val", val);
7096
7097   // parse string as both int and float; different fields use different types.
7098   n = strict_strtoll(val.c_str(), 10, &interr);
7099   f = strict_strtod(val.c_str(), &floaterr);
7100   uf = llrintl(f * (double)1000000.0);
7101
7102   if (!p.is_tier() &&
7103       (var == "hit_set_type" || var == "hit_set_period" ||
7104        var == "hit_set_count" || var == "hit_set_fpp" ||
7105        var == "target_max_objects" || var == "target_max_bytes" ||
7106        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7107        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7108        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7109        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7110        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7111     return -EACCES;
7112   }
7113
7114   if (var == "size") {
7115     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7116       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7117       return -EPERM;
7118     }
7119     if (p.type == pg_pool_t::TYPE_ERASURE) {
7120       ss << "can not change the size of an erasure-coded pool";
7121       return -ENOTSUP;
7122     }
7123     if (interr.length()) {
7124       ss << "error parsing integer value '" << val << "': " << interr;
7125       return -EINVAL;
7126     }
7127     if (n <= 0 || n > 10) {
7128       ss << "pool size must be between 1 and 10";
7129       return -EINVAL;
7130     }
7131     int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7132     if (r < 0) {
7133       return r;
7134     }
7135     p.size = n;
7136     if (n < p.min_size)
7137       p.min_size = n;
7138   } else if (var == "min_size") {
7139     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7140       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7141       return -EPERM;
7142     }
7143     if (interr.length()) {
7144       ss << "error parsing integer value '" << val << "': " << interr;
7145       return -EINVAL;
7146     }
7147
7148     if (p.type != pg_pool_t::TYPE_ERASURE) {
7149       if (n < 1 || n > p.size) {
7150         ss << "pool min_size must be between 1 and " << (int)p.size;
7151         return -EINVAL;
7152       }
7153     } else {
7154        ErasureCodeInterfaceRef erasure_code;
7155        int k;
7156        stringstream tmp;
7157        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7158        if (err == 0) {
7159          k = erasure_code->get_data_chunk_count();
7160        } else {
7161          ss << __func__ << " get_erasure_code failed: " << tmp.str();
7162          return err;
7163        }
7164
7165        if (n < k || n > p.size) {
7166          ss << "pool min_size must be between " << k << " and " << (int)p.size;
7167          return -EINVAL;
7168        }
7169     }
7170     p.min_size = n;
7171   } else if (var == "pg_num_actual") {
7172     if (interr.length()) {
7173       ss << "error parsing integer value '" << val << "': " << interr;
7174       return -EINVAL;
7175     }
7176     if (n == (int)p.get_pg_num()) {
7177       return 0;
7178     }
7179     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7180       ss << "'pg_num' must be greater than 0 and less than or equal to "
7181          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7182          << " (you may adjust 'mon max pool pg num' for higher values)";
7183       return -ERANGE;
7184     }
7185     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
7186       ss << "cannot adjust pg_num while initial PGs are being created";
7187       return -EBUSY;
7188     }
7189     if (n > (int)p.get_pg_num()) {
7190       if (p.get_pg_num() != p.get_pg_num_pending()) {
7191         // force pre-nautilus clients to resend their ops, since they
7192         // don't understand pg_num_pending changes form a new interval
7193         p.last_force_op_resend_prenautilus = pending_inc.epoch;
7194       }
7195       p.set_pg_num(n);
7196     } else {
7197       if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7198         ss << "nautilus OSDs are required to adjust pg_num_pending";
7199         return -EPERM;
7200       }
7201       if (n < (int)p.get_pgp_num()) {
7202         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
7203         return -EINVAL;
7204       }
7205       if (n < (int)p.get_pg_num() - 1) {
7206         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
7207            << ") - 1; only single pg decrease is currently supported";
7208         return -EINVAL;
7209       }
7210       p.set_pg_num_pending(n);
7211       // force pre-nautilus clients to resend their ops, since they
7212       // don't understand pg_num_pending changes form a new interval
7213       p.last_force_op_resend_prenautilus = pending_inc.epoch;
7214     }
7215     // force pre-luminous clients to resend their ops, since they
7216     // don't understand that split PGs now form a new interval.
7217     p.last_force_op_resend_preluminous = pending_inc.epoch;
7218   } else if (var == "pg_num") {
7219     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7220       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
7221       return -EPERM;
7222     }
7223     if (interr.length()) {
7224       ss << "error parsing integer value '" << val << "': " << interr;
7225       return -EINVAL;
7226     }
7227     if (n == (int)p.get_pg_num_target()) {
7228       return 0;
7229     }
7230     if (n <= 0 || static_cast<uint64_t>(n) >
7231                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7232       ss << "'pg_num' must be greater than 0 and less than or equal to "
7233          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7234          << " (you may adjust 'mon max pool pg num' for higher values)";
7235       return -ERANGE;
7236     }
7237     if (n > (int)p.get_pg_num_target()) {
7238       int r = check_pg_num(pool, n, p.get_size(), &ss);
7239       if (r) {
7240         return r;
7241       }
7242       bool force = false;
7243       cmd_getval(cct,cmdmap, "yes_i_really_mean_it", force);
7244       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
7245         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
7246         return -EPERM;
7247       }
7248     } else {
7249       if (osdmap.require_osd_release < CEPH_RELEASE_NAUTILUS) {
7250         ss << "nautilus OSDs are required to decrease pg_num";
7251         return -EPERM;
7252       }
7253     }
7254     // set targets; mgr will adjust pg_num_actual and pgp_num later.
7255     // make pgp_num track pg_num if it already matches.  if it is set
7256     // differently, leave it different and let the user control it
7257     // manually.
7258     if (p.get_pg_num_target() == p.get_pgp_num_target()) {
7259       p.set_pgp_num_target(n);
7260     }
7261     p.set_pg_num_target(n);
7262   } else if (var == "pgp_num_actual") {
7263     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7264       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7265       return -EPERM;
7266     }
7267     if (interr.length()) {
7268       ss << "error parsing integer value '" << val << "': " << interr;
7269       return -EINVAL;
7270     }
7271     if (n <= 0) {
7272       ss << "specified pgp_num must > 0, but you set to " << n;
7273       return -EINVAL;
7274     }
7275     if (n > (int)p.get_pg_num()) {
7276       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
7277       return -EINVAL;
7278     }
7279     if (n > (int)p.get_pg_num_pending()) {
7280       ss << "specified pgp_num " << n
7281          << " > pg_num_pending " << p.get_pg_num_pending();
7282       return -EINVAL;
7283     }
7284     p.set_pgp_num(n);
7285   } else if (var == "pgp_num") {
7286     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
7287       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
7288       return -EPERM;
7289     }
7290     if (interr.length()) {
7291       ss << "error parsing integer value '" << val << "': " << interr;
7292       return -EINVAL;
7293     }
7294     if (n <= 0) {
7295       ss << "specified pgp_num must > 0, but you set to " << n;
7296       return -EINVAL;
7297     }
7298     if (n > (int)p.get_pg_num_target()) {
7299       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
7300       return -EINVAL;
7301     }
7302     p.set_pgp_num_target(n);
7303   } else if (var == "pg_autoscale_mode") {
7304     n = pg_pool_t::get_pg_autoscale_mode_by_name(val);
7305     if (n < 0) {
7306       ss << "specified invalid mode " << val;
7307       return -EINVAL;
7308     }
7309     p.pg_autoscale_mode = n;
7310   } else if (var == "crush_rule") {
7311     int id = osdmap.crush->get_rule_id(val);
7312     if (id == -ENOENT) {
7313       ss << "crush rule " << val << " does not exist";
7314       return -ENOENT;
7315     }
7316     if (id < 0) {
7317       ss << cpp_strerror(id);
7318       return -ENOENT;
7319     }
7320     if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
7321       return -EINVAL;
7322     }
7323     p.crush_rule = id;
7324   } else if (var == "nodelete" || var == "nopgchange" ||
7325              var == "nosizechange" || var == "write_fadvise_dontneed" ||
7326              var == "noscrub" || var == "nodeep-scrub") {
7327     uint64_t flag = pg_pool_t::get_flag_by_name(var);
7328     // make sure we only compare against 'n' if we didn't receive a string
7329     if (val == "true" || (interr.empty() && n == 1)) {
7330       p.set_flag(flag);
7331     } else if (val == "false" || (interr.empty() && n == 0)) {
7332       p.unset_flag(flag);
7333     } else {
7334       ss << "expecting value 'true', 'false', '0', or '1'";
7335       return -EINVAL;
7336     }
7337   } else if (var == "hashpspool") {
7338     uint64_t flag = pg_pool_t::get_flag_by_name(var);
7339     bool force = false;
7340     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7341
7342     if (!force) {
7343       ss << "are you SURE?  this will remap all placement groups in this pool,"
7344             " this triggers large data movement,"
7345             " pass --yes-i-really-mean-it if you really do.";
7346       return -EPERM;
7347     }
7348     // make sure we only compare against 'n' if we didn't receive a string
7349     if (val == "true" || (interr.empty() && n == 1)) {
7350       p.set_flag(flag);
7351     } else if (val == "false" || (interr.empty() && n == 0)) {
7352       p.unset_flag(flag);
7353     } else {
7354       ss << "expecting value 'true', 'false', '0', or '1'";
7355       return -EINVAL;
7356     }
7357   } else if (var == "hit_set_type") {
7358     if (val == "none")
7359       p.hit_set_params = HitSet::Params();
7360     else {
7361       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
7362       if (err)
7363         return err;
7364       if (val == "bloom") {
7365         BloomHitSet::Params *bsp = new BloomHitSet::Params;
7366         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
7367         p.hit_set_params = HitSet::Params(bsp);
7368       } else if (val == "explicit_hash")
7369         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
7370       else if (val == "explicit_object")
7371         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
7372       else {
7373         ss << "unrecognized hit_set type '" << val << "'";
7374         return -EINVAL;
7375       }
7376     }
7377   } else if (var == "hit_set_period") {
7378     if (interr.length()) {
7379       ss << "error parsing integer value '" << val << "': " << interr;
7380       return -EINVAL;
7381     } else if (n < 0) {
7382       ss << "hit_set_period should be non-negative";
7383       return -EINVAL;
7384     }
7385     p.hit_set_period = n;
7386   } else if (var == "hit_set_count") {
7387     if (interr.length()) {
7388       ss << "error parsing integer value '" << val << "': " << interr;
7389       return -EINVAL;
7390     } else if (n < 0) {
7391       ss << "hit_set_count should be non-negative";
7392       return -EINVAL;
7393     }
7394     p.hit_set_count = n;
7395   } else if (var == "hit_set_fpp") {
7396     if (floaterr.length()) {
7397       ss << "error parsing floating point value '" << val << "': " << floaterr;
7398       return -EINVAL;
7399     } else if (f < 0 || f > 1.0) {
7400       ss << "hit_set_fpp should be in the range 0..1";
7401       return -EINVAL;
7402     }
7403     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
7404       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
7405       return -EINVAL;
7406     }
7407     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
7408     bloomp->set_fpp(f);
7409   } else if (var == "use_gmt_hitset") {
7410     if (val == "true" || (interr.empty() && n == 1)) {
7411       p.use_gmt_hitset = true;
7412     } else {
7413       ss << "expecting value 'true' or '1'";
7414       return -EINVAL;
7415     }
7416   } else if (var == "allow_ec_overwrites") {
7417     if (!p.is_erasure()) {
7418       ss << "ec overwrites can only be enabled for an erasure coded pool";
7419       return -EINVAL;
7420     }
7421     stringstream err;
7422     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
7423         !is_pool_currently_all_bluestore(pool, p, &err)) {
7424       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
7425       return -EINVAL;
7426     }
7427     if (val == "true" || (interr.empty() && n == 1)) {
7428         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
7429     } else if (val == "false" || (interr.empty() && n == 0)) {
7430       ss << "ec overwrites cannot be disabled once enabled";
7431       return -EINVAL;
7432     } else {
7433       ss << "expecting value 'true', 'false', '0', or '1'";
7434       return -EINVAL;
7435     }
7436   } else if (var == "target_max_objects") {
7437     if (interr.length()) {
7438       ss << "error parsing int '" << val << "': " << interr;
7439       return -EINVAL;
7440     }
7441     p.target_max_objects = n;
7442   } else if (var == "target_max_bytes") {
7443     if (interr.length()) {
7444       ss << "error parsing int '" << val << "': " << interr;
7445       return -EINVAL;
7446     }
7447     p.target_max_bytes = n;
7448   } else if (var == "cache_target_dirty_ratio") {
7449     if (floaterr.length()) {
7450       ss << "error parsing float '" << val << "': " << floaterr;
7451       return -EINVAL;
7452     }
7453     if (f < 0 || f > 1.0) {
7454       ss << "value must be in the range 0..1";
7455       return -ERANGE;
7456     }
7457     p.cache_target_dirty_ratio_micro = uf;
7458   } else if (var == "cache_target_dirty_high_ratio") {
7459     if (floaterr.length()) {
7460       ss << "error parsing float '" << val << "': " << floaterr;
7461       return -EINVAL;
7462     }
7463     if (f < 0 || f > 1.0) {
7464       ss << "value must be in the range 0..1";
7465       return -ERANGE;
7466     }
7467     p.cache_target_dirty_high_ratio_micro = uf;
7468   } else if (var == "cache_target_full_ratio") {
7469     if (floaterr.length()) {
7470       ss << "error parsing float '" << val << "': " << floaterr;
7471       return -EINVAL;
7472     }
7473     if (f < 0 || f > 1.0) {
7474       ss << "value must be in the range 0..1";
7475       return -ERANGE;
7476     }
7477     p.cache_target_full_ratio_micro = uf;
7478   } else if (var == "cache_min_flush_age") {
7479     if (interr.length()) {
7480       ss << "error parsing int '" << val << "': " << interr;
7481       return -EINVAL;
7482     }
7483     p.cache_min_flush_age = n;
7484   } else if (var == "cache_min_evict_age") {
7485     if (interr.length()) {
7486       ss << "error parsing int '" << val << "': " << interr;
7487       return -EINVAL;
7488     }
7489     p.cache_min_evict_age = n;
7490   } else if (var == "min_read_recency_for_promote") {
7491     if (interr.length()) {
7492       ss << "error parsing integer value '" << val << "': " << interr;
7493       return -EINVAL;
7494     }
7495     p.min_read_recency_for_promote = n;
7496   } else if (var == "hit_set_grade_decay_rate") {
7497     if (interr.length()) {
7498       ss << "error parsing integer value '" << val << "': " << interr;
7499       return -EINVAL;
7500     }
7501     if (n > 100 || n < 0) {
7502       ss << "value out of range,valid range is 0 - 100";
7503       return -EINVAL;
7504     }
7505     p.hit_set_grade_decay_rate = n;
7506   } else if (var == "hit_set_search_last_n") {
7507     if (interr.length()) {
7508       ss << "error parsing integer value '" << val << "': " << interr;
7509       return -EINVAL;
7510     }
7511     if (n > p.hit_set_count || n < 0) {
7512       ss << "value out of range,valid range is 0 - hit_set_count";
7513       return -EINVAL;
7514     }
7515     p.hit_set_search_last_n = n;
7516   } else if (var == "min_write_recency_for_promote") {
7517     if (interr.length()) {
7518       ss << "error parsing integer value '" << val << "': " << interr;
7519       return -EINVAL;
7520     }
7521     p.min_write_recency_for_promote = n;
7522   } else if (var == "fast_read") {
7523     if (p.is_replicated()) {
7524         ss << "fast read is not supported in replication pool";
7525         return -EINVAL;
7526     }
7527     if (val == "true" || (interr.empty() && n == 1)) {
7528       p.fast_read = true;
7529     } else if (val == "false" || (interr.empty() && n == 0)) {
7530       p.fast_read = false;
7531     } else {
7532       ss << "expecting value 'true', 'false', '0', or '1'";
7533       return -EINVAL;
7534     }
7535   } else if (pool_opts_t::is_opt_name(var)) {
7536     bool unset = val == "unset";
7537     if (var == "compression_mode") {
7538       if (!unset) {
7539         auto cmode = Compressor::get_comp_mode_type(val);
7540         if (!cmode) {
7541           ss << "unrecognized compression mode '" << val << "'";
7542           return -EINVAL;
7543         }
7544       }
7545     } else if (var == "compression_algorithm") {
7546       if (!unset) {
7547         auto alg = Compressor::get_comp_alg_type(val);
7548         if (!alg) {
7549           ss << "unrecognized compression_algorithm '" << val << "'";
7550           return -EINVAL;
7551         }
7552       }
7553     } else if (var == "compression_required_ratio") {
7554       if (floaterr.length()) {
7555         ss << "error parsing float value '" << val << "': " << floaterr;
7556         return -EINVAL;
7557       }
7558       if (f < 0 || f > 1) {
7559         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
7560         return -EINVAL;
7561       }
7562     } else if (var == "csum_type") {
7563       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
7564       if (t < 0 ) {
7565         ss << "unrecognized csum_type '" << val << "'";
7566         return -EINVAL;
7567       }
7568       //preserve csum_type numeric value
7569       n = t;
7570       interr.clear();
7571     } else if (var == "compression_max_blob_size" ||
7572                var == "compression_min_blob_size" ||
7573                var == "csum_max_block" ||
7574                var == "csum_min_block") {
7575       if (interr.length()) {
7576         ss << "error parsing int value '" << val << "': " << interr;
7577         return -EINVAL;
7578       }
7579     } else if (var == "fingerprint_algorithm") {
7580       if (!unset) {
7581         auto alg = pg_pool_t::get_fingerprint_from_str(val);
7582         if (!alg) {
7583           ss << "unrecognized fingerprint_algorithm '" << val << "'";
7584           return -EINVAL;
7585         }
7586       }
7587     } else if (var == "pg_num_min") {
7588       if (interr.length()) {
7589         ss << "error parsing int value '" << val << "': " << interr;
7590         return -EINVAL;
7591       }
7592       if (n > (int)p.get_pg_num_target()) {
7593         ss << "specified pg_num_min " << n
7594            << " > pg_num " << p.get_pg_num_target();
7595         return -EINVAL;
7596       }
7597     } else if (var == "recovery_priority") {
7598       if (interr.length()) {
7599         ss << "error parsing int value '" << val << "': " << interr;
7600         return -EINVAL;
7601       }
7602       if (!g_conf()->debug_allow_any_pool_priority) {
7603         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
7604           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
7605              << " and " << OSD_POOL_PRIORITY_MAX;
7606           return -EINVAL;
7607         }
7608       }
7609     } else if (var == "pg_autoscale_bias") {
7610       if (f < 0.0 || f > 1000.0) {
7611         ss << "pg_autoscale_bias must be between 0 and 1000";
7612         return -EINVAL;
7613       }
7614     }
7615
7616     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
7617     switch (desc.type) {
7618     case pool_opts_t::STR:
7619       if (unset) {
7620         p.opts.unset(desc.key);
7621       } else {
7622         p.opts.set(desc.key, static_cast<std::string>(val));
7623       }
7624       break;
7625     case pool_opts_t::INT:
7626       if (interr.length()) {
7627         ss << "error parsing integer value '" << val << "': " << interr;
7628         return -EINVAL;
7629       }
7630       if (n == 0) {
7631         p.opts.unset(desc.key);
7632       } else {
7633         p.opts.set(desc.key, static_cast<int64_t>(n));
7634       }
7635       break;
7636     case pool_opts_t::DOUBLE:
7637       if (floaterr.length()) {
7638         ss << "error parsing floating point value '" << val << "': " << floaterr;
7639         return -EINVAL;
7640       }
7641       if (f == 0) {
7642         p.opts.unset(desc.key);
7643       } else {
7644         p.opts.set(desc.key, static_cast<double>(f));
7645       }
7646       break;
7647     default:
7648       ceph_assert(!"unknown type");
7649     }
7650   } else {
7651     ss << "unrecognized variable '" << var << "'";
7652     return -EINVAL;
7653   }
7654   if (val != "unset") {
7655     ss << "set pool " << pool << " " << var << " to " << val;
7656   } else {
7657     ss << "unset pool " << pool << " " << var;
7658   }
7659   p.last_change = pending_inc.epoch;
7660   pending_inc.new_pools[pool] = p;
7661   return 0;
7662 }
7663
7664 int OSDMonitor::prepare_command_pool_application(const string &prefix,
7665                                                  const cmdmap_t& cmdmap,
7666                                                  stringstream& ss)
7667 {
7668   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
7669 }
7670
7671 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
7672                                                     const cmdmap_t& cmdmap,
7673                                                     stringstream& ss,
7674                                                     bool *modified)
7675 {
7676   return _command_pool_application(prefix, cmdmap, ss, modified, false);
7677 }
7678
7679
7680 /**
7681  * Common logic for preprocess and prepare phases of pool application
7682  * tag commands.  In preprocess mode we're only detecting invalid
7683  * commands, and determining whether it was a modification or a no-op.
7684  * In prepare mode we're actually updating the pending state.
7685  */
7686 int OSDMonitor::_command_pool_application(const string &prefix,
7687                                           const cmdmap_t& cmdmap,
7688                                           stringstream& ss,
7689                                           bool *modified,
7690                                           bool preparing)
7691 {
7692   string pool_name;
7693   cmd_getval(cct, cmdmap, "pool", pool_name);
7694   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
7695   if (pool < 0) {
7696     ss << "unrecognized pool '" << pool_name << "'";
7697     return -ENOENT;
7698   }
7699
7700   pg_pool_t p = *osdmap.get_pg_pool(pool);
7701   if (preparing) {
7702     if (pending_inc.new_pools.count(pool)) {
7703       p = pending_inc.new_pools[pool];
7704     }
7705   }
7706
7707   string app;
7708   cmd_getval(cct, cmdmap, "app", app);
7709   bool app_exists = (p.application_metadata.count(app) > 0);
7710
7711   string key;
7712   cmd_getval(cct, cmdmap, "key", key);
7713   if (key == "all") {
7714     ss << "key cannot be 'all'";
7715     return -EINVAL;
7716   }
7717
7718   string value;
7719   cmd_getval(cct, cmdmap, "value", value);
7720   if (value == "all") {
7721     ss << "value cannot be 'all'";
7722     return -EINVAL;
7723   }
7724
7725   if (boost::algorithm::ends_with(prefix, "enable")) {
7726     if (app.empty()) {
7727       ss << "application name must be provided";
7728       return -EINVAL;
7729     }
7730
7731     if (p.is_tier()) {
7732       ss << "application must be enabled on base tier";
7733       return -EINVAL;
7734     }
7735
7736     bool force = false;
7737     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7738
7739     if (!app_exists && !p.application_metadata.empty() && !force) {
7740       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
7741          << "application; pass --yes-i-really-mean-it to proceed anyway";
7742       return -EPERM;
7743     }
7744
7745     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
7746       ss << "too many enabled applications on pool '" << pool_name << "'; "
7747          << "max " << MAX_POOL_APPLICATIONS;
7748       return -EINVAL;
7749     }
7750
7751     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
7752       ss << "application name '" << app << "' too long; max length "
7753          << MAX_POOL_APPLICATION_LENGTH;
7754       return -EINVAL;
7755     }
7756
7757     if (!app_exists) {
7758       p.application_metadata[app] = {};
7759     }
7760     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
7761
7762   } else if (boost::algorithm::ends_with(prefix, "disable")) {
7763     bool force = false;
7764     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
7765
7766     if (!force) {
7767       ss << "Are you SURE? Disabling an application within a pool might result "
7768          << "in loss of application functionality; pass "
7769          << "--yes-i-really-mean-it to proceed anyway";
7770       return -EPERM;
7771     }
7772
7773     if (!app_exists) {
7774       ss << "application '" << app << "' is not enabled on pool '" << pool_name
7775          << "'";
7776       return 0; // idempotent
7777     }
7778
7779     p.application_metadata.erase(app);
7780     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
7781
7782   } else if (boost::algorithm::ends_with(prefix, "set")) {
7783     if (p.is_tier()) {
7784       ss << "application metadata must be set on base tier";
7785       return -EINVAL;
7786     }
7787
7788     if (!app_exists) {
7789       ss << "application '" << app << "' is not enabled on pool '" << pool_name
7790          << "'";
7791       return -ENOENT;
7792     }
7793
7794     string key;
7795     cmd_getval(cct, cmdmap, "key", key);
7796
7797     if (key.empty()) {
7798       ss << "key must be provided";
7799       return -EINVAL;
7800     }
7801
7802     auto &app_keys = p.application_metadata[app];
7803     if (app_keys.count(key) == 0 &&
7804         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
7805       ss << "too many keys set for application '" << app << "' on pool '"
7806          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
7807       return -EINVAL;
7808     }
7809
7810     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
7811       ss << "key '" << app << "' too long; max length "
7812          << MAX_POOL_APPLICATION_LENGTH;
7813       return -EINVAL;
7814     }
7815
7816     string value;
7817     cmd_getval(cct, cmdmap, "value", value);
7818     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
7819       ss << "value '" << value << "' too long; max length "
7820          << MAX_POOL_APPLICATION_LENGTH;
7821       return -EINVAL;
7822     }
7823
7824     p.application_metadata[app][key] = value;
7825     ss << "set application '" << app << "' key '" << key << "' to '"
7826        << value << "' on pool '" << pool_name << "'";
7827   } else if (boost::algorithm::ends_with(prefix, "rm")) {
7828     if (!app_exists) {
7829       ss << "application '" << app << "' is not enabled on pool '" << pool_name
7830          << "'";
7831       return -ENOENT;
7832     }
7833
7834     string key;
7835     cmd_getval(cct, cmdmap, "key", key);
7836     auto it = p.application_metadata[app].find(key);
7837     if (it == p.application_metadata[app].end()) {
7838       ss << "application '" << app << "' on pool '" << pool_name
7839          << "' does not have key '" << key << "'";
7840       return 0; // idempotent
7841     }
7842
7843     p.application_metadata[app].erase(it);
7844     ss << "removed application '" << app << "' key '" << key << "' on pool '"
7845        << pool_name << "'";
7846   } else {
7847     ceph_abort();
7848   }
7849
7850   if (preparing) {
7851     p.last_change = pending_inc.epoch;
7852     pending_inc.new_pools[pool] = p;
7853   }
7854
7855   // Because we fell through this far, we didn't hit no-op cases,
7856   // so pool was definitely modified
7857   if (modified != nullptr) {
7858     *modified = true;
7859   }
7860
7861   return 0;
7862 }
7863
7864 int OSDMonitor::_prepare_command_osd_crush_remove(
7865     CrushWrapper &newcrush,
7866     int32_t id,
7867     int32_t ancestor,
7868     bool has_ancestor,
7869     bool unlink_only)
7870 {
7871   int err = 0;
7872
7873   if (has_ancestor) {
7874     err = newcrush.remove_item_under(cct, id, ancestor,
7875         unlink_only);
7876   } else {
7877     err = newcrush.remove_item(cct, id, unlink_only);
7878   }
7879   return err;
7880 }
7881
7882 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
7883 {
7884   pending_inc.crush.clear();
7885   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7886 }
7887
7888 int OSDMonitor::prepare_command_osd_crush_remove(
7889     CrushWrapper &newcrush,
7890     int32_t id,
7891     int32_t ancestor,
7892     bool has_ancestor,
7893     bool unlink_only)
7894 {
7895   int err = _prepare_command_osd_crush_remove(
7896       newcrush, id, ancestor,
7897       has_ancestor, unlink_only);
7898
7899   if (err < 0)
7900     return err;
7901
7902   ceph_assert(err == 0);
7903   do_osd_crush_remove(newcrush);
7904
7905   return 0;
7906 }
7907
7908 int OSDMonitor::prepare_command_osd_remove(int32_t id)
7909 {
7910   if (osdmap.is_up(id)) {
7911     return -EBUSY;
7912   }
7913
7914   pending_inc.new_state[id] = osdmap.get_state(id);
7915   pending_inc.new_uuid[id] = uuid_d();
7916   pending_metadata_rm.insert(id);
7917   pending_metadata.erase(id);
7918
7919   return 0;
7920 }
7921
7922 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
7923 {
7924   ceph_assert(existing_id);
7925   *existing_id = -1;
7926
7927   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
7928     if (!osdmap.exists(i) &&
7929         pending_inc.new_up_client.count(i) == 0 &&
7930         (pending_inc.new_state.count(i) == 0 ||
7931          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
7932       *existing_id = i;
7933       return -1;
7934     }
7935   }
7936
7937   if (pending_inc.new_max_osd < 0) {
7938     return osdmap.get_max_osd();
7939   }
7940   return pending_inc.new_max_osd;
7941 }
7942
7943 void OSDMonitor::do_osd_create(
7944     const int32_t id,
7945     const uuid_d& uuid,
7946     const string& device_class,
7947     int32_t* new_id)
7948 {
7949   dout(10) << __func__ << " uuid " << uuid << dendl;
7950   ceph_assert(new_id);
7951
7952   // We presume validation has been performed prior to calling this
7953   // function. We assert with prejudice.
7954
7955   int32_t allocated_id = -1; // declare here so we can jump
7956   int32_t existing_id = -1;
7957   if (!uuid.is_zero()) {
7958     existing_id = osdmap.identify_osd(uuid);
7959     if (existing_id >= 0) {
7960       ceph_assert(id < 0 || id == existing_id);
7961       *new_id = existing_id;
7962       goto out;
7963     } else if (id >= 0) {
7964       // uuid does not exist, and id has been provided, so just create
7965       // the new osd.id
7966       *new_id = id;
7967       goto out;
7968     }
7969   }
7970
7971   // allocate a new id
7972   allocated_id = _allocate_osd_id(&existing_id);
7973   dout(10) << __func__ << " allocated id " << allocated_id
7974            << " existing id " << existing_id << dendl;
7975   if (existing_id >= 0) {
7976     ceph_assert(existing_id < osdmap.get_max_osd());
7977     ceph_assert(allocated_id < 0);
7978     pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
7979     *new_id = existing_id;
7980   } else if (allocated_id >= 0) {
7981     ceph_assert(existing_id < 0);
7982     // raise max_osd
7983     if (pending_inc.new_max_osd < 0) {
7984       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
7985     } else {
7986       ++pending_inc.new_max_osd;
7987     }
7988     *new_id = pending_inc.new_max_osd - 1;
7989     ceph_assert(*new_id == allocated_id);
7990   } else {
7991     ceph_abort_msg("unexpected condition");
7992   }
7993
7994 out:
7995   if (device_class.size()) {
7996     CrushWrapper newcrush;
7997     _get_pending_crush(newcrush);
7998     if (newcrush.get_max_devices() < *new_id + 1) {
7999       newcrush.set_max_devices(*new_id + 1);
8000     }
8001     string name = string("osd.") + stringify(*new_id);
8002     if (!newcrush.item_exists(*new_id)) {
8003       newcrush.set_item_name(*new_id, name);
8004     }
8005     ostringstream ss;
8006     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8007     if (r < 0) {
8008       derr << __func__ << " failed to set " << name << " device_class "
8009            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8010            << dendl;
8011       // non-fatal... this might be a replay and we want to be idempotent.
8012     } else {
8013       dout(20) << __func__ << " set " << name << " device_class " << device_class
8014                << dendl;
8015       pending_inc.crush.clear();
8016       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8017     }
8018   } else {
8019     dout(20) << __func__ << " no device_class" << dendl;
8020   }
8021
8022   dout(10) << __func__ << " using id " << *new_id << dendl;
8023   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8024     pending_inc.new_max_osd = *new_id + 1;
8025   }
8026
8027   pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8028   if (!uuid.is_zero())
8029     pending_inc.new_uuid[*new_id] = uuid;
8030 }
8031
8032 int OSDMonitor::validate_osd_create(
8033     const int32_t id,
8034     const uuid_d& uuid,
8035     const bool check_osd_exists,
8036     int32_t* existing_id,
8037     stringstream& ss)
8038 {
8039
8040   dout(10) << __func__ << " id " << id << " uuid " << uuid
8041            << " check_osd_exists " << check_osd_exists << dendl;
8042
8043   ceph_assert(existing_id);
8044
8045   if (id < 0 && uuid.is_zero()) {
8046     // we have nothing to validate
8047     *existing_id = -1;
8048     return 0;
8049   } else if (uuid.is_zero()) {
8050     // we have an id but we will ignore it - because that's what
8051     // `osd create` does.
8052     return 0;
8053   }
8054
8055   /*
8056    * This function will be used to validate whether we are able to
8057    * create a new osd when the `uuid` is specified.
8058    *
8059    * It will be used by both `osd create` and `osd new`, as the checks
8060    * are basically the same when it pertains to osd id and uuid validation.
8061    * However, `osd create` presumes an `uuid` is optional, for legacy
8062    * reasons, while `osd new` requires the `uuid` to be provided. This
8063    * means that `osd create` will not be idempotent if an `uuid` is not
8064    * provided, but we will always guarantee the idempotency of `osd new`.
8065    */
8066
8067   ceph_assert(!uuid.is_zero());
8068   if (pending_inc.identify_osd(uuid) >= 0) {
8069     // osd is about to exist
8070     return -EAGAIN;
8071   }
8072
8073   int32_t i = osdmap.identify_osd(uuid);
8074   if (i >= 0) {
8075     // osd already exists
8076     if (id >= 0 && i != id) {
8077       ss << "uuid " << uuid << " already in use for different id " << i;
8078       return -EEXIST;
8079     }
8080     // return a positive errno to distinguish between a blocking error
8081     // and an error we consider to not be a problem (i.e., this would be
8082     // an idempotent operation).
8083     *existing_id = i;
8084     return EEXIST;
8085   }
8086   // i < 0
8087   if (id >= 0) {
8088     if (pending_inc.new_state.count(id)) {
8089       // osd is about to exist
8090       return -EAGAIN;
8091     }
8092     // we may not care if an osd exists if we are recreating a previously
8093     // destroyed osd.
8094     if (check_osd_exists && osdmap.exists(id)) {
8095       ss << "id " << id << " already in use and does not match uuid "
8096          << uuid;
8097       return -EINVAL;
8098     }
8099   }
8100   return 0;
8101 }
8102
8103 int OSDMonitor::prepare_command_osd_create(
8104     const int32_t id,
8105     const uuid_d& uuid,
8106     int32_t* existing_id,
8107     stringstream& ss)
8108 {
8109   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8110   ceph_assert(existing_id);
8111   if (osdmap.is_destroyed(id)) {
8112     ss << "ceph osd create has been deprecated. Please use ceph osd new "
8113           "instead.";
8114     return -EINVAL;
8115   }
8116
8117   if (uuid.is_zero()) {
8118     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8119   }
8120
8121   return validate_osd_create(id, uuid, true, existing_id, ss);
8122 }
8123
8124 int OSDMonitor::prepare_command_osd_new(
8125     MonOpRequestRef op,
8126     const cmdmap_t& cmdmap,
8127     const map<string,string>& params,
8128     stringstream &ss,
8129     Formatter *f)
8130 {
8131   uuid_d uuid;
8132   string uuidstr;
8133   int64_t id = -1;
8134
8135   ceph_assert(paxos->is_plugged());
8136
8137   dout(10) << __func__ << " " << op << dendl;
8138
8139   /* validate command. abort now if something's wrong. */
8140
8141   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8142    *
8143    * If `id` is not specified, we will identify any existing osd based
8144    * on `uuid`. Operation will be idempotent iff secrets match.
8145    *
8146    * If `id` is specified, we will identify any existing osd based on
8147    * `uuid` and match against `id`. If they match, operation will be
8148    * idempotent iff secrets match.
8149    *
8150    * `-i secrets.json` will be optional. If supplied, will be used
8151    * to check for idempotency when `id` and `uuid` match.
8152    *
8153    * If `id` is not specified, and `uuid` does not exist, an id will
8154    * be found or allocated for the osd.
8155    *
8156    * If `id` is specified, and the osd has been previously marked
8157    * as destroyed, then the `id` will be reused.
8158    */
8159   if (!cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
8160     ss << "requires the OSD's UUID to be specified.";
8161     return -EINVAL;
8162   } else if (!uuid.parse(uuidstr.c_str())) {
8163     ss << "invalid UUID value '" << uuidstr << "'.";
8164     return -EINVAL;
8165   }
8166
8167   if (cmd_getval(cct, cmdmap, "id", id) &&
8168       (id < 0)) {
8169     ss << "invalid OSD id; must be greater or equal than zero.";
8170     return -EINVAL;
8171   }
8172
8173   // are we running an `osd create`-like command, or recreating
8174   // a previously destroyed osd?
8175
8176   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
8177
8178   // we will care about `id` to assess whether osd is `destroyed`, or
8179   // to create a new osd.
8180   // we will need an `id` by the time we reach auth.
8181
8182   int32_t existing_id = -1;
8183   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
8184                                 &existing_id, ss);
8185
8186   bool may_be_idempotent = false;
8187   if (err == EEXIST) {
8188     // this is idempotent from the osdmon's point-of-view
8189     may_be_idempotent = true;
8190     ceph_assert(existing_id >= 0);
8191     id = existing_id;
8192   } else if (err < 0) {
8193     return err;
8194   }
8195
8196   if (!may_be_idempotent) {
8197     // idempotency is out of the window. We are either creating a new
8198     // osd or recreating a destroyed osd.
8199     //
8200     // We now need to figure out if we have an `id` (and if it's valid),
8201     // of find an `id` if we don't have one.
8202
8203     // NOTE: we need to consider the case where the `id` is specified for
8204     // `osd create`, and we must honor it. So this means checking if
8205     // the `id` is destroyed, and if so assume the destroy; otherwise,
8206     // check if it `exists` - in which case we complain about not being
8207     // `destroyed`. In the end, if nothing fails, we must allow the
8208     // creation, so that we are compatible with `create`.
8209     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
8210       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
8211       ss << "OSD " << id << " has not yet been destroyed";
8212       return -EINVAL;
8213     } else if (id < 0) {
8214       // find an `id`
8215       id = _allocate_osd_id(&existing_id);
8216       if (id < 0) {
8217         ceph_assert(existing_id >= 0);
8218         id = existing_id;
8219       }
8220       dout(10) << __func__ << " found id " << id << " to use" << dendl;
8221     } else if (id >= 0 && osdmap.is_destroyed(id)) {
8222       dout(10) << __func__ << " recreating osd." << id << dendl;
8223     } else {
8224       dout(10) << __func__ << " creating new osd." << id << dendl;
8225     }
8226   } else {
8227     ceph_assert(id >= 0);
8228     ceph_assert(osdmap.exists(id));
8229   }
8230
8231   // we are now able to either create a brand new osd or reuse an existing
8232   // osd that has been previously destroyed.
8233
8234   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8235
8236   if (may_be_idempotent && params.empty()) {
8237     // nothing to do, really.
8238     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
8239     ceph_assert(id >= 0);
8240     if (f) {
8241       f->open_object_section("created_osd");
8242       f->dump_int("osdid", id);
8243       f->close_section();
8244     } else {
8245       ss << id;
8246     }
8247     return EEXIST;
8248   }
8249
8250   string device_class;
8251   auto p = params.find("crush_device_class");
8252   if (p != params.end()) {
8253     device_class = p->second;
8254     dout(20) << __func__ << " device_class will be " << device_class << dendl;
8255   }
8256   string cephx_secret, lockbox_secret, dmcrypt_key;
8257   bool has_lockbox = false;
8258   bool has_secrets = params.count("cephx_secret")
8259     || params.count("cephx_lockbox_secret")
8260     || params.count("dmcrypt_key");
8261
8262   ConfigKeyService *svc = nullptr;
8263   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
8264
8265   if (has_secrets) {
8266     if (params.count("cephx_secret") == 0) {
8267       ss << "requires a cephx secret.";
8268       return -EINVAL;
8269     }
8270     cephx_secret = params.at("cephx_secret");
8271
8272     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
8273     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
8274
8275     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
8276              << " dmcrypt " << has_dmcrypt_key << dendl;
8277
8278     if (has_lockbox_secret && has_dmcrypt_key) {
8279       has_lockbox = true;
8280       lockbox_secret = params.at("cephx_lockbox_secret");
8281       dmcrypt_key = params.at("dmcrypt_key");
8282     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
8283       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
8284       return -EINVAL;
8285     }
8286
8287     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
8288
8289     err = mon->authmon()->validate_osd_new(id, uuid,
8290         cephx_secret,
8291         lockbox_secret,
8292         cephx_entity,
8293         lockbox_entity,
8294         ss);
8295     if (err < 0) {
8296       return err;
8297     } else if (may_be_idempotent && err != EEXIST) {
8298       // for this to be idempotent, `id` should already be >= 0; no need
8299       // to use validate_id.
8300       ceph_assert(id >= 0);
8301       ss << "osd." << id << " exists but secrets do not match";
8302       return -EEXIST;
8303     }
8304
8305     if (has_lockbox) {
8306       svc = (ConfigKeyService*)mon->config_key_service;
8307       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
8308       if (err < 0) {
8309         return err;
8310       } else if (may_be_idempotent && err != EEXIST) {
8311         ceph_assert(id >= 0);
8312         ss << "osd." << id << " exists but dm-crypt key does not match.";
8313         return -EEXIST;
8314       }
8315     }
8316   }
8317   ceph_assert(!has_secrets || !cephx_secret.empty());
8318   ceph_assert(!has_lockbox || !lockbox_secret.empty());
8319
8320   if (may_be_idempotent) {
8321     // we have nothing to do for either the osdmon or the authmon,
8322     // and we have no lockbox - so the config key service will not be
8323     // touched. This is therefore an idempotent operation, and we can
8324     // just return right away.
8325     dout(10) << __func__ << " idempotent -- no op." << dendl;
8326     ceph_assert(id >= 0);
8327     if (f) {
8328       f->open_object_section("created_osd");
8329       f->dump_int("osdid", id);
8330       f->close_section();
8331     } else {
8332       ss << id;
8333     }
8334     return EEXIST;
8335   }
8336   ceph_assert(!may_be_idempotent);
8337
8338   // perform updates.
8339   if (has_secrets) {
8340     ceph_assert(!cephx_secret.empty());
8341     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
8342            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
8343
8344     err = mon->authmon()->do_osd_new(cephx_entity,
8345         lockbox_entity,
8346         has_lockbox);
8347     ceph_assert(0 == err);
8348
8349     if (has_lockbox) {
8350       ceph_assert(nullptr != svc);
8351       svc->do_osd_new(uuid, dmcrypt_key);
8352     }
8353   }
8354
8355   if (is_recreate_destroyed) {
8356     ceph_assert(id >= 0);
8357     ceph_assert(osdmap.is_destroyed(id));
8358     pending_inc.new_weight[id] = CEPH_OSD_OUT;
8359     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
8360     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
8361       pending_inc.new_state[id] |= CEPH_OSD_NEW;
8362     }
8363     if (osdmap.get_state(id) & CEPH_OSD_UP) {
8364       // due to http://tracker.ceph.com/issues/20751 some clusters may
8365       // have UP set for non-existent OSDs; make sure it is cleared
8366       // for a newly created osd.
8367       pending_inc.new_state[id] |= CEPH_OSD_UP;
8368     }
8369     pending_inc.new_uuid[id] = uuid;
8370   } else {
8371     ceph_assert(id >= 0);
8372     int32_t new_id = -1;
8373     do_osd_create(id, uuid, device_class, &new_id);
8374     ceph_assert(new_id >= 0);
8375     ceph_assert(id == new_id);
8376   }
8377
8378   if (f) {
8379     f->open_object_section("created_osd");
8380     f->dump_int("osdid", id);
8381     f->close_section();
8382   } else {
8383     ss << id;
8384   }
8385
8386   return 0;
8387 }
8388
8389 bool OSDMonitor::prepare_command(MonOpRequestRef op)
8390 {
8391   op->mark_osdmon_event(__func__);
8392   MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
8393   stringstream ss;
8394   cmdmap_t cmdmap;
8395   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
8396     string rs = ss.str();
8397     mon->reply_command(op, -EINVAL, rs, get_last_committed());
8398     return true;
8399   }
8400
8401   MonSession *session = op->get_session();
8402   if (!session) {
8403     derr << __func__ << " no session" << dendl;
8404     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
8405     return true;
8406   }
8407
8408   return prepare_command_impl(op, cmdmap);
8409 }
8410
8411 static int parse_reweights(CephContext *cct,
8412                            const cmdmap_t& cmdmap,
8413                            const OSDMap& osdmap,
8414                            map<int32_t, uint32_t>* weights)
8415 {
8416   string weights_str;
8417   if (!cmd_getval(cct, cmdmap, "weights", weights_str)) {
8418     return -EINVAL;
8419   }
8420   std::replace(begin(weights_str), end(weights_str), '\'', '"');
8421   json_spirit::mValue json_value;
8422   if (!json_spirit::read(weights_str, json_value)) {
8423     return -EINVAL;
8424   }
8425   if (json_value.type() != json_spirit::obj_type) {
8426     return -EINVAL;
8427   }
8428   const auto obj = json_value.get_obj();
8429   try {
8430     for (auto& osd_weight : obj) {
8431       auto osd_id = std::stoi(osd_weight.first);
8432       if (!osdmap.exists(osd_id)) {
8433         return -ENOENT;
8434       }
8435       if (osd_weight.second.type() != json_spirit::str_type) {
8436         return -EINVAL;
8437       }
8438       auto weight = std::stoul(osd_weight.second.get_str());
8439       weights->insert({osd_id, weight});
8440     }
8441   } catch (const std::logic_error& e) {
8442     return -EINVAL;
8443   }
8444   return 0;
8445 }
8446
8447 int OSDMonitor::prepare_command_osd_destroy(
8448     int32_t id,
8449     stringstream& ss)
8450 {
8451   ceph_assert(paxos->is_plugged());
8452
8453   // we check if the osd exists for the benefit of `osd purge`, which may
8454   // have previously removed the osd. If the osd does not exist, return
8455   // -ENOENT to convey this, and let the caller deal with it.
8456   //
8457   // we presume that all auth secrets and config keys were removed prior
8458   // to this command being called. if they exist by now, we also assume
8459   // they must have been created by some other command and do not pertain
8460   // to this non-existent osd.
8461   if (!osdmap.exists(id)) {
8462     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
8463     return -ENOENT;
8464   }
8465
8466   uuid_d uuid = osdmap.get_uuid(id);
8467   dout(10) << __func__ << " destroying osd." << id
8468            << " uuid " << uuid << dendl;
8469
8470   // if it has been destroyed, we assume our work here is done.
8471   if (osdmap.is_destroyed(id)) {
8472     ss << "destroyed osd." << id;
8473     return 0;
8474   }
8475
8476   EntityName cephx_entity, lockbox_entity;
8477   bool idempotent_auth = false, idempotent_cks = false;
8478
8479   int err = mon->authmon()->validate_osd_destroy(id, uuid,
8480                                                  cephx_entity,
8481                                                  lockbox_entity,
8482                                                  ss);
8483   if (err < 0) {
8484     if (err == -ENOENT) {
8485       idempotent_auth = true;
8486     } else {
8487       return err;
8488     }
8489   }
8490
8491   ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
8492   err = svc->validate_osd_destroy(id, uuid);
8493   if (err < 0) {
8494     ceph_assert(err == -ENOENT);
8495     err = 0;
8496     idempotent_cks = true;
8497   }
8498
8499   if (!idempotent_auth) {
8500     err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
8501     ceph_assert(0 == err);
8502   }
8503
8504   if (!idempotent_cks) {
8505     svc->do_osd_destroy(id, uuid);
8506   }
8507
8508   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
8509   pending_inc.new_uuid[id] = uuid_d();
8510
8511   // we can only propose_pending() once per service, otherwise we'll be
8512   // defying PaxosService and all laws of nature. Therefore, as we may
8513   // be used during 'osd purge', let's keep the caller responsible for
8514   // proposing.
8515   ceph_assert(err == 0);
8516   return 0;
8517 }
8518
8519 int OSDMonitor::prepare_command_osd_purge(
8520     int32_t id,
8521     stringstream& ss)
8522 {
8523   ceph_assert(paxos->is_plugged());
8524   dout(10) << __func__ << " purging osd." << id << dendl;
8525
8526   ceph_assert(!osdmap.is_up(id));
8527
8528   /*
8529    * This may look a bit weird, but this is what's going to happen:
8530    *
8531    *  1. we make sure that removing from crush works
8532    *  2. we call `prepare_command_osd_destroy()`. If it returns an
8533    *     error, then we abort the whole operation, as no updates
8534    *     have been made. However, we this function will have
8535    *     side-effects, thus we need to make sure that all operations
8536    *     performed henceforth will *always* succeed.
8537    *  3. we call `prepare_command_osd_remove()`. Although this
8538    *     function can return an error, it currently only checks if the
8539    *     osd is up - and we have made sure that it is not so, so there
8540    *     is no conflict, and it is effectively an update.
8541    *  4. finally, we call `do_osd_crush_remove()`, which will perform
8542    *     the crush update we delayed from before.
8543    */
8544
8545   CrushWrapper newcrush;
8546   _get_pending_crush(newcrush);
8547
8548   bool may_be_idempotent = false;
8549
8550   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
8551   if (err == -ENOENT) {
8552     err = 0;
8553     may_be_idempotent = true;
8554   } else if (err < 0) {
8555     ss << "error removing osd." << id << " from crush";
8556     return err;
8557   }
8558
8559   // no point destroying the osd again if it has already been marked destroyed
8560   if (!osdmap.is_destroyed(id)) {
8561     err = prepare_command_osd_destroy(id, ss);
8562     if (err < 0) {
8563       if (err == -ENOENT) {
8564         err = 0;
8565       } else {
8566         return err;
8567       }
8568     } else {
8569       may_be_idempotent = false;
8570     }
8571   }
8572   ceph_assert(0 == err);
8573
8574   if (may_be_idempotent && !osdmap.exists(id)) {
8575     dout(10) << __func__ << " osd." << id << " does not exist and "
8576              << "we are idempotent." << dendl;
8577     return -ENOENT;
8578   }
8579
8580   err = prepare_command_osd_remove(id);
8581   // we should not be busy, as we should have made sure this id is not up.
8582   ceph_assert(0 == err);
8583
8584   do_osd_crush_remove(newcrush);
8585   return 0;
8586 }
8587
8588 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
8589                                       const cmdmap_t& cmdmap)
8590 {
8591   op->mark_osdmon_event(__func__);
8592   MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
8593   bool ret = false;
8594   stringstream ss;
8595   string rs;
8596   bufferlist rdata;
8597   int err = 0;
8598
8599   string format;
8600   cmd_getval(cct, cmdmap, "format", format, string("plain"));
8601   boost::scoped_ptr<Formatter> f(Formatter::create(format));
8602
8603   string prefix;
8604   cmd_getval(cct, cmdmap, "prefix", prefix);
8605
8606   int64_t osdid;
8607   string osd_name;
8608   bool osdid_present = false;
8609   if (prefix != "osd pg-temp" &&
8610       prefix != "osd pg-upmap" &&
8611       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
8612     osdid_present = cmd_getval(cct, cmdmap, "id", osdid);
8613   }
8614   if (osdid_present) {
8615     ostringstream oss;
8616     oss << "osd." << osdid;
8617     osd_name = oss.str();
8618   }
8619
8620   // Even if there's a pending state with changes that could affect
8621   // a command, considering that said state isn't yet committed, we
8622   // just don't care about those changes if the command currently being
8623   // handled acts as a no-op against the current committed state.
8624   // In a nutshell, we assume this command  happens *before*.
8625   //
8626   // Let me make this clearer:
8627   //
8628   //   - If we have only one client, and that client issues some
8629   //     operation that would conflict with this operation  but is
8630   //     still on the pending state, then we would be sure that said
8631   //     operation wouldn't have returned yet, so the client wouldn't
8632   //     issue this operation (unless the client didn't wait for the
8633   //     operation to finish, and that would be the client's own fault).
8634   //
8635   //   - If we have more than one client, each client will observe
8636   //     whatever is the state at the moment of the commit.  So, if we
8637   //     have two clients, one issuing an unlink and another issuing a
8638   //     link, and if the link happens while the unlink is still on the
8639   //     pending state, from the link's point-of-view this is a no-op.
8640   //     If different clients are issuing conflicting operations and
8641   //     they care about that, then the clients should make sure they
8642   //     enforce some kind of concurrency mechanism -- from our
8643   //     perspective that's what Douglas Adams would call an SEP.
8644   //
8645   // This should be used as a general guideline for most commands handled
8646   // in this function.  Adapt as you see fit, but please bear in mind that
8647   // this is the expected behavior.
8648
8649
8650   if (prefix == "osd setcrushmap" ||
8651       (prefix == "osd crush set" && !osdid_present)) {
8652     if (pending_inc.crush.length()) {
8653       dout(10) << __func__ << " waiting for pending crush update " << dendl;
8654       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
8655       return true;
8656     }
8657     dout(10) << "prepare_command setting new crush map" << dendl;
8658     bufferlist data(m->get_data());
8659     CrushWrapper crush;
8660     try {
8661       auto bl = data.cbegin();
8662       crush.decode(bl);
8663     }
8664     catch (const std::exception &e) {
8665       err = -EINVAL;
8666       ss << "Failed to parse crushmap: " << e.what();
8667       goto reply;
8668     }
8669
8670     int64_t prior_version = 0;
8671     if (cmd_getval(cct, cmdmap, "prior_version", prior_version)) {
8672       if (prior_version == osdmap.get_crush_version() - 1) {
8673         // see if we are a resend of the last update.  this is imperfect
8674         // (multiple racing updaters may not both get reliable success)
8675         // but we expect crush updaters (via this interface) to be rare-ish.
8676         bufferlist current, proposed;
8677         osdmap.crush->encode(current, mon->get_quorum_con_features());
8678         crush.encode(proposed, mon->get_quorum_con_features());
8679         if (current.contents_equal(proposed)) {
8680           dout(10) << __func__
8681                    << " proposed matches current and version equals previous"
8682                    << dendl;
8683           err = 0;
8684           ss << osdmap.get_crush_version();
8685           goto reply;
8686         }
8687       }
8688       if (prior_version != osdmap.get_crush_version()) {
8689         err = -EPERM;
8690         ss << "prior_version " << prior_version << " != crush version "
8691            << osdmap.get_crush_version();
8692         goto reply;
8693       }
8694     }
8695
8696     if (crush.has_legacy_rule_ids()) {
8697       err = -EINVAL;
8698       ss << "crush maps with ruleset != ruleid are no longer allowed";
8699       goto reply;
8700     }
8701     if (!validate_crush_against_features(&crush, ss)) {
8702       err = -EINVAL;
8703       goto reply;
8704     }
8705
8706     err = osdmap.validate_crush_rules(&crush, &ss);
8707     if (err < 0) {
8708       goto reply;
8709     }
8710
8711     if (g_conf()->mon_osd_crush_smoke_test) {
8712       // sanity check: test some inputs to make sure this map isn't
8713       // totally broken
8714       dout(10) << " testing map" << dendl;
8715       stringstream ess;
8716       CrushTester tester(crush, ess);
8717       tester.set_min_x(0);
8718       tester.set_max_x(50);
8719       auto start = ceph::coarse_mono_clock::now();
8720       int r = tester.test_with_fork(g_conf()->mon_lease);
8721       auto duration = ceph::coarse_mono_clock::now() - start;
8722       if (r < 0) {
8723         dout(10) << " tester.test_with_fork returns " << r
8724                  << ": " << ess.str() << dendl;
8725         ss << "crush smoke test failed with " << r << ": " << ess.str();
8726         err = r;
8727         goto reply;
8728       }
8729       dout(10) << __func__ << " crush somke test duration: "
8730                << duration << ", result: " << ess.str() << dendl;
8731     }
8732
8733     pending_inc.crush = data;
8734     ss << osdmap.get_crush_version() + 1;
8735     goto update;
8736
8737   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
8738     CrushWrapper newcrush;
8739     _get_pending_crush(newcrush);
8740     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
8741       int bid = -1 - b;
8742       if (newcrush.bucket_exists(bid) &&
8743           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
8744         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
8745         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
8746       }
8747     }
8748     if (!validate_crush_against_features(&newcrush, ss)) {
8749       err = -EINVAL;
8750       goto reply;
8751     }
8752     pending_inc.crush.clear();
8753     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8754     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
8755                                               get_last_committed() + 1));
8756     return true;
8757   } else if (prefix == "osd crush set-device-class") {
8758     string device_class;
8759     if (!cmd_getval(cct, cmdmap, "class", device_class)) {
8760       err = -EINVAL; // no value!
8761       goto reply;
8762     }
8763
8764     bool stop = false;
8765     vector<string> idvec;
8766     cmd_getval(cct, cmdmap, "ids", idvec);
8767     CrushWrapper newcrush;
8768     _get_pending_crush(newcrush);
8769     set<int> updated;
8770     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8771       set<int> osds;
8772       // wildcard?
8773       if (j == 0 &&
8774           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8775         osdmap.get_all_osds(osds);
8776         stop = true;
8777       } else {
8778         // try traditional single osd way
8779         long osd = parse_osd_id(idvec[j].c_str(), &ss);
8780         if (osd < 0) {
8781           // ss has reason for failure
8782           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8783           err = -EINVAL;
8784           continue;
8785         }
8786         osds.insert(osd);
8787       }
8788
8789       for (auto &osd : osds) {
8790         if (!osdmap.exists(osd)) {
8791           ss << "osd." << osd << " does not exist. ";
8792           continue;
8793         }
8794
8795         ostringstream oss;
8796         oss << "osd." << osd;
8797         string name = oss.str();
8798
8799         if (newcrush.get_max_devices() < osd + 1) {
8800           newcrush.set_max_devices(osd + 1);
8801         }
8802         string action;
8803         if (newcrush.item_exists(osd)) {
8804           action = "updating";
8805         } else {
8806           action = "creating";
8807           newcrush.set_item_name(osd, name);
8808         }
8809
8810         dout(5) << action << " crush item id " << osd << " name '" << name
8811                 << "' device_class '" << device_class << "'"
8812                 << dendl;
8813         err = newcrush.update_device_class(osd, device_class, name, &ss);
8814         if (err < 0) {
8815           goto reply;
8816         }
8817         if (err == 0 && !_have_pending_crush()) {
8818           if (!stop) {
8819             // for single osd only, wildcard makes too much noise
8820             ss << "set-device-class item id " << osd << " name '" << name
8821                << "' device_class '" << device_class << "': no change. ";
8822           }
8823         } else {
8824           updated.insert(osd);
8825         }
8826       }
8827     }
8828
8829     if (!updated.empty()) {
8830       pending_inc.crush.clear();
8831       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8832       ss << "set osd(s) " << updated << " to class '" << device_class << "'";
8833       getline(ss, rs);
8834       wait_for_finished_proposal(op,
8835         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8836       return true;
8837     }
8838
8839  } else if (prefix == "osd crush rm-device-class") {
8840     bool stop = false;
8841     vector<string> idvec;
8842     cmd_getval(cct, cmdmap, "ids", idvec);
8843     CrushWrapper newcrush;
8844     _get_pending_crush(newcrush);
8845     set<int> updated;
8846
8847     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
8848       set<int> osds;
8849
8850       // wildcard?
8851       if (j == 0 &&
8852           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
8853         osdmap.get_all_osds(osds);
8854         stop = true;
8855       } else {
8856         // try traditional single osd way
8857         long osd = parse_osd_id(idvec[j].c_str(), &ss);
8858         if (osd < 0) {
8859           // ss has reason for failure
8860           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
8861           err = -EINVAL;
8862           goto reply;
8863         }
8864         osds.insert(osd);
8865       }
8866
8867       for (auto &osd : osds) {
8868         if (!osdmap.exists(osd)) {
8869           ss << "osd." << osd << " does not exist. ";
8870           continue;
8871         }
8872
8873         auto class_name = newcrush.get_item_class(osd);
8874         if (!class_name) {
8875           ss << "osd." << osd << " belongs to no class, ";
8876           continue;
8877         }
8878         // note that we do not verify if class_is_in_use here
8879         // in case the device is misclassified and user wants
8880         // to overridely reset...
8881
8882         err = newcrush.remove_device_class(cct, osd, &ss);
8883         if (err < 0) {
8884           // ss has reason for failure
8885           goto reply;
8886         }
8887         updated.insert(osd);
8888       }
8889     }
8890
8891     if (!updated.empty()) {
8892       pending_inc.crush.clear();
8893       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8894       ss << "done removing class of osd(s): " << updated;
8895       getline(ss, rs);
8896       wait_for_finished_proposal(op,
8897         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
8898       return true;
8899     }
8900   } else if (prefix == "osd crush class create") {
8901     string device_class;
8902     if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
8903       err = -EINVAL; // no value!
8904       goto reply;
8905     }
8906     if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8907       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8908          << "luminous' before using crush device classes";
8909       err = -EPERM;
8910       goto reply;
8911     }
8912     if (!_have_pending_crush() &&
8913         _get_stable_crush().class_exists(device_class)) {
8914       ss << "class '" << device_class << "' already exists";
8915       goto reply;
8916     }
8917      CrushWrapper newcrush;
8918     _get_pending_crush(newcrush);
8919      if (newcrush.class_exists(device_class)) {
8920       ss << "class '" << device_class << "' already exists";
8921       goto update;
8922     }
8923     int class_id = newcrush.get_or_create_class_id(device_class);
8924     pending_inc.crush.clear();
8925     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8926     ss << "created class " << device_class << " with id " << class_id
8927        << " to crush map";
8928     goto update;
8929   } else if (prefix == "osd crush class rm") {
8930     string device_class;
8931     if (!cmd_getval(g_ceph_context, cmdmap, "class", device_class)) {
8932        err = -EINVAL; // no value!
8933        goto reply;
8934      }
8935      if (osdmap.require_osd_release < CEPH_RELEASE_LUMINOUS) {
8936        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
8937          << "luminous' before using crush device classes";
8938        err = -EPERM;
8939        goto reply;
8940      }
8941
8942      if (!osdmap.crush->class_exists(device_class)) {
8943        err = 0;
8944        goto reply;
8945      }
8946
8947      CrushWrapper newcrush;
8948      _get_pending_crush(newcrush);
8949      if (!newcrush.class_exists(device_class)) {
8950        err = 0; // make command idempotent
8951        goto wait;
8952      }
8953      int class_id = newcrush.get_class_id(device_class);
8954      stringstream ts;
8955      if (newcrush.class_is_in_use(class_id, &ts)) {
8956        err = -EBUSY;
8957        ss << "class '" << device_class << "' " << ts.str();
8958        goto reply;
8959      }
8960
8961      // check if class is used by any erasure-code-profiles
8962      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
8963        osdmap.get_erasure_code_profiles();
8964      auto ec_profiles = pending_inc.get_erasure_code_profiles();
8965 #ifdef HAVE_STDLIB_MAP_SPLICING
8966      ec_profiles.merge(old_ec_profiles);
8967 #else
8968      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
8969                         make_move_iterator(end(old_ec_profiles)));
8970 #endif
8971      list<string> referenced_by;
8972      for (auto &i: ec_profiles) {
8973        for (auto &j: i.second) {
8974          if ("crush-device-class" == j.first && device_class == j.second) {
8975            referenced_by.push_back(i.first);
8976          }
8977        }
8978      }
8979      if (!referenced_by.empty()) {
8980        err = -EBUSY;
8981        ss << "class '" << device_class
8982           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
8983        goto reply;
8984      }
8985
8986      set<int> osds;
8987      newcrush.get_devices_by_class(device_class, &osds);
8988      for (auto& p: osds) {
8989        err = newcrush.remove_device_class(g_ceph_context, p, &ss);
8990        if (err < 0) {
8991          // ss has reason for failure
8992          goto reply;
8993        }
8994      }
8995
8996      if (osds.empty()) {
8997        // empty class, remove directly
8998        err = newcrush.remove_class_name(device_class);
8999        if (err < 0) {
9000          ss << "class '" << device_class << "' cannot be removed '"
9001             << cpp_strerror(err) << "'";
9002          goto reply;
9003        }
9004      }
9005
9006      pending_inc.crush.clear();
9007      newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9008      ss << "removed class " << device_class << " with id " << class_id
9009         << " from crush map";
9010      goto update;
9011   } else if (prefix == "osd crush class rename") {
9012     string srcname, dstname;
9013     if (!cmd_getval(cct, cmdmap, "srcname", srcname)) {
9014       err = -EINVAL;
9015       goto reply;
9016     }
9017     if (!cmd_getval(cct, cmdmap, "dstname", dstname)) {
9018       err = -EINVAL;
9019       goto reply;
9020     }
9021
9022     CrushWrapper newcrush;
9023     _get_pending_crush(newcrush);
9024     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9025       // suppose this is a replay and return success
9026       // so command is idempotent
9027       ss << "already renamed to '" << dstname << "'";
9028       err = 0;
9029       goto reply;
9030     }
9031
9032     err = newcrush.rename_class(srcname, dstname);
9033     if (err < 0) {
9034       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9035          << cpp_strerror(err);
9036       goto reply;
9037     }
9038
9039     pending_inc.crush.clear();
9040     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9041     ss << "rename class '" << srcname << "' to '" << dstname << "'";
9042     goto update;
9043   } else if (prefix == "osd crush add-bucket") {
9044     // os crush add-bucket <name> <type>
9045     string name, typestr;
9046     vector<string> argvec;
9047     cmd_getval(cct, cmdmap, "name", name);
9048     cmd_getval(cct, cmdmap, "type", typestr);
9049     cmd_getval(cct, cmdmap, "args", argvec);
9050     map<string,string> loc;
9051     if (!argvec.empty()) {
9052       CrushWrapper::parse_loc_map(argvec, &loc);
9053       dout(0) << "will create and move bucket '" << name
9054               << "' to location " << loc << dendl;
9055     }
9056
9057     if (!_have_pending_crush() &&
9058         _get_stable_crush().name_exists(name)) {
9059       ss << "bucket '" << name << "' already exists";
9060       goto reply;
9061     }
9062
9063     CrushWrapper newcrush;
9064     _get_pending_crush(newcrush);
9065
9066     if (newcrush.name_exists(name)) {
9067       ss << "bucket '" << name << "' already exists";
9068       goto update;
9069     }
9070     int type = newcrush.get_type_id(typestr);
9071     if (type < 0) {
9072       ss << "type '" << typestr << "' does not exist";
9073       err = -EINVAL;
9074       goto reply;
9075     }
9076     if (type == 0) {
9077       ss << "type '" << typestr << "' is for devices, not buckets";
9078       err = -EINVAL;
9079       goto reply;
9080     }
9081     int bucketno;
9082     err = newcrush.add_bucket(0, 0,
9083                               CRUSH_HASH_DEFAULT, type, 0, NULL,
9084                               NULL, &bucketno);
9085     if (err < 0) {
9086       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9087       goto reply;
9088     }
9089     err = newcrush.set_item_name(bucketno, name);
9090     if (err < 0) {
9091       ss << "error setting bucket name to '" << name << "'";
9092       goto reply;
9093     }
9094
9095     if (!loc.empty()) {
9096       if (!newcrush.check_item_loc(cct, bucketno, loc,
9097           (int *)NULL)) {
9098         err = newcrush.move_bucket(cct, bucketno, loc);
9099         if (err < 0) {
9100           ss << "error moving bucket '" << name << "' to location " << loc;
9101           goto reply;
9102         }
9103       } else {
9104         ss << "no need to move item id " << bucketno << " name '" << name
9105            << "' to location " << loc << " in crush map";
9106       }
9107     }
9108
9109     pending_inc.crush.clear();
9110     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9111     if (loc.empty()) {
9112       ss << "added bucket " << name << " type " << typestr
9113          << " to crush map";
9114     } else {
9115       ss << "added bucket " << name << " type " << typestr
9116          << " to location " << loc;
9117     }
9118     goto update;
9119   } else if (prefix == "osd crush rename-bucket") {
9120     string srcname, dstname;
9121     cmd_getval(cct, cmdmap, "srcname", srcname);
9122     cmd_getval(cct, cmdmap, "dstname", dstname);
9123
9124     err = crush_rename_bucket(srcname, dstname, &ss);
9125     if (err == -EALREADY) // equivalent to success for idempotency
9126       err = 0;
9127     if (err)
9128       goto reply;
9129     else
9130       goto update;
9131   } else if (prefix == "osd crush weight-set create" ||
9132              prefix == "osd crush weight-set create-compat") {
9133     CrushWrapper newcrush;
9134     _get_pending_crush(newcrush);
9135     int64_t pool;
9136     int positions;
9137     if (newcrush.has_non_straw2_buckets()) {
9138       ss << "crush map contains one or more bucket(s) that are not straw2";
9139       err = -EPERM;
9140       goto reply;
9141     }
9142     if (prefix == "osd crush weight-set create") {
9143       if (osdmap.require_min_compat_client > 0 &&
9144           osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
9145         ss << "require_min_compat_client "
9146            << ceph_release_name(osdmap.require_min_compat_client)
9147            << " < luminous, which is required for per-pool weight-sets. "
9148            << "Try 'ceph osd set-require-min-compat-client luminous' "
9149            << "before using the new interface";
9150         err = -EPERM;
9151         goto reply;
9152       }
9153       string poolname, mode;
9154       cmd_getval(cct, cmdmap, "pool", poolname);
9155       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9156       if (pool < 0) {
9157         ss << "pool '" << poolname << "' not found";
9158         err = -ENOENT;
9159         goto reply;
9160       }
9161       cmd_getval(cct, cmdmap, "mode", mode);
9162       if (mode != "flat" && mode != "positional") {
9163         ss << "unrecognized weight-set mode '" << mode << "'";
9164         err = -EINVAL;
9165         goto reply;
9166       }
9167       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
9168     } else {
9169       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9170       positions = 1;
9171     }
9172     if (!newcrush.create_choose_args(pool, positions)) {
9173       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
9174         ss << "compat weight-set already created";
9175       } else {
9176         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
9177            << "' already created";
9178       }
9179       goto reply;
9180     }
9181     pending_inc.crush.clear();
9182     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9183     goto update;
9184
9185   } else if (prefix == "osd crush weight-set rm" ||
9186              prefix == "osd crush weight-set rm-compat") {
9187     CrushWrapper newcrush;
9188     _get_pending_crush(newcrush);
9189     int64_t pool;
9190     if (prefix == "osd crush weight-set rm") {
9191       string poolname;
9192       cmd_getval(cct, cmdmap, "pool", poolname);
9193       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9194       if (pool < 0) {
9195         ss << "pool '" << poolname << "' not found";
9196         err = -ENOENT;
9197         goto reply;
9198       }
9199     } else {
9200       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9201     }
9202     newcrush.rm_choose_args(pool);
9203     pending_inc.crush.clear();
9204     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9205     goto update;
9206
9207   } else if (prefix == "osd crush weight-set reweight" ||
9208              prefix == "osd crush weight-set reweight-compat") {
9209     string poolname, item;
9210     vector<double> weight;
9211     cmd_getval(cct, cmdmap, "pool", poolname);
9212     cmd_getval(cct, cmdmap, "item", item);
9213     cmd_getval(cct, cmdmap, "weight", weight);
9214     CrushWrapper newcrush;
9215     _get_pending_crush(newcrush);
9216     int64_t pool;
9217     if (prefix == "osd crush weight-set reweight") {
9218       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9219       if (pool < 0) {
9220         ss << "pool '" << poolname << "' not found";
9221         err = -ENOENT;
9222         goto reply;
9223       }
9224       if (!newcrush.have_choose_args(pool)) {
9225         ss << "no weight-set for pool '" << poolname << "'";
9226         err = -ENOENT;
9227         goto reply;
9228       }
9229       auto arg_map = newcrush.choose_args_get(pool);
9230       int positions = newcrush.get_choose_args_positions(arg_map);
9231       if (weight.size() != (size_t)positions) {
9232          ss << "must specify exact " << positions << " weight values";
9233          err = -EINVAL;
9234          goto reply;
9235       }
9236     } else {
9237       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
9238       if (!newcrush.have_choose_args(pool)) {
9239         ss << "no backward-compatible weight-set";
9240         err = -ENOENT;
9241         goto reply;
9242       }
9243     }
9244     if (!newcrush.name_exists(item)) {
9245       ss << "item '" << item << "' does not exist";
9246       err = -ENOENT;
9247       goto reply;
9248     }
9249     err = newcrush.choose_args_adjust_item_weightf(
9250       cct,
9251       newcrush.choose_args_get(pool),
9252       newcrush.get_item_id(item),
9253       weight,
9254       &ss);
9255     if (err < 0) {
9256       goto reply;
9257     }
9258     err = 0;
9259     pending_inc.crush.clear();
9260     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9261     goto update;
9262   } else if (osdid_present &&
9263              (prefix == "osd crush set" || prefix == "osd crush add")) {
9264     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
9265     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
9266     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
9267
9268     if (!osdmap.exists(osdid)) {
9269       err = -ENOENT;
9270       ss << osd_name
9271          << " does not exist. Create it before updating the crush map";
9272       goto reply;
9273     }
9274
9275     double weight;
9276     if (!cmd_getval(cct, cmdmap, "weight", weight)) {
9277       ss << "unable to parse weight value '"
9278          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9279       err = -EINVAL;
9280       goto reply;
9281     }
9282
9283     string args;
9284     vector<string> argvec;
9285     cmd_getval(cct, cmdmap, "args", argvec);
9286     map<string,string> loc;
9287     CrushWrapper::parse_loc_map(argvec, &loc);
9288
9289     if (prefix == "osd crush set"
9290         && !_get_stable_crush().item_exists(osdid)) {
9291       err = -ENOENT;
9292       ss << "unable to set item id " << osdid << " name '" << osd_name
9293          << "' weight " << weight << " at location " << loc
9294          << ": does not exist";
9295       goto reply;
9296     }
9297
9298     dout(5) << "adding/updating crush item id " << osdid << " name '"
9299       << osd_name << "' weight " << weight << " at location "
9300       << loc << dendl;
9301     CrushWrapper newcrush;
9302     _get_pending_crush(newcrush);
9303
9304     string action;
9305     if (prefix == "osd crush set" ||
9306         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
9307       action = "set";
9308       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
9309     } else {
9310       action = "add";
9311       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
9312       if (err == 0)
9313         err = 1;
9314     }
9315
9316     if (err < 0)
9317       goto reply;
9318
9319     if (err == 0 && !_have_pending_crush()) {
9320       ss << action << " item id " << osdid << " name '" << osd_name
9321          << "' weight " << weight << " at location " << loc << ": no change";
9322       goto reply;
9323     }
9324
9325     pending_inc.crush.clear();
9326     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9327     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
9328        << weight << " at location " << loc << " to crush map";
9329     getline(ss, rs);
9330     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9331                                                       get_last_committed() + 1));
9332     return true;
9333
9334   } else if (prefix == "osd crush create-or-move") {
9335     do {
9336       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
9337       if (!osdmap.exists(osdid)) {
9338         err = -ENOENT;
9339         ss << osd_name
9340            << " does not exist.  create it before updating the crush map";
9341         goto reply;
9342       }
9343
9344       double weight;
9345       if (!cmd_getval(cct, cmdmap, "weight", weight)) {
9346         ss << "unable to parse weight value '"
9347            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9348         err = -EINVAL;
9349         goto reply;
9350       }
9351
9352       string args;
9353       vector<string> argvec;
9354       cmd_getval(cct, cmdmap, "args", argvec);
9355       map<string,string> loc;
9356       CrushWrapper::parse_loc_map(argvec, &loc);
9357
9358       dout(0) << "create-or-move crush item name '" << osd_name
9359               << "' initial_weight " << weight << " at location " << loc
9360               << dendl;
9361
9362       CrushWrapper newcrush;
9363       _get_pending_crush(newcrush);
9364
9365       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
9366                                          g_conf()->osd_crush_update_weight_set);
9367       if (err == 0) {
9368         ss << "create-or-move updated item name '" << osd_name
9369            << "' weight " << weight
9370            << " at location " << loc << " to crush map";
9371         break;
9372       }
9373       if (err > 0) {
9374         pending_inc.crush.clear();
9375         newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9376         ss << "create-or-move updating item name '" << osd_name
9377            << "' weight " << weight
9378            << " at location " << loc << " to crush map";
9379         getline(ss, rs);
9380         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9381                                                   get_last_committed() + 1));
9382         return true;
9383       }
9384     } while (false);
9385
9386   } else if (prefix == "osd crush move") {
9387     do {
9388       // osd crush move <name> <loc1> [<loc2> ...]
9389       string name;
9390       vector<string> argvec;
9391       cmd_getval(cct, cmdmap, "name", name);
9392       cmd_getval(cct, cmdmap, "args", argvec);
9393       map<string,string> loc;
9394       CrushWrapper::parse_loc_map(argvec, &loc);
9395
9396       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
9397       CrushWrapper newcrush;
9398       _get_pending_crush(newcrush);
9399
9400       if (!newcrush.name_exists(name)) {
9401         err = -ENOENT;
9402         ss << "item " << name << " does not exist";
9403         break;
9404       }
9405       int id = newcrush.get_item_id(name);
9406
9407       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
9408         if (id >= 0) {
9409           err = newcrush.create_or_move_item(
9410             cct, id, 0, name, loc,
9411             g_conf()->osd_crush_update_weight_set);
9412         } else {
9413           err = newcrush.move_bucket(cct, id, loc);
9414         }
9415         if (err >= 0) {
9416           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9417           pending_inc.crush.clear();
9418           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9419           getline(ss, rs);
9420           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9421                                                    get_last_committed() + 1));
9422           return true;
9423         }
9424       } else {
9425         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
9426         err = 0;
9427       }
9428     } while (false);
9429   } else if (prefix == "osd crush swap-bucket") {
9430     string source, dest;
9431     cmd_getval(cct, cmdmap, "source", source);
9432     cmd_getval(cct, cmdmap, "dest", dest);
9433
9434     bool force = false;
9435     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", force);
9436
9437     CrushWrapper newcrush;
9438     _get_pending_crush(newcrush);
9439     if (!newcrush.name_exists(source)) {
9440       ss << "source item " << source << " does not exist";
9441       err = -ENOENT;
9442       goto reply;
9443     }
9444     if (!newcrush.name_exists(dest)) {
9445       ss << "dest item " << dest << " does not exist";
9446       err = -ENOENT;
9447       goto reply;
9448     }
9449     int sid = newcrush.get_item_id(source);
9450     int did = newcrush.get_item_id(dest);
9451     int sparent;
9452     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
9453       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
9454       err = -EPERM;
9455       goto reply;
9456     }
9457     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
9458         !force) {
9459       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
9460          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
9461          << "; pass --yes-i-really-mean-it to proceed anyway";
9462       err = -EPERM;
9463       goto reply;
9464     }
9465     int r = newcrush.swap_bucket(cct, sid, did);
9466     if (r < 0) {
9467       ss << "failed to swap bucket contents: " << cpp_strerror(r);
9468       err = r;
9469       goto reply;
9470     }
9471     ss << "swapped bucket of " << source << " to " << dest;
9472     pending_inc.crush.clear();
9473     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9474     wait_for_finished_proposal(op,
9475                                new Monitor::C_Command(mon, op, err, ss.str(),
9476                                                       get_last_committed() + 1));
9477     return true;
9478   } else if (prefix == "osd crush link") {
9479     // osd crush link <name> <loc1> [<loc2> ...]
9480     string name;
9481     cmd_getval(cct, cmdmap, "name", name);
9482     vector<string> argvec;
9483     cmd_getval(cct, cmdmap, "args", argvec);
9484     map<string,string> loc;
9485     CrushWrapper::parse_loc_map(argvec, &loc);
9486
9487     // Need an explicit check for name_exists because get_item_id returns
9488     // 0 on unfound.
9489     int id = osdmap.crush->get_item_id(name);
9490     if (!osdmap.crush->name_exists(name)) {
9491       err = -ENOENT;
9492       ss << "item " << name << " does not exist";
9493       goto reply;
9494     } else {
9495       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
9496     }
9497     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
9498       ss << "no need to move item id " << id << " name '" << name
9499          << "' to location " << loc << " in crush map";
9500       err = 0;
9501       goto reply;
9502     }
9503
9504     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
9505     CrushWrapper newcrush;
9506     _get_pending_crush(newcrush);
9507
9508     if (!newcrush.name_exists(name)) {
9509       err = -ENOENT;
9510       ss << "item " << name << " does not exist";
9511       goto reply;
9512     } else {
9513       int id = newcrush.get_item_id(name);
9514       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
9515         err = newcrush.link_bucket(cct, id, loc);
9516         if (err >= 0) {
9517           ss << "linked item id " << id << " name '" << name
9518              << "' to location " << loc << " in crush map";
9519           pending_inc.crush.clear();
9520           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9521         } else {
9522           ss << "cannot link item id " << id << " name '" << name
9523              << "' to location " << loc;
9524           goto reply;
9525         }
9526       } else {
9527         ss << "no need to move item id " << id << " name '" << name
9528            << "' to location " << loc << " in crush map";
9529         err = 0;
9530       }
9531     }
9532     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
9533                                               get_last_committed() + 1));
9534     return true;
9535   } else if (prefix == "osd crush rm" ||
9536              prefix == "osd crush remove" ||
9537              prefix == "osd crush unlink") {
9538     do {
9539       // osd crush rm <id> [ancestor]
9540       CrushWrapper newcrush;
9541       _get_pending_crush(newcrush);
9542
9543       string name;
9544       cmd_getval(cct, cmdmap, "name", name);
9545
9546       if (!osdmap.crush->name_exists(name)) {
9547         err = 0;
9548         ss << "device '" << name << "' does not appear in the crush map";
9549         break;
9550       }
9551       if (!newcrush.name_exists(name)) {
9552         err = 0;
9553         ss << "device '" << name << "' does not appear in the crush map";
9554         getline(ss, rs);
9555         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9556                                                   get_last_committed() + 1));
9557         return true;
9558       }
9559       int id = newcrush.get_item_id(name);
9560       int ancestor = 0;
9561
9562       bool unlink_only = prefix == "osd crush unlink";
9563       string ancestor_str;
9564       if (cmd_getval(cct, cmdmap, "ancestor", ancestor_str)) {
9565         if (!newcrush.name_exists(ancestor_str)) {
9566           err = -ENOENT;
9567           ss << "ancestor item '" << ancestor_str
9568              << "' does not appear in the crush map";
9569           break;
9570         }
9571         ancestor = newcrush.get_item_id(ancestor_str);
9572       }
9573
9574       err = prepare_command_osd_crush_remove(
9575           newcrush,
9576           id, ancestor,
9577           (ancestor < 0), unlink_only);
9578
9579       if (err == -ENOENT) {
9580         ss << "item " << id << " does not appear in that position";
9581         err = 0;
9582         break;
9583       }
9584       if (err == 0) {
9585         if (!unlink_only)
9586           pending_inc.new_crush_node_flags[id] = 0;
9587         ss << "removed item id " << id << " name '" << name << "' from crush map";
9588         getline(ss, rs);
9589         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9590                                                   get_last_committed() + 1));
9591         return true;
9592       }
9593     } while (false);
9594
9595   } else if (prefix == "osd crush reweight-all") {
9596     CrushWrapper newcrush;
9597     _get_pending_crush(newcrush);
9598
9599     newcrush.reweight(cct);
9600     pending_inc.crush.clear();
9601     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9602     ss << "reweighted crush hierarchy";
9603     getline(ss, rs);
9604     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9605                                                   get_last_committed() + 1));
9606     return true;
9607   } else if (prefix == "osd crush reweight") {
9608     // osd crush reweight <name> <weight>
9609     CrushWrapper newcrush;
9610     _get_pending_crush(newcrush);
9611
9612     string name;
9613     cmd_getval(cct, cmdmap, "name", name);
9614     if (!newcrush.name_exists(name)) {
9615       err = -ENOENT;
9616       ss << "device '" << name << "' does not appear in the crush map";
9617       goto reply;
9618     }
9619
9620     int id = newcrush.get_item_id(name);
9621     if (id < 0) {
9622       ss << "device '" << name << "' is not a leaf in the crush map";
9623       err = -EINVAL;
9624       goto reply;
9625     }
9626     double w;
9627     if (!cmd_getval(cct, cmdmap, "weight", w)) {
9628       ss << "unable to parse weight value '"
9629          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9630       err = -EINVAL;
9631       goto reply;
9632     }
9633
9634     err = newcrush.adjust_item_weightf(cct, id, w,
9635                                        g_conf()->osd_crush_update_weight_set);
9636     if (err < 0)
9637       goto reply;
9638     pending_inc.crush.clear();
9639     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9640     ss << "reweighted item id " << id << " name '" << name << "' to " << w
9641        << " in crush map";
9642     getline(ss, rs);
9643     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9644                                                   get_last_committed() + 1));
9645     return true;
9646   } else if (prefix == "osd crush reweight-subtree") {
9647     // osd crush reweight <name> <weight>
9648     CrushWrapper newcrush;
9649     _get_pending_crush(newcrush);
9650
9651     string name;
9652     cmd_getval(cct, cmdmap, "name", name);
9653     if (!newcrush.name_exists(name)) {
9654       err = -ENOENT;
9655       ss << "device '" << name << "' does not appear in the crush map";
9656       goto reply;
9657     }
9658
9659     int id = newcrush.get_item_id(name);
9660     if (id >= 0) {
9661       ss << "device '" << name << "' is not a subtree in the crush map";
9662       err = -EINVAL;
9663       goto reply;
9664     }
9665     double w;
9666     if (!cmd_getval(cct, cmdmap, "weight", w)) {
9667       ss << "unable to parse weight value '"
9668          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
9669       err = -EINVAL;
9670       goto reply;
9671     }
9672
9673     err = newcrush.adjust_subtree_weightf(cct, id, w,
9674                                           g_conf()->osd_crush_update_weight_set);
9675     if (err < 0)
9676       goto reply;
9677     pending_inc.crush.clear();
9678     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9679     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
9680        << " in crush map";
9681     getline(ss, rs);
9682     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9683                                               get_last_committed() + 1));
9684     return true;
9685   } else if (prefix == "osd crush tunables") {
9686     CrushWrapper newcrush;
9687     _get_pending_crush(newcrush);
9688
9689     err = 0;
9690     string profile;
9691     cmd_getval(cct, cmdmap, "profile", profile);
9692     if (profile == "legacy" || profile == "argonaut") {
9693       newcrush.set_tunables_legacy();
9694     } else if (profile == "bobtail") {
9695       newcrush.set_tunables_bobtail();
9696     } else if (profile == "firefly") {
9697       newcrush.set_tunables_firefly();
9698     } else if (profile == "hammer") {
9699       newcrush.set_tunables_hammer();
9700     } else if (profile == "jewel") {
9701       newcrush.set_tunables_jewel();
9702     } else if (profile == "optimal") {
9703       newcrush.set_tunables_optimal();
9704     } else if (profile == "default") {
9705       newcrush.set_tunables_default();
9706     } else {
9707       ss << "unrecognized profile '" << profile << "'";
9708       err = -EINVAL;
9709       goto reply;
9710     }
9711
9712     if (!validate_crush_against_features(&newcrush, ss)) {
9713       err = -EINVAL;
9714       goto reply;
9715     }
9716
9717     pending_inc.crush.clear();
9718     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9719     ss << "adjusted tunables profile to " << profile;
9720     getline(ss, rs);
9721     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9722                                               get_last_committed() + 1));
9723     return true;
9724   } else if (prefix == "osd crush set-tunable") {
9725     CrushWrapper newcrush;
9726     _get_pending_crush(newcrush);
9727
9728     err = 0;
9729     string tunable;
9730     cmd_getval(cct, cmdmap, "tunable", tunable);
9731
9732     int64_t value = -1;
9733     if (!cmd_getval(cct, cmdmap, "value", value)) {
9734       err = -EINVAL;
9735       ss << "failed to parse integer value "
9736          << cmd_vartype_stringify(cmdmap.at("value"));
9737       goto reply;
9738     }
9739
9740     if (tunable == "straw_calc_version") {
9741       if (value != 0 && value != 1) {
9742         ss << "value must be 0 or 1; got " << value;
9743         err = -EINVAL;
9744         goto reply;
9745       }
9746       newcrush.set_straw_calc_version(value);
9747     } else {
9748       ss << "unrecognized tunable '" << tunable << "'";
9749       err = -EINVAL;
9750       goto reply;
9751     }
9752
9753     if (!validate_crush_against_features(&newcrush, ss)) {
9754       err = -EINVAL;
9755       goto reply;
9756     }
9757
9758     pending_inc.crush.clear();
9759     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9760     ss << "adjusted tunable " << tunable << " to " << value;
9761     getline(ss, rs);
9762     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9763                                               get_last_committed() + 1));
9764     return true;
9765
9766   } else if (prefix == "osd crush rule create-simple") {
9767     string name, root, type, mode;
9768     cmd_getval(cct, cmdmap, "name", name);
9769     cmd_getval(cct, cmdmap, "root", root);
9770     cmd_getval(cct, cmdmap, "type", type);
9771     cmd_getval(cct, cmdmap, "mode", mode);
9772     if (mode == "")
9773       mode = "firstn";
9774
9775     if (osdmap.crush->rule_exists(name)) {
9776       // The name is uniquely associated to a ruleid and the rule it contains
9777       // From the user point of view, the rule is more meaningfull.
9778       ss << "rule " << name << " already exists";
9779       err = 0;
9780       goto reply;
9781     }
9782
9783     CrushWrapper newcrush;
9784     _get_pending_crush(newcrush);
9785
9786     if (newcrush.rule_exists(name)) {
9787       // The name is uniquely associated to a ruleid and the rule it contains
9788       // From the user point of view, the rule is more meaningfull.
9789       ss << "rule " << name << " already exists";
9790       err = 0;
9791     } else {
9792       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
9793                                                pg_pool_t::TYPE_REPLICATED, &ss);
9794       if (ruleno < 0) {
9795         err = ruleno;
9796         goto reply;
9797       }
9798
9799       pending_inc.crush.clear();
9800       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9801     }
9802     getline(ss, rs);
9803     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9804                                               get_last_committed() + 1));
9805     return true;
9806
9807   } else if (prefix == "osd crush rule create-replicated") {
9808     string name, root, type, device_class;
9809     cmd_getval(cct, cmdmap, "name", name);
9810     cmd_getval(cct, cmdmap, "root", root);
9811     cmd_getval(cct, cmdmap, "type", type);
9812     cmd_getval(cct, cmdmap, "class", device_class);
9813
9814     if (osdmap.crush->rule_exists(name)) {
9815       // The name is uniquely associated to a ruleid and the rule it contains
9816       // From the user point of view, the rule is more meaningfull.
9817       ss << "rule " << name << " already exists";
9818       err = 0;
9819       goto reply;
9820     }
9821
9822     CrushWrapper newcrush;
9823     _get_pending_crush(newcrush);
9824
9825     if (newcrush.rule_exists(name)) {
9826       // The name is uniquely associated to a ruleid and the rule it contains
9827       // From the user point of view, the rule is more meaningfull.
9828       ss << "rule " << name << " already exists";
9829       err = 0;
9830     } else {
9831       int ruleno = newcrush.add_simple_rule(
9832         name, root, type, device_class,
9833         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
9834       if (ruleno < 0) {
9835         err = ruleno;
9836         goto reply;
9837       }
9838
9839       pending_inc.crush.clear();
9840       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9841     }
9842     getline(ss, rs);
9843     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9844                                               get_last_committed() + 1));
9845     return true;
9846
9847   } else if (prefix == "osd erasure-code-profile rm") {
9848     string name;
9849     cmd_getval(cct, cmdmap, "name", name);
9850
9851     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
9852       goto wait;
9853
9854     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
9855       err = -EBUSY;
9856       goto reply;
9857     }
9858
9859     if (osdmap.has_erasure_code_profile(name) ||
9860         pending_inc.new_erasure_code_profiles.count(name)) {
9861       if (osdmap.has_erasure_code_profile(name)) {
9862         pending_inc.old_erasure_code_profiles.push_back(name);
9863       } else {
9864         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
9865         pending_inc.new_erasure_code_profiles.erase(name);
9866       }
9867
9868       getline(ss, rs);
9869       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9870                                                         get_last_committed() + 1));
9871       return true;
9872     } else {
9873       ss << "erasure-code-profile " << name << " does not exist";
9874       err = 0;
9875       goto reply;
9876     }
9877
9878   } else if (prefix == "osd erasure-code-profile set") {
9879     string name;
9880     cmd_getval(cct, cmdmap, "name", name);
9881     vector<string> profile;
9882     cmd_getval(cct, cmdmap, "profile", profile);
9883
9884     bool force = false;
9885     cmd_getval(cct, cmdmap, "force", force);
9886
9887     map<string,string> profile_map;
9888     err = parse_erasure_code_profile(profile, &profile_map, &ss);
9889     if (err)
9890       goto reply;
9891     if (profile_map.find("plugin") == profile_map.end()) {
9892       ss << "erasure-code-profile " << profile_map
9893          << " must contain a plugin entry" << std::endl;
9894       err = -EINVAL;
9895       goto reply;
9896     }
9897     string plugin = profile_map["plugin"];
9898
9899     if (pending_inc.has_erasure_code_profile(name)) {
9900       dout(20) << "erasure code profile " << name << " try again" << dendl;
9901       goto wait;
9902     } else {
9903       err = normalize_profile(name, profile_map, force, &ss);
9904       if (err)
9905         goto reply;
9906
9907       if (osdmap.has_erasure_code_profile(name)) {
9908         ErasureCodeProfile existing_profile_map =
9909           osdmap.get_erasure_code_profile(name);
9910         err = normalize_profile(name, existing_profile_map, force, &ss);
9911         if (err)
9912           goto reply;
9913
9914         if (existing_profile_map == profile_map) {
9915           err = 0;
9916           goto reply;
9917         }
9918         if (!force) {
9919           err = -EPERM;
9920           ss << "will not override erasure code profile " << name
9921              << " because the existing profile "
9922              << existing_profile_map
9923              << " is different from the proposed profile "
9924              << profile_map;
9925           goto reply;
9926         }
9927       }
9928
9929       dout(20) << "erasure code profile set " << name << "="
9930                << profile_map << dendl;
9931       pending_inc.set_erasure_code_profile(name, profile_map);
9932     }
9933
9934     getline(ss, rs);
9935     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9936                                                       get_last_committed() + 1));
9937     return true;
9938
9939   } else if (prefix == "osd crush rule create-erasure") {
9940     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
9941     if (err == -EAGAIN)
9942       goto wait;
9943     if (err)
9944       goto reply;
9945     string name, poolstr;
9946     cmd_getval(cct, cmdmap, "name", name);
9947     string profile;
9948     cmd_getval(cct, cmdmap, "profile", profile);
9949     if (profile == "")
9950       profile = "default";
9951     if (profile == "default") {
9952       if (!osdmap.has_erasure_code_profile(profile)) {
9953         if (pending_inc.has_erasure_code_profile(profile)) {
9954           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
9955           goto wait;
9956         }
9957
9958         map<string,string> profile_map;
9959         err = osdmap.get_erasure_code_profile_default(cct,
9960                                                       profile_map,
9961                                                       &ss);
9962         if (err)
9963           goto reply;
9964         err = normalize_profile(name, profile_map, true, &ss);
9965         if (err)
9966           goto reply;
9967         dout(20) << "erasure code profile set " << profile << "="
9968                  << profile_map << dendl;
9969         pending_inc.set_erasure_code_profile(profile, profile_map);
9970         goto wait;
9971       }
9972     }
9973
9974     int rule;
9975     err = crush_rule_create_erasure(name, profile, &rule, &ss);
9976     if (err < 0) {
9977       switch(err) {
9978       case -EEXIST: // return immediately
9979         ss << "rule " << name << " already exists";
9980         err = 0;
9981         goto reply;
9982         break;
9983       case -EALREADY: // wait for pending to be proposed
9984         ss << "rule " << name << " already exists";
9985         err = 0;
9986         break;
9987       default: // non recoverable error
9988         goto reply;
9989         break;
9990       }
9991     } else {
9992       ss << "created rule " << name << " at " << rule;
9993     }
9994
9995     getline(ss, rs);
9996     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9997                                                       get_last_committed() + 1));
9998     return true;
9999
10000   } else if (prefix == "osd crush rule rm") {
10001     string name;
10002     cmd_getval(cct, cmdmap, "name", name);
10003
10004     if (!osdmap.crush->rule_exists(name)) {
10005       ss << "rule " << name << " does not exist";
10006       err = 0;
10007       goto reply;
10008     }
10009
10010     CrushWrapper newcrush;
10011     _get_pending_crush(newcrush);
10012
10013     if (!newcrush.rule_exists(name)) {
10014       ss << "rule " << name << " does not exist";
10015       err = 0;
10016     } else {
10017       int ruleno = newcrush.get_rule_id(name);
10018       ceph_assert(ruleno >= 0);
10019
10020       // make sure it is not in use.
10021       // FIXME: this is ok in some situations, but let's not bother with that
10022       // complexity now.
10023       int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10024       if (osdmap.crush_rule_in_use(ruleset)) {
10025         ss << "crush ruleset " << name << " " << ruleset << " is in use";
10026         err = -EBUSY;
10027         goto reply;
10028       }
10029
10030       err = newcrush.remove_rule(ruleno);
10031       if (err < 0) {
10032         goto reply;
10033       }
10034
10035       pending_inc.crush.clear();
10036       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10037     }
10038     getline(ss, rs);
10039     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10040                                               get_last_committed() + 1));
10041     return true;
10042
10043   } else if (prefix == "osd crush rule rename") {
10044     string srcname;
10045     string dstname;
10046     cmd_getval(cct, cmdmap, "srcname", srcname);
10047     cmd_getval(cct, cmdmap, "dstname", dstname);
10048     if (srcname.empty() || dstname.empty()) {
10049       ss << "must specify both source rule name and destination rule name";
10050       err = -EINVAL;
10051       goto reply;
10052     }
10053     if (srcname == dstname) {
10054       ss << "destination rule name is equal to source rule name";
10055       err = 0;
10056       goto reply;
10057     }
10058
10059     CrushWrapper newcrush;
10060     _get_pending_crush(newcrush);
10061     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10062       // srcname does not exist and dstname already exists
10063       // suppose this is a replay and return success
10064       // (so this command is idempotent)
10065       ss << "already renamed to '" << dstname << "'";
10066       err = 0;
10067       goto reply;
10068     }
10069
10070     err = newcrush.rename_rule(srcname, dstname, &ss);
10071     if (err < 0) {
10072       // ss has reason for failure
10073       goto reply;
10074     }
10075     pending_inc.crush.clear();
10076     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10077     getline(ss, rs);
10078     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10079                                get_last_committed() + 1));
10080     return true;
10081
10082   } else if (prefix == "osd setmaxosd") {
10083     int64_t newmax;
10084     if (!cmd_getval(cct, cmdmap, "newmax", newmax)) {
10085       ss << "unable to parse 'newmax' value '"
10086          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
10087       err = -EINVAL;
10088       goto reply;
10089     }
10090
10091     if (newmax > g_conf()->mon_max_osd) {
10092       err = -ERANGE;
10093       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
10094          << g_conf()->mon_max_osd << ")";
10095       goto reply;
10096     }
10097
10098     // Don't allow shrinking OSD number as this will cause data loss
10099     // and may cause kernel crashes.
10100     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10101     if (newmax < osdmap.get_max_osd()) {
10102       // Check if the OSDs exist between current max and new value.
10103       // If there are any OSDs exist, then don't allow shrinking number
10104       // of OSDs.
10105       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10106         if (osdmap.exists(i)) {
10107           err = -EBUSY;
10108           ss << "cannot shrink max_osd to " << newmax
10109              << " because osd." << i << " (and possibly others) still in use";
10110           goto reply;
10111         }
10112       }
10113     }
10114
10115     pending_inc.new_max_osd = newmax;
10116     ss << "set new max_osd = " << pending_inc.new_max_osd;
10117     getline(ss, rs);
10118     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10119                                               get_last_committed() + 1));
10120     return true;
10121
10122   } else if (prefix == "osd set-full-ratio" ||
10123              prefix == "osd set-backfillfull-ratio" ||
10124              prefix == "osd set-nearfull-ratio") {
10125     double n;
10126     if (!cmd_getval(cct, cmdmap, "ratio", n)) {
10127       ss << "unable to parse 'ratio' value '"
10128          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
10129       err = -EINVAL;
10130       goto reply;
10131     }
10132     if (prefix == "osd set-full-ratio")
10133       pending_inc.new_full_ratio = n;
10134     else if (prefix == "osd set-backfillfull-ratio")
10135       pending_inc.new_backfillfull_ratio = n;
10136     else if (prefix == "osd set-nearfull-ratio")
10137       pending_inc.new_nearfull_ratio = n;
10138     ss << prefix << " " << n;
10139     getline(ss, rs);
10140     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10141                                               get_last_committed() + 1));
10142     return true;
10143   } else if (prefix == "osd set-require-min-compat-client") {
10144     string v;
10145     cmd_getval(cct, cmdmap, "version", v);
10146     int vno = ceph_release_from_name(v.c_str());
10147     if (vno <= 0) {
10148       ss << "version " << v << " is not recognized";
10149       err = -EINVAL;
10150       goto reply;
10151     }
10152     OSDMap newmap;
10153     newmap.deepish_copy_from(osdmap);
10154     newmap.apply_incremental(pending_inc);
10155     newmap.require_min_compat_client = vno;
10156     auto mvno = newmap.get_min_compat_client();
10157     if (vno < mvno) {
10158       ss << "osdmap current utilizes features that require "
10159          << ceph_release_name(mvno)
10160          << "; cannot set require_min_compat_client below that to "
10161          << ceph_release_name(vno);
10162       err = -EPERM;
10163       goto reply;
10164     }
10165     bool sure = false;
10166     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
10167     if (!sure) {
10168       FeatureMap m;
10169       mon->get_combined_feature_map(&m);
10170       uint64_t features = ceph_release_features(vno);
10171       bool first = true;
10172       bool ok = true;
10173       for (int type : {
10174             CEPH_ENTITY_TYPE_CLIENT,
10175             CEPH_ENTITY_TYPE_MDS,
10176             CEPH_ENTITY_TYPE_MGR }) {
10177         auto p = m.m.find(type);
10178         if (p == m.m.end()) {
10179           continue;
10180         }
10181         for (auto& q : p->second) {
10182           uint64_t missing = ~q.first & features;
10183           if (missing) {
10184             if (first) {
10185               ss << "cannot set require_min_compat_client to " << v << ": ";
10186             } else {
10187               ss << "; ";
10188             }
10189             first = false;
10190             ss << q.second << " connected " << ceph_entity_type_name(type)
10191                << "(s) look like " << ceph_release_name(
10192                  ceph_release_from_features(q.first))
10193                << " (missing 0x" << std::hex << missing << std::dec << ")";
10194             ok = false;
10195           }
10196         }
10197       }
10198       if (!ok) {
10199         ss << "; add --yes-i-really-mean-it to do it anyway";
10200         err = -EPERM;
10201         goto reply;
10202       }
10203     }
10204     ss << "set require_min_compat_client to " << ceph_release_name(vno);
10205     pending_inc.new_require_min_compat_client = vno;
10206     getline(ss, rs);
10207     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10208                                                           get_last_committed() + 1));
10209     return true;
10210   } else if (prefix == "osd pause") {
10211     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10212
10213   } else if (prefix == "osd unpause") {
10214     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10215
10216   } else if (prefix == "osd set") {
10217     bool sure = false;
10218     cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
10219
10220     string key;
10221     cmd_getval(cct, cmdmap, "key", key);
10222     if (key == "full")
10223       return prepare_set_flag(op, CEPH_OSDMAP_FULL);
10224     else if (key == "pause")
10225       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10226     else if (key == "noup")
10227       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
10228     else if (key == "nodown")
10229       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
10230     else if (key == "noout")
10231       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
10232     else if (key == "noin")
10233       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
10234     else if (key == "nobackfill")
10235       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
10236     else if (key == "norebalance")
10237       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
10238     else if (key == "norecover")
10239       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
10240     else if (key == "noscrub")
10241       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
10242     else if (key == "nodeep-scrub")
10243       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10244     else if (key == "notieragent")
10245       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
10246     else if (key == "nosnaptrim")
10247       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
10248     else if (key == "pglog_hardlimit") {
10249       if (!osdmap.get_num_up_osds() && !sure) {
10250         ss << "Not advisable to continue since no OSDs are up. Pass "
10251            << "--yes-i-really-mean-it if you really wish to continue.";
10252         err = -EPERM;
10253         goto reply;
10254       }
10255       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
10256       // we are reusing a jewel feature bit that was retired in luminous.
10257       if (osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS &&
10258          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
10259           || sure)) {
10260         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
10261       } else {
10262         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
10263         err = -EPERM;
10264         goto reply;
10265       }
10266     } else {
10267       ss << "unrecognized flag '" << key << "'";
10268       err = -EINVAL;
10269     }
10270
10271   } else if (prefix == "osd unset") {
10272     string key;
10273     cmd_getval(cct, cmdmap, "key", key);
10274     if (key == "full")
10275       return prepare_unset_flag(op, CEPH_OSDMAP_FULL);
10276     else if (key == "pause")
10277       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
10278     else if (key == "noup")
10279       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
10280     else if (key == "nodown")
10281       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
10282     else if (key == "noout")
10283       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
10284     else if (key == "noin")
10285       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
10286     else if (key == "nobackfill")
10287       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
10288     else if (key == "norebalance")
10289       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
10290     else if (key == "norecover")
10291       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
10292     else if (key == "noscrub")
10293       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
10294     else if (key == "nodeep-scrub")
10295       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
10296     else if (key == "notieragent")
10297       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
10298     else if (key == "nosnaptrim")
10299       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
10300     else {
10301       ss << "unrecognized flag '" << key << "'";
10302       err = -EINVAL;
10303     }
10304
10305   } else if (prefix == "osd require-osd-release") {
10306     string release;
10307     cmd_getval(cct, cmdmap, "release", release);
10308     bool sure = false;
10309     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
10310     int rel = ceph_release_from_name(release.c_str());
10311     if (rel <= 0) {
10312       ss << "unrecognized release " << release;
10313       err = -EINVAL;
10314       goto reply;
10315     }
10316     if (rel == osdmap.require_osd_release) {
10317       // idempotent
10318       err = 0;
10319       goto reply;
10320     }
10321     ceph_assert(osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
10322     if (!osdmap.get_num_up_osds() && !sure) {
10323       ss << "Not advisable to continue since no OSDs are up. Pass "
10324          << "--yes-i-really-mean-it if you really wish to continue.";
10325       err = -EPERM;
10326       goto reply;
10327     }
10328     if (rel == CEPH_RELEASE_MIMIC) {
10329       if (!mon->monmap->get_required_features().contains_all(
10330             ceph::features::mon::FEATURE_MIMIC)) {
10331         ss << "not all mons are mimic";
10332         err = -EPERM;
10333         goto reply;
10334       }
10335       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
10336            && !sure) {
10337         ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
10338         err = -EPERM;
10339         goto reply;
10340       }
10341     } else if (rel == CEPH_RELEASE_NAUTILUS) {
10342       if (!mon->monmap->get_required_features().contains_all(
10343             ceph::features::mon::FEATURE_NAUTILUS)) {
10344         ss << "not all mons are nautilus";
10345         err = -EPERM;
10346         goto reply;
10347       }
10348       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
10349            && !sure) {
10350         ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
10351         err = -EPERM;
10352         goto reply;
10353       }
10354     } else {
10355       ss << "not supported for this release yet";
10356       err = -EPERM;
10357       goto reply;
10358     }
10359     if (rel < osdmap.require_osd_release) {
10360       ss << "require_osd_release cannot be lowered once it has been set";
10361       err = -EPERM;
10362       goto reply;
10363     }
10364     pending_inc.new_require_osd_release = rel;
10365     goto update;
10366   } else if (prefix == "osd down" ||
10367              prefix == "osd out" ||
10368              prefix == "osd in" ||
10369              prefix == "osd rm") {
10370
10371     bool any = false;
10372     bool stop = false;
10373     bool verbose = true;
10374
10375     vector<string> idvec;
10376     cmd_getval(cct, cmdmap, "ids", idvec);
10377     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10378       set<int> osds;
10379
10380       // wildcard?
10381       if (j == 0 &&
10382           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10383         if (prefix == "osd in") {
10384           // touch out osds only
10385           osdmap.get_out_existing_osds(osds);
10386         } else {
10387           osdmap.get_all_osds(osds);
10388         }
10389         stop = true;
10390         verbose = false; // so the output is less noisy.
10391       } else {
10392         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10393         if (osd < 0) {
10394           ss << "invalid osd id" << osd;
10395           err = -EINVAL;
10396           continue;
10397         } else if (!osdmap.exists(osd)) {
10398           ss << "osd." << osd << " does not exist. ";
10399           continue;
10400         }
10401
10402         osds.insert(osd);
10403       }
10404
10405       for (auto &osd : osds) {
10406         if (prefix == "osd down") {
10407           if (osdmap.is_down(osd)) {
10408             if (verbose)
10409               ss << "osd." << osd << " is already down. ";
10410           } else {
10411             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
10412             ss << "marked down osd." << osd << ". ";
10413             any = true;
10414           }
10415         } else if (prefix == "osd out") {
10416           if (osdmap.is_out(osd)) {
10417             if (verbose)
10418               ss << "osd." << osd << " is already out. ";
10419           } else {
10420             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
10421             if (osdmap.osd_weight[osd]) {
10422               if (pending_inc.new_xinfo.count(osd) == 0) {
10423                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10424               }
10425               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
10426             }
10427             ss << "marked out osd." << osd << ". ";
10428             std::ostringstream msg;
10429             msg << "Client " << op->get_session()->entity_name
10430                 << " marked osd." << osd << " out";
10431             if (osdmap.is_up(osd)) {
10432               msg << ", while it was still marked up";
10433             } else {
10434               auto period = ceph_clock_now() - down_pending_out[osd];
10435               msg << ", after it was down for " << int(period.sec())
10436                   << " seconds";
10437             }
10438
10439             mon->clog->info() << msg.str();
10440             any = true;
10441           }
10442         } else if (prefix == "osd in") {
10443           if (osdmap.is_in(osd)) {
10444             if (verbose)
10445               ss << "osd." << osd << " is already in. ";
10446           } else {
10447             if (osdmap.osd_xinfo[osd].old_weight > 0) {
10448               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
10449               if (pending_inc.new_xinfo.count(osd) == 0) {
10450                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
10451               }
10452               pending_inc.new_xinfo[osd].old_weight = 0;
10453             } else {
10454               pending_inc.new_weight[osd] = CEPH_OSD_IN;
10455             }
10456             ss << "marked in osd." << osd << ". ";
10457             any = true;
10458           }
10459         } else if (prefix == "osd rm") {
10460           err = prepare_command_osd_remove(osd);
10461
10462           if (err == -EBUSY) {
10463             if (any)
10464               ss << ", ";
10465             ss << "osd." << osd << " is still up; must be down before removal. ";
10466           } else {
10467             ceph_assert(err == 0);
10468             if (any) {
10469               ss << ", osd." << osd;
10470             } else {
10471               ss << "removed osd." << osd;
10472             }
10473             any = true;
10474           }
10475         }
10476       }
10477     }
10478     if (any) {
10479       getline(ss, rs);
10480       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
10481                                                 get_last_committed() + 1));
10482       return true;
10483     }
10484   } else if (prefix == "osd set-group" ||
10485              prefix == "osd unset-group" ||
10486              prefix == "osd add-noup" ||
10487              prefix == "osd add-nodown" ||
10488              prefix == "osd add-noin" ||
10489              prefix == "osd add-noout" ||
10490              prefix == "osd rm-noup" ||
10491              prefix == "osd rm-nodown" ||
10492              prefix == "osd rm-noin" ||
10493              prefix == "osd rm-noout") {
10494     bool do_set = prefix == "osd set-group" ||
10495                   prefix.find("add") != string::npos;
10496     string flag_str;
10497     unsigned flags = 0;
10498     vector<string> who;
10499     if (prefix == "osd set-group" || prefix == "osd unset-group") {
10500       cmd_getval(cct, cmdmap, "flags", flag_str);
10501       cmd_getval(cct, cmdmap, "who", who);
10502       vector<string> raw_flags;
10503       boost::split(raw_flags, flag_str, boost::is_any_of(","));
10504       for (auto& f : raw_flags) {
10505         if (f == "noup")
10506           flags |= CEPH_OSD_NOUP;
10507         else if (f == "nodown")
10508           flags |= CEPH_OSD_NODOWN;
10509         else if (f == "noin")
10510           flags |= CEPH_OSD_NOIN;
10511         else if (f == "noout")
10512           flags |= CEPH_OSD_NOOUT;
10513         else {
10514           ss << "unrecognized flag '" << f << "', must be one of "
10515              << "{noup,nodown,noin,noout}";
10516           err = -EINVAL;
10517           goto reply;
10518         }
10519       }
10520     } else {
10521       cmd_getval(cct, cmdmap, "ids", who);
10522       if (prefix.find("noup") != string::npos)
10523         flags = CEPH_OSD_NOUP;
10524       else if (prefix.find("nodown") != string::npos)
10525         flags = CEPH_OSD_NODOWN;
10526       else if (prefix.find("noin") != string::npos)
10527         flags = CEPH_OSD_NOIN;
10528       else if (prefix.find("noout") != string::npos)
10529         flags = CEPH_OSD_NOOUT;
10530       else
10531         ceph_assert(0 == "Unreachable!");
10532     }
10533     if (flags == 0) {
10534       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
10535       err = -EINVAL;
10536       goto reply;
10537     }
10538     if (who.empty()) {
10539       ss << "must specify at least one or more targets to set/unset";
10540       err = -EINVAL;
10541       goto reply;
10542     }
10543     set<int> osds;
10544     set<int> crush_nodes;
10545     set<int> device_classes;
10546     for (auto& w : who) {
10547       if (w == "any" || w == "all" || w == "*") {
10548         osdmap.get_all_osds(osds);
10549         break;
10550       }
10551       std::stringstream ts;
10552       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
10553         osds.insert(osd);
10554       } else if (osdmap.crush->name_exists(w)) {
10555         crush_nodes.insert(osdmap.crush->get_item_id(w));
10556       } else if (osdmap.crush->class_exists(w)) {
10557         device_classes.insert(osdmap.crush->get_class_id(w));
10558       } else {
10559         ss << "unable to parse osd id or crush node or device class: "
10560            << "\"" << w << "\". ";
10561       }
10562     }
10563     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
10564       // ss has reason for failure
10565       err = -EINVAL;
10566       goto reply;
10567     }
10568     bool any = false;
10569     for (auto osd : osds) {
10570       if (!osdmap.exists(osd)) {
10571         ss << "osd." << osd << " does not exist. ";
10572         continue;
10573       }
10574       if (do_set) {
10575         if (flags & CEPH_OSD_NOUP) {
10576           any |= osdmap.is_noup_by_osd(osd) ?
10577             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
10578             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
10579         }
10580         if (flags & CEPH_OSD_NODOWN) {
10581           any |= osdmap.is_nodown_by_osd(osd) ?
10582             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
10583             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
10584         }
10585         if (flags & CEPH_OSD_NOIN) {
10586           any |= osdmap.is_noin_by_osd(osd) ?
10587             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
10588             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
10589         }
10590         if (flags & CEPH_OSD_NOOUT) {
10591           any |= osdmap.is_noout_by_osd(osd) ?
10592             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
10593             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
10594         }
10595       } else {
10596         if (flags & CEPH_OSD_NOUP) {
10597           any |= osdmap.is_noup_by_osd(osd) ?
10598             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
10599             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
10600         }
10601         if (flags & CEPH_OSD_NODOWN) {
10602           any |= osdmap.is_nodown_by_osd(osd) ?
10603             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
10604             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
10605         }
10606         if (flags & CEPH_OSD_NOIN) {
10607           any |= osdmap.is_noin_by_osd(osd) ?
10608             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
10609             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
10610         }
10611         if (flags & CEPH_OSD_NOOUT) {
10612           any |= osdmap.is_noout_by_osd(osd) ?
10613             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
10614             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
10615         }
10616       }
10617     }
10618     for (auto& id : crush_nodes) {
10619       auto old_flags = osdmap.get_crush_node_flags(id);
10620       auto& pending_flags = pending_inc.new_crush_node_flags[id];
10621       pending_flags |= old_flags; // adopt existing flags first!
10622       if (do_set) {
10623         pending_flags |= flags;
10624       } else {
10625         pending_flags &= ~flags;
10626       }
10627       any = true;
10628     }
10629     for (auto& id : device_classes) {
10630       auto old_flags = osdmap.get_device_class_flags(id);
10631       auto& pending_flags = pending_inc.new_device_class_flags[id];
10632       pending_flags |= old_flags;
10633       if (do_set) {
10634         pending_flags |= flags;
10635       } else {
10636         pending_flags &= ~flags;
10637       }
10638       any = true;
10639     }
10640     if (any) {
10641       getline(ss, rs);
10642       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
10643                                  get_last_committed() + 1));
10644       return true;
10645     }
10646   } else if (prefix == "osd pg-temp") {
10647     string pgidstr;
10648     if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10649       ss << "unable to parse 'pgid' value '"
10650          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10651       err = -EINVAL;
10652       goto reply;
10653     }
10654     pg_t pgid;
10655     if (!pgid.parse(pgidstr.c_str())) {
10656       ss << "invalid pgid '" << pgidstr << "'";
10657       err = -EINVAL;
10658       goto reply;
10659     }
10660     if (!osdmap.pg_exists(pgid)) {
10661       ss << "pg " << pgid << " does not exist";
10662       err = -ENOENT;
10663       goto reply;
10664     }
10665     if (pending_inc.new_pg_temp.count(pgid)) {
10666       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
10667       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10668       return true;
10669     }
10670
10671     vector<int64_t> id_vec;
10672     vector<int32_t> new_pg_temp;
10673     cmd_getval(cct, cmdmap, "id", id_vec);
10674     if (id_vec.empty())  {
10675       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
10676       ss << "done cleaning up pg_temp of " << pgid;
10677       goto update;
10678     }
10679     for (auto osd : id_vec) {
10680       if (!osdmap.exists(osd)) {
10681         ss << "osd." << osd << " does not exist";
10682         err = -ENOENT;
10683         goto reply;
10684       }
10685       new_pg_temp.push_back(osd);
10686     }
10687
10688     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10689     if ((int)new_pg_temp.size() < pool_min_size) {
10690       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
10691          << pool_min_size << ")";
10692       err = -EINVAL;
10693       goto reply;
10694     }
10695
10696     int pool_size = osdmap.get_pg_pool_size(pgid);
10697     if ((int)new_pg_temp.size() > pool_size) {
10698       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
10699          << pool_size << ")";
10700       err = -EINVAL;
10701       goto reply;
10702     }
10703
10704     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
10705       new_pg_temp.begin(), new_pg_temp.end());
10706     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
10707     goto update;
10708   } else if (prefix == "osd primary-temp") {
10709     string pgidstr;
10710     if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10711       ss << "unable to parse 'pgid' value '"
10712          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10713       err = -EINVAL;
10714       goto reply;
10715     }
10716     pg_t pgid;
10717     if (!pgid.parse(pgidstr.c_str())) {
10718       ss << "invalid pgid '" << pgidstr << "'";
10719       err = -EINVAL;
10720       goto reply;
10721     }
10722     if (!osdmap.pg_exists(pgid)) {
10723       ss << "pg " << pgid << " does not exist";
10724       err = -ENOENT;
10725       goto reply;
10726     }
10727
10728     int64_t osd;
10729     if (!cmd_getval(cct, cmdmap, "id", osd)) {
10730       ss << "unable to parse 'id' value '"
10731          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10732       err = -EINVAL;
10733       goto reply;
10734     }
10735     if (osd != -1 && !osdmap.exists(osd)) {
10736       ss << "osd." << osd << " does not exist";
10737       err = -ENOENT;
10738       goto reply;
10739     }
10740
10741     if (osdmap.require_min_compat_client > 0 &&
10742         osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
10743       ss << "require_min_compat_client "
10744          << ceph_release_name(osdmap.require_min_compat_client)
10745          << " < firefly, which is required for primary-temp";
10746       err = -EPERM;
10747       goto reply;
10748     }
10749
10750     pending_inc.new_primary_temp[pgid] = osd;
10751     ss << "set " << pgid << " primary_temp mapping to " << osd;
10752     goto update;
10753   } else if (prefix == "pg repeer") {
10754     pg_t pgid;
10755     string pgidstr;
10756     cmd_getval(cct, cmdmap, "pgid", pgidstr);
10757     if (!pgid.parse(pgidstr.c_str())) {
10758       ss << "invalid pgid '" << pgidstr << "'";
10759       err = -EINVAL;
10760       goto reply;
10761     }
10762     if (!osdmap.pg_exists(pgid)) {
10763       ss << "pg '" << pgidstr << "' does not exist";
10764       err = -ENOENT;
10765       goto reply;
10766     }
10767     vector<int> acting;
10768     int primary;
10769     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
10770     if (primary < 0) {
10771       err = -EAGAIN;
10772       ss << "pg currently has no primary";
10773       goto reply;
10774     }
10775     if (acting.size() > 1) {
10776       // map to just primary; it will map back to what it wants
10777       pending_inc.new_pg_temp[pgid] = { primary };
10778     } else {
10779       // hmm, pick another arbitrary osd to induce a change.  Note
10780       // that this won't work if there is only one suitable OSD in the cluster.
10781       int i;
10782       bool done = false;
10783       for (i = 0; i < osdmap.get_max_osd(); ++i) {
10784         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
10785           continue;
10786         }
10787         pending_inc.new_pg_temp[pgid] = { primary, i };
10788         done = true;
10789         break;
10790       }
10791       if (!done) {
10792         err = -EAGAIN;
10793         ss << "not enough up OSDs in the cluster to force repeer";
10794         goto reply;
10795       }
10796     }
10797     goto update;
10798   } else if (prefix == "osd pg-upmap" ||
10799              prefix == "osd rm-pg-upmap" ||
10800              prefix == "osd pg-upmap-items" ||
10801              prefix == "osd rm-pg-upmap-items") {
10802     if (osdmap.require_min_compat_client < CEPH_RELEASE_LUMINOUS) {
10803       ss << "min_compat_client "
10804          << ceph_release_name(osdmap.require_min_compat_client)
10805          << " < luminous, which is required for pg-upmap. "
10806          << "Try 'ceph osd set-require-min-compat-client luminous' "
10807          << "before using the new interface";
10808       err = -EPERM;
10809       goto reply;
10810     }
10811     err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
10812     if (err == -EAGAIN)
10813       goto wait;
10814     if (err < 0)
10815       goto reply;
10816     string pgidstr;
10817     if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
10818       ss << "unable to parse 'pgid' value '"
10819          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
10820       err = -EINVAL;
10821       goto reply;
10822     }
10823     pg_t pgid;
10824     if (!pgid.parse(pgidstr.c_str())) {
10825       ss << "invalid pgid '" << pgidstr << "'";
10826       err = -EINVAL;
10827       goto reply;
10828     }
10829     if (!osdmap.pg_exists(pgid)) {
10830       ss << "pg " << pgid << " does not exist";
10831       err = -ENOENT;
10832       goto reply;
10833     }
10834     if (pending_inc.old_pools.count(pgid.pool())) {
10835       ss << "pool of " << pgid << " is pending removal";
10836       err = -ENOENT;
10837       getline(ss, rs);
10838       wait_for_finished_proposal(op,
10839         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
10840       return true;
10841     }
10842
10843     enum {
10844       OP_PG_UPMAP,
10845       OP_RM_PG_UPMAP,
10846       OP_PG_UPMAP_ITEMS,
10847       OP_RM_PG_UPMAP_ITEMS,
10848     } option;
10849
10850     if (prefix == "osd pg-upmap") {
10851       option = OP_PG_UPMAP;
10852     } else if (prefix == "osd rm-pg-upmap") {
10853       option = OP_RM_PG_UPMAP;
10854     } else if (prefix == "osd pg-upmap-items") {
10855       option = OP_PG_UPMAP_ITEMS;
10856     } else {
10857       option = OP_RM_PG_UPMAP_ITEMS;
10858     }
10859
10860     // check pending upmap changes
10861     switch (option) {
10862     case OP_PG_UPMAP: // fall through
10863     case OP_RM_PG_UPMAP:
10864       if (pending_inc.new_pg_upmap.count(pgid) ||
10865           pending_inc.old_pg_upmap.count(pgid)) {
10866         dout(10) << __func__ << " waiting for pending update on "
10867                  << pgid << dendl;
10868         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10869         return true;
10870       }
10871       break;
10872
10873     case OP_PG_UPMAP_ITEMS: // fall through
10874     case OP_RM_PG_UPMAP_ITEMS:
10875       if (pending_inc.new_pg_upmap_items.count(pgid) ||
10876           pending_inc.old_pg_upmap_items.count(pgid)) {
10877         dout(10) << __func__ << " waiting for pending update on "
10878                  << pgid << dendl;
10879         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10880         return true;
10881       }
10882       break;
10883
10884     default:
10885       ceph_abort_msg("invalid option");
10886     }
10887
10888     switch (option) {
10889     case OP_PG_UPMAP:
10890       {
10891         vector<int64_t> id_vec;
10892         if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
10893           ss << "unable to parse 'id' value(s) '"
10894              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10895           err = -EINVAL;
10896           goto reply;
10897         }
10898
10899         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
10900         if ((int)id_vec.size() < pool_min_size) {
10901           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
10902              << pool_min_size << ")";
10903           err = -EINVAL;
10904           goto reply;
10905         }
10906
10907         int pool_size = osdmap.get_pg_pool_size(pgid);
10908         if ((int)id_vec.size() > pool_size) {
10909           ss << "num of osds (" << id_vec.size() <<") > pool size ("
10910              << pool_size << ")";
10911           err = -EINVAL;
10912           goto reply;
10913         }
10914
10915         vector<int32_t> new_pg_upmap;
10916         for (auto osd : id_vec) {
10917           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
10918             ss << "osd." << osd << " does not exist";
10919             err = -ENOENT;
10920             goto reply;
10921           }
10922           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
10923           if (it != new_pg_upmap.end()) {
10924             ss << "osd." << osd << " already exists, ";
10925             continue;
10926           }
10927           new_pg_upmap.push_back(osd);
10928         }
10929
10930         if (new_pg_upmap.empty()) {
10931           ss << "no valid upmap items(pairs) is specified";
10932           err = -EINVAL;
10933           goto reply;
10934         }
10935
10936         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
10937           new_pg_upmap.begin(), new_pg_upmap.end());
10938         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
10939       }
10940       break;
10941
10942     case OP_RM_PG_UPMAP:
10943       {
10944         pending_inc.old_pg_upmap.insert(pgid);
10945         ss << "clear " << pgid << " pg_upmap mapping";
10946       }
10947       break;
10948
10949     case OP_PG_UPMAP_ITEMS:
10950       {
10951         vector<int64_t> id_vec;
10952         if (!cmd_getval(cct, cmdmap, "id", id_vec)) {
10953           ss << "unable to parse 'id' value(s) '"
10954              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
10955           err = -EINVAL;
10956           goto reply;
10957         }
10958
10959         if (id_vec.size() % 2) {
10960           ss << "you must specify pairs of osd ids to be remapped";
10961           err = -EINVAL;
10962           goto reply;
10963         }
10964
10965         int pool_size = osdmap.get_pg_pool_size(pgid);
10966         if ((int)(id_vec.size() / 2) > pool_size) {
10967           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
10968              << pool_size << ")";
10969           err = -EINVAL;
10970           goto reply;
10971         }
10972
10973         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
10974         ostringstream items;
10975         items << "[";
10976         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
10977           int from = *p++;
10978           int to = *p;
10979           if (from == to) {
10980             ss << "from osd." << from << " == to osd." << to << ", ";
10981             continue;
10982           }
10983           if (!osdmap.exists(from)) {
10984             ss << "osd." << from << " does not exist";
10985             err = -ENOENT;
10986             goto reply;
10987           }
10988           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
10989             ss << "osd." << to << " does not exist";
10990             err = -ENOENT;
10991             goto reply;
10992           }
10993           pair<int32_t,int32_t> entry = make_pair(from, to);
10994           auto it = std::find(new_pg_upmap_items.begin(),
10995             new_pg_upmap_items.end(), entry);
10996           if (it != new_pg_upmap_items.end()) {
10997             ss << "osd." << from << " -> osd." << to << " already exists, ";
10998             continue;
10999           }
11000           new_pg_upmap_items.push_back(entry);
11001           items << from << "->" << to << ",";
11002         }
11003         string out(items.str());
11004         out.resize(out.size() - 1); // drop last ','
11005         out += "]";
11006
11007         if (new_pg_upmap_items.empty()) {
11008           ss << "no valid upmap items(pairs) is specified";
11009           err = -EINVAL;
11010           goto reply;
11011         }
11012
11013         pending_inc.new_pg_upmap_items[pgid] =
11014           mempool::osdmap::vector<pair<int32_t,int32_t>>(
11015           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11016         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11017       }
11018       break;
11019
11020     case OP_RM_PG_UPMAP_ITEMS:
11021       {
11022         pending_inc.old_pg_upmap_items.insert(pgid);
11023         ss << "clear " << pgid << " pg_upmap_items mapping";
11024       }
11025       break;
11026
11027     default:
11028       ceph_abort_msg("invalid option");
11029     }
11030
11031     goto update;
11032   } else if (prefix == "osd primary-affinity") {
11033     int64_t id;
11034     if (!cmd_getval(cct, cmdmap, "id", id)) {
11035       ss << "invalid osd id value '"
11036          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11037       err = -EINVAL;
11038       goto reply;
11039     }
11040     double w;
11041     if (!cmd_getval(cct, cmdmap, "weight", w)) {
11042       ss << "unable to parse 'weight' value '"
11043          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11044       err = -EINVAL;
11045       goto reply;
11046     }
11047     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11048     if (ww < 0L) {
11049       ss << "weight must be >= 0";
11050       err = -EINVAL;
11051       goto reply;
11052     }
11053     if (osdmap.require_min_compat_client > 0 &&
11054         osdmap.require_min_compat_client < CEPH_RELEASE_FIREFLY) {
11055       ss << "require_min_compat_client "
11056          << ceph_release_name(osdmap.require_min_compat_client)
11057          << " < firefly, which is required for primary-affinity";
11058       err = -EPERM;
11059       goto reply;
11060     }
11061     if (osdmap.exists(id)) {
11062       pending_inc.new_primary_affinity[id] = ww;
11063       ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11064       getline(ss, rs);
11065       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11066                                                 get_last_committed() + 1));
11067       return true;
11068     } else {
11069       ss << "osd." << id << " does not exist";
11070       err = -ENOENT;
11071       goto reply;
11072     }
11073   } else if (prefix == "osd reweight") {
11074     int64_t id;
11075     if (!cmd_getval(cct, cmdmap, "id", id)) {
11076       ss << "unable to parse osd id value '"
11077          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11078       err = -EINVAL;
11079       goto reply;
11080     }
11081     double w;
11082     if (!cmd_getval(cct, cmdmap, "weight", w)) {
11083       ss << "unable to parse weight value '"
11084          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11085       err = -EINVAL;
11086       goto reply;
11087     }
11088     long ww = (int)((double)CEPH_OSD_IN*w);
11089     if (ww < 0L) {
11090       ss << "weight must be >= 0";
11091       err = -EINVAL;
11092       goto reply;
11093     }
11094     if (osdmap.exists(id)) {
11095       pending_inc.new_weight[id] = ww;
11096       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
11097       getline(ss, rs);
11098       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11099                                                 get_last_committed() + 1));
11100       return true;
11101     } else {
11102       ss << "osd." << id << " does not exist";
11103       err = -ENOENT;
11104       goto reply;
11105     }
11106   } else if (prefix == "osd reweightn") {
11107     map<int32_t, uint32_t> weights;
11108     err = parse_reweights(cct, cmdmap, osdmap, &weights);
11109     if (err) {
11110       ss << "unable to parse 'weights' value '"
11111          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
11112       goto reply;
11113     }
11114     pending_inc.new_weight.insert(weights.begin(), weights.end());
11115     wait_for_finished_proposal(
11116         op,
11117         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11118     return true;
11119   } else if (prefix == "osd lost") {
11120     int64_t id;
11121     if (!cmd_getval(cct, cmdmap, "id", id)) {
11122       ss << "unable to parse osd id value '"
11123          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11124       err = -EINVAL;
11125       goto reply;
11126     }
11127     bool sure = false;
11128     cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11129     if (!sure) {
11130       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
11131             "--yes-i-really-mean-it if you really do.";
11132       err = -EPERM;
11133       goto reply;
11134     } else if (!osdmap.exists(id)) {
11135       ss << "osd." << id << " does not exist";
11136       err = -ENOENT;
11137       goto reply;
11138     } else if (!osdmap.is_down(id)) {
11139       ss << "osd." << id << " is not down";
11140       err = -EBUSY;
11141       goto reply;
11142     } else {
11143       epoch_t e = osdmap.get_info(id).down_at;
11144       pending_inc.new_lost[id] = e;
11145       ss << "marked osd lost in epoch " << e;
11146       getline(ss, rs);
11147       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11148                                                 get_last_committed() + 1));
11149       return true;
11150     }
11151
11152   } else if (prefix == "osd destroy-actual" ||
11153              prefix == "osd purge-actual" ||
11154              prefix == "osd purge-new") {
11155     /* Destroying an OSD means that we don't expect to further make use of
11156      * the OSDs data (which may even become unreadable after this operation),
11157      * and that we are okay with scrubbing all its cephx keys and config-key
11158      * data (which may include lockbox keys, thus rendering the osd's data
11159      * unreadable).
11160      *
11161      * The OSD will not be removed. Instead, we will mark it as destroyed,
11162      * such that a subsequent call to `create` will not reuse the osd id.
11163      * This will play into being able to recreate the OSD, at the same
11164      * crush location, with minimal data movement.
11165      */
11166
11167     // make sure authmon is writeable.
11168     if (!mon->authmon()->is_writeable()) {
11169       dout(10) << __func__ << " waiting for auth mon to be writeable for "
11170                << "osd destroy" << dendl;
11171       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11172       return false;
11173     }
11174
11175     int64_t id;
11176     if (!cmd_getval(cct, cmdmap, "id", id)) {
11177       auto p = cmdmap.find("id");
11178       if (p == cmdmap.end()) {
11179         ss << "no osd id specified";
11180       } else {
11181         ss << "unable to parse osd id value '"
11182            << cmd_vartype_stringify(cmdmap.at("id")) << "";
11183       }
11184       err = -EINVAL;
11185       goto reply;
11186     }
11187
11188     bool is_destroy = (prefix == "osd destroy-actual");
11189     if (!is_destroy) {
11190       ceph_assert("osd purge-actual" == prefix ||
11191              "osd purge-new" == prefix);
11192     }
11193
11194     bool sure = false;
11195     cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", sure);
11196     if (!sure) {
11197       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
11198          << "This will mean real, permanent data loss, as well "
11199          << "as deletion of cephx and lockbox keys. "
11200          << "Pass --yes-i-really-mean-it if you really do.";
11201       err = -EPERM;
11202       goto reply;
11203     } else if (!osdmap.exists(id)) {
11204       ss << "osd." << id << " does not exist";
11205       err = 0; // idempotent
11206       goto reply;
11207     } else if (osdmap.is_up(id)) {
11208       ss << "osd." << id << " is not `down`.";
11209       err = -EBUSY;
11210       goto reply;
11211     } else if (is_destroy && osdmap.is_destroyed(id)) {
11212       ss << "destroyed osd." << id;
11213       err = 0;
11214       goto reply;
11215     }
11216
11217     if (prefix == "osd purge-new" &&
11218         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
11219       ss << "osd." << id << " is not new";
11220       err = -EPERM;
11221       goto reply;
11222     }
11223
11224     bool goto_reply = false;
11225
11226     paxos->plug();
11227     if (is_destroy) {
11228       err = prepare_command_osd_destroy(id, ss);
11229       // we checked above that it should exist.
11230       ceph_assert(err != -ENOENT);
11231     } else {
11232       err = prepare_command_osd_purge(id, ss);
11233       if (err == -ENOENT) {
11234         err = 0;
11235         ss << "osd." << id << " does not exist.";
11236         goto_reply = true;
11237       }
11238     }
11239     paxos->unplug();
11240
11241     if (err < 0 || goto_reply) {
11242       goto reply;
11243     }
11244
11245     if (is_destroy) {
11246       ss << "destroyed osd." << id;
11247     } else {
11248       ss << "purged osd." << id;
11249     }
11250
11251     getline(ss, rs);
11252     wait_for_finished_proposal(op,
11253         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
11254     force_immediate_propose();
11255     return true;
11256
11257   } else if (prefix == "osd new") {
11258
11259     // make sure authmon is writeable.
11260     if (!mon->authmon()->is_writeable()) {
11261       dout(10) << __func__ << " waiting for auth mon to be writeable for "
11262                << "osd new" << dendl;
11263       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
11264       return false;
11265     }
11266
11267     map<string,string> param_map;
11268
11269     bufferlist bl = m->get_data();
11270     string param_json = bl.to_str();
11271     dout(20) << __func__ << " osd new json = " << param_json << dendl;
11272
11273     err = get_json_str_map(param_json, ss, &param_map);
11274     if (err < 0)
11275       goto reply;
11276
11277     dout(20) << __func__ << " osd new params " << param_map << dendl;
11278
11279     paxos->plug();
11280     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
11281     paxos->unplug();
11282
11283     if (err < 0) {
11284       goto reply;
11285     }
11286
11287     if (f) {
11288       f->flush(rdata);
11289     } else {
11290       rdata.append(ss);
11291     }
11292
11293     if (err == EEXIST) {
11294       // idempotent operation
11295       err = 0;
11296       goto reply;
11297     }
11298
11299     wait_for_finished_proposal(op,
11300         new Monitor::C_Command(mon, op, 0, rs, rdata,
11301                                get_last_committed() + 1));
11302     force_immediate_propose();
11303     return true;
11304
11305   } else if (prefix == "osd create") {
11306
11307     // optional id provided?
11308     int64_t id = -1, cmd_id = -1;
11309     if (cmd_getval(cct, cmdmap, "id", cmd_id)) {
11310       if (cmd_id < 0) {
11311         ss << "invalid osd id value '" << cmd_id << "'";
11312         err = -EINVAL;
11313         goto reply;
11314       }
11315       dout(10) << " osd create got id " << cmd_id << dendl;
11316     }
11317
11318     uuid_d uuid;
11319     string uuidstr;
11320     if (cmd_getval(cct, cmdmap, "uuid", uuidstr)) {
11321       if (!uuid.parse(uuidstr.c_str())) {
11322         ss << "invalid uuid value '" << uuidstr << "'";
11323         err = -EINVAL;
11324         goto reply;
11325       }
11326       // we only care about the id if we also have the uuid, to
11327       // ensure the operation's idempotency.
11328       id = cmd_id;
11329     }
11330
11331     int32_t new_id = -1;
11332     err = prepare_command_osd_create(id, uuid, &new_id, ss);
11333     if (err < 0) {
11334       if (err == -EAGAIN) {
11335         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11336         return true;
11337       }
11338       // a check has failed; reply to the user.
11339       goto reply;
11340
11341     } else if (err == EEXIST) {
11342       // this is an idempotent operation; we can go ahead and reply.
11343       if (f) {
11344         f->open_object_section("created_osd");
11345         f->dump_int("osdid", new_id);
11346         f->close_section();
11347         f->flush(rdata);
11348       } else {
11349         ss << new_id;
11350         rdata.append(ss);
11351       }
11352       err = 0;
11353       goto reply;
11354     }
11355
11356     string empty_device_class;
11357     do_osd_create(id, uuid, empty_device_class, &new_id);
11358
11359     if (f) {
11360       f->open_object_section("created_osd");
11361       f->dump_int("osdid", new_id);
11362       f->close_section();
11363       f->flush(rdata);
11364     } else {
11365       ss << new_id;
11366       rdata.append(ss);
11367     }
11368     wait_for_finished_proposal(op,
11369         new Monitor::C_Command(mon, op, 0, rs, rdata,
11370                                get_last_committed() + 1));
11371     return true;
11372
11373   } else if (prefix == "osd blacklist clear") {
11374     pending_inc.new_blacklist.clear();
11375     std::list<std::pair<entity_addr_t,utime_t > > blacklist;
11376     osdmap.get_blacklist(&blacklist);
11377     for (const auto &entry : blacklist) {
11378       pending_inc.old_blacklist.push_back(entry.first);
11379     }
11380     ss << " removed all blacklist entries";
11381     getline(ss, rs);
11382     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11383                                               get_last_committed() + 1));
11384     return true;
11385   } else if (prefix == "osd blacklist") {
11386     string addrstr;
11387     cmd_getval(cct, cmdmap, "addr", addrstr);
11388     entity_addr_t addr;
11389     if (!addr.parse(addrstr.c_str(), 0)) {
11390       ss << "unable to parse address " << addrstr;
11391       err = -EINVAL;
11392       goto reply;
11393     }
11394     else {
11395       if (osdmap.require_osd_release >= CEPH_RELEASE_NAUTILUS) {
11396         // always blacklist type ANY
11397         addr.set_type(entity_addr_t::TYPE_ANY);
11398       } else {
11399         addr.set_type(entity_addr_t::TYPE_LEGACY);
11400       }
11401
11402       string blacklistop;
11403       cmd_getval(cct, cmdmap, "blacklistop", blacklistop);
11404       if (blacklistop == "add") {
11405         utime_t expires = ceph_clock_now();
11406         double d;
11407         // default one hour
11408         cmd_getval(cct, cmdmap, "expire", d,
11409           g_conf()->mon_osd_blacklist_default_expire);
11410         expires += d;
11411
11412         pending_inc.new_blacklist[addr] = expires;
11413
11414         {
11415           // cancel any pending un-blacklisting request too
11416           auto it = std::find(pending_inc.old_blacklist.begin(),
11417             pending_inc.old_blacklist.end(), addr);
11418           if (it != pending_inc.old_blacklist.end()) {
11419             pending_inc.old_blacklist.erase(it);
11420           }
11421         }
11422
11423         ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
11424         getline(ss, rs);
11425         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11426                                                   get_last_committed() + 1));
11427         return true;
11428       } else if (blacklistop == "rm") {
11429         if (osdmap.is_blacklisted(addr) ||
11430             pending_inc.new_blacklist.count(addr)) {
11431           if (osdmap.is_blacklisted(addr))
11432             pending_inc.old_blacklist.push_back(addr);
11433           else
11434             pending_inc.new_blacklist.erase(addr);
11435           ss << "un-blacklisting " << addr;
11436           getline(ss, rs);
11437           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11438                                                     get_last_committed() + 1));
11439           return true;
11440         }
11441         ss << addr << " isn't blacklisted";
11442         err = 0;
11443         goto reply;
11444       }
11445     }
11446   } else if (prefix == "osd pool mksnap") {
11447     string poolstr;
11448     cmd_getval(cct, cmdmap, "pool", poolstr);
11449     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11450     if (pool < 0) {
11451       ss << "unrecognized pool '" << poolstr << "'";
11452       err = -ENOENT;
11453       goto reply;
11454     }
11455     string snapname;
11456     cmd_getval(cct, cmdmap, "snap", snapname);
11457     const pg_pool_t *p = osdmap.get_pg_pool(pool);
11458     if (p->is_unmanaged_snaps_mode()) {
11459       ss << "pool " << poolstr << " is in unmanaged snaps mode";
11460       err = -EINVAL;
11461       goto reply;
11462     } else if (p->snap_exists(snapname.c_str())) {
11463       ss << "pool " << poolstr << " snap " << snapname << " already exists";
11464       err = 0;
11465       goto reply;
11466     } else if (p->is_tier()) {
11467       ss << "pool " << poolstr << " is a cache tier";
11468       err = -EINVAL;
11469       goto reply;
11470     }
11471     pg_pool_t *pp = 0;
11472     if (pending_inc.new_pools.count(pool))
11473       pp = &pending_inc.new_pools[pool];
11474     if (!pp) {
11475       pp = &pending_inc.new_pools[pool];
11476       *pp = *p;
11477     }
11478     if (pp->snap_exists(snapname.c_str())) {
11479       ss << "pool " << poolstr << " snap " << snapname << " already exists";
11480     } else {
11481       pp->add_snap(snapname.c_str(), ceph_clock_now());
11482       pp->set_snap_epoch(pending_inc.epoch);
11483       ss << "created pool " << poolstr << " snap " << snapname;
11484     }
11485     getline(ss, rs);
11486     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11487                                               get_last_committed() + 1));
11488     return true;
11489   } else if (prefix == "osd pool rmsnap") {
11490     string poolstr;
11491     cmd_getval(cct, cmdmap, "pool", poolstr);
11492     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11493     if (pool < 0) {
11494       ss << "unrecognized pool '" << poolstr << "'";
11495       err = -ENOENT;
11496       goto reply;
11497     }
11498     string snapname;
11499     cmd_getval(cct, cmdmap, "snap", snapname);
11500     const pg_pool_t *p = osdmap.get_pg_pool(pool);
11501     if (p->is_unmanaged_snaps_mode()) {
11502       ss << "pool " << poolstr << " is in unmanaged snaps mode";
11503       err = -EINVAL;
11504       goto reply;
11505     } else if (!p->snap_exists(snapname.c_str())) {
11506       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
11507       err = 0;
11508       goto reply;
11509     }
11510     pg_pool_t *pp = 0;
11511     if (pending_inc.new_pools.count(pool))
11512       pp = &pending_inc.new_pools[pool];
11513     if (!pp) {
11514       pp = &pending_inc.new_pools[pool];
11515       *pp = *p;
11516     }
11517     snapid_t sn = pp->snap_exists(snapname.c_str());
11518     if (sn) {
11519       pp->remove_snap(sn);
11520       pp->set_snap_epoch(pending_inc.epoch);
11521       ss << "removed pool " << poolstr << " snap " << snapname;
11522     } else {
11523       ss << "already removed pool " << poolstr << " snap " << snapname;
11524     }
11525     getline(ss, rs);
11526     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11527                                               get_last_committed() + 1));
11528     return true;
11529   } else if (prefix == "osd pool create") {
11530     int64_t pg_num, pg_num_min;
11531     int64_t pgp_num;
11532     cmd_getval(cct, cmdmap, "pg_num", pg_num, int64_t(0));
11533     cmd_getval(cct, cmdmap, "pgp_num", pgp_num, pg_num);
11534     cmd_getval(cct, cmdmap, "pg_num_min", pg_num_min, int64_t(0));
11535
11536     string pool_type_str;
11537     cmd_getval(cct, cmdmap, "pool_type", pool_type_str);
11538     if (pool_type_str.empty())
11539       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
11540
11541     string poolstr;
11542     cmd_getval(cct, cmdmap, "pool", poolstr);
11543     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11544     if (pool_id >= 0) {
11545       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11546       if (pool_type_str != p->get_type_name()) {
11547         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
11548         err = -EINVAL;
11549       } else {
11550         ss << "pool '" << poolstr << "' already exists";
11551         err = 0;
11552       }
11553       goto reply;
11554     }
11555
11556     int pool_type;
11557     if (pool_type_str == "replicated") {
11558       pool_type = pg_pool_t::TYPE_REPLICATED;
11559     } else if (pool_type_str == "erasure") {
11560       pool_type = pg_pool_t::TYPE_ERASURE;
11561     } else {
11562       ss << "unknown pool type '" << pool_type_str << "'";
11563       err = -EINVAL;
11564       goto reply;
11565     }
11566
11567     bool implicit_rule_creation = false;
11568     int64_t expected_num_objects = 0;
11569     string rule_name;
11570     cmd_getval(cct, cmdmap, "rule", rule_name);
11571     string erasure_code_profile;
11572     cmd_getval(cct, cmdmap, "erasure_code_profile", erasure_code_profile);
11573
11574     if (pool_type == pg_pool_t::TYPE_ERASURE) {
11575       if (erasure_code_profile == "")
11576         erasure_code_profile = "default";
11577       //handle the erasure code profile
11578       if (erasure_code_profile == "default") {
11579         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
11580           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
11581             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
11582             goto wait;
11583           }
11584
11585           map<string,string> profile_map;
11586           err = osdmap.get_erasure_code_profile_default(cct,
11587                                                       profile_map,
11588                                                       &ss);
11589           if (err)
11590             goto reply;
11591           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
11592           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
11593           goto wait;
11594         }
11595       }
11596       if (rule_name == "") {
11597         implicit_rule_creation = true;
11598         if (erasure_code_profile == "default") {
11599           rule_name = "erasure-code";
11600         } else {
11601           dout(1) << "implicitly use rule named after the pool: "
11602                 << poolstr << dendl;
11603           rule_name = poolstr;
11604         }
11605       }
11606       cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
11607                  expected_num_objects, int64_t(0));
11608     } else {
11609       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
11610       //     and put expected_num_objects to rule field
11611       if (erasure_code_profile != "") { // cmd is from CLI
11612         if (rule_name != "") {
11613           string interr;
11614           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
11615           if (interr.length()) {
11616             ss << "error parsing integer value '" << rule_name << "': " << interr;
11617             err = -EINVAL;
11618             goto reply;
11619           }
11620         }
11621         rule_name = erasure_code_profile;
11622       } else { // cmd is well-formed
11623         cmd_getval(g_ceph_context, cmdmap, "expected_num_objects",
11624                    expected_num_objects, int64_t(0));
11625       }
11626     }
11627
11628     if (!implicit_rule_creation && rule_name != "") {
11629       int rule;
11630       err = get_crush_rule(rule_name, &rule, &ss);
11631       if (err == -EAGAIN) {
11632         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11633         return true;
11634       }
11635       if (err)
11636         goto reply;
11637     }
11638
11639     if (expected_num_objects < 0) {
11640       ss << "'expected_num_objects' must be non-negative";
11641       err = -EINVAL;
11642       goto reply;
11643     }
11644
11645     if (expected_num_objects > 0 &&
11646         cct->_conf->osd_objectstore == "filestore" &&
11647         cct->_conf->filestore_merge_threshold > 0) {
11648       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
11649       err = -EINVAL;
11650       goto reply;
11651     }
11652
11653     if (expected_num_objects == 0 &&
11654         cct->_conf->osd_objectstore == "filestore" &&
11655         cct->_conf->filestore_merge_threshold < 0) {
11656       int osds = osdmap.get_num_osds();
11657       if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
11658         ss << "For better initial performance on pools expected to store a "
11659            << "large number of objects, consider supplying the "
11660            << "expected_num_objects parameter when creating the pool.\n";
11661       }
11662     }
11663
11664     int64_t fast_read_param;
11665     cmd_getval(cct, cmdmap, "fast_read", fast_read_param, int64_t(-1));
11666     FastReadType fast_read = FAST_READ_DEFAULT;
11667     if (fast_read_param == 0)
11668       fast_read = FAST_READ_OFF;
11669     else if (fast_read_param > 0)
11670       fast_read = FAST_READ_ON;
11671
11672     int64_t repl_size = 0;
11673     cmd_getval(cct, cmdmap, "size", repl_size);
11674     int64_t target_size_bytes = 0;
11675     double target_size_ratio = 0.0;
11676     cmd_getval(cct, cmdmap, "target_size_bytes", target_size_bytes);
11677     cmd_getval(cct, cmdmap, "target_size_ratio", target_size_ratio);
11678
11679     err = prepare_new_pool(poolstr,
11680                            -1, // default crush rule
11681                            rule_name,
11682                            pg_num, pgp_num, pg_num_min,
11683                            repl_size, target_size_bytes, target_size_ratio,
11684                            erasure_code_profile, pool_type,
11685                            (uint64_t)expected_num_objects,
11686                            fast_read,
11687                            &ss);
11688     if (err < 0) {
11689       switch(err) {
11690       case -EEXIST:
11691         ss << "pool '" << poolstr << "' already exists";
11692         break;
11693       case -EAGAIN:
11694         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11695         return true;
11696       case -ERANGE:
11697         goto reply;
11698       default:
11699         goto reply;
11700         break;
11701       }
11702     } else {
11703       ss << "pool '" << poolstr << "' created";
11704     }
11705     getline(ss, rs);
11706     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11707                                               get_last_committed() + 1));
11708     return true;
11709
11710   } else if (prefix == "osd pool delete" ||
11711              prefix == "osd pool rm") {
11712     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
11713     string poolstr, poolstr2, sure;
11714     cmd_getval(cct, cmdmap, "pool", poolstr);
11715     cmd_getval(cct, cmdmap, "pool2", poolstr2);
11716     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
11717     if (pool < 0) {
11718       ss << "pool '" << poolstr << "' does not exist";
11719       err = 0;
11720       goto reply;
11721     }
11722
11723     bool force_no_fake = false;
11724     cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it", force_no_fake);
11725     bool force = false;
11726     cmd_getval(cct, cmdmap, "yes_i_really_really_mean_it_not_faking", force);
11727     if (poolstr2 != poolstr ||
11728         (!force && !force_no_fake)) {
11729       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
11730          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
11731          << "followed by --yes-i-really-really-mean-it.";
11732       err = -EPERM;
11733       goto reply;
11734     }
11735     err = _prepare_remove_pool(pool, &ss, force_no_fake);
11736     if (err == -EAGAIN) {
11737       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11738       return true;
11739     }
11740     if (err < 0)
11741       goto reply;
11742     goto update;
11743   } else if (prefix == "osd pool rename") {
11744     string srcpoolstr, destpoolstr;
11745     cmd_getval(cct, cmdmap, "srcpool", srcpoolstr);
11746     cmd_getval(cct, cmdmap, "destpool", destpoolstr);
11747     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
11748     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
11749
11750     if (pool_src < 0) {
11751       if (pool_dst >= 0) {
11752         // src pool doesn't exist, dst pool does exist: to ensure idempotency
11753         // of operations, assume this rename succeeded, as it is not changing
11754         // the current state.  Make sure we output something understandable
11755         // for whoever is issuing the command, if they are paying attention,
11756         // in case it was not intentional; or to avoid a "wtf?" and a bug
11757         // report in case it was intentional, while expecting a failure.
11758         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
11759           << destpoolstr << "' does -- assuming successful rename";
11760         err = 0;
11761       } else {
11762         ss << "unrecognized pool '" << srcpoolstr << "'";
11763         err = -ENOENT;
11764       }
11765       goto reply;
11766     } else if (pool_dst >= 0) {
11767       // source pool exists and so does the destination pool
11768       ss << "pool '" << destpoolstr << "' already exists";
11769       err = -EEXIST;
11770       goto reply;
11771     }
11772
11773     int ret = _prepare_rename_pool(pool_src, destpoolstr);
11774     if (ret == 0) {
11775       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
11776     } else {
11777       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
11778         << cpp_strerror(ret);
11779     }
11780     getline(ss, rs);
11781     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
11782                                               get_last_committed() + 1));
11783     return true;
11784
11785   } else if (prefix == "osd pool set") {
11786     err = prepare_command_pool_set(cmdmap, ss);
11787     if (err == -EAGAIN)
11788       goto wait;
11789     if (err < 0)
11790       goto reply;
11791
11792     getline(ss, rs);
11793     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11794                                                    get_last_committed() + 1));
11795     return true;
11796   } else if (prefix == "osd tier add") {
11797     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11798     if (err == -EAGAIN)
11799       goto wait;
11800     if (err)
11801       goto reply;
11802     string poolstr;
11803     cmd_getval(cct, cmdmap, "pool", poolstr);
11804     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11805     if (pool_id < 0) {
11806       ss << "unrecognized pool '" << poolstr << "'";
11807       err = -ENOENT;
11808       goto reply;
11809     }
11810     string tierpoolstr;
11811     cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
11812     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11813     if (tierpool_id < 0) {
11814       ss << "unrecognized pool '" << tierpoolstr << "'";
11815       err = -ENOENT;
11816       goto reply;
11817     }
11818     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11819     ceph_assert(p);
11820     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11821     ceph_assert(tp);
11822
11823     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
11824       goto reply;
11825     }
11826
11827     // make sure new tier is empty
11828     string force_nonempty;
11829     cmd_getval(cct, cmdmap, "force_nonempty", force_nonempty);
11830     const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
11831     if (pstats && pstats->stats.sum.num_objects != 0 &&
11832         force_nonempty != "--force-nonempty") {
11833       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
11834       err = -ENOTEMPTY;
11835       goto reply;
11836     }
11837     if (tp->is_erasure()) {
11838       ss << "tier pool '" << tierpoolstr
11839          << "' is an ec pool, which cannot be a tier";
11840       err = -ENOTSUP;
11841       goto reply;
11842     }
11843     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
11844         ((force_nonempty != "--force-nonempty") ||
11845          (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
11846       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
11847       err = -ENOTEMPTY;
11848       goto reply;
11849     }
11850     // go
11851     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11852     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11853     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
11854       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11855       return true;
11856     }
11857     np->tiers.insert(tierpool_id);
11858     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
11859     ntp->tier_of = pool_id;
11860     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
11861     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11862                                               get_last_committed() + 1));
11863     return true;
11864   } else if (prefix == "osd tier remove" ||
11865              prefix == "osd tier rm") {
11866     string poolstr;
11867     cmd_getval(cct, cmdmap, "pool", poolstr);
11868     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11869     if (pool_id < 0) {
11870       ss << "unrecognized pool '" << poolstr << "'";
11871       err = -ENOENT;
11872       goto reply;
11873     }
11874     string tierpoolstr;
11875     cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
11876     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
11877     if (tierpool_id < 0) {
11878       ss << "unrecognized pool '" << tierpoolstr << "'";
11879       err = -ENOENT;
11880       goto reply;
11881     }
11882     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11883     ceph_assert(p);
11884     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
11885     ceph_assert(tp);
11886
11887     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
11888       goto reply;
11889     }
11890
11891     if (p->tiers.count(tierpool_id) == 0) {
11892       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11893       err = 0;
11894       goto reply;
11895     }
11896     if (tp->tier_of != pool_id) {
11897       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
11898          << osdmap.get_pool_name(tp->tier_of) << "': "
11899          // be scary about it; this is an inconsistency and bells must go off
11900          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
11901       err = -EINVAL;
11902       goto reply;
11903     }
11904     if (p->read_tier == tierpool_id) {
11905       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
11906       err = -EBUSY;
11907       goto reply;
11908     }
11909     // go
11910     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11911     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
11912     if (np->tiers.count(tierpool_id) == 0 ||
11913         ntp->tier_of != pool_id ||
11914         np->read_tier == tierpool_id) {
11915       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11916       return true;
11917     }
11918     np->tiers.erase(tierpool_id);
11919     ntp->clear_tier();
11920     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
11921     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11922                                               get_last_committed() + 1));
11923     return true;
11924   } else if (prefix == "osd tier set-overlay") {
11925     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
11926     if (err == -EAGAIN)
11927       goto wait;
11928     if (err)
11929       goto reply;
11930     string poolstr;
11931     cmd_getval(cct, cmdmap, "pool", poolstr);
11932     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11933     if (pool_id < 0) {
11934       ss << "unrecognized pool '" << poolstr << "'";
11935       err = -ENOENT;
11936       goto reply;
11937     }
11938     string overlaypoolstr;
11939     cmd_getval(cct, cmdmap, "overlaypool", overlaypoolstr);
11940     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
11941     if (overlaypool_id < 0) {
11942       ss << "unrecognized pool '" << overlaypoolstr << "'";
11943       err = -ENOENT;
11944       goto reply;
11945     }
11946     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11947     ceph_assert(p);
11948     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
11949     ceph_assert(overlay_p);
11950     if (p->tiers.count(overlaypool_id) == 0) {
11951       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
11952       err = -EINVAL;
11953       goto reply;
11954     }
11955     if (p->read_tier == overlaypool_id) {
11956       err = 0;
11957       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
11958       goto reply;
11959     }
11960     if (p->has_read_tier()) {
11961       ss << "pool '" << poolstr << "' has overlay '"
11962          << osdmap.get_pool_name(p->read_tier)
11963          << "'; please remove-overlay first";
11964       err = -EINVAL;
11965       goto reply;
11966     }
11967
11968     // go
11969     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
11970     np->read_tier = overlaypool_id;
11971     np->write_tier = overlaypool_id;
11972     np->set_last_force_op_resend(pending_inc.epoch);
11973     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
11974     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
11975     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
11976     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
11977       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
11978     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
11979                                               get_last_committed() + 1));
11980     return true;
11981   } else if (prefix == "osd tier remove-overlay" ||
11982              prefix == "osd tier rm-overlay") {
11983     string poolstr;
11984     cmd_getval(cct, cmdmap, "pool", poolstr);
11985     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
11986     if (pool_id < 0) {
11987       ss << "unrecognized pool '" << poolstr << "'";
11988       err = -ENOENT;
11989       goto reply;
11990     }
11991     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
11992     ceph_assert(p);
11993     if (!p->has_read_tier()) {
11994       err = 0;
11995       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
11996       goto reply;
11997     }
11998
11999     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12000       goto reply;
12001     }
12002
12003     // go
12004     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12005     if (np->has_read_tier()) {
12006       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12007       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12008       nop->set_last_force_op_resend(pending_inc.epoch);
12009     }
12010     if (np->has_write_tier()) {
12011       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12012       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12013       nop->set_last_force_op_resend(pending_inc.epoch);
12014     }
12015     np->clear_read_tier();
12016     np->clear_write_tier();
12017     np->set_last_force_op_resend(pending_inc.epoch);
12018     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12019     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12020                                               get_last_committed() + 1));
12021     return true;
12022   } else if (prefix == "osd tier cache-mode") {
12023     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12024     if (err == -EAGAIN)
12025       goto wait;
12026     if (err)
12027       goto reply;
12028     string poolstr;
12029     cmd_getval(cct, cmdmap, "pool", poolstr);
12030     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12031     if (pool_id < 0) {
12032       ss << "unrecognized pool '" << poolstr << "'";
12033       err = -ENOENT;
12034       goto reply;
12035     }
12036     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12037     ceph_assert(p);
12038     if (!p->is_tier()) {
12039       ss << "pool '" << poolstr << "' is not a tier";
12040       err = -EINVAL;
12041       goto reply;
12042     }
12043     string modestr;
12044     cmd_getval(cct, cmdmap, "mode", modestr);
12045     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12046     if (mode < 0) {
12047       ss << "'" << modestr << "' is not a valid cache mode";
12048       err = -EINVAL;
12049       goto reply;
12050     }
12051
12052     bool sure = false;
12053     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12054
12055     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12056          mode != pg_pool_t::CACHEMODE_NONE &&
12057          mode != pg_pool_t::CACHEMODE_PROXY &&
12058          mode != pg_pool_t::CACHEMODE_READPROXY) &&
12059          !sure) {
12060       ss << "'" << modestr << "' is not a well-supported cache mode and may "
12061          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
12062       err = -EPERM;
12063       goto reply;
12064     }
12065
12066     // pool already has this cache-mode set and there are no pending changes
12067     if (p->cache_mode == mode &&
12068         (pending_inc.new_pools.count(pool_id) == 0 ||
12069          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12070       ss << "set cache-mode for pool '" << poolstr << "'"
12071          << " to " << pg_pool_t::get_cache_mode_name(mode);
12072       err = 0;
12073       goto reply;
12074     }
12075
12076     /* Mode description:
12077      *
12078      *  none:       No cache-mode defined
12079      *  forward:    Forward all reads and writes to base pool
12080      *  writeback:  Cache writes, promote reads from base pool
12081      *  readonly:   Forward writes to base pool
12082      *  readforward: Writes are in writeback mode, Reads are in forward mode
12083      *  proxy:       Proxy all reads and writes to base pool
12084      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
12085      *
12086      * Hence, these are the allowed transitions:
12087      *
12088      *  none -> any
12089      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12090      *  proxy -> forward || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12091      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12092      *  readproxy -> forward || proxy || readforward || writeback || any IF num_objects_dirty == 0
12093      *  writeback -> readforward || readproxy || forward || proxy
12094      *  readonly -> any
12095      */
12096
12097     // We check if the transition is valid against the current pool mode, as
12098     // it is the only committed state thus far.  We will blantly squash
12099     // whatever mode is on the pending state.
12100
12101     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
12102         (mode != pg_pool_t::CACHEMODE_FORWARD &&
12103           mode != pg_pool_t::CACHEMODE_PROXY &&
12104           mode != pg_pool_t::CACHEMODE_READFORWARD &&
12105           mode != pg_pool_t::CACHEMODE_READPROXY)) {
12106       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
12107          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
12108          << "' pool; only '"
12109          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_FORWARD)
12110          << "','"
12111          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
12112          << "','"
12113          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READFORWARD)
12114          << "','"
12115          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
12116         << "' allowed.";
12117       err = -EINVAL;
12118       goto reply;
12119     }
12120     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
12121         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12122           mode != pg_pool_t::CACHEMODE_FORWARD &&
12123           mode != pg_pool_t::CACHEMODE_PROXY &&
12124           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12125
12126         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
12127         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12128           mode != pg_pool_t::CACHEMODE_FORWARD &&
12129           mode != pg_pool_t::CACHEMODE_READFORWARD &&
12130           mode != pg_pool_t::CACHEMODE_PROXY)) ||
12131
12132         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
12133         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12134           mode != pg_pool_t::CACHEMODE_FORWARD &&
12135           mode != pg_pool_t::CACHEMODE_READFORWARD &&
12136           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12137
12138         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
12139         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12140           mode != pg_pool_t::CACHEMODE_READFORWARD &&
12141           mode != pg_pool_t::CACHEMODE_PROXY &&
12142           mode != pg_pool_t::CACHEMODE_READPROXY))) {
12143
12144       const pool_stat_t* pstats =
12145         mon->mgrstatmon()->get_pool_stat(pool_id);
12146
12147       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
12148         ss << "unable to set cache-mode '"
12149            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
12150            << "': dirty objects found";
12151         err = -EBUSY;
12152         goto reply;
12153       }
12154     }
12155     // go
12156     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12157     np->cache_mode = mode;
12158     // set this both when moving to and from cache_mode NONE.  this is to
12159     // capture legacy pools that were set up before this flag existed.
12160     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
12161     ss << "set cache-mode for pool '" << poolstr
12162         << "' to " << pg_pool_t::get_cache_mode_name(mode);
12163     if (mode == pg_pool_t::CACHEMODE_NONE) {
12164       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
12165       ceph_assert(base_pool);
12166       if (base_pool->read_tier == pool_id ||
12167           base_pool->write_tier == pool_id)
12168         ss <<" (WARNING: pool is still configured as read or write tier)";
12169     }
12170     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12171                                               get_last_committed() + 1));
12172     return true;
12173   } else if (prefix == "osd tier add-cache") {
12174     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12175     if (err == -EAGAIN)
12176       goto wait;
12177     if (err)
12178       goto reply;
12179     string poolstr;
12180     cmd_getval(cct, cmdmap, "pool", poolstr);
12181     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12182     if (pool_id < 0) {
12183       ss << "unrecognized pool '" << poolstr << "'";
12184       err = -ENOENT;
12185       goto reply;
12186     }
12187     string tierpoolstr;
12188     cmd_getval(cct, cmdmap, "tierpool", tierpoolstr);
12189     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12190     if (tierpool_id < 0) {
12191       ss << "unrecognized pool '" << tierpoolstr << "'";
12192       err = -ENOENT;
12193       goto reply;
12194     }
12195     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12196     ceph_assert(p);
12197     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12198     ceph_assert(tp);
12199
12200     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12201       goto reply;
12202     }
12203
12204     int64_t size = 0;
12205     if (!cmd_getval(cct, cmdmap, "size", size)) {
12206       ss << "unable to parse 'size' value '"
12207          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
12208       err = -EINVAL;
12209       goto reply;
12210     }
12211     // make sure new tier is empty
12212     const pool_stat_t *pstats =
12213       mon->mgrstatmon()->get_pool_stat(tierpool_id);
12214     if (pstats && pstats->stats.sum.num_objects != 0) {
12215       ss << "tier pool '" << tierpoolstr << "' is not empty";
12216       err = -ENOTEMPTY;
12217       goto reply;
12218     }
12219     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
12220     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12221     if (mode < 0) {
12222       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
12223       err = -EINVAL;
12224       goto reply;
12225     }
12226     HitSet::Params hsp;
12227     auto& cache_hit_set_type =
12228       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
12229     if (cache_hit_set_type == "bloom") {
12230       BloomHitSet::Params *bsp = new BloomHitSet::Params;
12231       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
12232       hsp = HitSet::Params(bsp);
12233     } else if (cache_hit_set_type == "explicit_hash") {
12234       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
12235     } else if (cache_hit_set_type == "explicit_object") {
12236       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
12237     } else {
12238       ss << "osd tier cache default hit set type '"
12239          << cache_hit_set_type << "' is not a known type";
12240       err = -EINVAL;
12241       goto reply;
12242     }
12243     // go
12244     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12245     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12246     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12247       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12248       return true;
12249     }
12250     np->tiers.insert(tierpool_id);
12251     np->read_tier = np->write_tier = tierpool_id;
12252     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12253     np->set_last_force_op_resend(pending_inc.epoch);
12254     ntp->set_last_force_op_resend(pending_inc.epoch);
12255     ntp->tier_of = pool_id;
12256     ntp->cache_mode = mode;
12257     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
12258     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
12259     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
12260     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
12261     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
12262     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
12263     ntp->hit_set_params = hsp;
12264     ntp->target_max_bytes = size;
12265     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
12266     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12267                                               get_last_committed() + 1));
12268     return true;
12269   } else if (prefix == "osd pool set-quota") {
12270     string poolstr;
12271     cmd_getval(cct, cmdmap, "pool", poolstr);
12272     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12273     if (pool_id < 0) {
12274       ss << "unrecognized pool '" << poolstr << "'";
12275       err = -ENOENT;
12276       goto reply;
12277     }
12278
12279     string field;
12280     cmd_getval(cct, cmdmap, "field", field);
12281     if (field != "max_objects" && field != "max_bytes") {
12282       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
12283       err = -EINVAL;
12284       goto reply;
12285     }
12286
12287     // val could contain unit designations, so we treat as a string
12288     string val;
12289     cmd_getval(cct, cmdmap, "val", val);
12290     string tss;
12291     int64_t value;
12292     if (field == "max_objects") {
12293       value = strict_sistrtoll(val.c_str(), &tss);
12294     } else if (field == "max_bytes") {
12295       value = strict_iecstrtoll(val.c_str(), &tss);
12296     } else {
12297       ceph_abort_msg("unrecognized option");
12298     }
12299     if (!tss.empty()) {
12300       ss << "error parsing value '" << val << "': " << tss;
12301       err = -EINVAL;
12302       goto reply;
12303     }
12304
12305     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
12306     if (field == "max_objects") {
12307       pi->quota_max_objects = value;
12308     } else if (field == "max_bytes") {
12309       pi->quota_max_bytes = value;
12310     } else {
12311       ceph_abort_msg("unrecognized option");
12312     }
12313     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
12314     rs = ss.str();
12315     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12316                                               get_last_committed() + 1));
12317     return true;
12318   } else if (prefix == "osd pool application enable" ||
12319              prefix == "osd pool application disable" ||
12320              prefix == "osd pool application set" ||
12321              prefix == "osd pool application rm") {
12322     err = prepare_command_pool_application(prefix, cmdmap, ss);
12323     if (err == -EAGAIN) {
12324       goto wait;
12325     } else if (err < 0) {
12326       goto reply;
12327     } else {
12328       goto update;
12329     }
12330   } else if (prefix == "osd force-create-pg") {
12331     pg_t pgid;
12332     string pgidstr;
12333     cmd_getval(cct, cmdmap, "pgid", pgidstr);
12334     if (!pgid.parse(pgidstr.c_str())) {
12335       ss << "invalid pgid '" << pgidstr << "'";
12336       err = -EINVAL;
12337       goto reply;
12338     }
12339     if (!osdmap.pg_exists(pgid)) {
12340       ss << "pg " << pgid << " should not exist";
12341       err = -ENOENT;
12342       goto reply;
12343     }
12344     bool sure = false;
12345     cmd_getval(cct, cmdmap, "yes_i_really_mean_it", sure);
12346     if (!sure) {
12347       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
12348          << "that the cluster will give up ever trying to recover the lost data.  Do this "
12349          << "only if you are certain that all copies of the PG are in fact lost and you are "
12350          << "willing to accept that the data is permanently destroyed.  Pass "
12351          << "--yes-i-really-mean-it to proceed.";
12352       err = -EPERM;
12353       goto reply;
12354     }
12355     bool creating_now;
12356     {
12357       std::lock_guard<std::mutex> l(creating_pgs_lock);
12358       auto emplaced = creating_pgs.pgs.emplace(pgid,
12359                                                make_pair(osdmap.get_epoch(),
12360                                                          ceph_clock_now()));
12361       creating_now = emplaced.second;
12362     }
12363     if (creating_now) {
12364       ss << "pg " << pgidstr << " now creating, ok";
12365       // set the pool's CREATING flag so that (1) the osd won't ignore our
12366       // create message and (2) we won't propose any future pg_num changes
12367       // until after the PG has been instantiated.
12368       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
12369         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
12370       }
12371       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
12372       err = 0;
12373       goto update;
12374     } else {
12375       ss << "pg " << pgid << " already creating";
12376       err = 0;
12377       goto reply;
12378     }
12379   } else {
12380     err = -EINVAL;
12381   }
12382
12383  reply:
12384   getline(ss, rs);
12385   if (err < 0 && rs.length() == 0)
12386     rs = cpp_strerror(err);
12387   mon->reply_command(op, err, rs, rdata, get_last_committed());
12388   return ret;
12389
12390  update:
12391   getline(ss, rs);
12392   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12393                                             get_last_committed() + 1));
12394   return true;
12395
12396  wait:
12397   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12398   return true;
12399 }
12400
12401 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
12402 {
12403   op->mark_osdmon_event(__func__);
12404
12405   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12406   MonSession *session = op->get_session();
12407   if (!session) {
12408     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12409     return true;
12410   }
12411
12412   switch (m->op) {
12413   case POOL_OP_CREATE_UNMANAGED_SNAP:
12414   case POOL_OP_DELETE_UNMANAGED_SNAP:
12415     {
12416       const std::string* pool_name = nullptr;
12417       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
12418       if (pg_pool != nullptr) {
12419         pool_name = &osdmap.get_pool_name(m->pool);
12420       }
12421
12422       if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
12423                                           session->entity_name, session->caps,
12424                                           session->get_peer_socket_addr(),
12425                                           pool_name)) {
12426         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
12427                 << "privileges. message: " << *m  << std::endl
12428                 << "caps: " << session->caps << dendl;
12429         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12430         return true;
12431       }
12432     }
12433     break;
12434   default:
12435     if (!session->is_capable("osd", MON_CAP_W)) {
12436       dout(0) << "got pool op from entity with insufficient privileges. "
12437               << "message: " << *m  << std::endl
12438               << "caps: " << session->caps << dendl;
12439       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
12440       return true;
12441     }
12442     break;
12443   }
12444
12445   return false;
12446 }
12447
12448 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
12449 {
12450   op->mark_osdmon_event(__func__);
12451   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12452
12453   if (enforce_pool_op_caps(op)) {
12454     return true;
12455   }
12456
12457   if (m->fsid != mon->monmap->fsid) {
12458     dout(0) << __func__ << " drop message on fsid " << m->fsid
12459             << " != " << mon->monmap->fsid << " for " << *m << dendl;
12460     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12461     return true;
12462   }
12463
12464   if (m->op == POOL_OP_CREATE)
12465     return preprocess_pool_op_create(op);
12466
12467   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
12468   if (p == nullptr) {
12469     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
12470     if (m->op == POOL_OP_DELETE) {
12471       _pool_op_reply(op, 0, osdmap.get_epoch());
12472     } else {
12473       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12474     }
12475     return true;
12476   }
12477
12478   // check if the snap and snapname exist
12479   bool snap_exists = false;
12480   if (p->snap_exists(m->name.c_str()))
12481     snap_exists = true;
12482
12483   switch (m->op) {
12484   case POOL_OP_CREATE_SNAP:
12485     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
12486       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12487       return true;
12488     }
12489     if (snap_exists) {
12490       _pool_op_reply(op, 0, osdmap.get_epoch());
12491       return true;
12492     }
12493     return false;
12494   case POOL_OP_CREATE_UNMANAGED_SNAP:
12495     if (p->is_pool_snaps_mode()) {
12496       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12497       return true;
12498     }
12499     return false;
12500   case POOL_OP_DELETE_SNAP:
12501     if (p->is_unmanaged_snaps_mode()) {
12502       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12503       return true;
12504     }
12505     if (!snap_exists) {
12506       _pool_op_reply(op, 0, osdmap.get_epoch());
12507       return true;
12508     }
12509     return false;
12510   case POOL_OP_DELETE_UNMANAGED_SNAP:
12511     if (p->is_pool_snaps_mode()) {
12512       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12513       return true;
12514     }
12515     if (p->is_removed_snap(m->snapid)) {
12516       _pool_op_reply(op, 0, osdmap.get_epoch());
12517       return true;
12518     }
12519     return false;
12520   case POOL_OP_DELETE:
12521     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
12522       _pool_op_reply(op, 0, osdmap.get_epoch());
12523       return true;
12524     }
12525     return false;
12526   case POOL_OP_AUID_CHANGE:
12527     return false;
12528   default:
12529     ceph_abort();
12530     break;
12531   }
12532
12533   return false;
12534 }
12535
12536 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
12537 {
12538   op->mark_osdmon_event(__func__);
12539   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12540   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
12541   if (pool >= 0) {
12542     _pool_op_reply(op, 0, osdmap.get_epoch());
12543     return true;
12544   }
12545
12546   return false;
12547 }
12548
12549 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
12550 {
12551   op->mark_osdmon_event(__func__);
12552   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12553   dout(10) << "prepare_pool_op " << *m << dendl;
12554   if (m->op == POOL_OP_CREATE) {
12555     return prepare_pool_op_create(op);
12556   } else if (m->op == POOL_OP_DELETE) {
12557     return prepare_pool_op_delete(op);
12558   }
12559
12560   int ret = 0;
12561   bool changed = false;
12562
12563   if (!osdmap.have_pg_pool(m->pool)) {
12564     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12565     return false;
12566   }
12567
12568   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
12569
12570   switch (m->op) {
12571     case POOL_OP_CREATE_SNAP:
12572       if (pool->is_tier()) {
12573         ret = -EINVAL;
12574         _pool_op_reply(op, ret, osdmap.get_epoch());
12575         return false;
12576       }  // else, fall through
12577     case POOL_OP_DELETE_SNAP:
12578       if (!pool->is_unmanaged_snaps_mode()) {
12579         bool snap_exists = pool->snap_exists(m->name.c_str());
12580         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
12581           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
12582           ret = 0;
12583         } else {
12584           break;
12585         }
12586       } else {
12587         ret = -EINVAL;
12588       }
12589       _pool_op_reply(op, ret, osdmap.get_epoch());
12590       return false;
12591
12592     case POOL_OP_DELETE_UNMANAGED_SNAP:
12593       // we won't allow removal of an unmanaged snapshot from a pool
12594       // not in unmanaged snaps mode.
12595       if (!pool->is_unmanaged_snaps_mode()) {
12596         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
12597         return false;
12598       }
12599       /* fall-thru */
12600     case POOL_OP_CREATE_UNMANAGED_SNAP:
12601       // but we will allow creating an unmanaged snapshot on any pool
12602       // as long as it is not in 'pool' snaps mode.
12603       if (pool->is_pool_snaps_mode()) {
12604         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
12605         return false;
12606       }
12607   }
12608
12609   // projected pool info
12610   pg_pool_t pp;
12611   if (pending_inc.new_pools.count(m->pool))
12612     pp = pending_inc.new_pools[m->pool];
12613   else
12614     pp = *osdmap.get_pg_pool(m->pool);
12615
12616   bufferlist reply_data;
12617
12618   // pool snaps vs unmanaged snaps are mutually exclusive
12619   switch (m->op) {
12620   case POOL_OP_CREATE_SNAP:
12621   case POOL_OP_DELETE_SNAP:
12622     if (pp.is_unmanaged_snaps_mode()) {
12623       ret = -EINVAL;
12624       goto out;
12625     }
12626     break;
12627
12628   case POOL_OP_CREATE_UNMANAGED_SNAP:
12629   case POOL_OP_DELETE_UNMANAGED_SNAP:
12630     if (pp.is_pool_snaps_mode()) {
12631       ret = -EINVAL;
12632       goto out;
12633     }
12634   }
12635
12636   switch (m->op) {
12637   case POOL_OP_CREATE_SNAP:
12638     if (!pp.snap_exists(m->name.c_str())) {
12639       pp.add_snap(m->name.c_str(), ceph_clock_now());
12640       dout(10) << "create snap in pool " << m->pool << " " << m->name
12641                << " seq " << pp.get_snap_epoch() << dendl;
12642       changed = true;
12643     }
12644     break;
12645
12646   case POOL_OP_DELETE_SNAP:
12647     {
12648       snapid_t s = pp.snap_exists(m->name.c_str());
12649       if (s) {
12650         pp.remove_snap(s);
12651         pending_inc.new_removed_snaps[m->pool].insert(s);
12652         changed = true;
12653       }
12654     }
12655     break;
12656
12657   case POOL_OP_CREATE_UNMANAGED_SNAP:
12658     {
12659       uint64_t snapid;
12660       pp.add_unmanaged_snap(snapid);
12661       encode(snapid, reply_data);
12662       changed = true;
12663     }
12664     break;
12665
12666   case POOL_OP_DELETE_UNMANAGED_SNAP:
12667     if (!pp.is_removed_snap(m->snapid)) {
12668       if (m->snapid > pp.get_snap_seq()) {
12669         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
12670         return false;
12671       }
12672       pp.remove_unmanaged_snap(m->snapid);
12673       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
12674       changed = true;
12675     }
12676     break;
12677
12678   case POOL_OP_AUID_CHANGE:
12679     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
12680     return false;
12681
12682   default:
12683     ceph_abort();
12684     break;
12685   }
12686
12687   if (changed) {
12688     pp.set_snap_epoch(pending_inc.epoch);
12689     pending_inc.new_pools[m->pool] = pp;
12690   }
12691
12692  out:
12693   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
12694   return true;
12695 }
12696
12697 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
12698 {
12699   op->mark_osdmon_event(__func__);
12700   int err = prepare_new_pool(op);
12701   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
12702   return true;
12703 }
12704
12705 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
12706                                    ostream *ss)
12707 {
12708   const string& poolstr = osdmap.get_pool_name(pool_id);
12709
12710   // If the Pool is in use by CephFS, refuse to delete it
12711   FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12712   if (pending_fsmap.pool_in_use(pool_id)) {
12713     *ss << "pool '" << poolstr << "' is in use by CephFS";
12714     return -EBUSY;
12715   }
12716
12717   if (pool.tier_of >= 0) {
12718     *ss << "pool '" << poolstr << "' is a tier of '"
12719         << osdmap.get_pool_name(pool.tier_of) << "'";
12720     return -EBUSY;
12721   }
12722   if (!pool.tiers.empty()) {
12723     *ss << "pool '" << poolstr << "' has tiers";
12724     for(auto tier : pool.tiers) {
12725       *ss << " " << osdmap.get_pool_name(tier);
12726     }
12727     return -EBUSY;
12728   }
12729
12730   if (!g_conf()->mon_allow_pool_delete) {
12731     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
12732     return -EPERM;
12733   }
12734
12735   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
12736     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
12737     return -EPERM;
12738   }
12739
12740   *ss << "pool '" << poolstr << "' removed";
12741   return 0;
12742 }
12743
12744 /**
12745  * Check if it is safe to add a tier to a base pool
12746  *
12747  * @return
12748  * True if the operation should proceed, false if we should abort here
12749  * (abort doesn't necessarily mean error, could be idempotency)
12750  */
12751 bool OSDMonitor::_check_become_tier(
12752     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
12753     const int64_t base_pool_id, const pg_pool_t *base_pool,
12754     int *err,
12755     ostream *ss) const
12756 {
12757   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
12758   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12759
12760   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12761   if (pending_fsmap.pool_in_use(tier_pool_id)) {
12762     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
12763     *err = -EBUSY;
12764     return false;
12765   }
12766
12767   if (base_pool->tiers.count(tier_pool_id)) {
12768     ceph_assert(tier_pool->tier_of == base_pool_id);
12769     *err = 0;
12770     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
12771       << base_pool_name << "'";
12772     return false;
12773   }
12774
12775   if (base_pool->is_tier()) {
12776     *ss << "pool '" << base_pool_name << "' is already a tier of '"
12777       << osdmap.get_pool_name(base_pool->tier_of) << "', "
12778       << "multiple tiers are not yet supported.";
12779     *err = -EINVAL;
12780     return false;
12781   }
12782
12783   if (tier_pool->has_tiers()) {
12784     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
12785     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
12786          it != tier_pool->tiers.end(); ++it)
12787       *ss << "'" << osdmap.get_pool_name(*it) << "',";
12788     *ss << " multiple tiers are not yet supported.";
12789     *err = -EINVAL;
12790     return false;
12791   }
12792
12793   if (tier_pool->is_tier()) {
12794     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
12795        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
12796     *err = -EINVAL;
12797     return false;
12798   }
12799
12800   *err = 0;
12801   return true;
12802 }
12803
12804
12805 /**
12806  * Check if it is safe to remove a tier from this base pool
12807  *
12808  * @return
12809  * True if the operation should proceed, false if we should abort here
12810  * (abort doesn't necessarily mean error, could be idempotency)
12811  */
12812 bool OSDMonitor::_check_remove_tier(
12813     const int64_t base_pool_id, const pg_pool_t *base_pool,
12814     const pg_pool_t *tier_pool,
12815     int *err, ostream *ss) const
12816 {
12817   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
12818
12819   // Apply CephFS-specific checks
12820   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
12821   if (pending_fsmap.pool_in_use(base_pool_id)) {
12822     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
12823       // If the underlying pool is erasure coded and does not allow EC
12824       // overwrites, we can't permit the removal of the replicated tier that
12825       // CephFS relies on to access it
12826       *ss << "pool '" << base_pool_name <<
12827           "' does not allow EC overwrites and is in use by CephFS"
12828           " via its tier";
12829       *err = -EBUSY;
12830       return false;
12831     }
12832
12833     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
12834       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
12835              "tier is still in use as a writeback cache.  Change the cache "
12836              "mode and flush the cache before removing it";
12837       *err = -EBUSY;
12838       return false;
12839     }
12840   }
12841
12842   *err = 0;
12843   return true;
12844 }
12845
12846 int OSDMonitor::_prepare_remove_pool(
12847   int64_t pool, ostream *ss, bool no_fake)
12848 {
12849   dout(10) << __func__ << " " << pool << dendl;
12850   const pg_pool_t *p = osdmap.get_pg_pool(pool);
12851   int r = _check_remove_pool(pool, *p, ss);
12852   if (r < 0)
12853     return r;
12854
12855   auto new_pool = pending_inc.new_pools.find(pool);
12856   if (new_pool != pending_inc.new_pools.end()) {
12857     // if there is a problem with the pending info, wait and retry
12858     // this op.
12859     const auto& p = new_pool->second;
12860     int r = _check_remove_pool(pool, p, ss);
12861     if (r < 0)
12862       return -EAGAIN;
12863   }
12864
12865   if (pending_inc.old_pools.count(pool)) {
12866     dout(10) << __func__ << " " << pool << " already pending removal"
12867              << dendl;
12868     return 0;
12869   }
12870
12871   if (g_conf()->mon_fake_pool_delete && !no_fake) {
12872     string old_name = osdmap.get_pool_name(pool);
12873     string new_name = old_name + "." + stringify(pool) + ".DELETED";
12874     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
12875             << old_name << " -> " << new_name << dendl;
12876     pending_inc.new_pool_names[pool] = new_name;
12877     return 0;
12878   }
12879
12880   // remove
12881   pending_inc.old_pools.insert(pool);
12882
12883   // remove any pg_temp mappings for this pool
12884   for (auto p = osdmap.pg_temp->begin();
12885        p != osdmap.pg_temp->end();
12886        ++p) {
12887     if (p->first.pool() == pool) {
12888       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
12889                << p->first << dendl;
12890       pending_inc.new_pg_temp[p->first].clear();
12891     }
12892   }
12893   // remove any primary_temp mappings for this pool
12894   for (auto p = osdmap.primary_temp->begin();
12895       p != osdmap.primary_temp->end();
12896       ++p) {
12897     if (p->first.pool() == pool) {
12898       dout(10) << __func__ << " " << pool
12899                << " removing obsolete primary_temp" << p->first << dendl;
12900       pending_inc.new_primary_temp[p->first] = -1;
12901     }
12902   }
12903   // remove any pg_upmap mappings for this pool
12904   for (auto& p : osdmap.pg_upmap) {
12905     if (p.first.pool() == pool) {
12906       dout(10) << __func__ << " " << pool
12907                << " removing obsolete pg_upmap "
12908                << p.first << dendl;
12909       pending_inc.old_pg_upmap.insert(p.first);
12910     }
12911   }
12912   // remove any pending pg_upmap mappings for this pool
12913   {
12914     auto it = pending_inc.new_pg_upmap.begin();
12915     while (it != pending_inc.new_pg_upmap.end()) {
12916       if (it->first.pool() == pool) {
12917         dout(10) << __func__ << " " << pool
12918                  << " removing pending pg_upmap "
12919                  << it->first << dendl;
12920         it = pending_inc.new_pg_upmap.erase(it);
12921       } else {
12922         it++;
12923       }
12924     }
12925   }
12926   // remove any pg_upmap_items mappings for this pool
12927   for (auto& p : osdmap.pg_upmap_items) {
12928     if (p.first.pool() == pool) {
12929       dout(10) << __func__ << " " << pool
12930                << " removing obsolete pg_upmap_items " << p.first
12931                << dendl;
12932       pending_inc.old_pg_upmap_items.insert(p.first);
12933     }
12934   }
12935   // remove any pending pg_upmap mappings for this pool
12936   {
12937     auto it = pending_inc.new_pg_upmap_items.begin();
12938     while (it != pending_inc.new_pg_upmap_items.end()) {
12939       if (it->first.pool() == pool) {
12940         dout(10) << __func__ << " " << pool
12941                  << " removing pending pg_upmap_items "
12942                  << it->first << dendl;
12943         it = pending_inc.new_pg_upmap_items.erase(it);
12944       } else {
12945         it++;
12946       }
12947     }
12948   }
12949
12950   // remove any choose_args for this pool
12951   CrushWrapper newcrush;
12952   _get_pending_crush(newcrush);
12953   if (newcrush.have_choose_args(pool)) {
12954     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
12955     newcrush.rm_choose_args(pool);
12956     pending_inc.crush.clear();
12957     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
12958   }
12959   return 0;
12960 }
12961
12962 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
12963 {
12964   dout(10) << "_prepare_rename_pool " << pool << dendl;
12965   if (pending_inc.old_pools.count(pool)) {
12966     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
12967     return -ENOENT;
12968   }
12969   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
12970        p != pending_inc.new_pool_names.end();
12971        ++p) {
12972     if (p->second == newname && p->first != pool) {
12973       return -EEXIST;
12974     }
12975   }
12976
12977   pending_inc.new_pool_names[pool] = newname;
12978   return 0;
12979 }
12980
12981 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
12982 {
12983   op->mark_osdmon_event(__func__);
12984   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
12985   ostringstream ss;
12986   int ret = _prepare_remove_pool(m->pool, &ss, false);
12987   if (ret == -EAGAIN) {
12988     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12989     return true;
12990   }
12991   if (ret < 0)
12992     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
12993   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
12994                                                       pending_inc.epoch));
12995   return true;
12996 }
12997
12998 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
12999                                 int ret, epoch_t epoch, bufferlist *blp)
13000 {
13001   op->mark_osdmon_event(__func__);
13002   MPoolOp *m = static_cast<MPoolOp*>(op->get_req());
13003   dout(20) << "_pool_op_reply " << ret << dendl;
13004   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13005                                          ret, epoch, get_last_committed(), blp);
13006   mon->send_reply(op, reply);
13007 }
13008
13009 void OSDMonitor::convert_pool_priorities(void)
13010 {
13011   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13012   int64_t max_prio = 0;
13013   int64_t min_prio = 0;
13014   for (const auto &i : osdmap.get_pools()) {
13015     const auto &pool = i.second;
13016
13017     if (pool.opts.is_set(key)) {
13018       int64_t prio;
13019       pool.opts.get(key, &prio);
13020       if (prio > max_prio)
13021         max_prio = prio;
13022       if (prio < min_prio)
13023         min_prio = prio;
13024     }
13025   }
13026   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13027     dout(20) << __func__ << " nothing to fix" << dendl;
13028     return;
13029   }
13030   // Current pool priorities exceeds new maximum
13031   for (const auto &i : osdmap.get_pools()) {
13032     const auto pool_id = i.first;
13033     pg_pool_t pool = i.second;
13034
13035     int64_t prio = 0;
13036     pool.opts.get(key, &prio);
13037     int64_t n;
13038
13039     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13040       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13041       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13042     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13043       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
13044       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13045     } else {
13046       continue;
13047     }
13048     if (n == 0) {
13049       pool.opts.unset(key);
13050     } else {
13051       pool.opts.set(key, static_cast<int64_t>(n));
13052     }
13053     dout(10) << __func__ << " pool " << pool_id
13054              << " recovery_priority adjusted "
13055              << prio << " to " << n << dendl;
13056     pool.last_change = pending_inc.epoch;
13057     pending_inc.new_pools[pool_id] = pool;
13058   }
13059 }