ceph/src/osd/OSD.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2017 OVH
   8  *
   9  * This is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License version 2.1, as published by the Free Software
  12  * Foundation.  See file COPYING.
  13  *
  14  */
  15
  16 #include "acconfig.h"
  17
  18 #include <cctype>
  19 #include <fstream>
  20 #include <iostream>
  21 #include <iterator>
  22
  23 #include <unistd.h>
  24 #include <sys/stat.h>
  25 #include <signal.h>
  26 #include <time.h>
  27 #include <boost/range/adaptor/reversed.hpp>
  28
  29 #ifdef HAVE_SYS_PARAM_H
  30 #include <sys/param.h>
  31 #endif
  32
  33 #ifdef HAVE_SYS_MOUNT_H
  34 #include <sys/mount.h>
  35 #endif
  36
  37 #include "osd/PG.h"
  38 #include "osd/scrubber/scrub_machine.h"
  39 #include "osd/scrubber/pg_scrubber.h"
  40
  41 #include "include/types.h"
  42 #include "include/compat.h"
  43 #include "include/random.h"
  44 #include "include/scope_guard.h"
  45
  46 #include "OSD.h"
  47 #include "OSDMap.h"
  48 #include "Watch.h"
  49 #include "osdc/Objecter.h"
  50
  51 #include "common/errno.h"
  52 #include "common/ceph_argparse.h"
  53 #include "common/ceph_releases.h"
  54 #include "common/ceph_time.h"
  55 #include "common/version.h"
  56 #include "common/async/blocked_completion.h"
  57 #include "common/pick_address.h"
  58 #include "common/blkdev.h"
  59 #include "common/numa.h"
  60
  61 #include "os/ObjectStore.h"
  62 #ifdef HAVE_LIBFUSE
  63 #include "os/FuseStore.h"
  64 #endif
  65
  66 #include "PrimaryLogPG.h"
  67
  68 #include "msg/Messenger.h"
  69 #include "msg/Message.h"
  70
  71 #include "mon/MonClient.h"
  72
  73 #include "messages/MLog.h"
  74
  75 #include "messages/MGenericMessage.h"
  76 #include "messages/MOSDPing.h"
  77 #include "messages/MOSDFailure.h"
  78 #include "messages/MOSDMarkMeDown.h"
  79 #include "messages/MOSDMarkMeDead.h"
  80 #include "messages/MOSDFull.h"
  81 #include "messages/MOSDOp.h"
  82 #include "messages/MOSDOpReply.h"
  83 #include "messages/MOSDBackoff.h"
  84 #include "messages/MOSDBeacon.h"
  85 #include "messages/MOSDRepOp.h"
  86 #include "messages/MOSDRepOpReply.h"
  87 #include "messages/MOSDBoot.h"
  88 #include "messages/MOSDPGTemp.h"
  89 #include "messages/MOSDPGReadyToMerge.h"
  90
  91 #include "messages/MOSDMap.h"
  92 #include "messages/MMonGetOSDMap.h"
  93 #include "messages/MOSDPGNotify.h"
  94 #include "messages/MOSDPGNotify2.h"
  95 #include "messages/MOSDPGQuery2.h"
  96 #include "messages/MOSDPGLog.h"
  97 #include "messages/MOSDPGRemove.h"
  98 #include "messages/MOSDPGInfo.h"
  99 #include "messages/MOSDPGInfo2.h"
 100 #include "messages/MOSDPGCreate.h"
 101 #include "messages/MOSDPGCreate2.h"
 102 #include "messages/MBackfillReserve.h"
 103 #include "messages/MRecoveryReserve.h"
 104 #include "messages/MOSDForceRecovery.h"
 105 #include "messages/MOSDECSubOpWrite.h"
 106 #include "messages/MOSDECSubOpWriteReply.h"
 107 #include "messages/MOSDECSubOpRead.h"
 108 #include "messages/MOSDECSubOpReadReply.h"
 109 #include "messages/MOSDPGCreated.h"
 110 #include "messages/MOSDPGUpdateLogMissing.h"
 111 #include "messages/MOSDPGUpdateLogMissingReply.h"
 112
 113 #include "messages/MOSDPeeringOp.h"
 114
 115 #include "messages/MOSDAlive.h"
 116
 117 #include "messages/MOSDScrub.h"
 118 #include "messages/MOSDScrub2.h"
 119
 120 #include "messages/MCommand.h"
 121 #include "messages/MCommandReply.h"
 122
 123 #include "messages/MPGStats.h"
 124
 125 #include "messages/MMonGetPurgedSnaps.h"
 126 #include "messages/MMonGetPurgedSnapsReply.h"
 127
 128 #include "common/perf_counters.h"
 129 #include "common/Timer.h"
 130 #include "common/LogClient.h"
 131 #include "common/AsyncReserver.h"
 132 #include "common/HeartbeatMap.h"
 133 #include "common/admin_socket.h"
 134 #include "common/ceph_context.h"
 135
 136 #include "global/signal_handler.h"
 137 #include "global/pidfile.h"
 138
 139 #include "include/color.h"
 140 #include "perfglue/cpu_profiler.h"
 141 #include "perfglue/heap_profiler.h"
 142
 143 #include "osd/ClassHandler.h"
 144 #include "osd/OpRequest.h"
 145
 146 #include "auth/AuthAuthorizeHandler.h"
 147 #include "auth/RotatingKeyRing.h"
 148
 149 #include "objclass/objclass.h"
 150
 151 #include "common/cmdparse.h"
 152 #include "include/str_list.h"
 153 #include "include/util.h"
 154
 155 #include "include/ceph_assert.h"
 156 #include "common/config.h"
 157 #include "common/EventTrace.h"
 158
 159 #include "json_spirit/json_spirit_reader.h"
 160 #include "json_spirit/json_spirit_writer.h"
 161
 162 #ifdef WITH_LTTNG
 163 #define TRACEPOINT_DEFINE
 164 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 165 #include "tracing/osd.h"
 166 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 167 #undef TRACEPOINT_DEFINE
 168 #else
 169 #define tracepoint(...)
 170 #endif
 171
 172 #include "osd_tracer.h"
 173
 174
 175 #define dout_context cct
 176 #define dout_subsys ceph_subsys_osd
 177 #undef dout_prefix
 178 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
 179
 180 using std::deque;
 181 using std::list;
 182 using std::lock_guard;
 183 using std::make_pair;
 184 using std::make_tuple;
 185 using std::make_unique;
 186 using std::map;
 187 using std::ostream;
 188 using std::ostringstream;
 189 using std::pair;
 190 using std::set;
 191 using std::string;
 192 using std::stringstream;
 193 using std::to_string;
 194 using std::unique_ptr;
 195 using std::vector;
 196
 197 using ceph::bufferlist;
 198 using ceph::bufferptr;
 199 using ceph::decode;
 200 using ceph::encode;
 201 using ceph::fixed_u_to_string;
 202 using ceph::Formatter;
 203 using ceph::heartbeat_handle_d;
 204 using ceph::make_mutex;
 205
 206 using namespace ceph::osd::scheduler;
 207 using TOPNSPC::common::cmd_getval;
 208 using TOPNSPC::common::cmd_getval_or;
 209
 210 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
 211   return *_dout << "osd." << whoami << " " << epoch << " ";
 212 }
 213
 214
 215 //Initial features in new superblock.
 216 //Features here are also automatically upgraded
 217 CompatSet OSD::get_osd_initial_compat_set() {
 218   CompatSet::FeatureSet ceph_osd_feature_compat;
 219   CompatSet::FeatureSet ceph_osd_feature_ro_compat;
 220   CompatSet::FeatureSet ceph_osd_feature_incompat;
 221   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
 222   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
 223   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
 224   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
 225   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
 226   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
 227   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
 228   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
 229   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
 230   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
 231   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
 232   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
 233   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
 234   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
 235   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
 236   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
 237   return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
 238                    ceph_osd_feature_incompat);
 239 }
 240
 241 //Features are added here that this OSD supports.
 242 CompatSet OSD::get_osd_compat_set() {
 243   CompatSet compat =  get_osd_initial_compat_set();
 244   //Any features here can be set in code, but not in initial superblock
 245   compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
 246   return compat;
 247 }
 248
 249 OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
 250   osd(osd),
 251   cct(osd->cct),
 252   whoami(osd->whoami), store(osd->store.get()),
 253   log_client(osd->log_client), clog(osd->clog),
 254   pg_recovery_stats(osd->pg_recovery_stats),
 255   cluster_messenger(osd->cluster_messenger),
 256   client_messenger(osd->client_messenger),
 257   logger(osd->logger),
 258   recoverystate_perf(osd->recoverystate_perf),
 259   monc(osd->monc),
 260   osd_max_object_size(cct->_conf, "osd_max_object_size"),
 261   osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
 262   publish_lock{ceph::make_mutex("OSDService::publish_lock")},
 263   pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
 264   max_oldest_map(0),
 265   m_scrub_queue{cct, *this},
 266   agent_valid_iterator(false),
 267   agent_ops(0),
 268   flush_mode_high_count(0),
 269   agent_active(true),
 270   agent_thread(this),
 271   agent_stop_flag(false),
 272   agent_timer(osd->client_messenger->cct, agent_timer_lock),
 273   last_recalibrate(ceph_clock_now()),
 274   promote_max_objects(0),
 275   promote_max_bytes(0),
 276   poolctx(poolctx),
 277   objecter(make_unique<Objecter>(osd->client_messenger->cct,
 278                                  osd->objecter_messenger,
 279                                  osd->monc, poolctx)),
 280   m_objecter_finishers(cct->_conf->osd_objecter_finishers),
 281   watch_timer(osd->client_messenger->cct, watch_lock),
 282   next_notif_id(0),
 283   recovery_request_timer(cct, recovery_request_lock, false),
 284   sleep_timer(cct, sleep_lock, false),
 285   reserver_finisher(cct),
 286   local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 287                  cct->_conf->osd_min_recovery_priority),
 288   remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 289                   cct->_conf->osd_min_recovery_priority),
 290   snap_reserver(cct, &reserver_finisher,
 291                 cct->_conf->osd_max_trimming_pgs),
 292   recovery_ops_active(0),
 293   recovery_ops_reserved(0),
 294   recovery_paused(false),
 295   map_cache(cct, cct->_conf->osd_map_cache_size),
 296   map_bl_cache(cct->_conf->osd_map_cache_size),
 297   map_bl_inc_cache(cct->_conf->osd_map_cache_size),
 298   cur_state(NONE),
 299   cur_ratio(0), physical_ratio(0),
 300   boot_epoch(0), up_epoch(0), bind_epoch(0)
 301 {
 302   objecter->init();
 303
 304   for (int i = 0; i < m_objecter_finishers; i++) {
 305     ostringstream str;
 306     str << "objecter-finisher-" << i;
 307     auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
 308     objecter_finishers.push_back(std::move(fin));
 309   }
 310 }
 311
 312 #ifdef PG_DEBUG_REFS
 313 void OSDService::add_pgid(spg_t pgid, PG *pg) {
 314   std::lock_guard l(pgid_lock);
 315   if (!pgid_tracker.count(pgid)) {
 316     live_pgs[pgid] = pg;
 317   }
 318   pgid_tracker[pgid]++;
 319 }
 320 void OSDService::remove_pgid(spg_t pgid, PG *pg)
 321 {
 322   std::lock_guard l(pgid_lock);
 323   ceph_assert(pgid_tracker.count(pgid));
 324   ceph_assert(pgid_tracker[pgid] > 0);
 325   pgid_tracker[pgid]--;
 326   if (pgid_tracker[pgid] == 0) {
 327     pgid_tracker.erase(pgid);
 328     live_pgs.erase(pgid);
 329   }
 330 }
 331 void OSDService::dump_live_pgids()
 332 {
 333   std::lock_guard l(pgid_lock);
 334   derr << "live pgids:" << dendl;
 335   for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
 336        i != pgid_tracker.cend();
 337        ++i) {
 338     derr << "\t" << *i << dendl;
 339     live_pgs[i->first]->dump_live_ids();
 340   }
 341 }
 342 #endif
 343
 344
 345 ceph::signedspan OSDService::get_mnow()
 346 {
 347   return ceph::mono_clock::now() - osd->startup_time;
 348 }
 349
 350 void OSDService::identify_splits_and_merges(
 351   OSDMapRef old_map,
 352   OSDMapRef new_map,
 353   spg_t pgid,
 354   set<pair<spg_t,epoch_t>> *split_children,
 355   set<pair<spg_t,epoch_t>> *merge_pgs)
 356 {
 357   if (!old_map->have_pg_pool(pgid.pool())) {
 358     return;
 359   }
 360   int old_pgnum = old_map->get_pg_num(pgid.pool());
 361   auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
 362   if (p == osd->pg_num_history.pg_nums.end()) {
 363     return;
 364   }
 365   dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
 366            << " to e" << new_map->get_epoch()
 367            << " pg_nums " << p->second << dendl;
 368   deque<spg_t> queue;
 369   queue.push_back(pgid);
 370   set<spg_t> did;
 371   while (!queue.empty()) {
 372     auto cur = queue.front();
 373     queue.pop_front();
 374     did.insert(cur);
 375     unsigned pgnum = old_pgnum;
 376     for (auto q = p->second.lower_bound(old_map->get_epoch());
 377          q != p->second.end() &&
 378            q->first <= new_map->get_epoch();
 379          ++q) {
 380       if (pgnum < q->second) {
 381         // split?
 382         if (cur.ps() < pgnum) {
 383           set<spg_t> children;
 384           if (cur.is_split(pgnum, q->second, &children)) {
 385             dout(20) << __func__ << " " << cur << " e" << q->first
 386                      << " pg_num " << pgnum << " -> " << q->second
 387                      << " children " << children << dendl;
 388             for (auto i : children) {
 389               split_children->insert(make_pair(i, q->first));
 390               if (!did.count(i))
 391                 queue.push_back(i);
 392             }
 393           }
 394         } else if (cur.ps() < q->second) {
 395           dout(20) << __func__ << " " << cur << " e" << q->first
 396                    << " pg_num " << pgnum << " -> " << q->second
 397                    << " is a child" << dendl;
 398           // normally we'd capture this from the parent, but it's
 399           // possible the parent doesn't exist yet (it will be
 400           // fabricated to allow an intervening merge).  note this PG
 401           // as a split child here to be sure we catch it.
 402           split_children->insert(make_pair(cur, q->first));
 403         } else {
 404           dout(20) << __func__ << " " << cur << " e" << q->first
 405                    << " pg_num " << pgnum << " -> " << q->second
 406                    << " is post-split, skipping" << dendl;
 407         }
 408       } else if (merge_pgs) {
 409         // merge?
 410         if (cur.ps() >= q->second) {
 411           if (cur.ps() < pgnum) {
 412             spg_t parent;
 413             if (cur.is_merge_source(pgnum, q->second, &parent)) {
 414               set<spg_t> children;
 415               parent.is_split(q->second, pgnum, &children);
 416               dout(20) << __func__ << " " << cur << " e" << q->first
 417                        << " pg_num " << pgnum << " -> " << q->second
 418                        << " is merge source, target " << parent
 419                        << ", source(s) " << children << dendl;
 420               merge_pgs->insert(make_pair(parent, q->first));
 421               if (!did.count(parent)) {
 422                 // queue (and re-scan) parent in case it might not exist yet
 423                 // and there are some future splits pending on it
 424                 queue.push_back(parent);
 425               }
 426               for (auto c : children) {
 427                 merge_pgs->insert(make_pair(c, q->first));
 428                 if (!did.count(c))
 429                   queue.push_back(c);
 430               }
 431             }
 432           } else {
 433             dout(20) << __func__ << " " << cur << " e" << q->first
 434                      << " pg_num " << pgnum << " -> " << q->second
 435                      << " is beyond old pgnum, skipping" << dendl;
 436           }
 437         } else {
 438           set<spg_t> children;
 439           if (cur.is_split(q->second, pgnum, &children)) {
 440             dout(20) << __func__ << " " << cur << " e" << q->first
 441                      << " pg_num " << pgnum << " -> " << q->second
 442                      << " is merge target, source " << children << dendl;
 443             for (auto c : children) {
 444               merge_pgs->insert(make_pair(c, q->first));
 445               if (!did.count(c))
 446                 queue.push_back(c);
 447             }
 448             merge_pgs->insert(make_pair(cur, q->first));
 449           }
 450         }
 451       }
 452       pgnum = q->second;
 453     }
 454   }
 455 }
 456
 457 void OSDService::need_heartbeat_peer_update()
 458 {
 459   osd->need_heartbeat_peer_update();
 460 }
 461
 462 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
 463 {
 464   std::lock_guard l(hb_stamp_lock);
 465   if (peer >= hb_stamps.size()) {
 466     hb_stamps.resize(peer + 1);
 467   }
 468   if (!hb_stamps[peer]) {
 469     hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
 470   }
 471   return hb_stamps[peer];
 472 }
 473
 474 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
 475 {
 476   osd->enqueue_peering_evt(
 477     spgid,
 478     PGPeeringEventRef(
 479       std::make_shared<PGPeeringEvent>(
 480         epoch, epoch,
 481         RenewLease())));
 482 }
 483
 484 void OSDService::start_shutdown()
 485 {
 486   {
 487     std::lock_guard l(agent_timer_lock);
 488     agent_timer.shutdown();
 489   }
 490
 491   {
 492     std::lock_guard l(sleep_lock);
 493     sleep_timer.shutdown();
 494   }
 495
 496   {
 497     std::lock_guard l(recovery_request_lock);
 498     recovery_request_timer.shutdown();
 499   }
 500 }
 501
 502 void OSDService::shutdown_reserver()
 503 {
 504   reserver_finisher.wait_for_empty();
 505   reserver_finisher.stop();
 506 }
 507
 508 void OSDService::shutdown()
 509 {
 510   mono_timer.suspend();
 511
 512   {
 513     std::lock_guard l(watch_lock);
 514     watch_timer.shutdown();
 515   }
 516
 517   objecter->shutdown();
 518   for (auto& f : objecter_finishers) {
 519     f->wait_for_empty();
 520     f->stop();
 521   }
 522
 523   publish_map(OSDMapRef());
 524   next_osdmap = OSDMapRef();
 525 }
 526
 527 void OSDService::init()
 528 {
 529   reserver_finisher.start();
 530   for (auto& f : objecter_finishers) {
 531     f->start();
 532   }
 533   objecter->set_client_incarnation(0);
 534
 535   // deprioritize objecter in daemonperf output
 536   objecter->get_logger()->set_prio_adjust(-3);
 537
 538   watch_timer.init();
 539   agent_timer.init();
 540   mono_timer.resume();
 541
 542   agent_thread.create("osd_srv_agent");
 543
 544   if (cct->_conf->osd_recovery_delay_start)
 545     defer_recovery(cct->_conf->osd_recovery_delay_start);
 546 }
 547
 548 void OSDService::final_init()
 549 {
 550   objecter->start(osdmap.get());
 551 }
 552
 553 void OSDService::activate_map()
 554 {
 555   // wake/unwake the tiering agent
 556   std::lock_guard l{agent_lock};
 557   agent_active =
 558     !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
 559     osd->is_active();
 560   agent_cond.notify_all();
 561 }
 562
 563 void OSDService::request_osdmap_update(epoch_t e)
 564 {
 565   osd->osdmap_subscribe(e, false);
 566 }
 567
 568
 569 class AgentTimeoutCB : public Context {
 570   PGRef pg;
 571 public:
 572   explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
 573   void finish(int) override {
 574     pg->agent_choose_mode_restart();
 575   }
 576 };
 577
 578 void OSDService::agent_entry()
 579 {
 580   dout(10) << __func__ << " start" << dendl;
 581   std::unique_lock agent_locker{agent_lock};
 582
 583   while (!agent_stop_flag) {
 584     if (agent_queue.empty()) {
 585       dout(20) << __func__ << " empty queue" << dendl;
 586       agent_cond.wait(agent_locker);
 587       continue;
 588     }
 589     uint64_t level = agent_queue.rbegin()->first;
 590     set<PGRef>& top = agent_queue.rbegin()->second;
 591     dout(10) << __func__
 592              << " tiers " << agent_queue.size()
 593              << ", top is " << level
 594              << " with pgs " << top.size()
 595              << ", ops " << agent_ops << "/"
 596              << cct->_conf->osd_agent_max_ops
 597              << (agent_active ? " active" : " NOT ACTIVE")
 598              << dendl;
 599     dout(20) << __func__ << " oids " << agent_oids << dendl;
 600     int max = cct->_conf->osd_agent_max_ops - agent_ops;
 601     int agent_flush_quota = max;
 602     if (!flush_mode_high_count)
 603       agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
 604     if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
 605       agent_cond.wait(agent_locker);
 606       continue;
 607     }
 608
 609     if (!agent_valid_iterator || agent_queue_pos == top.end()) {
 610       agent_queue_pos = top.begin();
 611       agent_valid_iterator = true;
 612     }
 613     PGRef pg = *agent_queue_pos;
 614     dout(10) << "high_count " << flush_mode_high_count
 615              << " agent_ops " << agent_ops
 616              << " flush_quota " << agent_flush_quota << dendl;
 617     agent_locker.unlock();
 618     if (!pg->agent_work(max, agent_flush_quota)) {
 619       dout(10) << __func__ << " " << pg->pg_id
 620         << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
 621         << " seconds" << dendl;
 622
 623       logger->inc(l_osd_tier_delay);
 624       // Queue a timer to call agent_choose_mode for this pg in 5 seconds
 625       std::lock_guard timer_locker{agent_timer_lock};
 626       Context *cb = new AgentTimeoutCB(pg);
 627       agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
 628     }
 629     agent_locker.lock();
 630   }
 631   dout(10) << __func__ << " finish" << dendl;
 632 }
 633
 634 void OSDService::agent_stop()
 635 {
 636   {
 637     std::lock_guard l(agent_lock);
 638
 639     // By this time all ops should be cancelled
 640     ceph_assert(agent_ops == 0);
 641     // By this time all PGs are shutdown and dequeued
 642     if (!agent_queue.empty()) {
 643       set<PGRef>& top = agent_queue.rbegin()->second;
 644       derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
 645       ceph_abort_msg("agent queue not empty");
 646     }
 647
 648     agent_stop_flag = true;
 649     agent_cond.notify_all();
 650   }
 651   agent_thread.join();
 652 }
 653
 654 // -------------------------------------
 655
 656 void OSDService::promote_throttle_recalibrate()
 657 {
 658   utime_t now = ceph_clock_now();
 659   double dur = now - last_recalibrate;
 660   last_recalibrate = now;
 661   unsigned prob = promote_probability_millis;
 662
 663   uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
 664   uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
 665
 666   unsigned min_prob = 1;
 667
 668   uint64_t attempts, obj, bytes;
 669   promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
 670   dout(10) << __func__ << " " << attempts << " attempts, promoted "
 671            << obj << " objects and " << byte_u_t(bytes) << "; target "
 672            << target_obj_sec << " obj/sec or "
 673            << byte_u_t(target_bytes_sec) << "/sec"
 674            << dendl;
 675
 676   // calculate what the probability *should* be, given the targets
 677   unsigned new_prob;
 678   if (attempts && dur > 0) {
 679     uint64_t avg_size = 1;
 680     if (obj)
 681       avg_size = std::max<uint64_t>(bytes / obj, 1);
 682     unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
 683     unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
 684       / (double)attempts;
 685     dout(20) << __func__ << "  po " << po << " pb " << pb << " avg_size "
 686              << avg_size << dendl;
 687     if (target_obj_sec && target_bytes_sec)
 688       new_prob = std::min(po, pb);
 689     else if (target_obj_sec)
 690       new_prob = po;
 691     else if (target_bytes_sec)
 692       new_prob = pb;
 693     else
 694       new_prob = 1000;
 695   } else {
 696     new_prob = 1000;
 697   }
 698   dout(20) << __func__ << "  new_prob " << new_prob << dendl;
 699
 700   // correct for persistent skew between target rate and actual rate, adjust
 701   double ratio = 1.0;
 702   unsigned actual = 0;
 703   if (attempts && obj) {
 704     actual = obj * 1000 / attempts;
 705     ratio = (double)actual / (double)prob;
 706     new_prob = (double)new_prob / ratio;
 707   }
 708   new_prob = std::max(new_prob, min_prob);
 709   new_prob = std::min(new_prob, 1000u);
 710
 711   // adjust
 712   prob = (prob + new_prob) / 2;
 713   prob = std::max(prob, min_prob);
 714   prob = std::min(prob, 1000u);
 715   dout(10) << __func__ << "  actual " << actual
 716            << ", actual/prob ratio " << ratio
 717            << ", adjusted new_prob " << new_prob
 718            << ", prob " << promote_probability_millis << " -> " << prob
 719            << dendl;
 720   promote_probability_millis = prob;
 721
 722   // set hard limits for this interval to mitigate stampedes
 723   promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
 724   promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
 725 }
 726
 727 // -------------------------------------
 728
 729 float OSDService::get_failsafe_full_ratio()
 730 {
 731   float full_ratio = cct->_conf->osd_failsafe_full_ratio;
 732   if (full_ratio > 1.0) full_ratio /= 100.0;
 733   return full_ratio;
 734 }
 735
 736 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
 737 {
 738   // The OSDMap ratios take precendence.  So if the failsafe is .95 and
 739   // the admin sets the cluster full to .96, the failsafe moves up to .96
 740   // too.  (Not that having failsafe == full is ideal, but it's better than
 741   // dropping writes before the clusters appears full.)
 742   OSDMapRef osdmap = get_osdmap();
 743   if (!osdmap || osdmap->get_epoch() == 0) {
 744     return NONE;
 745   }
 746   float nearfull_ratio = osdmap->get_nearfull_ratio();
 747   float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
 748   float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
 749   float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
 750
 751   if (osdmap->require_osd_release < ceph_release_t::luminous) {
 752     // use the failsafe for nearfull and full; the mon isn't using the
 753     // flags anyway because we're mid-upgrade.
 754     full_ratio = failsafe_ratio;
 755     backfillfull_ratio = failsafe_ratio;
 756     nearfull_ratio = failsafe_ratio;
 757   } else if (full_ratio <= 0 ||
 758              backfillfull_ratio <= 0 ||
 759              nearfull_ratio <= 0) {
 760     derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
 761     // use failsafe flag.  ick.  the monitor did something wrong or the user
 762     // did something stupid.
 763     full_ratio = failsafe_ratio;
 764     backfillfull_ratio = failsafe_ratio;
 765     nearfull_ratio = failsafe_ratio;
 766   }
 767
 768   if (injectfull_state > NONE && injectfull) {
 769     inject = "(Injected)";
 770     return injectfull_state;
 771   } else if (pratio > failsafe_ratio) {
 772     return FAILSAFE;
 773   } else if (ratio > full_ratio) {
 774     return FULL;
 775   } else if (ratio > backfillfull_ratio) {
 776     return BACKFILLFULL;
 777   } else if (pratio > nearfull_ratio) {
 778     return NEARFULL;
 779   }
 780    return NONE;
 781 }
 782
 783 void OSDService::check_full_status(float ratio, float pratio)
 784 {
 785   std::lock_guard l(full_status_lock);
 786
 787   cur_ratio = ratio;
 788   physical_ratio = pratio;
 789
 790   string inject;
 791   s_names new_state;
 792   new_state = recalc_full_state(ratio, pratio, inject);
 793
 794   dout(20) << __func__ << " cur ratio " << ratio
 795            << ", physical ratio " << pratio
 796            << ", new state " << get_full_state_name(new_state)
 797            << " " << inject
 798            << dendl;
 799
 800   // warn
 801   if (cur_state != new_state) {
 802     dout(10) << __func__ << " " << get_full_state_name(cur_state)
 803              << " -> " << get_full_state_name(new_state) << dendl;
 804     if (new_state == FAILSAFE) {
 805       clog->error() << "full status failsafe engaged, dropping updates, now "
 806                     << (int)roundf(ratio * 100) << "% full";
 807     } else if (cur_state == FAILSAFE) {
 808       clog->error() << "full status failsafe disengaged, no longer dropping "
 809                      << "updates, now " << (int)roundf(ratio * 100) << "% full";
 810     }
 811     cur_state = new_state;
 812   }
 813 }
 814
 815 bool OSDService::need_fullness_update()
 816 {
 817   OSDMapRef osdmap = get_osdmap();
 818   s_names cur = NONE;
 819   if (osdmap->exists(whoami)) {
 820     if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
 821       cur = FULL;
 822     } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
 823       cur = BACKFILLFULL;
 824     } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
 825       cur = NEARFULL;
 826     }
 827   }
 828   s_names want = NONE;
 829   if (is_full())
 830     want = FULL;
 831   else if (is_backfillfull())
 832     want = BACKFILLFULL;
 833   else if (is_nearfull())
 834     want = NEARFULL;
 835   return want != cur;
 836 }
 837
 838 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
 839 {
 840   if (injectfull && injectfull_state >= type) {
 841     // injectfull is either a count of the number of times to return failsafe full
 842     // or if -1 then always return full
 843     if (injectfull > 0)
 844       --injectfull;
 845     ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
 846              << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
 847              << dendl;
 848     return true;
 849   }
 850   return false;
 851 }
 852
 853 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
 854 {
 855   std::lock_guard l(full_status_lock);
 856
 857   if (_check_inject_full(dpp, type))
 858     return true;
 859
 860   if (cur_state >= type)
 861     ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
 862                        << " physical " << physical_ratio << dendl;
 863
 864   return cur_state >= type;
 865 }
 866
 867 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
 868 {
 869   ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
 870   {
 871     std::lock_guard l(full_status_lock);
 872     if (_check_inject_full(dpp, type)) {
 873       return true;
 874     }
 875   }
 876
 877   float pratio;
 878   float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
 879
 880   string notused;
 881   s_names tentative_state = recalc_full_state(ratio, pratio, notused);
 882
 883   if (tentative_state >= type)
 884     ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
 885
 886   return tentative_state >= type;
 887 }
 888
 889 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
 890 {
 891   return _check_full(dpp, FAILSAFE);
 892 }
 893
 894 bool OSDService::check_full(DoutPrefixProvider *dpp) const
 895 {
 896   return _check_full(dpp, FULL);
 897 }
 898
 899 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
 900 {
 901   return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
 902 }
 903
 904 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
 905 {
 906   return _check_full(dpp, BACKFILLFULL);
 907 }
 908
 909 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
 910 {
 911   return _check_full(dpp, NEARFULL);
 912 }
 913
 914 bool OSDService::is_failsafe_full() const
 915 {
 916   std::lock_guard l(full_status_lock);
 917   return cur_state == FAILSAFE;
 918 }
 919
 920 bool OSDService::is_full() const
 921 {
 922   std::lock_guard l(full_status_lock);
 923   return cur_state >= FULL;
 924 }
 925
 926 bool OSDService::is_backfillfull() const
 927 {
 928   std::lock_guard l(full_status_lock);
 929   return cur_state >= BACKFILLFULL;
 930 }
 931
 932 bool OSDService::is_nearfull() const
 933 {
 934   std::lock_guard l(full_status_lock);
 935   return cur_state >= NEARFULL;
 936 }
 937
 938 void OSDService::set_injectfull(s_names type, int64_t count)
 939 {
 940   std::lock_guard l(full_status_lock);
 941   injectfull_state = type;
 942   injectfull = count;
 943 }
 944
 945 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
 946                             osd_alert_list_t& alerts)
 947 {
 948   uint64_t bytes = stbuf.total;
 949   uint64_t avail = stbuf.available;
 950   uint64_t used = stbuf.get_used_raw();
 951
 952   // For testing fake statfs values so it doesn't matter if all
 953   // OSDs are using the same partition.
 954   if (cct->_conf->fake_statfs_for_testing) {
 955     uint64_t total_num_bytes = 0;
 956     vector<PGRef> pgs;
 957     osd->_get_pgs(&pgs);
 958     for (auto p : pgs) {
 959       total_num_bytes += p->get_stats_num_bytes();
 960     }
 961     bytes = cct->_conf->fake_statfs_for_testing;
 962     if (total_num_bytes < bytes)
 963       avail = bytes - total_num_bytes;
 964     else
 965       avail = 0;
 966     dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
 967             << " adjust available " << avail
 968             << dendl;
 969     used = bytes - avail;
 970   }
 971
 972   logger->set(l_osd_stat_bytes, bytes);
 973   logger->set(l_osd_stat_bytes_used, used);
 974   logger->set(l_osd_stat_bytes_avail, avail);
 975
 976   std::lock_guard l(stat_lock);
 977   osd_stat.statfs = stbuf;
 978   osd_stat.os_alerts.clear();
 979   osd_stat.os_alerts[whoami].swap(alerts);
 980   if (cct->_conf->fake_statfs_for_testing) {
 981     osd_stat.statfs.total = bytes;
 982     osd_stat.statfs.available = avail;
 983     // For testing don't want used to go negative, so clear reserved
 984     osd_stat.statfs.internally_reserved = 0;
 985   }
 986 }
 987
 988 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
 989                                     int num_pgs)
 990 {
 991   utime_t now = ceph_clock_now();
 992   auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
 993   std::lock_guard l(stat_lock);
 994   osd_stat.hb_peers.swap(hb_peers);
 995   osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
 996   osd_stat.num_pgs = num_pgs;
 997   // Clean entries that aren't updated
 998   // This is called often enough that we can just remove 1 at a time
 999   for (auto i: osd_stat.hb_pingtime) {
1000     if (i.second.last_update == 0)
1001       continue;
1002     if (stale_time && now.sec() - i.second.last_update > stale_time) {
1003       dout(20) << __func__ << " time out heartbeat for osd " << i.first
1004                << " last_update " << i.second.last_update << dendl;
1005       osd_stat.hb_pingtime.erase(i.first);
1006       break;
1007     }
1008   }
1009   return osd_stat;
1010 }
1011
1012 void OSDService::inc_osd_stat_repaired()
1013 {
1014   std::lock_guard l(stat_lock);
1015   osd_stat.num_shards_repaired++;
1016   return;
1017 }
1018
1019 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1020                                          uint64_t adjust_used)
1021 {
1022   *pratio =
1023    ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1024
1025   if (adjust_used) {
1026     dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used()  << dendl;
1027     if (new_stat.statfs.available > adjust_used)
1028       new_stat.statfs.available -= adjust_used;
1029     else
1030       new_stat.statfs.available = 0;
1031     dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1032   }
1033
1034   // Check all pgs and adjust kb_used to include all pending backfill data
1035   int backfill_adjusted = 0;
1036   vector<PGRef> pgs;
1037   osd->_get_pgs(&pgs);
1038   for (auto p : pgs) {
1039     backfill_adjusted += p->pg_stat_adjust(&new_stat);
1040   }
1041   if (backfill_adjusted) {
1042     dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1043   }
1044   return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1045 }
1046
1047 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1048 {
1049   OSDMapRef next_map = get_nextmap_reserved();
1050   // service map is always newer/newest
1051   ceph_assert(from_epoch <= next_map->get_epoch());
1052
1053   if (next_map->is_down(peer) ||
1054       next_map->get_info(peer).up_from > from_epoch) {
1055     m->put();
1056     release_map(next_map);
1057     return;
1058   }
1059   ConnectionRef peer_con;
1060   if (peer == whoami) {
1061     peer_con = osd->cluster_messenger->get_loopback_connection();
1062   } else {
1063     peer_con = osd->cluster_messenger->connect_to_osd(
1064         next_map->get_cluster_addrs(peer), false, true);
1065   }
1066   maybe_share_map(peer_con.get(), next_map);
1067   peer_con->send_message(m);
1068   release_map(next_map);
1069 }
1070
1071 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1072 {
1073   OSDMapRef next_map = get_nextmap_reserved();
1074   // service map is always newer/newest
1075   ceph_assert(from_epoch <= next_map->get_epoch());
1076
1077   for (auto& iter : messages) {
1078     if (next_map->is_down(iter.first) ||
1079         next_map->get_info(iter.first).up_from > from_epoch) {
1080       iter.second->put();
1081       continue;
1082     }
1083     ConnectionRef peer_con;
1084     if (iter.first == whoami) {
1085       peer_con = osd->cluster_messenger->get_loopback_connection();
1086     } else {
1087       peer_con = osd->cluster_messenger->connect_to_osd(
1088           next_map->get_cluster_addrs(iter.first), false, true);
1089     }
1090     maybe_share_map(peer_con.get(), next_map);
1091     peer_con->send_message(iter.second);
1092   }
1093   release_map(next_map);
1094 }
1095 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1096 {
1097   OSDMapRef next_map = get_nextmap_reserved();
1098   // service map is always newer/newest
1099   ceph_assert(from_epoch <= next_map->get_epoch());
1100
1101   if (next_map->is_down(peer) ||
1102       next_map->get_info(peer).up_from > from_epoch) {
1103     release_map(next_map);
1104     return NULL;
1105   }
1106   ConnectionRef con;
1107   if (peer == whoami) {
1108     con = osd->cluster_messenger->get_loopback_connection();
1109   } else {
1110     con = osd->cluster_messenger->connect_to_osd(
1111         next_map->get_cluster_addrs(peer), false, true);
1112   }
1113   release_map(next_map);
1114   return con;
1115 }
1116
1117 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1118 {
1119   OSDMapRef next_map = get_nextmap_reserved();
1120   // service map is always newer/newest
1121   ceph_assert(from_epoch <= next_map->get_epoch());
1122
1123   pair<ConnectionRef,ConnectionRef> ret;
1124   if (next_map->is_down(peer) ||
1125       next_map->get_info(peer).up_from > from_epoch) {
1126     release_map(next_map);
1127     return ret;
1128   }
1129   ret.first = osd->hb_back_client_messenger->connect_to_osd(
1130     next_map->get_hb_back_addrs(peer));
1131   ret.second = osd->hb_front_client_messenger->connect_to_osd(
1132     next_map->get_hb_front_addrs(peer));
1133   release_map(next_map);
1134   return ret;
1135 }
1136
1137 entity_name_t OSDService::get_cluster_msgr_name() const
1138 {
1139   return cluster_messenger->get_myname();
1140 }
1141
1142 void OSDService::queue_want_pg_temp(pg_t pgid,
1143                                     const vector<int>& want,
1144                                     bool forced)
1145 {
1146   std::lock_guard l(pg_temp_lock);
1147   auto p = pg_temp_pending.find(pgid);
1148   if (p == pg_temp_pending.end() ||
1149       p->second.acting != want ||
1150       forced) {
1151     pg_temp_wanted[pgid] = {want, forced};
1152   }
1153 }
1154
1155 void OSDService::remove_want_pg_temp(pg_t pgid)
1156 {
1157   std::lock_guard l(pg_temp_lock);
1158   pg_temp_wanted.erase(pgid);
1159   pg_temp_pending.erase(pgid);
1160 }
1161
1162 void OSDService::_sent_pg_temp()
1163 {
1164 #ifdef HAVE_STDLIB_MAP_SPLICING
1165   pg_temp_pending.merge(pg_temp_wanted);
1166 #else
1167   pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1168                          make_move_iterator(end(pg_temp_wanted)));
1169 #endif
1170   pg_temp_wanted.clear();
1171 }
1172
1173 void OSDService::requeue_pg_temp()
1174 {
1175   std::lock_guard l(pg_temp_lock);
1176   // wanted overrides pending.  note that remove_want_pg_temp
1177   // clears the item out of both.
1178   unsigned old_wanted = pg_temp_wanted.size();
1179   unsigned old_pending = pg_temp_pending.size();
1180   _sent_pg_temp();
1181   pg_temp_wanted.swap(pg_temp_pending);
1182   dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1183            << pg_temp_wanted.size() << dendl;
1184 }
1185
1186 std::ostream& operator<<(std::ostream& out,
1187                          const OSDService::pg_temp_t& pg_temp)
1188 {
1189   out << pg_temp.acting;
1190   if (pg_temp.forced) {
1191     out << " (forced)";
1192   }
1193   return out;
1194 }
1195
1196 void OSDService::send_pg_temp()
1197 {
1198   std::lock_guard l(pg_temp_lock);
1199   if (pg_temp_wanted.empty())
1200     return;
1201   dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1202   MOSDPGTemp *ms[2] = {nullptr, nullptr};
1203   for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1204     auto& m = ms[pg_temp.forced];
1205     if (!m) {
1206       m = new MOSDPGTemp(osdmap->get_epoch());
1207       m->forced = pg_temp.forced;
1208     }
1209     m->pg_temp.emplace(pgid, pg_temp.acting);
1210   }
1211   for (auto m : ms) {
1212     if (m) {
1213       monc->send_mon_message(m);
1214     }
1215   }
1216   _sent_pg_temp();
1217 }
1218
1219 void OSDService::send_pg_created(pg_t pgid)
1220 {
1221   std::lock_guard l(pg_created_lock);
1222   dout(20) << __func__ << dendl;
1223   auto o = get_osdmap();
1224   if (o->require_osd_release >= ceph_release_t::luminous) {
1225     pg_created.insert(pgid);
1226     monc->send_mon_message(new MOSDPGCreated(pgid));
1227   }
1228 }
1229
1230 void OSDService::send_pg_created()
1231 {
1232   std::lock_guard l(pg_created_lock);
1233   dout(20) << __func__ << dendl;
1234   auto o = get_osdmap();
1235   if (o->require_osd_release >= ceph_release_t::luminous) {
1236     for (auto pgid : pg_created) {
1237       monc->send_mon_message(new MOSDPGCreated(pgid));
1238     }
1239   }
1240 }
1241
1242 void OSDService::prune_pg_created()
1243 {
1244   std::lock_guard l(pg_created_lock);
1245   dout(20) << __func__ << dendl;
1246   auto o = get_osdmap();
1247   auto i = pg_created.begin();
1248   while (i != pg_created.end()) {
1249     auto p = o->get_pg_pool(i->pool());
1250     if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1251       dout(20) << __func__ << " pruning " << *i << dendl;
1252       i = pg_created.erase(i);
1253     } else {
1254       dout(20) << __func__ << " keeping " << *i << dendl;
1255       ++i;
1256     }
1257   }
1258 }
1259
1260
1261 // --------------------------------------
1262 // dispatch
1263
1264 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1265                                  epoch_t *_bind_epoch) const
1266 {
1267   std::lock_guard l(epoch_lock);
1268   if (_boot_epoch)
1269     *_boot_epoch = boot_epoch;
1270   if (_up_epoch)
1271     *_up_epoch = up_epoch;
1272   if (_bind_epoch)
1273     *_bind_epoch = bind_epoch;
1274 }
1275
1276 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1277                             const epoch_t *_bind_epoch)
1278 {
1279   std::lock_guard l(epoch_lock);
1280   if (_boot_epoch) {
1281     ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1282     boot_epoch = *_boot_epoch;
1283   }
1284   if (_up_epoch) {
1285     ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1286     up_epoch = *_up_epoch;
1287   }
1288   if (_bind_epoch) {
1289     ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1290     bind_epoch = *_bind_epoch;
1291   }
1292 }
1293
1294 bool OSDService::prepare_to_stop()
1295 {
1296   std::unique_lock l(is_stopping_lock);
1297   if (get_state() != NOT_STOPPING)
1298     return false;
1299
1300   OSDMapRef osdmap = get_osdmap();
1301   if (osdmap && osdmap->is_up(whoami)) {
1302     dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
1303     set_state(PREPARING_TO_STOP);
1304     monc->send_mon_message(
1305       new MOSDMarkMeDown(
1306         monc->get_fsid(),
1307         whoami,
1308         osdmap->get_addrs(whoami),
1309         osdmap->get_epoch(),
1310         true,  // request ack
1311         true   // mark as down and dead
1312         ));
1313     const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1314     is_stopping_cond.wait_for(l, timeout,
1315       [this] { return get_state() == STOPPING; });
1316   }
1317
1318   dout(0) << __func__ << " starting shutdown" << dendl;
1319   set_state(STOPPING);
1320   return true;
1321 }
1322
1323 void OSDService::got_stop_ack()
1324 {
1325   std::scoped_lock l(is_stopping_lock);
1326   if (get_state() == PREPARING_TO_STOP) {
1327     dout(0) << __func__ << " starting shutdown" << dendl;
1328     set_state(STOPPING);
1329     is_stopping_cond.notify_all();
1330   } else {
1331     dout(10) << __func__ << " ignoring msg" << dendl;
1332   }
1333 }
1334
1335 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1336                                                OSDSuperblock& sblock)
1337 {
1338   MOSDMap *m = new MOSDMap(monc->get_fsid(),
1339                            osdmap->get_encoding_features());
1340   m->oldest_map = max_oldest_map;
1341   m->newest_map = sblock.newest_map;
1342
1343   int max = cct->_conf->osd_map_message_max;
1344   ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1345
1346   if (since < m->oldest_map) {
1347     // we don't have the next map the target wants, so start with a
1348     // full map.
1349     bufferlist bl;
1350     dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1351              << since << ", starting with full map" << dendl;
1352     since = m->oldest_map;
1353     if (!get_map_bl(since, bl)) {
1354       derr << __func__ << " missing full map " << since << dendl;
1355       goto panic;
1356     }
1357     max--;
1358     max_bytes -= bl.length();
1359     m->maps[since] = std::move(bl);
1360   }
1361   for (epoch_t e = since + 1; e <= to; ++e) {
1362     bufferlist bl;
1363     if (get_inc_map_bl(e, bl)) {
1364       m->incremental_maps[e] = std::move(bl);
1365     } else {
1366       dout(10) << __func__ << " missing incremental map " << e << dendl;
1367       if (!get_map_bl(e, bl)) {
1368         derr << __func__ << " also missing full map " << e << dendl;
1369         goto panic;
1370       }
1371       m->maps[e] = std::move(bl);
1372     }
1373     max--;
1374     max_bytes -= bl.length();
1375     if (max <= 0 || max_bytes <= 0) {
1376       break;
1377     }
1378   }
1379   return m;
1380
1381  panic:
1382   if (!m->maps.empty() ||
1383       !m->incremental_maps.empty()) {
1384     // send what we have so far
1385     return m;
1386   }
1387   // send something
1388   bufferlist bl;
1389   if (get_inc_map_bl(m->newest_map, bl)) {
1390     m->incremental_maps[m->newest_map] = std::move(bl);
1391   } else {
1392     derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1393     if (!get_map_bl(m->newest_map, bl)) {
1394       derr << __func__ << " unable to load latest full map " << m->newest_map
1395            << dendl;
1396       ceph_abort();
1397     }
1398     m->maps[m->newest_map] = std::move(bl);
1399   }
1400   return m;
1401 }
1402
1403 void OSDService::send_map(MOSDMap *m, Connection *con)
1404 {
1405   con->send_message(m);
1406 }
1407
1408 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1409                                       const OSDMapRef& osdmap)
1410 {
1411   epoch_t to = osdmap->get_epoch();
1412   dout(10) << "send_incremental_map " << since << " -> " << to
1413            << " to " << con << " " << con->get_peer_addr() << dendl;
1414
1415   MOSDMap *m = NULL;
1416   while (!m) {
1417     OSDSuperblock sblock(get_superblock());
1418     if (since < sblock.oldest_map) {
1419       // just send latest full map
1420       MOSDMap *m = new MOSDMap(monc->get_fsid(),
1421                                osdmap->get_encoding_features());
1422       m->oldest_map = max_oldest_map;
1423       m->newest_map = sblock.newest_map;
1424       get_map_bl(to, m->maps[to]);
1425       send_map(m, con);
1426       return;
1427     }
1428
1429     if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1430       dout(10) << "  " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1431                << ", only sending most recent" << dendl;
1432       since = to - cct->_conf->osd_map_share_max_epochs;
1433     }
1434
1435     m = build_incremental_map_msg(since, to, sblock);
1436   }
1437   send_map(m, con);
1438 }
1439
1440 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1441 {
1442   bool found = map_bl_cache.lookup(e, &bl);
1443   if (found) {
1444     logger->inc(l_osd_map_bl_cache_hit);
1445     return true;
1446   }
1447   logger->inc(l_osd_map_bl_cache_miss);
1448   found = store->read(meta_ch,
1449                       OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1450                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1451   if (found) {
1452     _add_map_bl(e, bl);
1453   }
1454   return found;
1455 }
1456
1457 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1458 {
1459   std::lock_guard l(map_cache_lock);
1460   bool found = map_bl_inc_cache.lookup(e, &bl);
1461   if (found) {
1462     logger->inc(l_osd_map_bl_cache_hit);
1463     return true;
1464   }
1465   logger->inc(l_osd_map_bl_cache_miss);
1466   found = store->read(meta_ch,
1467                       OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1468                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1469   if (found) {
1470     _add_map_inc_bl(e, bl);
1471   }
1472   return found;
1473 }
1474
1475 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1476 {
1477   dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1478   // cache a contiguous buffer
1479   if (bl.get_num_buffers() > 1) {
1480     bl.rebuild();
1481   }
1482   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1483   map_bl_cache.add(e, bl);
1484 }
1485
1486 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1487 {
1488   dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1489   // cache a contiguous buffer
1490   if (bl.get_num_buffers() > 1) {
1491     bl.rebuild();
1492   }
1493   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1494   map_bl_inc_cache.add(e, bl);
1495 }
1496
1497 OSDMapRef OSDService::_add_map(OSDMap *o)
1498 {
1499   epoch_t e = o->get_epoch();
1500
1501   if (cct->_conf->osd_map_dedup) {
1502     // Dedup against an existing map at a nearby epoch
1503     OSDMapRef for_dedup = map_cache.lower_bound(e);
1504     if (for_dedup) {
1505       OSDMap::dedup(for_dedup.get(), o);
1506     }
1507   }
1508   bool existed;
1509   OSDMapRef l = map_cache.add(e, o, &existed);
1510   if (existed) {
1511     delete o;
1512   }
1513   return l;
1514 }
1515
1516 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1517 {
1518   std::lock_guard l(map_cache_lock);
1519   OSDMapRef retval = map_cache.lookup(epoch);
1520   if (retval) {
1521     dout(30) << "get_map " << epoch << " -cached" << dendl;
1522     logger->inc(l_osd_map_cache_hit);
1523     return retval;
1524   }
1525   {
1526     logger->inc(l_osd_map_cache_miss);
1527     epoch_t lb = map_cache.cached_key_lower_bound();
1528     if (epoch < lb) {
1529       dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1530       logger->inc(l_osd_map_cache_miss_low);
1531       logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1532     }
1533   }
1534
1535   OSDMap *map = new OSDMap;
1536   if (epoch > 0) {
1537     dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1538     bufferlist bl;
1539     if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1540       derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1541       delete map;
1542       return OSDMapRef();
1543     }
1544     map->decode(bl);
1545   } else {
1546     dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1547   }
1548   return _add_map(map);
1549 }
1550
1551 // ops
1552
1553
1554 void OSDService::reply_op_error(OpRequestRef op, int err)
1555 {
1556   reply_op_error(op, err, eversion_t(), 0, {});
1557 }
1558
1559 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1560                                 version_t uv,
1561                                 vector<pg_log_op_return_item_t> op_returns)
1562 {
1563   auto m = op->get_req<MOSDOp>();
1564   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1565   int flags;
1566   flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1567
1568   MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1569                                        !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1570   reply->set_reply_versions(v, uv);
1571   reply->set_op_returns(op_returns);
1572   m->get_connection()->send_message(reply);
1573 }
1574
1575 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1576 {
1577   if (!cct->_conf->osd_debug_misdirected_ops) {
1578     return;
1579   }
1580
1581   auto m = op->get_req<MOSDOp>();
1582   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1583
1584   ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1585
1586   if (pg->is_ec_pg()) {
1587     /**
1588        * OSD recomputes op target based on current OSDMap. With an EC pg, we
1589        * can get this result:
1590        * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1591        *    [CRUSH_ITEM_NONE, 2, 3]/3
1592        * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1593        *    [3, 2, 3]/3
1594        * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1595        *    -- misdirected op
1596        * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1597        *    it and fulfils it
1598        *
1599        * We can't compute the op target based on the sending map epoch due to
1600        * splitting.  The simplest thing is to detect such cases here and drop
1601        * them without an error (the client will resend anyway).
1602        */
1603     ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1604     OSDMapRef opmap = try_get_map(m->get_map_epoch());
1605     if (!opmap) {
1606       dout(7) << __func__ << ": " << *pg << " no longer have map for "
1607               << m->get_map_epoch() << ", dropping" << dendl;
1608       return;
1609     }
1610     pg_t _pgid = m->get_raw_pg();
1611     spg_t pgid;
1612     if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1613       _pgid = opmap->raw_pg_to_pg(_pgid);
1614     if (opmap->get_primary_shard(_pgid, &pgid) &&
1615         pgid.shard != pg->pg_id.shard) {
1616       dout(7) << __func__ << ": " << *pg << " primary changed since "
1617               << m->get_map_epoch() << ", dropping" << dendl;
1618       return;
1619     }
1620   }
1621
1622   dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1623   clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1624                << " pg " << m->get_raw_pg()
1625                << " to osd." << whoami
1626                << " not " << pg->get_acting()
1627                << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1628 }
1629
1630 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1631 {
1632   osd->op_shardedwq.queue(std::move(qi));
1633 }
1634
1635 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1636 {
1637   osd->op_shardedwq.queue_front(std::move(qi));
1638 }
1639
1640 void OSDService::queue_recovery_context(
1641   PG *pg,
1642   GenContext<ThreadPool::TPHandle&> *c)
1643 {
1644   epoch_t e = get_osdmap_epoch();
1645   enqueue_back(
1646     OpSchedulerItem(
1647       unique_ptr<OpSchedulerItem::OpQueueable>(
1648         new PGRecoveryContext(pg->get_pgid(), c, e)),
1649       cct->_conf->osd_recovery_cost,
1650       cct->_conf->osd_recovery_priority,
1651       ceph_clock_now(),
1652       0,
1653       e));
1654 }
1655
1656 void OSDService::queue_for_snap_trim(PG *pg)
1657 {
1658   dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1659   enqueue_back(
1660     OpSchedulerItem(
1661       unique_ptr<OpSchedulerItem::OpQueueable>(
1662         new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1663       cct->_conf->osd_snap_trim_cost,
1664       cct->_conf->osd_snap_trim_priority,
1665       ceph_clock_now(),
1666       0,
1667       pg->get_osdmap_epoch()));
1668 }
1669
1670 template <class MSG_TYPE>
1671 void OSDService::queue_scrub_event_msg(PG* pg,
1672                                        Scrub::scrub_prio_t with_priority,
1673                                        unsigned int qu_priority,
1674                                        Scrub::act_token_t act_token)
1675 {
1676   const auto epoch = pg->get_osdmap_epoch();
1677   auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
1678   dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
1679            << ". Epoch: " << epoch << " token: " << act_token << dendl;
1680
1681   enqueue_back(OpSchedulerItem(
1682     unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1683     pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1684 }
1685
1686 template <class MSG_TYPE>
1687 void OSDService::queue_scrub_event_msg(PG* pg,
1688                                        Scrub::scrub_prio_t with_priority)
1689 {
1690   const auto epoch = pg->get_osdmap_epoch();
1691   auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1692   dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1693
1694   enqueue_back(OpSchedulerItem(
1695     unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1696     pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1697 }
1698
1699 void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1700 {
1701   queue_scrub_event_msg<PGScrub>(pg, with_priority);
1702 }
1703
1704 void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1705 {
1706   queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1707 }
1708
1709 void OSDService::queue_for_rep_scrub(PG* pg,
1710                                      Scrub::scrub_prio_t with_priority,
1711                                      unsigned int qu_priority,
1712                                      Scrub::act_token_t act_token)
1713 {
1714   queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
1715 }
1716
1717 void OSDService::queue_for_rep_scrub_resched(PG* pg,
1718                                              Scrub::scrub_prio_t with_priority,
1719                                              unsigned int qu_priority,
1720                                              Scrub::act_token_t act_token)
1721 {
1722   // Resulting scrub event: 'SchedReplica'
1723   queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
1724                                            act_token);
1725 }
1726
1727 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1728 {
1729   // Resulting scrub event: 'RemotesReserved'
1730   queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1731 }
1732
1733 void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1734 {
1735   // Resulting scrub event: 'ReservationFailure'
1736   queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1737 }
1738
1739 void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1740 {
1741   // Resulting scrub event: 'InternalSchedScrub'
1742   queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1743 }
1744
1745 void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1746 {
1747   // Resulting scrub event: 'ActivePushesUpd'
1748   queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1749 }
1750
1751 void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
1752 {
1753   // Resulting scrub event: 'SelectedChunkFree'
1754   queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
1755 }
1756
1757 void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
1758 {
1759   // Resulting scrub event: 'ChunkIsBusy'
1760   queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
1761 }
1762
1763 void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1764 {
1765   queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1766 }
1767
1768 void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1769 {
1770   // Resulting scrub event: 'Unblocked'
1771   queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1772 }
1773
1774 void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1775 {
1776   // Resulting scrub event: 'DigestUpdate'
1777   queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1778 }
1779
1780 void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
1781 {
1782   // Resulting scrub event: 'IntLocalMapDone'
1783   queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
1784 }
1785
1786 void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1787 {
1788   // Resulting scrub event: 'GotReplicas'
1789   queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1790 }
1791
1792 void OSDService::queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority)
1793 {
1794   // Resulting scrub event: 'MapsCompared'
1795   queue_scrub_event_msg<PGScrubMapsCompared>(pg, with_priority);
1796 }
1797
1798 void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1799 {
1800   // Resulting scrub event: 'ReplicaPushesUpd'
1801   queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1802 }
1803
1804 void OSDService::queue_scrub_is_finished(PG *pg)
1805 {
1806   // Resulting scrub event: 'ScrubFinished'
1807   queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
1808 }
1809
1810 void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
1811 {
1812   // Resulting scrub event: 'NextChunk'
1813   queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
1814 }
1815
1816 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1817 {
1818   dout(10) << __func__ << " on " << pgid << " e " << e  << dendl;
1819   enqueue_back(
1820     OpSchedulerItem(
1821       unique_ptr<OpSchedulerItem::OpQueueable>(
1822         new PGDelete(pgid, e)),
1823       cct->_conf->osd_pg_delete_cost,
1824       cct->_conf->osd_pg_delete_priority,
1825       ceph_clock_now(),
1826       0,
1827       e));
1828 }
1829
1830 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1831 {
1832   return osd->try_finish_pg_delete(pg, old_pg_num);
1833 }
1834
1835 // ---
1836
1837 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1838 {
1839   std::lock_guard l(merge_lock);
1840   dout(10) << __func__ << " " << pg->pg_id << dendl;
1841   ready_to_merge_source[pg->pg_id.pgid] = version;
1842   assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1843   _send_ready_to_merge();
1844 }
1845
1846 void OSDService::set_ready_to_merge_target(PG *pg,
1847                                            eversion_t version,
1848                                            epoch_t last_epoch_started,
1849                                            epoch_t last_epoch_clean)
1850 {
1851   std::lock_guard l(merge_lock);
1852   dout(10) << __func__ << " " << pg->pg_id << dendl;
1853   ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1854                                          make_tuple(version,
1855                                                     last_epoch_started,
1856                                                     last_epoch_clean)));
1857   assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1858   _send_ready_to_merge();
1859 }
1860
1861 void OSDService::set_not_ready_to_merge_source(pg_t source)
1862 {
1863   std::lock_guard l(merge_lock);
1864   dout(10) << __func__ << " " << source << dendl;
1865   not_ready_to_merge_source.insert(source);
1866   assert(ready_to_merge_source.count(source) == 0);
1867   _send_ready_to_merge();
1868 }
1869
1870 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1871 {
1872   std::lock_guard l(merge_lock);
1873   dout(10) << __func__ << " " << target << " source " << source << dendl;
1874   not_ready_to_merge_target[target] = source;
1875   assert(ready_to_merge_target.count(target) == 0);
1876   _send_ready_to_merge();
1877 }
1878
1879 void OSDService::send_ready_to_merge()
1880 {
1881   std::lock_guard l(merge_lock);
1882   _send_ready_to_merge();
1883 }
1884
1885 void OSDService::_send_ready_to_merge()
1886 {
1887   dout(20) << __func__
1888            << " ready_to_merge_source " << ready_to_merge_source
1889            << " not_ready_to_merge_source " << not_ready_to_merge_source
1890            << " ready_to_merge_target " << ready_to_merge_target
1891            << " not_ready_to_merge_target " << not_ready_to_merge_target
1892            << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1893            << dendl;
1894   for (auto src : not_ready_to_merge_source) {
1895     if (sent_ready_to_merge_source.count(src) == 0) {
1896       monc->send_mon_message(new MOSDPGReadyToMerge(
1897                                src,
1898                                {}, {}, 0, 0,
1899                                false,
1900                                osdmap->get_epoch()));
1901       sent_ready_to_merge_source.insert(src);
1902     }
1903   }
1904   for (auto p : not_ready_to_merge_target) {
1905     if (sent_ready_to_merge_source.count(p.second) == 0) {
1906       monc->send_mon_message(new MOSDPGReadyToMerge(
1907                                p.second,
1908                                {}, {}, 0, 0,
1909                                false,
1910                                osdmap->get_epoch()));
1911       sent_ready_to_merge_source.insert(p.second);
1912     }
1913   }
1914   for (auto src : ready_to_merge_source) {
1915     if (not_ready_to_merge_source.count(src.first) ||
1916         not_ready_to_merge_target.count(src.first.get_parent())) {
1917       continue;
1918     }
1919     auto p = ready_to_merge_target.find(src.first.get_parent());
1920     if (p != ready_to_merge_target.end() &&
1921         sent_ready_to_merge_source.count(src.first) == 0) {
1922       monc->send_mon_message(new MOSDPGReadyToMerge(
1923                                src.first,           // source pgid
1924                                src.second,          // src version
1925                                std::get<0>(p->second), // target version
1926                                std::get<1>(p->second), // PG's last_epoch_started
1927                                std::get<2>(p->second), // PG's last_epoch_clean
1928                                true,
1929                                osdmap->get_epoch()));
1930       sent_ready_to_merge_source.insert(src.first);
1931     }
1932   }
1933 }
1934
1935 void OSDService::clear_ready_to_merge(PG *pg)
1936 {
1937   std::lock_guard l(merge_lock);
1938   dout(10) << __func__ << " " << pg->pg_id << dendl;
1939   ready_to_merge_source.erase(pg->pg_id.pgid);
1940   ready_to_merge_target.erase(pg->pg_id.pgid);
1941   not_ready_to_merge_source.erase(pg->pg_id.pgid);
1942   not_ready_to_merge_target.erase(pg->pg_id.pgid);
1943   sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1944 }
1945
1946 void OSDService::clear_sent_ready_to_merge()
1947 {
1948   std::lock_guard l(merge_lock);
1949   sent_ready_to_merge_source.clear();
1950 }
1951
1952 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1953 {
1954   std::lock_guard l(merge_lock);
1955   auto i = sent_ready_to_merge_source.begin();
1956   while (i != sent_ready_to_merge_source.end()) {
1957     if (!osdmap->pg_exists(*i)) {
1958       dout(10) << __func__ << " " << *i << dendl;
1959       i = sent_ready_to_merge_source.erase(i);
1960     } else {
1961       ++i;
1962     }
1963   }
1964 }
1965
1966 // ---
1967
1968 void OSDService::_queue_for_recovery(
1969   std::pair<epoch_t, PGRef> p,
1970   uint64_t reserved_pushes)
1971 {
1972   ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1973   enqueue_back(
1974     OpSchedulerItem(
1975       unique_ptr<OpSchedulerItem::OpQueueable>(
1976         new PGRecovery(
1977           p.second->get_pgid(), p.first, reserved_pushes)),
1978       cct->_conf->osd_recovery_cost,
1979       cct->_conf->osd_recovery_priority,
1980       ceph_clock_now(),
1981       0,
1982       p.first));
1983 }
1984
1985 // ====================================================================
1986 // OSD
1987
1988 #undef dout_prefix
1989 #define dout_prefix *_dout
1990
1991 // Commands shared between OSD's console and admin console:
1992 namespace ceph::osd_cmds {
1993
1994 int heap(CephContext& cct,
1995          const cmdmap_t& cmdmap,
1996          std::ostream& outos,
1997          std::ostream& erros);
1998
1999 } // namespace ceph::osd_cmds
2000
2001 int OSD::mkfs(CephContext *cct,
2002               std::unique_ptr<ObjectStore> store,
2003               uuid_d fsid,
2004               int whoami,
2005               string osdspec_affinity)
2006 {
2007   int ret;
2008
2009   OSDSuperblock sb;
2010   bufferlist sbbl;
2011   // if we are fed a uuid for this osd, use it.
2012   store->set_fsid(cct->_conf->osd_uuid);
2013
2014   ret = store->mkfs();
2015   if (ret) {
2016     derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2017          << cpp_strerror(ret) << dendl;
2018     return ret;
2019   }
2020
2021   store->set_cache_shards(1);  // doesn't matter for mkfs!
2022
2023   ret = store->mount();
2024   if (ret) {
2025     derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2026          << cpp_strerror(ret) << dendl;
2027     return ret;
2028   }
2029
2030   auto umount_store = make_scope_guard([&] {
2031     store->umount();
2032   });
2033
2034   ObjectStore::CollectionHandle ch =
2035     store->open_collection(coll_t::meta());
2036   if (ch) {
2037     ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2038     if (ret < 0) {
2039       derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2040       return ret;
2041     }
2042     /* if we already have superblock, check content of superblock */
2043     dout(0) << " have superblock" << dendl;
2044     auto p = sbbl.cbegin();
2045     decode(sb, p);
2046     if (whoami != sb.whoami) {
2047       derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2048            << dendl;
2049       return -EINVAL;
2050     }
2051     if (fsid != sb.cluster_fsid) {
2052       derr << "provided cluster fsid " << fsid
2053            << " != superblock's " << sb.cluster_fsid << dendl;
2054       return -EINVAL;
2055     }
2056   } else {
2057     // create superblock
2058     sb.cluster_fsid = fsid;
2059     sb.osd_fsid = store->get_fsid();
2060     sb.whoami = whoami;
2061     sb.compat_features = get_osd_initial_compat_set();
2062
2063     bufferlist bl;
2064     encode(sb, bl);
2065
2066     ObjectStore::CollectionHandle ch = store->create_new_collection(
2067       coll_t::meta());
2068     ObjectStore::Transaction t;
2069     t.create_collection(coll_t::meta(), 0);
2070     t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2071     ret = store->queue_transaction(ch, std::move(t));
2072     if (ret) {
2073       derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2074            << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2075       return ret;
2076     }
2077     ch->flush();
2078   }
2079
2080   ret = write_meta(cct, store.get(), sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
2081   if (ret) {
2082     derr << "OSD::mkfs: failed to write fsid file: error "
2083          << cpp_strerror(ret) << dendl;
2084   }
2085   return ret;
2086 }
2087
2088 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2089 {
2090   char val[80];
2091   int r;
2092
2093   snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2094   r = store->write_meta("magic", val);
2095   if (r < 0)
2096     return r;
2097
2098   snprintf(val, sizeof(val), "%d", whoami);
2099   r = store->write_meta("whoami", val);
2100   if (r < 0)
2101     return r;
2102
2103   cluster_fsid.print(val);
2104   r = store->write_meta("ceph_fsid", val);
2105   if (r < 0)
2106     return r;
2107
2108   string key = cct->_conf.get_val<string>("key");
2109   if (key.size()) {
2110     r = store->write_meta("osd_key", key);
2111     if (r < 0)
2112       return r;
2113   } else {
2114     string keyfile = cct->_conf.get_val<string>("keyfile");
2115     if (!keyfile.empty()) {
2116       bufferlist keybl;
2117       string err;
2118       r = keybl.read_file(keyfile.c_str(), &err);
2119       if (r < 0) {
2120         derr << __func__ << " failed to read keyfile " << keyfile << ": "
2121              << err << ": " << cpp_strerror(r) << dendl;
2122         return r;
2123       }
2124       r = store->write_meta("osd_key", keybl.to_str());
2125       if (r < 0)
2126         return r;
2127     }
2128   }
2129   if (!osdspec_affinity.empty()) {
2130     r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2131     if (r < 0)
2132       return r;
2133   }
2134
2135   r = store->write_meta("ready", "ready");
2136   if (r < 0)
2137     return r;
2138
2139   return 0;
2140 }
2141
2142 int OSD::peek_meta(ObjectStore *store,
2143                    std::string *magic,
2144                    uuid_d *cluster_fsid,
2145                    uuid_d *osd_fsid,
2146                    int *whoami,
2147                    ceph_release_t *require_osd_release)
2148 {
2149   string val;
2150
2151   int r = store->read_meta("magic", &val);
2152   if (r < 0)
2153     return r;
2154   *magic = val;
2155
2156   r = store->read_meta("whoami", &val);
2157   if (r < 0)
2158     return r;
2159   *whoami = atoi(val.c_str());
2160
2161   r = store->read_meta("ceph_fsid", &val);
2162   if (r < 0)
2163     return r;
2164   r = cluster_fsid->parse(val.c_str());
2165   if (!r)
2166     return -EINVAL;
2167
2168   r = store->read_meta("fsid", &val);
2169   if (r < 0) {
2170     *osd_fsid = uuid_d();
2171   } else {
2172     r = osd_fsid->parse(val.c_str());
2173     if (!r)
2174       return -EINVAL;
2175   }
2176
2177   r = store->read_meta("require_osd_release", &val);
2178   if (r >= 0) {
2179     *require_osd_release = ceph_release_from_name(val);
2180   }
2181
2182   return 0;
2183 }
2184
2185
2186 #undef dout_prefix
2187 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2188
2189 // cons/des
2190
2191 OSD::OSD(CephContext *cct_,
2192          std::unique_ptr<ObjectStore> store_,
2193          int id,
2194          Messenger *internal_messenger,
2195          Messenger *external_messenger,
2196          Messenger *hb_client_front,
2197          Messenger *hb_client_back,
2198          Messenger *hb_front_serverm,
2199          Messenger *hb_back_serverm,
2200          Messenger *osdc_messenger,
2201          MonClient *mc,
2202          const std::string &dev, const std::string &jdev,
2203          ceph::async::io_context_pool& poolctx) :
2204   Dispatcher(cct_),
2205   tick_timer(cct, osd_lock),
2206   tick_timer_without_osd_lock(cct, tick_timer_lock),
2207   gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2208   cluster_messenger(internal_messenger),
2209   client_messenger(external_messenger),
2210   objecter_messenger(osdc_messenger),
2211   monc(mc),
2212   mgrc(cct_, client_messenger, &mc->monmap),
2213   logger(create_logger()),
2214   recoverystate_perf(create_recoverystate_perf()),
2215   store(std::move(store_)),
2216   log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2217   clog(log_client.create_channel()),
2218   whoami(id),
2219   dev_path(dev), journal_path(jdev),
2220   store_is_rotational(store->is_rotational()),
2221   trace_endpoint("0.0.0.0", 0, "osd"),
2222   asok_hook(NULL),
2223   m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2224                                   "osd_pg_epoch_max_lag_factor")),
2225   osd_compat(get_osd_compat_set()),
2226   osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2227             get_num_op_threads()),
2228   heartbeat_stop(false),
2229   heartbeat_need_update(true),
2230   hb_front_client_messenger(hb_client_front),
2231   hb_back_client_messenger(hb_client_back),
2232   hb_front_server_messenger(hb_front_serverm),
2233   hb_back_server_messenger(hb_back_serverm),
2234   daily_loadavg(0.0),
2235   heartbeat_thread(this),
2236   heartbeat_dispatcher(this),
2237   op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2238                   cct->_conf->osd_num_op_tracker_shard),
2239   test_ops_hook(NULL),
2240   op_shardedwq(
2241     this,
2242     ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2243     ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
2244     &osd_op_tp),
2245   last_pg_create_epoch(0),
2246   boot_finisher(cct),
2247   up_thru_wanted(0),
2248   requested_full_first(0),
2249   requested_full_last(0),
2250   service(this, poolctx)
2251 {
2252
2253   if (!gss_ktfile_client.empty()) {
2254     // Assert we can export environment variable
2255     /*
2256         The default client keytab is used, if it is present and readable,
2257         to automatically obtain initial credentials for GSSAPI client
2258         applications. The principal name of the first entry in the client
2259         keytab is used by default when obtaining initial credentials.
2260         1. The KRB5_CLIENT_KTNAME environment variable.
2261         2. The default_client_keytab_name profile variable in [libdefaults].
2262         3. The hardcoded default, DEFCKTNAME.
2263     */
2264     const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2265                                     gss_ktfile_client.c_str(), 1));
2266     ceph_assert(set_result == 0);
2267   }
2268
2269   monc->set_messenger(client_messenger);
2270   op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2271                                          cct->_conf->osd_op_log_threshold);
2272   op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2273                                            cct->_conf->osd_op_history_duration);
2274   op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2275                                                     cct->_conf->osd_op_history_slow_op_threshold);
2276   ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2277 #ifdef WITH_BLKIN
2278   std::stringstream ss;
2279   ss << "osd." << whoami;
2280   trace_endpoint.copy_name(ss.str());
2281 #endif
2282
2283   // initialize shards
2284   num_shards = get_num_op_shards();
2285   for (uint32_t i = 0; i < num_shards; i++) {
2286     OSDShard *one_shard = new OSDShard(
2287       i,
2288       cct,
2289       this);
2290     shards.push_back(one_shard);
2291   }
2292 }
2293
2294 OSD::~OSD()
2295 {
2296   while (!shards.empty()) {
2297     delete shards.back();
2298     shards.pop_back();
2299   }
2300   cct->get_perfcounters_collection()->remove(recoverystate_perf);
2301   cct->get_perfcounters_collection()->remove(logger);
2302   delete recoverystate_perf;
2303   delete logger;
2304 }
2305
2306 double OSD::get_tick_interval() const
2307 {
2308   // vary +/- 5% to avoid scrub scheduling livelocks
2309   constexpr auto delta = 0.05;
2310   return (OSD_TICK_INTERVAL *
2311           ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2312 }
2313
2314 void OSD::handle_signal(int signum)
2315 {
2316   ceph_assert(signum == SIGINT || signum == SIGTERM);
2317   derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2318   shutdown();
2319 }
2320
2321 int OSD::pre_init()
2322 {
2323   std::lock_guard lock(osd_lock);
2324   if (is_stopping())
2325     return 0;
2326
2327   if (store->test_mount_in_use()) {
2328     derr << "OSD::pre_init: object store '" << dev_path << "' is "
2329          << "currently in use. (Is ceph-osd already running?)" << dendl;
2330     return -EBUSY;
2331   }
2332
2333   cct->_conf.add_observer(this);
2334   return 0;
2335 }
2336
2337 int OSD::set_numa_affinity()
2338 {
2339   // storage numa node
2340   int store_node = -1;
2341   store->get_numa_node(&store_node, nullptr, nullptr);
2342   if (store_node >= 0) {
2343     dout(1) << __func__ << " storage numa node " << store_node << dendl;
2344   }
2345
2346   // check network numa node(s)
2347   int front_node = -1, back_node = -1;
2348   string front_iface = pick_iface(
2349     cct,
2350     client_messenger->get_myaddrs().front().get_sockaddr_storage());
2351   string back_iface = pick_iface(
2352     cct,
2353     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2354   int r = get_iface_numa_node(front_iface, &front_node);
2355   if (r >= 0 && front_node >= 0) {
2356     dout(1) << __func__ << " public network " << front_iface << " numa node "
2357             << front_node << dendl;
2358     r = get_iface_numa_node(back_iface, &back_node);
2359     if (r >= 0 && back_node >= 0) {
2360       dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2361               << back_node << dendl;
2362       if (front_node == back_node &&
2363           front_node == store_node) {
2364         dout(1) << " objectstore and network numa nodes all match" << dendl;
2365         if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2366           numa_node = front_node;
2367         }
2368       } else if (front_node != back_node) {
2369         dout(1) << __func__ << " public and cluster network numa nodes do not match"
2370                 << dendl;
2371       } else {
2372         dout(1) << __func__ << " objectstore and network numa nodes do not match"
2373                 << dendl;
2374       }
2375     } else if (back_node == -2) {
2376       dout(1) << __func__ << " cluster network " << back_iface
2377               << " ports numa nodes do not match" << dendl;
2378     } else {
2379       derr << __func__ << " unable to identify cluster interface '" << back_iface
2380            << "' numa node: " << cpp_strerror(r) << dendl;
2381     }
2382   } else if (front_node == -2) {
2383     dout(1) << __func__ << " public network " << front_iface
2384             << " ports numa nodes do not match" << dendl;
2385   } else {
2386     derr << __func__ << " unable to identify public interface '" << front_iface
2387          << "' numa node: " << cpp_strerror(r) << dendl;
2388   }
2389   if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2390     // this takes precedence over the automagic logic above
2391     numa_node = node;
2392   }
2393   if (numa_node >= 0) {
2394     int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2395     if (r < 0) {
2396       dout(1) << __func__ << " unable to determine numa node " << numa_node
2397               << " CPUs" << dendl;
2398       numa_node = -1;
2399     } else {
2400       dout(1) << __func__ << " setting numa affinity to node " << numa_node
2401               << " cpus "
2402               << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2403               << dendl;
2404       r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2405       if (r < 0) {
2406         r = -errno;
2407         derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2408              << dendl;
2409         numa_node = -1;
2410       }
2411     }
2412   } else {
2413     dout(1) << __func__ << " not setting numa affinity" << dendl;
2414   }
2415   return 0;
2416 }
2417
2418 // asok
2419
2420 class OSDSocketHook : public AdminSocketHook {
2421   OSD *osd;
2422 public:
2423   explicit OSDSocketHook(OSD *o) : osd(o) {}
2424   int call(std::string_view prefix, const cmdmap_t& cmdmap,
2425            Formatter *f,
2426            std::ostream& ss,
2427            bufferlist& out) override {
2428     ceph_abort("should use async hook");
2429   }
2430   void call_async(
2431     std::string_view prefix,
2432     const cmdmap_t& cmdmap,
2433     Formatter *f,
2434     const bufferlist& inbl,
2435     std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2436     try {
2437       osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2438     } catch (const TOPNSPC::common::bad_cmd_get& e) {
2439       bufferlist empty;
2440       on_finish(-EINVAL, e.what(), empty);
2441     }
2442   }
2443 };
2444
2445 std::set<int64_t> OSD::get_mapped_pools()
2446 {
2447   std::set<int64_t> pools;
2448   std::vector<spg_t> pgids;
2449   _get_pgids(&pgids);
2450   for (const auto &pgid : pgids) {
2451     pools.insert(pgid.pool());
2452   }
2453   return pools;
2454 }
2455
2456 OSD::PGRefOrError OSD::locate_asok_target(const cmdmap_t& cmdmap,
2457                                      stringstream& ss,
2458                                      bool only_primary)
2459 {
2460   string pgidstr;
2461   if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2462     ss << "no pgid specified";
2463     return OSD::PGRefOrError{std::nullopt, -EINVAL};
2464   }
2465
2466   pg_t pgid;
2467   if (!pgid.parse(pgidstr.c_str())) {
2468     ss << "couldn't parse pgid '" << pgidstr << "'";
2469     return OSD::PGRefOrError{std::nullopt, -EINVAL};
2470   }
2471
2472   spg_t pcand;
2473   PGRef pg;
2474   if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) {
2475     if (pg->is_primary() || !only_primary) {
2476       return OSD::PGRefOrError{pg, 0};
2477     }
2478
2479     ss << "not primary for pgid " << pgid;
2480     pg->unlock();
2481     return OSD::PGRefOrError{std::nullopt, -EAGAIN};
2482   } else {
2483     ss << "i don't have pgid " << pgid;
2484     return OSD::PGRefOrError{std::nullopt, -ENOENT};
2485   }
2486 }
2487
2488 // note that the cmdmap is explicitly copied into asok_route_to_pg()
2489 int OSD::asok_route_to_pg(
2490   bool only_primary,
2491   std::string_view prefix,
2492   cmdmap_t cmdmap,
2493   Formatter* f,
2494   stringstream& ss,
2495   const bufferlist& inbl,
2496   bufferlist& outbl,
2497   std::function<void(int, const std::string&, bufferlist&)> on_finish)
2498 {
2499   auto [target_pg, ret] = locate_asok_target(cmdmap, ss, only_primary);
2500
2501   if (!target_pg.has_value()) {
2502     // 'ss' and 'ret' already contain the error information
2503     on_finish(ret, ss.str(), outbl);
2504     return ret;
2505   }
2506
2507   //  the PG was locked by locate_asok_target()
2508   try {
2509     (*target_pg)->do_command(prefix, cmdmap, inbl, on_finish);
2510     (*target_pg)->unlock();
2511     return 0;  // the pg handler calls on_finish directly
2512   } catch (const TOPNSPC::common::bad_cmd_get& e) {
2513     (*target_pg)->unlock();
2514     ss << e.what();
2515     on_finish(ret, ss.str(), outbl);
2516     return -EINVAL;
2517   }
2518 }
2519
2520 void OSD::asok_command(
2521   std::string_view prefix, const cmdmap_t& cmdmap,
2522   Formatter *f,
2523   const bufferlist& inbl,
2524   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2525 {
2526   int ret = 0;
2527   stringstream ss;   // stderr error message stream
2528   bufferlist outbl;  // if empty at end, we'll dump formatter as output
2529
2530   // --- PG commands are routed here to PG::do_command ---
2531   if (prefix == "pg" ||
2532       prefix == "query" ||
2533       prefix == "mark_unfound_lost" ||
2534       prefix == "list_unfound" ||
2535       prefix == "scrub" ||
2536       prefix == "deep_scrub"
2537     ) {
2538     string pgidstr;
2539     pg_t pgid;
2540     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2541       ss << "no pgid specified";
2542       ret = -EINVAL;
2543       goto out;
2544     }
2545     if (!pgid.parse(pgidstr.c_str())) {
2546       ss << "couldn't parse pgid '" << pgidstr << "'";
2547       ret = -EINVAL;
2548       goto out;
2549     }
2550     spg_t pcand;
2551     PGRef pg;
2552     if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2553         (pg = _lookup_lock_pg(pcand))) {
2554       if (pg->is_primary()) {
2555         cmdmap_t new_cmdmap = cmdmap;
2556         try {
2557           pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2558           pg->unlock();
2559           return; // the pg handler calls on_finish directly
2560         } catch (const TOPNSPC::common::bad_cmd_get& e) {
2561           pg->unlock();
2562           ss << e.what();
2563           ret = -EINVAL;
2564           goto out;
2565         }
2566       } else {
2567         ss << "not primary for pgid " << pgid;
2568         // do not reply; they will get newer maps and realize they
2569         // need to resend.
2570         pg->unlock();
2571         ret = -EAGAIN;
2572         goto out;
2573       }
2574     } else {
2575       ss << "i don't have pgid " << pgid;
2576       ret = -ENOENT;
2577     }
2578   }
2579
2580   // --- PG commands that will be answered even if !primary ---
2581
2582   else if (prefix == "scrubdebug") {
2583     asok_route_to_pg(false, prefix, cmdmap, f, ss, inbl, outbl, on_finish);
2584     return;
2585   }
2586
2587   // --- OSD commands follow ---
2588
2589   else if (prefix == "status") {
2590     lock_guard l(osd_lock);
2591     f->open_object_section("status");
2592     f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2593     f->dump_stream("osd_fsid") << superblock.osd_fsid;
2594     f->dump_unsigned("whoami", superblock.whoami);
2595     f->dump_string("state", get_state_name(get_state()));
2596     f->dump_unsigned("oldest_map", superblock.oldest_map);
2597     f->dump_unsigned("newest_map", superblock.newest_map);
2598     f->dump_unsigned("num_pgs", num_pgs);
2599     f->close_section();
2600   } else if (prefix == "flush_journal") {
2601     store->flush_journal();
2602   } else if (prefix == "dump_ops_in_flight" ||
2603              prefix == "ops" ||
2604              prefix == "dump_blocked_ops" ||
2605              prefix == "dump_historic_ops" ||
2606              prefix == "dump_historic_ops_by_duration" ||
2607              prefix == "dump_historic_slow_ops") {
2608
2609     const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2610 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2611 will start to track new ops received afterwards.";
2612
2613     set<string> filters;
2614     vector<string> filter_str;
2615     if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2616         copy(filter_str.begin(), filter_str.end(),
2617            inserter(filters, filters.end()));
2618     }
2619
2620     if (prefix == "dump_ops_in_flight" ||
2621         prefix == "ops") {
2622       if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2623         ss << error_str;
2624         ret = -EINVAL;
2625         goto out;
2626       }
2627     }
2628     if (prefix == "dump_blocked_ops") {
2629       if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2630         ss << error_str;
2631         ret = -EINVAL;
2632         goto out;
2633       }
2634     }
2635     if (prefix == "dump_historic_ops") {
2636       if (!op_tracker.dump_historic_ops(f, false, filters)) {
2637         ss << error_str;
2638         ret = -EINVAL;
2639         goto out;
2640       }
2641     }
2642     if (prefix == "dump_historic_ops_by_duration") {
2643       if (!op_tracker.dump_historic_ops(f, true, filters)) {
2644         ss << error_str;
2645         ret = -EINVAL;
2646         goto out;
2647       }
2648     }
2649     if (prefix == "dump_historic_slow_ops") {
2650       if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2651         ss << error_str;
2652         ret = -EINVAL;
2653         goto out;
2654       }
2655     }
2656   } else if (prefix == "dump_op_pq_state") {
2657     f->open_object_section("pq");
2658     op_shardedwq.dump(f);
2659     f->close_section();
2660   } else if (prefix == "dump_blocklist") {
2661     list<pair<entity_addr_t,utime_t> > bl;
2662     list<pair<entity_addr_t,utime_t> > rbl;
2663     OSDMapRef curmap = service.get_osdmap();
2664     curmap->get_blocklist(&bl, &rbl);
2665
2666     f->open_array_section("blocklist");
2667     for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2668         it != bl.end(); ++it) {
2669       f->open_object_section("entry");
2670       f->open_object_section("entity_addr_t");
2671       it->first.dump(f);
2672       f->close_section(); //entity_addr_t
2673       it->second.localtime(f->dump_stream("expire_time"));
2674       f->close_section(); //entry
2675     }
2676     f->close_section(); //blocklist
2677     f->open_array_section("range_blocklist");
2678     for (list<pair<entity_addr_t,utime_t> >::iterator it = rbl.begin();
2679         it != rbl.end(); ++it) {
2680       f->open_object_section("entry");
2681       f->open_object_section("entity_addr_t");
2682       it->first.dump(f);
2683       f->close_section(); //entity_addr_t
2684       it->second.localtime(f->dump_stream("expire_time"));
2685       f->close_section(); //entry
2686     }
2687     f->close_section(); //blocklist
2688   } else if (prefix == "dump_watchers") {
2689     list<obj_watch_item_t> watchers;
2690     // scan pg's
2691     vector<PGRef> pgs;
2692     _get_pgs(&pgs);
2693     for (auto& pg : pgs) {
2694       list<obj_watch_item_t> pg_watchers;
2695       pg->get_watchers(&pg_watchers);
2696       watchers.splice(watchers.end(), pg_watchers);
2697     }
2698
2699     f->open_array_section("watchers");
2700     for (list<obj_watch_item_t>::iterator it = watchers.begin();
2701         it != watchers.end(); ++it) {
2702
2703       f->open_object_section("watch");
2704
2705       f->dump_string("namespace", it->obj.nspace);
2706       f->dump_string("object", it->obj.oid.name);
2707
2708       f->open_object_section("entity_name");
2709       it->wi.name.dump(f);
2710       f->close_section(); //entity_name_t
2711
2712       f->dump_unsigned("cookie", it->wi.cookie);
2713       f->dump_unsigned("timeout", it->wi.timeout_seconds);
2714
2715       f->open_object_section("entity_addr_t");
2716       it->wi.addr.dump(f);
2717       f->close_section(); //entity_addr_t
2718
2719       f->close_section(); //watch
2720     }
2721
2722     f->close_section(); //watchers
2723   } else if (prefix == "dump_recovery_reservations") {
2724     f->open_object_section("reservations");
2725     f->open_object_section("local_reservations");
2726     service.local_reserver.dump(f);
2727     f->close_section();
2728     f->open_object_section("remote_reservations");
2729     service.remote_reserver.dump(f);
2730     f->close_section();
2731     f->close_section();
2732   } else if (prefix == "dump_scrub_reservations") {
2733     f->open_object_section("scrub_reservations");
2734     service.get_scrub_services().dump_scrub_reservations(f);
2735     f->close_section();
2736   } else if (prefix == "get_latest_osdmap") {
2737     get_latest_osdmap();
2738   } else if (prefix == "set_heap_property") {
2739     string property;
2740     int64_t value = 0;
2741     string error;
2742     bool success = false;
2743     if (!cmd_getval(cmdmap, "property", property)) {
2744       error = "unable to get property";
2745       success = false;
2746     } else if (!cmd_getval(cmdmap, "value", value)) {
2747       error = "unable to get value";
2748       success = false;
2749     } else if (value < 0) {
2750       error = "negative value not allowed";
2751       success = false;
2752     } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2753       error = "invalid property";
2754       success = false;
2755     } else {
2756       success = true;
2757     }
2758     f->open_object_section("result");
2759     f->dump_string("error", error);
2760     f->dump_bool("success", success);
2761     f->close_section();
2762   } else if (prefix == "get_heap_property") {
2763     string property;
2764     size_t value = 0;
2765     string error;
2766     bool success = false;
2767     if (!cmd_getval(cmdmap, "property", property)) {
2768       error = "unable to get property";
2769       success = false;
2770     } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2771       error = "invalid property";
2772       success = false;
2773     } else {
2774       success = true;
2775     }
2776     f->open_object_section("result");
2777     f->dump_string("error", error);
2778     f->dump_bool("success", success);
2779     f->dump_int("value", value);
2780     f->close_section();
2781   } else if (prefix == "dump_objectstore_kv_stats") {
2782     store->get_db_statistics(f);
2783   } else if (prefix == "dump_scrubs") {
2784     service.get_scrub_services().dump_scrubs(f);
2785   } else if (prefix == "calc_objectstore_db_histogram") {
2786     store->generate_db_histogram(f);
2787   } else if (prefix == "flush_store_cache") {
2788     store->flush_cache(&ss);
2789   } else if (prefix == "dump_pgstate_history") {
2790     f->open_object_section("pgstate_history");
2791     f->open_array_section("pgs");
2792     vector<PGRef> pgs;
2793     _get_pgs(&pgs);
2794     for (auto& pg : pgs) {
2795       f->open_object_section("pg");
2796       f->dump_stream("pg") << pg->pg_id;
2797       f->dump_string("currently", pg->get_current_state());
2798       pg->dump_pgstate_history(f);
2799       f->close_section();
2800     }
2801     f->close_section();
2802     f->close_section();
2803   } else if (prefix == "compact") {
2804     dout(1) << "triggering manual compaction" << dendl;
2805     auto start = ceph::coarse_mono_clock::now();
2806     store->compact();
2807     auto end = ceph::coarse_mono_clock::now();
2808     double duration = std::chrono::duration<double>(end-start).count();
2809     dout(1) << "finished manual compaction in "
2810             << duration
2811             << " seconds" << dendl;
2812     f->open_object_section("compact_result");
2813     f->dump_float("elapsed_time", duration);
2814     f->close_section();
2815   } else if (prefix == "get_mapped_pools") {
2816     f->open_array_section("mapped_pools");
2817     set<int64_t> poollist = get_mapped_pools();
2818     for (auto pool : poollist) {
2819       f->dump_int("pool_id", pool);
2820     }
2821     f->close_section();
2822   } else if (prefix == "smart") {
2823     string devid;
2824     cmd_getval(cmdmap, "devid", devid);
2825     ostringstream out;
2826     probe_smart(devid, out);
2827     outbl.append(out.str());
2828   } else if (prefix == "list_devices") {
2829     set<string> devnames;
2830     store->get_devices(&devnames);
2831     f->open_array_section("list_devices");
2832     for (auto dev : devnames) {
2833       if (dev.find("dm-") == 0) {
2834         continue;
2835       }
2836       string err;
2837       f->open_object_section("device");
2838       f->dump_string("device", "/dev/" + dev);
2839       f->dump_string("device_id", get_device_id(dev, &err));
2840       f->close_section();
2841     }
2842     f->close_section();
2843   } else if (prefix == "send_beacon") {
2844     lock_guard l(osd_lock);
2845     if (is_active()) {
2846       send_beacon(ceph::coarse_mono_clock::now());
2847     }
2848   }
2849
2850   else if (prefix == "cluster_log") {
2851     vector<string> msg;
2852     cmd_getval(cmdmap, "message", msg);
2853     if (msg.empty()) {
2854       ret = -EINVAL;
2855       ss << "ignoring empty log message";
2856       goto out;
2857     }
2858     string message = msg.front();
2859     for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2860       message += " " + *a;
2861     string lvl;
2862     cmd_getval(cmdmap, "level", lvl);
2863     clog_type level = string_to_clog_type(lvl);
2864     if (level < 0) {
2865       ret = -EINVAL;
2866       ss << "unknown level '" << lvl << "'";
2867       goto out;
2868     }
2869     clog->do_log(level, message);
2870   }
2871
2872   else if (prefix == "bench") {
2873     // default count 1G, size 4MB
2874     int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 1LL << 30);
2875     int64_t bsize = cmd_getval_or<int64_t>(cmdmap, "size", 4LL << 20);
2876     int64_t osize = cmd_getval_or<int64_t>(cmdmap, "object_size", 0);
2877     int64_t onum = cmd_getval_or<int64_t>(cmdmap, "object_num", 0);
2878     double elapsed = 0.0;
2879
2880     ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
2881     if (ret != 0) {
2882       goto out;
2883     }
2884
2885     double rate = count / elapsed;
2886     double iops = rate / bsize;
2887     f->open_object_section("osd_bench_results");
2888     f->dump_int("bytes_written", count);
2889     f->dump_int("blocksize", bsize);
2890     f->dump_float("elapsed_sec", elapsed);
2891     f->dump_float("bytes_per_sec", rate);
2892     f->dump_float("iops", iops);
2893     f->close_section();
2894   }
2895
2896   else if (prefix == "flush_pg_stats") {
2897     mgrc.send_pgstats();
2898     f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2899   }
2900
2901   else if (prefix == "heap") {
2902     std::stringstream outss;
2903     ret = ceph::osd_cmds::heap(*cct, cmdmap, outss, ss);
2904     outbl.append(outss);
2905   }
2906
2907   else if (prefix == "debug dump_missing") {
2908     f->open_array_section("pgs");
2909     vector<PGRef> pgs;
2910     _get_pgs(&pgs);
2911     for (auto& pg : pgs) {
2912       string s = stringify(pg->pg_id);
2913       f->open_array_section(s.c_str());
2914       pg->lock();
2915       pg->dump_missing(f);
2916       pg->unlock();
2917       f->close_section();
2918     }
2919     f->close_section();
2920   }
2921
2922   else if (prefix == "debug kick_recovery_wq") {
2923     int64_t delay;
2924     cmd_getval(cmdmap, "delay", delay);
2925     ostringstream oss;
2926     oss << delay;
2927     ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2928     if (ret != 0) {
2929       ss << "kick_recovery_wq: error setting "
2930          << "osd_recovery_delay_start to '" << delay << "': error "
2931          << ret;
2932       goto out;
2933     }
2934     cct->_conf.apply_changes(nullptr);
2935     ss << "kicking recovery queue. set osd_recovery_delay_start "
2936        << "to " << cct->_conf->osd_recovery_delay_start;
2937   }
2938
2939   else if (prefix == "cpu_profiler") {
2940     ostringstream ds;
2941     string arg;
2942     cmd_getval(cmdmap, "arg", arg);
2943     vector<string> argvec;
2944     get_str_vec(arg, argvec);
2945     cpu_profiler_handle_command(argvec, ds);
2946     outbl.append(ds.str());
2947   }
2948
2949   else if (prefix == "dump_pg_recovery_stats") {
2950     lock_guard l(osd_lock);
2951     pg_recovery_stats.dump_formatted(f);
2952   }
2953
2954   else if (prefix == "reset_pg_recovery_stats") {
2955     lock_guard l(osd_lock);
2956     pg_recovery_stats.reset();
2957   }
2958
2959   else if (prefix == "perf histogram dump") {
2960     std::string logger;
2961     std::string counter;
2962     cmd_getval(cmdmap, "logger", logger);
2963     cmd_getval(cmdmap, "counter", counter);
2964     cct->get_perfcounters_collection()->dump_formatted_histograms(
2965       f, false, logger, counter);
2966   }
2967
2968   else if (prefix == "cache drop") {
2969     lock_guard l(osd_lock);
2970     dout(20) << "clearing all caches" << dendl;
2971     // Clear the objectstore's cache - onode and buffer for Bluestore,
2972     // system's pagecache for Filestore
2973     ret = store->flush_cache(&ss);
2974     if (ret < 0) {
2975       ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2976       goto out;
2977     }
2978     // Clear the objectcontext cache (per PG)
2979     vector<PGRef> pgs;
2980     _get_pgs(&pgs);
2981     for (auto& pg: pgs) {
2982       pg->clear_cache();
2983     }
2984   }
2985
2986   else if (prefix == "cache status") {
2987     lock_guard l(osd_lock);
2988     int obj_ctx_count = 0;
2989     vector<PGRef> pgs;
2990     _get_pgs(&pgs);
2991     for (auto& pg: pgs) {
2992       obj_ctx_count += pg->get_cache_obj_count();
2993     }
2994     f->open_object_section("cache_status");
2995     f->dump_int("object_ctx", obj_ctx_count);
2996     store->dump_cache_stats(f);
2997     f->close_section();
2998   }
2999
3000   else if (prefix == "scrub_purged_snaps") {
3001     lock_guard l(osd_lock);
3002     scrub_purged_snaps();
3003   }
3004
3005   else if (prefix == "dump_osd_network") {
3006     lock_guard l(osd_lock);
3007     int64_t value = 0;
3008     if (!(cmd_getval(cmdmap, "value", value))) {
3009       // Convert milliseconds to microseconds
3010       value = static_cast<double>(g_conf().get_val<double>(
3011                                     "mon_warn_on_slow_ping_time")) * 1000;
3012       if (value == 0) {
3013         double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
3014         value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
3015         value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
3016       }
3017     } else {
3018       // Convert user input to microseconds
3019       value *= 1000;
3020     }
3021     if (value < 0) value = 0;
3022
3023     struct osd_ping_time_t {
3024       uint32_t pingtime;
3025       int to;
3026       bool back;
3027       std::array<uint32_t,3> times;
3028       std::array<uint32_t,3> min;
3029       std::array<uint32_t,3> max;
3030       uint32_t last;
3031       uint32_t last_update;
3032
3033       bool operator<(const osd_ping_time_t& rhs) const {
3034         if (pingtime < rhs.pingtime)
3035           return true;
3036         if (pingtime > rhs.pingtime)
3037           return false;
3038         if (to < rhs.to)
3039           return true;
3040         if (to > rhs.to)
3041           return false;
3042         return back;
3043       }
3044     };
3045
3046     set<osd_ping_time_t> sorted;
3047     // Get pingtimes under lock and not on the stack
3048     map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3049     service.get_hb_pingtime(pingtimes);
3050     for (auto j : *pingtimes) {
3051       if (j.second.last_update == 0)
3052         continue;
3053       osd_ping_time_t item;
3054       item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3055       item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3056       if (item.pingtime >= value) {
3057         item.to = j.first;
3058         item.times[0] = j.second.back_pingtime[0];
3059         item.times[1] = j.second.back_pingtime[1];
3060         item.times[2] = j.second.back_pingtime[2];
3061         item.min[0] = j.second.back_min[0];
3062         item.min[1] = j.second.back_min[1];
3063         item.min[2] = j.second.back_min[2];
3064         item.max[0] = j.second.back_max[0];
3065         item.max[1] = j.second.back_max[1];
3066         item.max[2] = j.second.back_max[2];
3067         item.last = j.second.back_last;
3068         item.back = true;
3069         item.last_update = j.second.last_update;
3070         sorted.emplace(item);
3071       }
3072       if (j.second.front_last == 0)
3073         continue;
3074       item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3075       item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3076       if (item.pingtime >= value) {
3077         item.to = j.first;
3078         item.times[0] = j.second.front_pingtime[0];
3079         item.times[1] = j.second.front_pingtime[1];
3080         item.times[2] = j.second.front_pingtime[2];
3081         item.min[0] = j.second.front_min[0];
3082         item.min[1] = j.second.front_min[1];
3083         item.min[2] = j.second.front_min[2];
3084         item.max[0] = j.second.front_max[0];
3085         item.max[1] = j.second.front_max[1];
3086         item.max[2] = j.second.front_max[2];
3087         item.last = j.second.front_last;
3088         item.last_update = j.second.last_update;
3089         item.back = false;
3090         sorted.emplace(item);
3091       }
3092     }
3093     delete pingtimes;
3094     //
3095     // Network ping times (1min 5min 15min)
3096     f->open_object_section("network_ping_times");
3097     f->dump_int("threshold", value / 1000);
3098     f->open_array_section("entries");
3099     for (auto &sitem : boost::adaptors::reverse(sorted)) {
3100       ceph_assert(sitem.pingtime >= value);
3101       f->open_object_section("entry");
3102
3103       const time_t lu(sitem.last_update);
3104       char buffer[26];
3105       string lustr(ctime_r(&lu, buffer));
3106       lustr.pop_back();   // Remove trailing \n
3107       auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3108       f->dump_string("last update", lustr);
3109       f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3110       f->dump_int("from osd", whoami);
3111       f->dump_int("to osd", sitem.to);
3112       f->dump_string("interface", (sitem.back ? "back" : "front"));
3113       f->open_object_section("average");
3114       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3115       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3116       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3117       f->close_section();  // average
3118       f->open_object_section("min");
3119       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3120       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3121       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3122       f->close_section();  // min
3123       f->open_object_section("max");
3124       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3125       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3126       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3127       f->close_section();  // max
3128       f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3129       f->close_section();  // entry
3130     }
3131     f->close_section(); // entries
3132     f->close_section(); // network_ping_times
3133   } else if (prefix == "dump_pool_statfs") {
3134     lock_guard l(osd_lock);
3135
3136     int64_t p = 0;
3137     if (!(cmd_getval(cmdmap, "poolid", p))) {
3138       ss << "Error dumping pool statfs: no poolid provided";
3139       ret = -EINVAL;
3140       goto out;
3141     }
3142
3143     store_statfs_t st;
3144     bool per_pool_omap_stats = false;
3145
3146     ret = store->pool_statfs(p, &st, &per_pool_omap_stats);
3147     if (ret < 0) {
3148       ss << "Error dumping pool statfs: " << cpp_strerror(ret);
3149       goto out;
3150     } else {
3151       ss << "dumping pool statfs...";
3152       f->open_object_section("pool_statfs");
3153       f->dump_int("poolid", p);
3154       st.dump(f);
3155       f->close_section();
3156     }
3157   } else {
3158     ceph_abort_msg("broken asok registration");
3159   }
3160
3161  out:
3162   on_finish(ret, ss.str(), outbl);
3163 }
3164
3165 int OSD::run_osd_bench_test(
3166   int64_t count,
3167   int64_t bsize,
3168   int64_t osize,
3169   int64_t onum,
3170   double *elapsed,
3171   ostream &ss)
3172 {
3173   int ret = 0;
3174   uint32_t duration = cct->_conf->osd_bench_duration;
3175
3176   if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3177     // let us limit the block size because the next checks rely on it
3178     // having a sane value.  If we allow any block size to be set things
3179     // can still go sideways.
3180     ss << "block 'size' values are capped at "
3181        << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3182        << " a higher value, please adjust 'osd_bench_max_block_size'";
3183     ret = -EINVAL;
3184     return ret;
3185   } else if (bsize < (int64_t) (1 << 20)) {
3186     // entering the realm of small block sizes.
3187     // limit the count to a sane value, assuming a configurable amount of
3188     // IOPS and duration, so that the OSD doesn't get hung up on this,
3189     // preventing timeouts from going off
3190     int64_t max_count =
3191       bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3192     if (count > max_count) {
3193       ss << "'count' values greater than " << max_count
3194          << " for a block size of " << byte_u_t(bsize) << ", assuming "
3195          << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3196          << " for " << duration << " seconds,"
3197          << " can cause ill effects on osd. "
3198          << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3199          << " value if you wish to use a higher 'count'.";
3200       ret = -EINVAL;
3201       return ret;
3202     }
3203   } else {
3204     // 1MB block sizes are big enough so that we get more stuff done.
3205     // However, to avoid the osd from getting hung on this and having
3206     // timers being triggered, we are going to limit the count assuming
3207     // a configurable throughput and duration.
3208     // NOTE: max_count is the total amount of bytes that we believe we
3209     //       will be able to write during 'duration' for the given
3210     //       throughput.  The block size hardly impacts this unless it's
3211     //       way too big.  Given we already check how big the block size
3212     //       is, it's safe to assume everything will check out.
3213     int64_t max_count =
3214       cct->_conf->osd_bench_large_size_max_throughput * duration;
3215     if (count > max_count) {
3216       ss << "'count' values greater than " << max_count
3217          << " for a block size of " << byte_u_t(bsize) << ", assuming "
3218          << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3219          << " for " << duration << " seconds,"
3220          << " can cause ill effects on osd. "
3221          << " Please adjust 'osd_bench_large_size_max_throughput'"
3222          << " with a higher value if you wish to use a higher 'count'.";
3223       ret = -EINVAL;
3224       return ret;
3225     }
3226   }
3227
3228   if (osize && bsize > osize) {
3229     bsize = osize;
3230   }
3231
3232   dout(1) << " bench count " << count
3233           << " bsize " << byte_u_t(bsize) << dendl;
3234
3235   ObjectStore::Transaction cleanupt;
3236
3237   if (osize && onum) {
3238     bufferlist bl;
3239     bufferptr bp(osize);
3240     memset(bp.c_str(), 'a', bp.length());
3241     bl.push_back(std::move(bp));
3242     bl.rebuild_page_aligned();
3243     for (int i=0; i<onum; ++i) {
3244       char nm[30];
3245       snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3246       object_t oid(nm);
3247       hobject_t soid(sobject_t(oid, 0));
3248       ObjectStore::Transaction t;
3249       t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3250       store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3251       cleanupt.remove(coll_t(), ghobject_t(soid));
3252     }
3253   }
3254
3255   bufferlist bl;
3256   bufferptr bp(bsize);
3257   memset(bp.c_str(), 'a', bp.length());
3258   bl.push_back(std::move(bp));
3259   bl.rebuild_page_aligned();
3260
3261   {
3262     C_SaferCond waiter;
3263     if (!service.meta_ch->flush_commit(&waiter)) {
3264       waiter.wait();
3265     }
3266   }
3267
3268   utime_t start = ceph_clock_now();
3269   for (int64_t pos = 0; pos < count; pos += bsize) {
3270     char nm[30];
3271     unsigned offset = 0;
3272     if (onum && osize) {
3273       snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3274       offset = rand() % (osize / bsize) * bsize;
3275     } else {
3276       snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3277     }
3278     object_t oid(nm);
3279     hobject_t soid(sobject_t(oid, 0));
3280     ObjectStore::Transaction t;
3281     t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3282     store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3283     if (!onum || !osize) {
3284       cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3285     }
3286   }
3287
3288   {
3289     C_SaferCond waiter;
3290     if (!service.meta_ch->flush_commit(&waiter)) {
3291       waiter.wait();
3292     }
3293   }
3294   utime_t end = ceph_clock_now();
3295   *elapsed = end - start;
3296
3297   // clean up
3298   store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3299   {
3300     C_SaferCond waiter;
3301     if (!service.meta_ch->flush_commit(&waiter)) {
3302       waiter.wait();
3303     }
3304   }
3305
3306  return ret;
3307 }
3308
3309 class TestOpsSocketHook : public AdminSocketHook {
3310   OSDService *service;
3311   ObjectStore *store;
3312 public:
3313   TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3314   int call(std::string_view command, const cmdmap_t& cmdmap,
3315            Formatter *f,
3316            std::ostream& errss,
3317            bufferlist& out) override {
3318     int r = 0;
3319     stringstream outss;
3320     try {
3321       test_ops(service, store, command, cmdmap, outss);
3322       out.append(outss);
3323     } catch (const TOPNSPC::common::bad_cmd_get& e) {
3324       errss << e.what();
3325       r = -EINVAL;
3326     }
3327     return r;
3328   }
3329   void test_ops(OSDService *service, ObjectStore *store,
3330                 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3331
3332 };
3333
3334 class OSD::C_Tick : public Context {
3335   OSD *osd;
3336   public:
3337   explicit C_Tick(OSD *o) : osd(o) {}
3338   void finish(int r) override {
3339     osd->tick();
3340   }
3341 };
3342
3343 class OSD::C_Tick_WithoutOSDLock : public Context {
3344   OSD *osd;
3345   public:
3346   explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3347   void finish(int r) override {
3348     osd->tick_without_osd_lock();
3349   }
3350 };
3351
3352 int OSD::enable_disable_fuse(bool stop)
3353 {
3354 #ifdef HAVE_LIBFUSE
3355   int r;
3356   string mntpath = cct->_conf->osd_data + "/fuse";
3357   if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3358     dout(1) << __func__ << " disabling" << dendl;
3359     fuse_store->stop();
3360     delete fuse_store;
3361     fuse_store = NULL;
3362     r = ::rmdir(mntpath.c_str());
3363     if (r < 0) {
3364       r = -errno;
3365       derr << __func__ << " failed to rmdir " << mntpath << ": "
3366            << cpp_strerror(r) << dendl;
3367       return r;
3368     }
3369     return 0;
3370   }
3371   if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3372     dout(1) << __func__ << " enabling" << dendl;
3373     r = ::mkdir(mntpath.c_str(), 0700);
3374     if (r < 0)
3375       r = -errno;
3376     if (r < 0 && r != -EEXIST) {
3377       derr << __func__ << " unable to create " << mntpath << ": "
3378            << cpp_strerror(r) << dendl;
3379       return r;
3380     }
3381     fuse_store = new FuseStore(store.get(), mntpath);
3382     r = fuse_store->start();
3383     if (r < 0) {
3384       derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3385       delete fuse_store;
3386       fuse_store = NULL;
3387       return r;
3388     }
3389   }
3390 #endif  // HAVE_LIBFUSE
3391   return 0;
3392 }
3393
3394 size_t OSD::get_num_cache_shards()
3395 {
3396   return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3397 }
3398
3399 int OSD::get_num_op_shards()
3400 {
3401   if (cct->_conf->osd_op_num_shards)
3402     return cct->_conf->osd_op_num_shards;
3403   if (store_is_rotational)
3404     return cct->_conf->osd_op_num_shards_hdd;
3405   else
3406     return cct->_conf->osd_op_num_shards_ssd;
3407 }
3408
3409 int OSD::get_num_op_threads()
3410 {
3411   if (cct->_conf->osd_op_num_threads_per_shard)
3412     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3413   if (store_is_rotational)
3414     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3415   else
3416     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3417 }
3418
3419 float OSD::get_osd_recovery_sleep()
3420 {
3421   if (cct->_conf->osd_recovery_sleep)
3422     return cct->_conf->osd_recovery_sleep;
3423   if (!store_is_rotational && !journal_is_rotational)
3424     return cct->_conf->osd_recovery_sleep_ssd;
3425   else if (store_is_rotational && !journal_is_rotational)
3426     return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3427   else
3428     return cct->_conf->osd_recovery_sleep_hdd;
3429 }
3430
3431 float OSD::get_osd_delete_sleep()
3432 {
3433   float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3434   if (osd_delete_sleep > 0)
3435     return osd_delete_sleep;
3436   if (!store_is_rotational && !journal_is_rotational)
3437     return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3438   if (store_is_rotational && !journal_is_rotational)
3439     return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3440   return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3441 }
3442
3443 int OSD::get_recovery_max_active()
3444 {
3445   if (cct->_conf->osd_recovery_max_active)
3446     return cct->_conf->osd_recovery_max_active;
3447   if (store_is_rotational)
3448     return cct->_conf->osd_recovery_max_active_hdd;
3449   else
3450     return cct->_conf->osd_recovery_max_active_ssd;
3451 }
3452
3453 float OSD::get_osd_snap_trim_sleep()
3454 {
3455   float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3456   if (osd_snap_trim_sleep > 0)
3457     return osd_snap_trim_sleep;
3458   if (!store_is_rotational && !journal_is_rotational)
3459     return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3460   if (store_is_rotational && !journal_is_rotational)
3461     return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3462   return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3463 }
3464
3465 int OSD::init()
3466 {
3467   OSDMapRef osdmap;
3468   CompatSet initial, diff;
3469   std::lock_guard lock(osd_lock);
3470   if (is_stopping())
3471     return 0;
3472   tracing::osd::tracer.init("osd");
3473   tick_timer.init();
3474   tick_timer_without_osd_lock.init();
3475   service.recovery_request_timer.init();
3476   service.sleep_timer.init();
3477
3478   boot_finisher.start();
3479
3480   {
3481     string val;
3482     store->read_meta("require_osd_release", &val);
3483     last_require_osd_release = ceph_release_from_name(val);
3484   }
3485
3486   // mount.
3487   dout(2) << "init " << dev_path
3488           << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3489           << dendl;
3490   dout(2) << "journal " << journal_path << dendl;
3491   ceph_assert(store);  // call pre_init() first!
3492
3493   store->set_cache_shards(get_num_cache_shards());
3494
3495  int rotating_auth_attempts = 0;
3496  auto rotating_auth_timeout =
3497    g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3498
3499   int r = store->mount();
3500   if (r < 0) {
3501     derr << "OSD:init: unable to mount object store" << dendl;
3502     return r;
3503   }
3504   journal_is_rotational = store->is_journal_rotational();
3505   dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3506           << dendl;
3507
3508   enable_disable_fuse(false);
3509
3510   dout(2) << "boot" << dendl;
3511
3512   service.meta_ch = store->open_collection(coll_t::meta());
3513   if (!service.meta_ch) {
3514     derr << "OSD:init: unable to open meta collection"
3515          << dendl;
3516     r = -ENOENT;
3517     goto out;
3518   }
3519   // initialize the daily loadavg with current 15min loadavg
3520   double loadavgs[3];
3521   if (getloadavg(loadavgs, 3) == 3) {
3522     daily_loadavg = loadavgs[2];
3523   } else {
3524     derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3525     daily_loadavg = 1.0;
3526   }
3527
3528   // sanity check long object name handling
3529   {
3530     hobject_t l;
3531     l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3532     l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3533     l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3534     r = store->validate_hobject_key(l);
3535     if (r < 0) {
3536       derr << "backend (" << store->get_type() << ") is unable to support max "
3537            << "object name[space] len" << dendl;
3538       derr << "   osd max object name len = "
3539            << cct->_conf->osd_max_object_name_len << dendl;
3540       derr << "   osd max object namespace len = "
3541            << cct->_conf->osd_max_object_namespace_len << dendl;
3542       derr << cpp_strerror(r) << dendl;
3543       if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3544         goto out;
3545       }
3546       derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3547            << dendl;
3548     } else {
3549       dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3550     }
3551   }
3552
3553   // read superblock
3554   r = read_superblock();
3555   if (r < 0) {
3556     derr << "OSD::init() : unable to read osd superblock" << dendl;
3557     r = -EINVAL;
3558     goto out;
3559   }
3560
3561   if (osd_compat.compare(superblock.compat_features) < 0) {
3562     derr << "The disk uses features unsupported by the executable." << dendl;
3563     derr << " ondisk features " << superblock.compat_features << dendl;
3564     derr << " daemon features " << osd_compat << dendl;
3565
3566     if (osd_compat.writeable(superblock.compat_features)) {
3567       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3568       derr << "it is still writeable, though. Missing features: " << diff << dendl;
3569       r = -EOPNOTSUPP;
3570       goto out;
3571     }
3572     else {
3573       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3574       derr << "Cannot write to disk! Missing features: " << diff << dendl;
3575       r = -EOPNOTSUPP;
3576       goto out;
3577     }
3578   }
3579
3580   assert_warn(whoami == superblock.whoami);
3581   if (whoami != superblock.whoami) {
3582     derr << "OSD::init: superblock says osd"
3583          << superblock.whoami << " but I am osd." << whoami << dendl;
3584     r = -EINVAL;
3585     goto out;
3586   }
3587
3588   startup_time = ceph::mono_clock::now();
3589
3590   // load up "current" osdmap
3591   assert_warn(!get_osdmap());
3592   if (get_osdmap()) {
3593     derr << "OSD::init: unable to read current osdmap" << dendl;
3594     r = -EINVAL;
3595     goto out;
3596   }
3597   osdmap = get_map(superblock.current_epoch);
3598   set_osdmap(osdmap);
3599
3600   // make sure we don't have legacy pgs deleting
3601   {
3602     vector<coll_t> ls;
3603     int r = store->list_collections(ls);
3604     ceph_assert(r >= 0);
3605     for (auto c : ls) {
3606       spg_t pgid;
3607       if (c.is_pg(&pgid) &&
3608           !osdmap->have_pg_pool(pgid.pool())) {
3609         ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3610         if (!store->exists(service.meta_ch, oid)) {
3611           derr << __func__ << " missing pg_pool_t for deleted pool "
3612                << pgid.pool() << " for pg " << pgid
3613                << "; please downgrade to luminous and allow "
3614                << "pg deletion to complete before upgrading" << dendl;
3615           ceph_abort();
3616         }
3617       }
3618     }
3619   }
3620
3621   initial = get_osd_initial_compat_set();
3622   diff = superblock.compat_features.unsupported(initial);
3623   if (superblock.compat_features.merge(initial)) {
3624     // Are we adding SNAPMAPPER2?
3625     if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3626       dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3627               << dendl;
3628       auto ch = service.meta_ch;
3629       auto hoid = make_snapmapper_oid();
3630       unsigned max = cct->_conf->osd_target_transaction_size;
3631       r = SnapMapper::convert_legacy(cct, store.get(), ch, hoid, max);
3632       if (r < 0)
3633         goto out;
3634     }
3635     // We need to persist the new compat_set before we
3636     // do anything else
3637     dout(5) << "Upgrading superblock adding: " << diff << dendl;
3638     ObjectStore::Transaction t;
3639     write_superblock(t);
3640     r = store->queue_transaction(service.meta_ch, std::move(t));
3641     if (r < 0)
3642       goto out;
3643   }
3644
3645   // make sure snap mapper object exists
3646   if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3647     dout(10) << "init creating/touching snapmapper object" << dendl;
3648     ObjectStore::Transaction t;
3649     t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3650     r = store->queue_transaction(service.meta_ch, std::move(t));
3651     if (r < 0)
3652       goto out;
3653   }
3654   if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3655     dout(10) << "init creating/touching purged_snaps object" << dendl;
3656     ObjectStore::Transaction t;
3657     t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3658     r = store->queue_transaction(service.meta_ch, std::move(t));
3659     if (r < 0)
3660       goto out;
3661   }
3662
3663   if (cct->_conf->osd_open_classes_on_start) {
3664     int r = ClassHandler::get_instance().open_all_classes();
3665     if (r)
3666       dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3667   }
3668
3669   check_osdmap_features();
3670
3671   {
3672     epoch_t bind_epoch = osdmap->get_epoch();
3673     service.set_epochs(NULL, NULL, &bind_epoch);
3674   }
3675
3676   clear_temp_objects();
3677
3678   // initialize osdmap references in sharded wq
3679   for (auto& shard : shards) {
3680     std::lock_guard l(shard->osdmap_lock);
3681     shard->shard_osdmap = osdmap;
3682   }
3683
3684   // load up pgs (as they previously existed)
3685   load_pgs();
3686
3687   dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3688
3689   if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3690     dout(2) << "compacting object store's omap" << dendl;
3691     store->compact();
3692   }
3693
3694   // prime osd stats
3695   {
3696     struct store_statfs_t stbuf;
3697     osd_alert_list_t alerts;
3698     int r = store->statfs(&stbuf, &alerts);
3699     ceph_assert(r == 0);
3700     service.set_statfs(stbuf, alerts);
3701   }
3702
3703   // client_messenger's auth_client will be set up by monc->init() later.
3704   for (auto m : { cluster_messenger,
3705         objecter_messenger,
3706         hb_front_client_messenger,
3707         hb_back_client_messenger,
3708         hb_front_server_messenger,
3709         hb_back_server_messenger } ) {
3710     m->set_auth_client(monc);
3711   }
3712   for (auto m : { client_messenger,
3713         cluster_messenger,
3714         hb_front_server_messenger,
3715         hb_back_server_messenger }) {
3716     m->set_auth_server(monc);
3717   }
3718   monc->set_handle_authentication_dispatcher(this);
3719
3720   monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3721                       | CEPH_ENTITY_TYPE_MGR);
3722   r = monc->init();
3723   if (r < 0)
3724     goto out;
3725
3726   mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
3727   mgrc.set_perf_metric_query_cb(
3728     [this](const ConfigPayload &config_payload) {
3729         set_perf_queries(config_payload);
3730       },
3731       [this] {
3732         return get_perf_reports();
3733       });
3734   mgrc.init();
3735
3736   // tell monc about log_client so it will know about mon session resets
3737   monc->set_log_client(&log_client);
3738   update_log_config();
3739
3740   // i'm ready!
3741   client_messenger->add_dispatcher_tail(&mgrc);
3742   client_messenger->add_dispatcher_tail(this);
3743   cluster_messenger->add_dispatcher_head(this);
3744
3745   hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3746   hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3747   hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3748   hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3749
3750   objecter_messenger->add_dispatcher_head(service.objecter.get());
3751
3752   service.init();
3753   service.publish_map(osdmap);
3754   service.publish_superblock(superblock);
3755   service.max_oldest_map = superblock.oldest_map;
3756
3757   for (auto& shard : shards) {
3758     // put PGs in a temporary set because we may modify pg_slots
3759     // unordered_map below.
3760     set<PGRef> pgs;
3761     for (auto& i : shard->pg_slots) {
3762       PGRef pg = i.second->pg;
3763       if (!pg) {
3764         continue;
3765       }
3766       pgs.insert(pg);
3767     }
3768     for (auto pg : pgs) {
3769       std::scoped_lock l{*pg};
3770       set<pair<spg_t,epoch_t>> new_children;
3771       set<pair<spg_t,epoch_t>> merge_pgs;
3772       service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3773                                          &new_children, &merge_pgs);
3774       if (!new_children.empty()) {
3775         for (auto shard : shards) {
3776           shard->prime_splits(osdmap, &new_children);
3777         }
3778         assert(new_children.empty());
3779       }
3780       if (!merge_pgs.empty()) {
3781         for (auto shard : shards) {
3782           shard->prime_merges(osdmap, &merge_pgs);
3783         }
3784         assert(merge_pgs.empty());
3785       }
3786     }
3787   }
3788
3789   osd_op_tp.start();
3790
3791   // start the heartbeat
3792   heartbeat_thread.create("osd_srv_heartbt");
3793
3794   // tick
3795   tick_timer.add_event_after(get_tick_interval(),
3796                              new C_Tick(this));
3797   {
3798     std::lock_guard l(tick_timer_lock);
3799     tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3800                                                 new C_Tick_WithoutOSDLock(this));
3801   }
3802
3803   osd_lock.unlock();
3804
3805   r = monc->authenticate();
3806   if (r < 0) {
3807     derr << __func__ << " authentication failed: " << cpp_strerror(r)
3808          << dendl;
3809     exit(1);
3810   }
3811
3812   while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3813     derr << "unable to obtain rotating service keys; retrying" << dendl;
3814     ++rotating_auth_attempts;
3815     if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3816         derr << __func__ << " wait_auth_rotating timed out" << dendl;
3817         exit(1);
3818     }
3819   }
3820
3821   r = update_crush_device_class();
3822   if (r < 0) {
3823     derr << __func__ << " unable to update_crush_device_class: "
3824          << cpp_strerror(r) << dendl;
3825     exit(1);
3826   }
3827
3828   r = update_crush_location();
3829   if (r < 0) {
3830     derr << __func__ << " unable to update_crush_location: "
3831          << cpp_strerror(r) << dendl;
3832     exit(1);
3833   }
3834
3835   osd_lock.lock();
3836   if (is_stopping())
3837     return 0;
3838
3839   // start objecter *after* we have authenticated, so that we don't ignore
3840   // the OSDMaps it requests.
3841   service.final_init();
3842
3843   check_config();
3844
3845   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3846   consume_map();
3847
3848   dout(0) << "done with init, starting boot process" << dendl;
3849
3850   // subscribe to any pg creations
3851   monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3852
3853   // MgrClient needs this (it doesn't have MonClient reference itself)
3854   monc->sub_want("mgrmap", 0, 0);
3855
3856   // we don't need to ask for an osdmap here; objecter will
3857   //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3858
3859   monc->renew_subs();
3860
3861   start_boot();
3862
3863   // Override a few options if mclock scheduler is enabled.
3864   maybe_override_max_osd_capacity_for_qos();
3865   maybe_override_options_for_qos();
3866
3867   return 0;
3868
3869 out:
3870   enable_disable_fuse(true);
3871   store->umount();
3872   store.reset();
3873   return r;
3874 }
3875
3876 void OSD::final_init()
3877 {
3878   AdminSocket *admin_socket = cct->get_admin_socket();
3879   asok_hook = new OSDSocketHook(this);
3880   int r = admin_socket->register_command("status", asok_hook,
3881                                          "high-level status of OSD");
3882   ceph_assert(r == 0);
3883   r = admin_socket->register_command("flush_journal",
3884                                      asok_hook,
3885                                      "flush the journal to permanent store");
3886   ceph_assert(r == 0);
3887   r = admin_socket->register_command("dump_ops_in_flight " \
3888                                      "name=filterstr,type=CephString,n=N,req=false",
3889                                      asok_hook,
3890                                      "show the ops currently in flight");
3891   ceph_assert(r == 0);
3892   r = admin_socket->register_command("ops " \
3893                                      "name=filterstr,type=CephString,n=N,req=false",
3894                                      asok_hook,
3895                                      "show the ops currently in flight");
3896   ceph_assert(r == 0);
3897   r = admin_socket->register_command("dump_blocked_ops " \
3898                                      "name=filterstr,type=CephString,n=N,req=false",
3899                                      asok_hook,
3900                                      "show the blocked ops currently in flight");
3901   ceph_assert(r == 0);
3902   r = admin_socket->register_command("dump_historic_ops " \
3903                                      "name=filterstr,type=CephString,n=N,req=false",
3904                                      asok_hook,
3905                                      "show recent ops");
3906   ceph_assert(r == 0);
3907   r = admin_socket->register_command("dump_historic_slow_ops " \
3908                                      "name=filterstr,type=CephString,n=N,req=false",
3909                                      asok_hook,
3910                                      "show slowest recent ops");
3911   ceph_assert(r == 0);
3912   r = admin_socket->register_command("dump_historic_ops_by_duration " \
3913                                      "name=filterstr,type=CephString,n=N,req=false",
3914                                      asok_hook,
3915                                      "show slowest recent ops, sorted by duration");
3916   ceph_assert(r == 0);
3917   r = admin_socket->register_command("dump_op_pq_state",
3918                                      asok_hook,
3919                                      "dump op queue state");
3920   ceph_assert(r == 0);
3921   r = admin_socket->register_command("dump_blocklist",
3922                                      asok_hook,
3923                                      "dump blocklisted clients and times");
3924   ceph_assert(r == 0);
3925   r = admin_socket->register_command("dump_watchers",
3926                                      asok_hook,
3927                                      "show clients which have active watches,"
3928                                      " and on which objects");
3929   ceph_assert(r == 0);
3930   r = admin_socket->register_command("dump_recovery_reservations",
3931                                      asok_hook,
3932                                      "show recovery reservations");
3933   ceph_assert(r == 0);
3934   r = admin_socket->register_command("dump_scrub_reservations",
3935                                      asok_hook,
3936                                      "show scrub reservations");
3937   ceph_assert(r == 0);
3938   r = admin_socket->register_command("get_latest_osdmap",
3939                                      asok_hook,
3940                                      "force osd to update the latest map from "
3941                                      "the mon");
3942   ceph_assert(r == 0);
3943
3944   r = admin_socket->register_command("set_heap_property " \
3945                                      "name=property,type=CephString " \
3946                                      "name=value,type=CephInt",
3947                                      asok_hook,
3948                                      "update malloc extension heap property");
3949   ceph_assert(r == 0);
3950
3951   r = admin_socket->register_command("get_heap_property " \
3952                                      "name=property,type=CephString",
3953                                      asok_hook,
3954                                      "get malloc extension heap property");
3955   ceph_assert(r == 0);
3956
3957   r = admin_socket->register_command("dump_objectstore_kv_stats",
3958                                      asok_hook,
3959                                      "print statistics of kvdb which used by bluestore");
3960   ceph_assert(r == 0);
3961
3962   r = admin_socket->register_command("dump_scrubs",
3963                                      asok_hook,
3964                                      "print scheduled scrubs");
3965   ceph_assert(r == 0);
3966
3967   r = admin_socket->register_command("calc_objectstore_db_histogram",
3968                                      asok_hook,
3969                                      "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3970   ceph_assert(r == 0);
3971
3972   r = admin_socket->register_command("flush_store_cache",
3973                                      asok_hook,
3974                                      "Flush bluestore internal cache");
3975   ceph_assert(r == 0);
3976   r = admin_socket->register_command("dump_pgstate_history",
3977                                      asok_hook,
3978                                      "show recent state history");
3979   ceph_assert(r == 0);
3980
3981   r = admin_socket->register_command("compact",
3982                                      asok_hook,
3983                                      "Commpact object store's omap."
3984                                      " WARNING: Compaction probably slows your requests");
3985   ceph_assert(r == 0);
3986
3987   r = admin_socket->register_command("get_mapped_pools",
3988                                      asok_hook,
3989                                      "dump pools whose PG(s) are mapped to this OSD.");
3990
3991   ceph_assert(r == 0);
3992
3993   r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3994                                      asok_hook,
3995                                      "probe OSD devices for SMART data.");
3996
3997   ceph_assert(r == 0);
3998
3999   r = admin_socket->register_command("list_devices",
4000                                      asok_hook,
4001                                      "list OSD devices.");
4002   r = admin_socket->register_command("send_beacon",
4003                                      asok_hook,
4004                                      "send OSD beacon to mon immediately");
4005
4006   r = admin_socket->register_command(
4007     "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
4008     "Dump osd heartbeat network ping times");
4009   ceph_assert(r == 0);
4010
4011   r = admin_socket->register_command(
4012     "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook,
4013     "Dump store's statistics for the given pool");
4014   ceph_assert(r == 0);
4015
4016   test_ops_hook = new TestOpsSocketHook(&(this->service), this->store.get());
4017   // Note: pools are CephString instead of CephPoolname because
4018   // these commands traditionally support both pool names and numbers
4019   r = admin_socket->register_command(
4020    "setomapval " \
4021    "name=pool,type=CephString " \
4022    "name=objname,type=CephObjectname " \
4023    "name=key,type=CephString "\
4024    "name=val,type=CephString",
4025    test_ops_hook,
4026    "set omap key");
4027   ceph_assert(r == 0);
4028   r = admin_socket->register_command(
4029     "rmomapkey " \
4030     "name=pool,type=CephString " \
4031     "name=objname,type=CephObjectname " \
4032     "name=key,type=CephString",
4033     test_ops_hook,
4034     "remove omap key");
4035   ceph_assert(r == 0);
4036   r = admin_socket->register_command(
4037     "setomapheader " \
4038     "name=pool,type=CephString " \
4039     "name=objname,type=CephObjectname " \
4040     "name=header,type=CephString",
4041     test_ops_hook,
4042     "set omap header");
4043   ceph_assert(r == 0);
4044
4045   r = admin_socket->register_command(
4046     "getomap " \
4047     "name=pool,type=CephString " \
4048     "name=objname,type=CephObjectname",
4049     test_ops_hook,
4050     "output entire object map");
4051   ceph_assert(r == 0);
4052
4053   r = admin_socket->register_command(
4054     "truncobj " \
4055     "name=pool,type=CephString " \
4056     "name=objname,type=CephObjectname " \
4057     "name=len,type=CephInt",
4058     test_ops_hook,
4059     "truncate object to length");
4060   ceph_assert(r == 0);
4061
4062   r = admin_socket->register_command(
4063     "injectdataerr " \
4064     "name=pool,type=CephString " \
4065     "name=objname,type=CephObjectname " \
4066     "name=shardid,type=CephInt,req=false,range=0|255",
4067     test_ops_hook,
4068     "inject data error to an object");
4069   ceph_assert(r == 0);
4070
4071   r = admin_socket->register_command(
4072     "injectmdataerr " \
4073     "name=pool,type=CephString " \
4074     "name=objname,type=CephObjectname " \
4075     "name=shardid,type=CephInt,req=false,range=0|255",
4076     test_ops_hook,
4077     "inject metadata error to an object");
4078   ceph_assert(r == 0);
4079   r = admin_socket->register_command(
4080     "set_recovery_delay " \
4081     "name=utime,type=CephInt,req=false",
4082     test_ops_hook,
4083      "Delay osd recovery by specified seconds");
4084   ceph_assert(r == 0);
4085   r = admin_socket->register_command(
4086    "injectfull " \
4087    "name=type,type=CephString,req=false " \
4088    "name=count,type=CephInt,req=false ",
4089    test_ops_hook,
4090    "Inject a full disk (optional count times)");
4091   ceph_assert(r == 0);
4092   r = admin_socket->register_command(
4093     "bench " \
4094     "name=count,type=CephInt,req=false "    \
4095     "name=size,type=CephInt,req=false "            \
4096     "name=object_size,type=CephInt,req=false "     \
4097     "name=object_num,type=CephInt,req=false ",
4098     asok_hook,
4099     "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4100     "(default count=1G default size=4MB). Results in log.");
4101   ceph_assert(r == 0);
4102   r = admin_socket->register_command(
4103     "cluster_log " \
4104     "name=level,type=CephChoices,strings=error,warning,info,debug "     \
4105     "name=message,type=CephString,n=N",
4106     asok_hook,
4107     "log a message to the cluster log");
4108   ceph_assert(r == 0);
4109   r = admin_socket->register_command(
4110     "flush_pg_stats",
4111     asok_hook,
4112     "flush pg stats");
4113   ceph_assert(r == 0);
4114   r = admin_socket->register_command(
4115     "heap " \
4116     "name=heapcmd,type=CephChoices,strings="                            \
4117     "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4118     "name=value,type=CephString,req=false",
4119     asok_hook,
4120     "show heap usage info (available only if compiled with tcmalloc)");
4121   ceph_assert(r == 0);
4122   r = admin_socket->register_command(
4123     "debug dump_missing "                       \
4124     "name=filename,type=CephFilepath",
4125     asok_hook,
4126     "dump missing objects to a named file");
4127   ceph_assert(r == 0);
4128   r = admin_socket->register_command(
4129     "debug kick_recovery_wq "                                           \
4130     "name=delay,type=CephInt,range=0",
4131     asok_hook,
4132     "set osd_recovery_delay_start to <val>");
4133   ceph_assert(r == 0);
4134   r = admin_socket->register_command(
4135     "cpu_profiler "                                             \
4136     "name=arg,type=CephChoices,strings=status|flush",
4137     asok_hook,
4138     "run cpu profiling on daemon");
4139   ceph_assert(r == 0);
4140   r = admin_socket->register_command(
4141     "dump_pg_recovery_stats",
4142     asok_hook,
4143     "dump pg recovery statistics");
4144   ceph_assert(r == 0);
4145   r = admin_socket->register_command(
4146     "reset_pg_recovery_stats",
4147     asok_hook,
4148     "reset pg recovery statistics");
4149   ceph_assert(r == 0);
4150   r = admin_socket->register_command(
4151     "cache drop",
4152     asok_hook,
4153     "Drop all OSD caches");
4154   ceph_assert(r == 0);
4155   r = admin_socket->register_command(
4156     "cache status",
4157     asok_hook,
4158     "Get OSD caches statistics");
4159   ceph_assert(r == 0);
4160   r = admin_socket->register_command(
4161     "scrub_purged_snaps",
4162     asok_hook,
4163     "Scrub purged_snaps vs snapmapper index");
4164   ceph_assert(r == 0);
4165   r = admin_socket->register_command(
4166     "scrubdebug "                                               \
4167     "name=pgid,type=CephPgid "                                  \
4168     "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4169     "name=value,type=CephString,req=false",
4170     asok_hook,
4171     "debug the scrubber");
4172   ceph_assert(r == 0);
4173
4174   // -- pg commands --
4175   // old form: ceph pg <pgid> command ...
4176   r = admin_socket->register_command(
4177     "pg "                          \
4178     "name=pgid,type=CephPgid "     \
4179     "name=cmd,type=CephChoices,strings=query",
4180     asok_hook,
4181     "");
4182   ceph_assert(r == 0);
4183   r = admin_socket->register_command(
4184     "pg "                          \
4185     "name=pgid,type=CephPgid "     \
4186     "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4187     "name=mulcmd,type=CephChoices,strings=revert|delete",
4188     asok_hook,
4189     "");
4190   ceph_assert(r == 0);
4191   r = admin_socket->register_command(
4192     "pg "                          \
4193     "name=pgid,type=CephPgid "     \
4194     "name=cmd,type=CephChoices,strings=list_unfound " \
4195     "name=offset,type=CephString,req=false",
4196     asok_hook,
4197     "");
4198   ceph_assert(r == 0);
4199   r = admin_socket->register_command(
4200     "pg "                          \
4201     "name=pgid,type=CephPgid "     \
4202     "name=cmd,type=CephChoices,strings=scrub " \
4203     "name=time,type=CephInt,req=false",
4204     asok_hook,
4205     "");
4206   ceph_assert(r == 0);
4207   r = admin_socket->register_command(
4208     "pg "                          \
4209     "name=pgid,type=CephPgid "     \
4210     "name=cmd,type=CephChoices,strings=deep_scrub " \
4211     "name=time,type=CephInt,req=false",
4212     asok_hook,
4213     "");
4214   ceph_assert(r == 0);
4215   // new form: tell <pgid> <cmd> for both cli and rest
4216   r = admin_socket->register_command(
4217     "query",
4218     asok_hook,
4219     "show details of a specific pg");
4220   ceph_assert(r == 0);
4221   r = admin_socket->register_command(
4222     "mark_unfound_lost "                                        \
4223     "name=pgid,type=CephPgid,req=false "                        \
4224     "name=mulcmd,type=CephChoices,strings=revert|delete",
4225     asok_hook,
4226     "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4227   ceph_assert(r == 0);
4228   r = admin_socket->register_command(
4229     "list_unfound "                                     \
4230     "name=pgid,type=CephPgid,req=false "                \
4231     "name=offset,type=CephString,req=false",
4232     asok_hook,
4233     "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4234   ceph_assert(r == 0);
4235   r = admin_socket->register_command(
4236     "scrub "                            \
4237     "name=pgid,type=CephPgid,req=false "        \
4238     "name=time,type=CephInt,req=false",
4239     asok_hook,
4240     "Trigger a scheduled scrub ");
4241   ceph_assert(r == 0);
4242   r = admin_socket->register_command(
4243     "deep_scrub "                       \
4244     "name=pgid,type=CephPgid,req=false "        \
4245     "name=time,type=CephInt,req=false",
4246     asok_hook,
4247     "Trigger a scheduled deep scrub ");
4248   ceph_assert(r == 0);
4249 }
4250
4251 PerfCounters* OSD::create_logger()
4252 {
4253   PerfCounters* logger = build_osd_logger(cct);
4254   cct->get_perfcounters_collection()->add(logger);
4255   return logger;
4256 }
4257
4258 PerfCounters* OSD::create_recoverystate_perf()
4259 {
4260   PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
4261   cct->get_perfcounters_collection()->add(recoverystate_perf);
4262   return recoverystate_perf;
4263 }
4264
4265 int OSD::shutdown()
4266 {
4267   // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4268   //cct->_conf->osd_fast_shutdown = true;
4269
4270   dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4271           << cct->_conf->osd_fast_shutdown
4272           << ", null-fm = " << store->has_null_manager() << dendl;
4273
4274   utime_t  start_time_func = ceph_clock_now();
4275
4276   if (cct->_conf->osd_fast_shutdown) {
4277     derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4278     if (cct->_conf->osd_fast_shutdown_notify_mon)
4279       service.prepare_to_stop();
4280
4281     // There is no state we need to keep wehn running in NULL-FM moode
4282     if (!store->has_null_manager()) {
4283       cct->_log->flush();
4284       _exit(0);
4285     }
4286   } else if (!service.prepare_to_stop()) {
4287     return 0; // already shutting down
4288   }
4289
4290   osd_lock.lock();
4291   if (is_stopping()) {
4292     osd_lock.unlock();
4293     return 0;
4294   }
4295
4296   if (!cct->_conf->osd_fast_shutdown) {
4297     dout(0) << "shutdown" << dendl;
4298   }
4299
4300   // don't accept new task for this OSD
4301   set_state(STATE_STOPPING);
4302
4303   // Disabled debugging during fast-shutdown
4304   if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4305     cct->_conf.set_val("debug_osd", "100");
4306     cct->_conf.set_val("debug_journal", "100");
4307     cct->_conf.set_val("debug_filestore", "100");
4308     cct->_conf.set_val("debug_bluestore", "100");
4309     cct->_conf.set_val("debug_ms", "100");
4310     cct->_conf.apply_changes(nullptr);
4311   }
4312
4313   if (cct->_conf->osd_fast_shutdown) {
4314     // first, stop new task from being taken from op_shardedwq
4315     // and clear all pending tasks
4316     op_shardedwq.stop_for_fast_shutdown();
4317
4318     utime_t  start_time_timer = ceph_clock_now();
4319     tick_timer.shutdown();
4320     {
4321       std::lock_guard l(tick_timer_lock);
4322       tick_timer_without_osd_lock.shutdown();
4323     }
4324
4325     osd_lock.unlock();
4326     utime_t  start_time_osd_drain = ceph_clock_now();
4327
4328     // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4329     osd_op_tp.drain();
4330     osd_op_tp.stop();
4331
4332     utime_t  start_time_umount = ceph_clock_now();
4333     store->prepare_for_fast_shutdown();
4334     std::lock_guard lock(osd_lock);
4335     // TBD: assert in allocator that nothing is being add
4336     store->umount();
4337
4338     utime_t end_time = ceph_clock_now();
4339     if (cct->_conf->osd_fast_shutdown_timeout) {
4340       ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4341     }
4342     dout(0) <<"Fast Shutdown duration total     :" << end_time              - start_time_func       << " seconds" << dendl;
4343     dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount     - start_time_osd_drain  << " seconds" << dendl;
4344     dout(0) <<"Fast Shutdown duration umount    :" << end_time              - start_time_umount     << " seconds" << dendl;
4345     dout(0) <<"Fast Shutdown duration timer     :" << start_time_osd_drain  - start_time_timer      << " seconds" << dendl;
4346     cct->_log->flush();
4347
4348     // now it is safe to exit
4349     _exit(0);
4350   }
4351
4352   // stop MgrClient earlier as it's more like an internal consumer of OSD
4353   mgrc.shutdown();
4354
4355   service.start_shutdown();
4356
4357   // stop sending work to pgs.  this just prevents any new work in _process
4358   // from racing with on_shutdown and potentially entering the pg after.
4359   op_shardedwq.drain();
4360
4361   // Shutdown PGs
4362   {
4363     vector<PGRef> pgs;
4364     _get_pgs(&pgs);
4365     for (auto pg : pgs) {
4366       pg->shutdown();
4367     }
4368   }
4369
4370   // drain op queue again (in case PGs requeued something)
4371   op_shardedwq.drain();
4372   {
4373     finished.clear(); // zap waiters (bleh, this is messy)
4374     waiting_for_osdmap.clear();
4375   }
4376
4377   // unregister commands
4378   cct->get_admin_socket()->unregister_commands(asok_hook);
4379   delete asok_hook;
4380   asok_hook = NULL;
4381
4382   cct->get_admin_socket()->unregister_commands(test_ops_hook);
4383   delete test_ops_hook;
4384   test_ops_hook = NULL;
4385
4386   osd_lock.unlock();
4387
4388   {
4389     std::lock_guard l{heartbeat_lock};
4390     heartbeat_stop = true;
4391     heartbeat_cond.notify_all();
4392     heartbeat_peers.clear();
4393   }
4394   heartbeat_thread.join();
4395
4396   hb_back_server_messenger->mark_down_all();
4397   hb_front_server_messenger->mark_down_all();
4398   hb_front_client_messenger->mark_down_all();
4399   hb_back_client_messenger->mark_down_all();
4400
4401   osd_op_tp.drain();
4402   osd_op_tp.stop();
4403   dout(10) << "op sharded tp stopped" << dendl;
4404
4405   dout(10) << "stopping agent" << dendl;
4406   service.agent_stop();
4407
4408   boot_finisher.wait_for_empty();
4409
4410   osd_lock.lock();
4411
4412   boot_finisher.stop();
4413   reset_heartbeat_peers(true);
4414
4415   tick_timer.shutdown();
4416
4417   {
4418     std::lock_guard l(tick_timer_lock);
4419     tick_timer_without_osd_lock.shutdown();
4420   }
4421
4422   // note unmount epoch
4423   dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4424   superblock.mounted = service.get_boot_epoch();
4425   superblock.clean_thru = get_osdmap_epoch();
4426   ObjectStore::Transaction t;
4427   write_superblock(t);
4428   int r = store->queue_transaction(service.meta_ch, std::move(t));
4429   if (r) {
4430     derr << "OSD::shutdown: error writing superblock: "
4431          << cpp_strerror(r) << dendl;
4432   }
4433
4434
4435   service.shutdown_reserver();
4436
4437   // Remove PGs
4438 #ifdef PG_DEBUG_REFS
4439   service.dump_live_pgids();
4440 #endif
4441   while (true) {
4442     vector<PGRef> pgs;
4443     _get_pgs(&pgs, true);
4444     if (pgs.empty()) {
4445       break;
4446     }
4447     for (auto& pg : pgs) {
4448       if (pg->is_deleted()) {
4449         continue;
4450       }
4451       dout(20) << " kicking pg " << pg << dendl;
4452       pg->lock();
4453       if (pg->get_num_ref() != 1) {
4454         derr << "pgid " << pg->get_pgid() << " has ref count of "
4455              << pg->get_num_ref() << dendl;
4456 #ifdef PG_DEBUG_REFS
4457         pg->dump_live_ids();
4458 #endif
4459         if (cct->_conf->osd_shutdown_pgref_assert) {
4460           ceph_abort();
4461         }
4462       }
4463       pg->ch.reset();
4464       pg->unlock();
4465     }
4466   }
4467 #ifdef PG_DEBUG_REFS
4468   service.dump_live_pgids();
4469 #endif
4470
4471   osd_lock.unlock();
4472   cct->_conf.remove_observer(this);
4473   osd_lock.lock();
4474
4475   service.meta_ch.reset();
4476
4477   dout(10) << "syncing store" << dendl;
4478   enable_disable_fuse(true);
4479
4480   if (cct->_conf->osd_journal_flush_on_shutdown) {
4481     dout(10) << "flushing journal" << dendl;
4482     store->flush_journal();
4483   }
4484
4485   monc->shutdown();
4486   osd_lock.unlock();
4487   {
4488     std::unique_lock l{map_lock};
4489     set_osdmap(OSDMapRef());
4490   }
4491   for (auto s : shards) {
4492     std::lock_guard l(s->osdmap_lock);
4493     s->shard_osdmap = OSDMapRef();
4494   }
4495   service.shutdown();
4496
4497   std::lock_guard lock(osd_lock);
4498   store->umount();
4499   store.reset();
4500   dout(10) << "Store synced" << dendl;
4501
4502   op_tracker.on_shutdown();
4503
4504   ClassHandler::get_instance().shutdown();
4505   client_messenger->shutdown();
4506   cluster_messenger->shutdown();
4507   hb_front_client_messenger->shutdown();
4508   hb_back_client_messenger->shutdown();
4509   objecter_messenger->shutdown();
4510   hb_front_server_messenger->shutdown();
4511   hb_back_server_messenger->shutdown();
4512
4513   utime_t duration = ceph_clock_now() - start_time_func;
4514   dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
4515
4516   tracing::osd::tracer.shutdown();
4517
4518   return r;
4519 }
4520
4521 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4522 {
4523   bool created = false;
4524   while (true) {
4525     dout(10) << __func__ << " cmd: " << cmd << dendl;
4526     vector<string> vcmd{cmd};
4527     bufferlist inbl;
4528     C_SaferCond w;
4529     string outs;
4530     monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4531     int r = w.wait();
4532     if (r < 0) {
4533       if (r == -ENOENT && !created) {
4534         string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4535           + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4536         vector<string> vnewcmd{newcmd};
4537         bufferlist inbl;
4538         C_SaferCond w;
4539         string outs;
4540         monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4541         int r = w.wait();
4542         if (r < 0) {
4543           derr << __func__ << " fail: osd does not exist and created failed: "
4544                << cpp_strerror(r) << dendl;
4545           return r;
4546         }
4547         created = true;
4548         continue;
4549       }
4550       derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4551       return r;
4552     }
4553     break;
4554   }
4555
4556   return 0;
4557 }
4558
4559 int OSD::update_crush_location()
4560 {
4561   if (!cct->_conf->osd_crush_update_on_start) {
4562     dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4563     return 0;
4564   }
4565
4566   char weight[32];
4567   if (cct->_conf->osd_crush_initial_weight >= 0) {
4568     snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4569   } else {
4570     struct store_statfs_t st;
4571     osd_alert_list_t alerts;
4572     int r = store->statfs(&st, &alerts);
4573     if (r < 0) {
4574       derr << "statfs: " << cpp_strerror(r) << dendl;
4575       return r;
4576     }
4577     snprintf(weight, sizeof(weight), "%.4lf",
4578              std::max(.00001,
4579                       double(st.total) /
4580                       double(1ull << 40 /* TB */)));
4581   }
4582
4583   dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4584
4585   string cmd =
4586     string("{\"prefix\": \"osd crush create-or-move\", ") +
4587     string("\"id\": ") + stringify(whoami) + ", " +
4588     string("\"weight\":") + weight + ", " +
4589     string("\"args\": [") + stringify(cct->crush_location) + "]}";
4590   return mon_cmd_maybe_osd_create(cmd);
4591 }
4592
4593 int OSD::update_crush_device_class()
4594 {
4595   if (!cct->_conf->osd_class_update_on_start) {
4596     dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4597     return 0;
4598   }
4599
4600   string device_class;
4601   int r = store->read_meta("crush_device_class", &device_class);
4602   if (r < 0 || device_class.empty()) {
4603     device_class = store->get_default_device_class();
4604   }
4605
4606   if (device_class.empty()) {
4607     dout(20) << __func__ << " no device class stored locally" << dendl;
4608     return 0;
4609   }
4610
4611   string cmd =
4612     string("{\"prefix\": \"osd crush set-device-class\", ") +
4613     string("\"class\": \"") + device_class + string("\", ") +
4614     string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4615
4616   r = mon_cmd_maybe_osd_create(cmd);
4617   if (r == -EBUSY) {
4618     // good, already bound to a device-class
4619     return 0;
4620   } else {
4621     return r;
4622   }
4623 }
4624
4625 void OSD::write_superblock(ObjectStore::Transaction& t)
4626 {
4627   dout(10) << "write_superblock " << superblock << dendl;
4628
4629   //hack: at minimum it's using the baseline feature set
4630   if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4631     superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4632
4633   bufferlist bl;
4634   encode(superblock, bl);
4635   t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4636 }
4637
4638 int OSD::read_superblock()
4639 {
4640   bufferlist bl;
4641   int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4642   if (r < 0)
4643     return r;
4644
4645   auto p = bl.cbegin();
4646   decode(superblock, p);
4647
4648   dout(10) << "read_superblock " << superblock << dendl;
4649
4650   return 0;
4651 }
4652
4653 void OSD::clear_temp_objects()
4654 {
4655   dout(10) << __func__ << dendl;
4656   vector<coll_t> ls;
4657   store->list_collections(ls);
4658   for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4659     spg_t pgid;
4660     if (!p->is_pg(&pgid))
4661       continue;
4662
4663     // list temp objects
4664     dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4665
4666     vector<ghobject_t> temps;
4667     ghobject_t next;
4668     while (1) {
4669       vector<ghobject_t> objects;
4670       auto ch = store->open_collection(*p);
4671       ceph_assert(ch);
4672       store->collection_list(ch, next, ghobject_t::get_max(),
4673                              store->get_ideal_list_max(),
4674                              &objects, &next);
4675       if (objects.empty())
4676         break;
4677       vector<ghobject_t>::iterator q;
4678       for (q = objects.begin(); q != objects.end(); ++q) {
4679         // Hammer set pool for temps to -1, so check for clean-up
4680         if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4681           temps.push_back(*q);
4682         } else {
4683           break;
4684         }
4685       }
4686       // If we saw a non-temp object and hit the break above we can
4687       // break out of the while loop too.
4688       if (q != objects.end())
4689         break;
4690     }
4691     if (!temps.empty()) {
4692       ObjectStore::Transaction t;
4693       int removed = 0;
4694       for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4695         dout(20) << "  removing " << *p << " object " << *q << dendl;
4696         t.remove(*p, *q);
4697         if (++removed > cct->_conf->osd_target_transaction_size) {
4698           store->queue_transaction(service.meta_ch, std::move(t));
4699           t = ObjectStore::Transaction();
4700           removed = 0;
4701         }
4702       }
4703       if (removed) {
4704         store->queue_transaction(service.meta_ch, std::move(t));
4705       }
4706     }
4707   }
4708 }
4709
4710 void OSD::recursive_remove_collection(CephContext* cct,
4711                                       ObjectStore *store, spg_t pgid,
4712                                       coll_t tmp)
4713 {
4714   OSDriver driver(
4715     store,
4716     coll_t(),
4717     make_snapmapper_oid());
4718
4719   ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4720   ObjectStore::Transaction t;
4721   SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4722
4723   ghobject_t next;
4724   int max = cct->_conf->osd_target_transaction_size;
4725   vector<ghobject_t> objects;
4726   objects.reserve(max);
4727   while (true) {
4728     objects.clear();
4729     store->collection_list(ch, next, ghobject_t::get_max(),
4730       max, &objects, &next);
4731     generic_dout(10) << __func__ << " " << objects << dendl;
4732     if (objects.empty())
4733       break;
4734     for (auto& p: objects) {
4735       OSDriver::OSTransaction _t(driver.get_transaction(&t));
4736       int r = mapper.remove_oid(p.hobj, &_t);
4737       if (r != 0 && r != -ENOENT)
4738         ceph_abort();
4739       t.remove(tmp, p);
4740     }
4741     int r = store->queue_transaction(ch, std::move(t));
4742     ceph_assert(r == 0);
4743     t = ObjectStore::Transaction();
4744   }
4745   t.remove_collection(tmp);
4746   int r = store->queue_transaction(ch, std::move(t));
4747   ceph_assert(r == 0);
4748
4749   C_SaferCond waiter;
4750   if (!ch->flush_commit(&waiter)) {
4751     waiter.wait();
4752   }
4753 }
4754
4755
4756 // ======================================================
4757 // PG's
4758
4759 PG* OSD::_make_pg(
4760   OSDMapRef createmap,
4761   spg_t pgid)
4762 {
4763   dout(10) << __func__ << " " << pgid << dendl;
4764   pg_pool_t pi;
4765   map<string,string> ec_profile;
4766   string name;
4767   if (createmap->have_pg_pool(pgid.pool())) {
4768     pi = *createmap->get_pg_pool(pgid.pool());
4769     name = createmap->get_pool_name(pgid.pool());
4770     if (pi.is_erasure()) {
4771       ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4772     }
4773   } else {
4774     // pool was deleted; grab final pg_pool_t off disk.
4775     ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4776     bufferlist bl;
4777     int r = store->read(service.meta_ch, oid, 0, 0, bl);
4778     if (r < 0) {
4779       derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4780            << dendl;
4781       return nullptr;
4782     }
4783     ceph_assert(r >= 0);
4784     auto p = bl.cbegin();
4785     decode(pi, p);
4786     decode(name, p);
4787     if (p.end()) { // dev release v13.0.2 did not include ec_profile
4788       derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4789            << " tombstone" << dendl;
4790       return nullptr;
4791     }
4792     decode(ec_profile, p);
4793   }
4794   PGPool pool(createmap, pgid.pool(), pi, name);
4795   PG *pg;
4796   if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4797       pi.type == pg_pool_t::TYPE_ERASURE)
4798     pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4799   else
4800     ceph_abort();
4801   return pg;
4802 }
4803
4804 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4805 {
4806   v->clear();
4807   v->reserve(get_num_pgs());
4808   for (auto& s : shards) {
4809     std::lock_guard l(s->shard_lock);
4810     for (auto& j : s->pg_slots) {
4811       if (j.second->pg &&
4812           !j.second->pg->is_deleted()) {
4813         v->push_back(j.second->pg);
4814         if (clear_too) {
4815           s->_detach_pg(j.second.get());
4816         }
4817       }
4818     }
4819   }
4820 }
4821
4822 void OSD::_get_pgids(vector<spg_t> *v)
4823 {
4824   v->clear();
4825   v->reserve(get_num_pgs());
4826   for (auto& s : shards) {
4827     std::lock_guard l(s->shard_lock);
4828     for (auto& j : s->pg_slots) {
4829       if (j.second->pg &&
4830           !j.second->pg->is_deleted()) {
4831         v->push_back(j.first);
4832       }
4833     }
4834   }
4835 }
4836
4837 void OSD::register_pg(PGRef pg)
4838 {
4839   spg_t pgid = pg->get_pgid();
4840   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4841   auto sdata = shards[shard_index];
4842   std::lock_guard l(sdata->shard_lock);
4843   auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4844   ceph_assert(r.second);
4845   auto *slot = r.first->second.get();
4846   dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4847   sdata->_attach_pg(slot, pg.get());
4848 }
4849
4850 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4851 {
4852   auto sdata = pg->osd_shard;
4853   ceph_assert(sdata);
4854   {
4855     std::lock_guard l(sdata->shard_lock);
4856     auto p = sdata->pg_slots.find(pg->pg_id);
4857     if (p == sdata->pg_slots.end() ||
4858         !p->second->pg) {
4859       dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4860       return false;
4861     }
4862     if (p->second->waiting_for_merge_epoch) {
4863       dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4864       return false;
4865     }
4866     dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4867     sdata->_detach_pg(p->second.get());
4868   }
4869
4870   for (auto shard : shards) {
4871     shard->unprime_split_children(pg->pg_id, old_pg_num);
4872   }
4873
4874   // update pg count now since we might not get an osdmap any time soon.
4875   if (pg->is_primary())
4876     service.logger->dec(l_osd_pg_primary);
4877   else if (pg->is_nonprimary())
4878     service.logger->dec(l_osd_pg_replica); // misnomver
4879   else
4880     service.logger->dec(l_osd_pg_stray);
4881
4882   return true;
4883 }
4884
4885 PGRef OSD::_lookup_pg(spg_t pgid)
4886 {
4887   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4888   auto sdata = shards[shard_index];
4889   std::lock_guard l(sdata->shard_lock);
4890   auto p = sdata->pg_slots.find(pgid);
4891   if (p == sdata->pg_slots.end()) {
4892     return nullptr;
4893   }
4894   return p->second->pg;
4895 }
4896
4897 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4898 {
4899   PGRef pg = _lookup_pg(pgid);
4900   if (!pg) {
4901     return nullptr;
4902   }
4903   pg->lock();
4904   if (!pg->is_deleted()) {
4905     return pg;
4906   }
4907   pg->unlock();
4908   return nullptr;
4909 }
4910
4911 PGRef OSD::lookup_lock_pg(spg_t pgid)
4912 {
4913   return _lookup_lock_pg(pgid);
4914 }
4915
4916 void OSD::load_pgs()
4917 {
4918   ceph_assert(ceph_mutex_is_locked(osd_lock));
4919   dout(0) << "load_pgs" << dendl;
4920
4921   {
4922     auto pghist = make_pg_num_history_oid();
4923     bufferlist bl;
4924     int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4925     if (r >= 0 && bl.length() > 0) {
4926       auto p = bl.cbegin();
4927       decode(pg_num_history, p);
4928     }
4929     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4930   }
4931
4932   vector<coll_t> ls;
4933   int r = store->list_collections(ls);
4934   if (r < 0) {
4935     derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4936   }
4937
4938   int num = 0;
4939   for (vector<coll_t>::iterator it = ls.begin();
4940        it != ls.end();
4941        ++it) {
4942     spg_t pgid;
4943     if (it->is_temp(&pgid) ||
4944        (it->is_pg(&pgid) && PG::_has_removal_flag(store.get(), pgid))) {
4945       dout(10) << "load_pgs " << *it
4946                << " removing, legacy or flagged for removal pg" << dendl;
4947       recursive_remove_collection(cct, store.get(), pgid, *it);
4948       continue;
4949     }
4950
4951     if (!it->is_pg(&pgid)) {
4952       dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4953       continue;
4954     }
4955
4956     dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4957     epoch_t map_epoch = 0;
4958     int r = PG::peek_map_epoch(store.get(), pgid, &map_epoch);
4959     if (r < 0) {
4960       derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4961            << dendl;
4962       continue;
4963     }
4964
4965     PGRef pg;
4966     if (map_epoch > 0) {
4967       OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4968       if (!pgosdmap) {
4969         if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4970           derr << __func__ << ": could not find map for epoch " << map_epoch
4971                << " on pg " << pgid << ", but the pool is not present in the "
4972                << "current map, so this is probably a result of bug 10617.  "
4973                << "Skipping the pg for now, you can use ceph-objectstore-tool "
4974                << "to clean it up later." << dendl;
4975           continue;
4976         } else {
4977           derr << __func__ << ": have pgid " << pgid << " at epoch "
4978                << map_epoch << ", but missing map.  Crashing."
4979                << dendl;
4980           ceph_abort_msg("Missing map in load_pgs");
4981         }
4982       }
4983       pg = _make_pg(pgosdmap, pgid);
4984     } else {
4985       pg = _make_pg(get_osdmap(), pgid);
4986     }
4987     if (!pg) {
4988       recursive_remove_collection(cct, store.get(), pgid, *it);
4989       continue;
4990     }
4991
4992     // there can be no waiters here, so we don't call _wake_pg_slot
4993
4994     pg->lock();
4995     pg->ch = store->open_collection(pg->coll);
4996
4997     // read pg state, log
4998     pg->read_state(store.get());
4999
5000     if (pg->dne())  {
5001       dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
5002       pg->ch = nullptr;
5003       pg->unlock();
5004       recursive_remove_collection(cct, store.get(), pgid, *it);
5005       continue;
5006     }
5007     {
5008       uint32_t shard_index = pgid.hash_to_shard(shards.size());
5009       assert(NULL != shards[shard_index]);
5010       store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5011     }
5012
5013     dout(10) << __func__ << " loaded " << *pg << dendl;
5014     pg->unlock();
5015
5016     register_pg(pg);
5017     ++num;
5018   }
5019   dout(0) << __func__ << " opened " << num << " pgs" << dendl;
5020 }
5021
5022
5023 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
5024                                  const PGCreateInfo *info)
5025 {
5026   spg_t pgid = info->pgid;
5027
5028   if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
5029     dout(10) << __func__ << " hit max pg, dropping" << dendl;
5030     return nullptr;
5031   }
5032
5033   OSDMapRef startmap = get_map(info->epoch);
5034
5035   if (info->by_mon) {
5036     int64_t pool_id = pgid.pgid.pool();
5037     const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
5038     if (!pool) {
5039       dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
5040       return nullptr;
5041     }
5042     if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
5043         !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
5044       // this ensures we do not process old creating messages after the
5045       // pool's initial pgs have been created (and pg are subsequently
5046       // allowed to split or merge).
5047       dout(20) << __func__ << "  dropping " << pgid
5048                << "create, pool does not have CREATING flag set" << dendl;
5049       return nullptr;
5050     }
5051   }
5052
5053   int up_primary, acting_primary;
5054   vector<int> up, acting;
5055   startmap->pg_to_up_acting_osds(
5056     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5057
5058   const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
5059   if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
5060       store->get_type() != "bluestore") {
5061     clog->warn() << "pg " << pgid
5062                  << " is at risk of silent data corruption: "
5063                  << "the pool allows ec overwrites but is not stored in "
5064                  << "bluestore, so deep scrubbing will not detect bitrot";
5065   }
5066   PeeringCtx rctx;
5067   create_pg_collection(
5068     rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
5069   init_pg_ondisk(rctx.transaction, pgid, pp);
5070
5071   int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
5072
5073   PGRef pg = _make_pg(startmap, pgid);
5074   pg->ch = store->create_new_collection(pg->coll);
5075
5076   {
5077     uint32_t shard_index = pgid.hash_to_shard(shards.size());
5078     assert(NULL != shards[shard_index]);
5079     store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5080   }
5081
5082   pg->lock(true);
5083
5084   // we are holding the shard lock
5085   ceph_assert(!pg->is_deleted());
5086
5087   pg->init(
5088     role,
5089     up,
5090     up_primary,
5091     acting,
5092     acting_primary,
5093     info->history,
5094     info->past_intervals,
5095     rctx.transaction);
5096
5097   pg->init_collection_pool_opts();
5098
5099   if (pg->is_primary()) {
5100     std::lock_guard locker{m_perf_queries_lock};
5101     pg->set_dynamic_perf_stats_queries(m_perf_queries);
5102   }
5103
5104   pg->handle_initialize(rctx);
5105   pg->handle_activate_map(rctx);
5106
5107   dispatch_context(rctx, pg.get(), osdmap, nullptr);
5108
5109   dout(10) << __func__ << " new pg " << *pg << dendl;
5110   return pg;
5111 }
5112
5113 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
5114                                 spg_t pgid,
5115                                 bool is_mon_create)
5116 {
5117   const auto max_pgs_per_osd =
5118     (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5119      cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5120
5121   if (num_pgs < max_pgs_per_osd) {
5122     return false;
5123   }
5124
5125   std::lock_guard l(pending_creates_lock);
5126   if (is_mon_create) {
5127     pending_creates_from_mon++;
5128   } else {
5129     bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
5130     pending_creates_from_osd.emplace(pgid, is_primary);
5131   }
5132   dout(1) << __func__ << " withhold creation of pg " << pgid
5133           << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
5134   return true;
5135 }
5136
5137 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5138 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5139 // to up set if pg_temp is empty. so an empty pg_temp won't work.
5140 static vector<int32_t> twiddle(const vector<int>& acting) {
5141   if (acting.size() > 1) {
5142     return {acting[0]};
5143   } else {
5144     vector<int32_t> twiddled(acting.begin(), acting.end());
5145     twiddled.push_back(-1);
5146     return twiddled;
5147   }
5148 }
5149
5150 void OSD::resume_creating_pg()
5151 {
5152   bool do_sub_pg_creates = false;
5153   bool have_pending_creates = false;
5154   {
5155     const auto max_pgs_per_osd =
5156       (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5157        cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5158     if (max_pgs_per_osd <= num_pgs) {
5159       // this could happen if admin decreases this setting before a PG is removed
5160       return;
5161     }
5162     unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5163     std::lock_guard l(pending_creates_lock);
5164     if (pending_creates_from_mon > 0) {
5165       dout(20) << __func__ << " pending_creates_from_mon "
5166                << pending_creates_from_mon << dendl;
5167       do_sub_pg_creates = true;
5168       if (pending_creates_from_mon >= spare_pgs) {
5169         spare_pgs = pending_creates_from_mon = 0;
5170       } else {
5171         spare_pgs -= pending_creates_from_mon;
5172         pending_creates_from_mon = 0;
5173       }
5174     }
5175     auto pg = pending_creates_from_osd.cbegin();
5176     while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
5177       dout(20) << __func__ << " pg " << pg->first << dendl;
5178       vector<int> acting;
5179       get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5180       service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
5181       pg = pending_creates_from_osd.erase(pg);
5182       do_sub_pg_creates = true;
5183       spare_pgs--;
5184     }
5185     have_pending_creates = (pending_creates_from_mon > 0 ||
5186                             !pending_creates_from_osd.empty());
5187   }
5188
5189   bool do_renew_subs = false;
5190   if (do_sub_pg_creates) {
5191     if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5192       dout(4) << __func__ << ": resolicit pg creates from mon since "
5193               << last_pg_create_epoch << dendl;
5194       do_renew_subs = true;
5195     }
5196   }
5197   version_t start = get_osdmap_epoch() + 1;
5198   if (have_pending_creates) {
5199     // don't miss any new osdmap deleting PGs
5200     if (monc->sub_want("osdmap", start, 0)) {
5201       dout(4) << __func__ << ": resolicit osdmap from mon since "
5202               << start << dendl;
5203       do_renew_subs = true;
5204     }
5205   } else if (do_sub_pg_creates) {
5206     // no need to subscribe the osdmap continuously anymore
5207     // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5208     if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
5209       dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
5210               << start << dendl;
5211       do_renew_subs = true;
5212     }
5213   }
5214
5215   if (do_renew_subs) {
5216     monc->renew_subs();
5217   }
5218
5219   service.send_pg_temp();
5220 }
5221
5222 void OSD::build_initial_pg_history(
5223   spg_t pgid,
5224   epoch_t created,
5225   utime_t created_stamp,
5226   pg_history_t *h,
5227   PastIntervals *pi)
5228 {
5229   dout(10) << __func__ << " " << pgid << " created " << created << dendl;
5230   *h = pg_history_t(created, created_stamp);
5231
5232   OSDMapRef lastmap = service.get_map(created);
5233   int up_primary, acting_primary;
5234   vector<int> up, acting;
5235   lastmap->pg_to_up_acting_osds(
5236     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5237
5238   ostringstream debug;
5239   for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
5240     OSDMapRef osdmap = service.get_map(e);
5241     int new_up_primary, new_acting_primary;
5242     vector<int> new_up, new_acting;
5243     osdmap->pg_to_up_acting_osds(
5244       pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5245
5246     // this is a bit imprecise, but sufficient?
5247     struct min_size_predicate_t : public IsPGRecoverablePredicate {
5248       const pg_pool_t *pi;
5249       bool operator()(const set<pg_shard_t> &have) const {
5250         return have.size() >= pi->min_size;
5251       }
5252       explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
5253     } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5254
5255     bool new_interval = PastIntervals::check_new_interval(
5256       acting_primary,
5257       new_acting_primary,
5258       acting, new_acting,
5259       up_primary,
5260       new_up_primary,
5261       up, new_up,
5262       h->same_interval_since,
5263       h->last_epoch_clean,
5264       osdmap.get(),
5265       lastmap.get(),
5266       pgid.pgid,
5267       min_size_predicate,
5268       pi,
5269       &debug);
5270     if (new_interval) {
5271       h->same_interval_since = e;
5272       if (up != new_up) {
5273         h->same_up_since = e;
5274       }
5275       if (acting_primary != new_acting_primary) {
5276         h->same_primary_since = e;
5277       }
5278       if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5279                              osdmap->get_pg_num(pgid.pgid.pool()),
5280                              nullptr)) {
5281         h->last_epoch_split = e;
5282       }
5283       up = new_up;
5284       acting = new_acting;
5285       up_primary = new_up_primary;
5286       acting_primary = new_acting_primary;
5287     }
5288     lastmap = osdmap;
5289   }
5290   dout(20) << __func__ << " " << debug.str() << dendl;
5291   dout(10) << __func__ << " " << *h << " " << *pi
5292            << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5293                        pi->get_bounds()) << ")"
5294            << dendl;
5295 }
5296
5297 void OSD::_add_heartbeat_peer(int p)
5298 {
5299   if (p == whoami)
5300     return;
5301   HeartbeatInfo *hi;
5302
5303   map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5304   if (i == heartbeat_peers.end()) {
5305     pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5306     if (!cons.first)
5307       return;
5308     assert(cons.second);
5309
5310     hi = &heartbeat_peers[p];
5311     hi->peer = p;
5312
5313     auto stamps = service.get_hb_stamps(p);
5314
5315     auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5316     sb->peer = p;
5317     sb->stamps = stamps;
5318     hi->hb_interval_start = ceph_clock_now();
5319     hi->con_back = cons.first.get();
5320     hi->con_back->set_priv(sb);
5321
5322     auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5323     sf->peer = p;
5324     sf->stamps = stamps;
5325     hi->con_front = cons.second.get();
5326     hi->con_front->set_priv(sf);
5327
5328     dout(10) << "_add_heartbeat_peer: new peer osd." << p
5329              << " " << hi->con_back->get_peer_addr()
5330              << " " << hi->con_front->get_peer_addr()
5331              << dendl;
5332   } else {
5333     hi = &i->second;
5334   }
5335   hi->epoch = get_osdmap_epoch();
5336 }
5337
5338 void OSD::_remove_heartbeat_peer(int n)
5339 {
5340   map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5341   ceph_assert(q != heartbeat_peers.end());
5342   dout(20) << " removing heartbeat peer osd." << n
5343            << " " << q->second.con_back->get_peer_addr()
5344            << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5345            << dendl;
5346   q->second.clear_mark_down();
5347   heartbeat_peers.erase(q);
5348 }
5349
5350 void OSD::need_heartbeat_peer_update()
5351 {
5352   if (is_stopping())
5353     return;
5354   dout(20) << "need_heartbeat_peer_update" << dendl;
5355   heartbeat_set_peers_need_update();
5356 }
5357
5358 void OSD::maybe_update_heartbeat_peers()
5359 {
5360   ceph_assert(ceph_mutex_is_locked(osd_lock));
5361
5362   if (is_waiting_for_healthy() || is_active()) {
5363     utime_t now = ceph_clock_now();
5364     if (last_heartbeat_resample == utime_t()) {
5365       last_heartbeat_resample = now;
5366       heartbeat_set_peers_need_update();
5367     } else if (!heartbeat_peers_need_update()) {
5368       utime_t dur = now - last_heartbeat_resample;
5369       if (dur > cct->_conf->osd_heartbeat_grace) {
5370         dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5371         heartbeat_set_peers_need_update();
5372         last_heartbeat_resample = now;
5373         // automatically clean up any stale heartbeat peers
5374         // if we are unhealthy, then clean all
5375         reset_heartbeat_peers(is_waiting_for_healthy());
5376       }
5377     }
5378   }
5379
5380   if (!heartbeat_peers_need_update())
5381     return;
5382   heartbeat_clear_peers_need_update();
5383
5384   std::lock_guard l(heartbeat_lock);
5385
5386   dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5387
5388
5389   // build heartbeat from set
5390   if (is_active()) {
5391     vector<PGRef> pgs;
5392     _get_pgs(&pgs);
5393     for (auto& pg : pgs) {
5394       pg->with_heartbeat_peers([&](int peer) {
5395           if (get_osdmap()->is_up(peer)) {
5396             _add_heartbeat_peer(peer);
5397           }
5398         });
5399     }
5400   }
5401
5402   // include next and previous up osds to ensure we have a fully-connected set
5403   set<int> want, extras;
5404   const int next = get_osdmap()->get_next_up_osd_after(whoami);
5405   if (next >= 0)
5406     want.insert(next);
5407   int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5408   if (prev >= 0 && prev != next)
5409     want.insert(prev);
5410
5411   // make sure we have at least **min_down** osds coming from different
5412   // subtree level (e.g., hosts) for fast failure detection.
5413   auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5414   auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5415   auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5416   get_osdmap()->get_random_up_osds_by_subtree(
5417     whoami, subtree, limit, want, &want);
5418
5419   for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5420     dout(10) << " adding neighbor peer osd." << *p << dendl;
5421     extras.insert(*p);
5422     _add_heartbeat_peer(*p);
5423   }
5424
5425   // remove down peers; enumerate extras
5426   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5427   while (p != heartbeat_peers.end()) {
5428     if (!get_osdmap()->is_up(p->first)) {
5429       int o = p->first;
5430       ++p;
5431       _remove_heartbeat_peer(o);
5432       continue;
5433     }
5434     if (p->second.epoch < get_osdmap_epoch()) {
5435       extras.insert(p->first);
5436     }
5437     ++p;
5438   }
5439
5440   // too few?
5441   for (int n = next; n >= 0; ) {
5442     if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5443       break;
5444     if (!extras.count(n) && !want.count(n) && n != whoami) {
5445       dout(10) << " adding random peer osd." << n << dendl;
5446       extras.insert(n);
5447       _add_heartbeat_peer(n);
5448     }
5449     n = get_osdmap()->get_next_up_osd_after(n);
5450     if (n == next)
5451       break;  // came full circle; stop
5452   }
5453
5454   // too many?
5455   for (set<int>::iterator p = extras.begin();
5456        (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5457        ++p) {
5458     if (want.count(*p))
5459       continue;
5460     _remove_heartbeat_peer(*p);
5461   }
5462
5463   dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5464
5465   // clean up stale failure pending
5466   for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5467     if (heartbeat_peers.count(it->first) == 0) {
5468       send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5469       failure_pending.erase(it++);
5470     } else {
5471       it++;
5472     }
5473   }
5474 }
5475
5476 void OSD::reset_heartbeat_peers(bool all)
5477 {
5478   ceph_assert(ceph_mutex_is_locked(osd_lock));
5479   dout(10) << "reset_heartbeat_peers" << dendl;
5480   utime_t stale = ceph_clock_now();
5481   stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5482   std::lock_guard l(heartbeat_lock);
5483   for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5484     auto& [peer, hi] = *it;
5485     if (all || hi.is_stale(stale)) {
5486       hi.clear_mark_down();
5487       // stop sending failure_report to mon too
5488       failure_queue.erase(peer);
5489       failure_pending.erase(peer);
5490       it = heartbeat_peers.erase(it);
5491     } else {
5492       ++it;
5493     }
5494   }
5495 }
5496
5497 void OSD::handle_osd_ping(MOSDPing *m)
5498 {
5499   if (superblock.cluster_fsid != m->fsid) {
5500     dout(20) << "handle_osd_ping from " << m->get_source_inst()
5501              << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5502              << dendl;
5503     m->put();
5504     return;
5505   }
5506
5507   int from = m->get_source().num();
5508
5509   heartbeat_lock.lock();
5510   if (is_stopping()) {
5511     heartbeat_lock.unlock();
5512     m->put();
5513     return;
5514   }
5515
5516   utime_t now = ceph_clock_now();
5517   auto mnow = service.get_mnow();
5518   ConnectionRef con(m->get_connection());
5519   OSDMapRef curmap = service.get_osdmap();
5520   if (!curmap) {
5521     heartbeat_lock.unlock();
5522     m->put();
5523     return;
5524   }
5525
5526   auto sref = con->get_priv();
5527   Session *s = static_cast<Session*>(sref.get());
5528   if (!s) {
5529     heartbeat_lock.unlock();
5530     m->put();
5531     return;
5532   }
5533   if (!s->stamps) {
5534     s->peer = from;
5535     s->stamps = service.get_hb_stamps(from);
5536   }
5537
5538   switch (m->op) {
5539
5540   case MOSDPing::PING:
5541     {
5542       if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5543         auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5544         if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5545           if (heartbeat_drop->second == 0) {
5546             debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5547           } else {
5548             --heartbeat_drop->second;
5549             dout(5) << "Dropping heartbeat from " << from
5550                     << ", " << heartbeat_drop->second
5551                     << " remaining to drop" << dendl;
5552             break;
5553           }
5554         } else if (cct->_conf->osd_debug_drop_ping_probability >
5555                    ((((double)(rand()%100))/100.0))) {
5556           heartbeat_drop =
5557             debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5558                              cct->_conf->osd_debug_drop_ping_duration)).first;
5559           dout(5) << "Dropping heartbeat from " << from
5560                   << ", " << heartbeat_drop->second
5561                   << " remaining to drop" << dendl;
5562           break;
5563         }
5564       }
5565
5566       ceph::signedspan sender_delta_ub{};
5567       s->stamps->got_ping(
5568         m->up_from,
5569         mnow,
5570         m->mono_send_stamp,
5571         m->delta_ub,
5572         &sender_delta_ub);
5573       dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5574
5575       if (!cct->get_heartbeat_map()->is_healthy()) {
5576         dout(10) << "internal heartbeat not healthy, dropping ping request"
5577                  << dendl;
5578         break;
5579       }
5580
5581       Message *r = new MOSDPing(monc->get_fsid(),
5582                                 curmap->get_epoch(),
5583                                 MOSDPing::PING_REPLY,
5584                                 m->ping_stamp,
5585                                 m->mono_ping_stamp,
5586                                 mnow,
5587                                 service.get_up_epoch(),
5588                                 cct->_conf->osd_heartbeat_min_size,
5589                                 sender_delta_ub);
5590       con->send_message(r);
5591
5592       if (curmap->is_up(from)) {
5593         if (is_active()) {
5594           ConnectionRef cluster_con = service.get_con_osd_cluster(
5595             from, curmap->get_epoch());
5596           if (cluster_con) {
5597             service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5598           }
5599         }
5600       } else if (!curmap->exists(from) ||
5601                  curmap->get_down_at(from) > m->map_epoch) {
5602         // tell them they have died
5603         Message *r = new MOSDPing(monc->get_fsid(),
5604                                   curmap->get_epoch(),
5605                                   MOSDPing::YOU_DIED,
5606                                   m->ping_stamp,
5607                                   m->mono_ping_stamp,
5608                                   mnow,
5609                                   service.get_up_epoch(),
5610                                   cct->_conf->osd_heartbeat_min_size);
5611         con->send_message(r);
5612       }
5613     }
5614     break;
5615
5616   case MOSDPing::PING_REPLY:
5617     {
5618       map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5619       if (i != heartbeat_peers.end()) {
5620         auto acked = i->second.ping_history.find(m->ping_stamp);
5621         if (acked != i->second.ping_history.end()) {
5622           int &unacknowledged = acked->second.second;
5623           if (con == i->second.con_back) {
5624             dout(25) << "handle_osd_ping got reply from osd." << from
5625                      << " first_tx " << i->second.first_tx
5626                      << " last_tx " << i->second.last_tx
5627                      << " last_rx_back " << i->second.last_rx_back
5628                      << " -> " << now
5629                      << " last_rx_front " << i->second.last_rx_front
5630                      << dendl;
5631             i->second.last_rx_back = now;
5632             ceph_assert(unacknowledged > 0);
5633             --unacknowledged;
5634             // if there is no front con, set both stamps.
5635             if (i->second.con_front == NULL) {
5636               i->second.last_rx_front = now;
5637               ceph_assert(unacknowledged > 0);
5638               --unacknowledged;
5639             }
5640           } else if (con == i->second.con_front) {
5641             dout(25) << "handle_osd_ping got reply from osd." << from
5642                      << " first_tx " << i->second.first_tx
5643                      << " last_tx " << i->second.last_tx
5644                      << " last_rx_back " << i->second.last_rx_back
5645                      << " last_rx_front " << i->second.last_rx_front
5646                      << " -> " << now
5647                      << dendl;
5648             i->second.last_rx_front = now;
5649             ceph_assert(unacknowledged > 0);
5650             --unacknowledged;
5651           }
5652
5653           if (unacknowledged == 0) {
5654             // succeeded in getting all replies
5655             dout(25) << "handle_osd_ping got all replies from osd." << from
5656                      << " , erase pending ping(sent at " << m->ping_stamp << ")"
5657                      << " and older pending ping(s)"
5658                      << dendl;
5659
5660 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5661             ++i->second.hb_average_count;
5662             uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5663             i->second.hb_total_back += back_pingtime;
5664             if (back_pingtime < i->second.hb_min_back)
5665               i->second.hb_min_back = back_pingtime;
5666             if (back_pingtime > i->second.hb_max_back)
5667               i->second.hb_max_back = back_pingtime;
5668             uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5669             i->second.hb_total_front += front_pingtime;
5670             if (front_pingtime < i->second.hb_min_front)
5671               i->second.hb_min_front = front_pingtime;
5672             if (front_pingtime > i->second.hb_max_front)
5673               i->second.hb_max_front = front_pingtime;
5674
5675             ceph_assert(i->second.hb_interval_start != utime_t());
5676             if (i->second.hb_interval_start == utime_t())
5677               i->second.hb_interval_start = now;
5678             int64_t hb_avg_time_period = 60;
5679             if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5680               hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5681             }
5682             if (now - i->second.hb_interval_start >=  utime_t(hb_avg_time_period, 0)) {
5683               uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5684               uint32_t back_min = i->second.hb_min_back;
5685               uint32_t back_max = i->second.hb_max_back;
5686               uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5687               uint32_t front_min = i->second.hb_min_front;
5688               uint32_t front_max = i->second.hb_max_front;
5689
5690               // Reset for new interval
5691               i->second.hb_average_count = 0;
5692               i->second.hb_interval_start = now;
5693               i->second.hb_total_back = i->second.hb_max_back = 0;
5694               i->second.hb_min_back =  UINT_MAX;
5695               i->second.hb_total_front = i->second.hb_max_front = 0;
5696               i->second.hb_min_front = UINT_MAX;
5697
5698               // Record per osd interace ping times
5699               // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5700               if (i->second.hb_back_pingtime.size() == 0) {
5701                 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5702                 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5703                   i->second.hb_back_pingtime.push_back(back_avg);
5704                   i->second.hb_back_min.push_back(back_min);
5705                   i->second.hb_back_max.push_back(back_max);
5706                   i->second.hb_front_pingtime.push_back(front_avg);
5707                   i->second.hb_front_min.push_back(front_min);
5708                   i->second.hb_front_max.push_back(front_max);
5709                   ++i->second.hb_index;
5710                 }
5711               } else {
5712                 int index = i->second.hb_index & (hb_vector_size - 1);
5713                 i->second.hb_back_pingtime[index] = back_avg;
5714                 i->second.hb_back_min[index] = back_min;
5715                 i->second.hb_back_max[index] = back_max;
5716                 i->second.hb_front_pingtime[index] = front_avg;
5717                 i->second.hb_front_min[index] = front_min;
5718                 i->second.hb_front_max[index] = front_max;
5719                 ++i->second.hb_index;
5720               }
5721
5722               {
5723                 std::lock_guard l(service.stat_lock);
5724                 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5725                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5726
5727                 uint32_t total = 0;
5728                 uint32_t min = UINT_MAX;
5729                 uint32_t max = 0;
5730                 uint32_t count = 0;
5731                 uint32_t which = 0;
5732                 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5733                 for (int32_t k = size - 1 ; k >= 0; --k) {
5734                   ++count;
5735                   int index = (i->second.hb_index + k) % size;
5736                   total += i->second.hb_back_pingtime[index];
5737                   if (i->second.hb_back_min[index] < min)
5738                     min = i->second.hb_back_min[index];
5739                   if (i->second.hb_back_max[index] > max)
5740                     max = i->second.hb_back_max[index];
5741                   if (count == 1 || count == 5 || count == 15) {
5742                     service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5743                     service.osd_stat.hb_pingtime[from].back_min[which] = min;
5744                     service.osd_stat.hb_pingtime[from].back_max[which] = max;
5745                     which++;
5746                     if (count == 15)
5747                       break;
5748                   }
5749                 }
5750
5751                 if (i->second.con_front != NULL) {
5752                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5753
5754                   total = 0;
5755                   min = UINT_MAX;
5756                   max = 0;
5757                   count = 0;
5758                   which = 0;
5759                   for (int32_t k = size - 1 ; k >= 0; --k) {
5760                     ++count;
5761                     int index = (i->second.hb_index + k) % size;
5762                     total += i->second.hb_front_pingtime[index];
5763                     if (i->second.hb_front_min[index] < min)
5764                       min = i->second.hb_front_min[index];
5765                     if (i->second.hb_front_max[index] > max)
5766                       max = i->second.hb_front_max[index];
5767                     if (count == 1 || count == 5 || count == 15) {
5768                       service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5769                       service.osd_stat.hb_pingtime[from].front_min[which] = min;
5770                       service.osd_stat.hb_pingtime[from].front_max[which] = max;
5771                       which++;
5772                       if (count == 15)
5773                         break;
5774                     }
5775                   }
5776                 }
5777               }
5778             } else {
5779                 std::lock_guard l(service.stat_lock);
5780                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5781                 if (i->second.con_front != NULL)
5782                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5783             }
5784             i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5785           }
5786
5787           if (i->second.is_healthy(now)) {
5788             // Cancel false reports
5789             auto failure_queue_entry = failure_queue.find(from);
5790             if (failure_queue_entry != failure_queue.end()) {
5791               dout(10) << "handle_osd_ping canceling queued "
5792                        << "failure report for osd." << from << dendl;
5793               failure_queue.erase(failure_queue_entry);
5794             }
5795
5796             auto failure_pending_entry = failure_pending.find(from);
5797             if (failure_pending_entry != failure_pending.end()) {
5798               dout(10) << "handle_osd_ping canceling in-flight "
5799                        << "failure report for osd." << from << dendl;
5800               send_still_alive(curmap->get_epoch(),
5801                                from,
5802                                failure_pending_entry->second.second);
5803               failure_pending.erase(failure_pending_entry);
5804             }
5805           }
5806         } else {
5807           // old replies, deprecated by newly sent pings.
5808           dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5809                    << ") is found, treat as covered by newly sent pings "
5810                    << "and ignore"
5811                    << dendl;
5812         }
5813       }
5814
5815       if (m->map_epoch &&
5816           curmap->is_up(from)) {
5817         if (is_active()) {
5818           ConnectionRef cluster_con = service.get_con_osd_cluster(
5819             from, curmap->get_epoch());
5820           if (cluster_con) {
5821             service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5822           }
5823         }
5824       }
5825
5826       s->stamps->got_ping_reply(
5827         mnow,
5828         m->mono_send_stamp,
5829         m->delta_ub);
5830       dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5831     }
5832     break;
5833
5834   case MOSDPing::YOU_DIED:
5835     dout(10) << "handle_osd_ping " << m->get_source_inst()
5836              << " says i am down in " << m->map_epoch << dendl;
5837     osdmap_subscribe(curmap->get_epoch()+1, false);
5838     break;
5839   }
5840
5841   heartbeat_lock.unlock();
5842   m->put();
5843 }
5844
5845 void OSD::heartbeat_entry()
5846 {
5847   std::unique_lock l(heartbeat_lock);
5848   if (is_stopping())
5849     return;
5850   while (!heartbeat_stop) {
5851     heartbeat();
5852
5853     double wait;
5854     if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5855       wait = (float)cct->_conf->osd_heartbeat_interval;
5856     } else {
5857       wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5858     }
5859     auto w = ceph::make_timespan(wait);
5860     dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5861     heartbeat_cond.wait_for(l, w);
5862     if (is_stopping())
5863       return;
5864     dout(30) << "heartbeat_entry woke up" << dendl;
5865   }
5866 }
5867
5868 void OSD::heartbeat_check()
5869 {
5870   ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5871   utime_t now = ceph_clock_now();
5872
5873   // check for incoming heartbeats (move me elsewhere?)
5874   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5875        p != heartbeat_peers.end();
5876        ++p) {
5877
5878     if (p->second.first_tx == utime_t()) {
5879       dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5880                << " yet, skipping" << dendl;
5881       continue;
5882     }
5883
5884     dout(25) << "heartbeat_check osd." << p->first
5885              << " first_tx " << p->second.first_tx
5886              << " last_tx " << p->second.last_tx
5887              << " last_rx_back " << p->second.last_rx_back
5888              << " last_rx_front " << p->second.last_rx_front
5889              << dendl;
5890     if (p->second.is_unhealthy(now)) {
5891       utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5892       if (p->second.last_rx_back == utime_t() ||
5893           p->second.last_rx_front == utime_t()) {
5894         derr << "heartbeat_check: no reply from "
5895              << p->second.con_front->get_peer_addr().get_sockaddr()
5896              << " osd." << p->first
5897              << " ever on either front or back, first ping sent "
5898              << p->second.first_tx
5899              << " (oldest deadline " << oldest_deadline << ")"
5900              << dendl;
5901         // fail
5902         failure_queue[p->first] = p->second.first_tx;
5903       } else {
5904         derr << "heartbeat_check: no reply from "
5905              << p->second.con_front->get_peer_addr().get_sockaddr()
5906              << " osd." << p->first << " since back " << p->second.last_rx_back
5907              << " front " << p->second.last_rx_front
5908              << " (oldest deadline " << oldest_deadline << ")"
5909              << dendl;
5910         // fail
5911         failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5912       }
5913     }
5914   }
5915 }
5916
5917 void OSD::heartbeat()
5918 {
5919   ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5920   dout(30) << "heartbeat" << dendl;
5921
5922   auto load_for_logger = service.get_scrub_services().update_load_average();
5923   if (load_for_logger) {
5924     logger->set(l_osd_loadavg, load_for_logger.value());
5925   }
5926   dout(30) << "heartbeat checking stats" << dendl;
5927
5928   // refresh peer list and osd stats
5929   vector<int> hb_peers;
5930   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5931        p != heartbeat_peers.end();
5932        ++p)
5933     hb_peers.push_back(p->first);
5934
5935   auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5936   dout(5) << __func__ << " " << new_stat << dendl;
5937   ceph_assert(new_stat.statfs.total);
5938
5939   float pratio;
5940   float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5941
5942   service.check_full_status(ratio, pratio);
5943
5944   utime_t now = ceph_clock_now();
5945   auto mnow = service.get_mnow();
5946   utime_t deadline = now;
5947   deadline += cct->_conf->osd_heartbeat_grace;
5948
5949   // send heartbeats
5950   for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5951        i != heartbeat_peers.end();
5952        ++i) {
5953     int peer = i->first;
5954     Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5955     if (!s) {
5956       dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5957       continue;
5958     }
5959     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5960
5961     i->second.last_tx = now;
5962     if (i->second.first_tx == utime_t())
5963       i->second.first_tx = now;
5964     i->second.ping_history[now] = make_pair(deadline,
5965       HeartbeatInfo::HEARTBEAT_MAX_CONN);
5966     if (i->second.hb_interval_start == utime_t())
5967       i->second.hb_interval_start = now;
5968
5969     std::optional<ceph::signedspan> delta_ub;
5970     s->stamps->sent_ping(&delta_ub);
5971
5972     i->second.con_back->send_message(
5973       new MOSDPing(monc->get_fsid(),
5974                    service.get_osdmap_epoch(),
5975                    MOSDPing::PING,
5976                    now,
5977                    mnow,
5978                    mnow,
5979                    service.get_up_epoch(),
5980                    cct->_conf->osd_heartbeat_min_size,
5981                    delta_ub));
5982
5983     if (i->second.con_front)
5984       i->second.con_front->send_message(
5985         new MOSDPing(monc->get_fsid(),
5986                      service.get_osdmap_epoch(),
5987                      MOSDPing::PING,
5988                      now,
5989                      mnow,
5990                      mnow,
5991                      service.get_up_epoch(),
5992                      cct->_conf->osd_heartbeat_min_size,
5993                      delta_ub));
5994   }
5995
5996   logger->set(l_osd_hb_to, heartbeat_peers.size());
5997
5998   // hmm.. am i all alone?
5999   dout(30) << "heartbeat lonely?" << dendl;
6000   if (heartbeat_peers.empty()) {
6001     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
6002       last_mon_heartbeat = now;
6003       dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
6004       osdmap_subscribe(get_osdmap_epoch() + 1, false);
6005     }
6006   }
6007
6008   dout(30) << "heartbeat done" << dendl;
6009 }
6010
6011 bool OSD::heartbeat_reset(Connection *con)
6012 {
6013   std::lock_guard l(heartbeat_lock);
6014   auto s = con->get_priv();
6015   dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
6016   con->set_priv(nullptr);
6017   if (s) {
6018     if (is_stopping()) {
6019       return true;
6020     }
6021     auto session = static_cast<Session*>(s.get());
6022     auto p = heartbeat_peers.find(session->peer);
6023     if (p != heartbeat_peers.end() &&
6024         (p->second.con_back == con ||
6025          p->second.con_front == con)) {
6026       dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6027                << ", reopening" << dendl;
6028       p->second.clear_mark_down(con);
6029       pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
6030       if (newcon.first) {
6031         p->second.con_back = newcon.first.get();
6032         p->second.con_back->set_priv(s);
6033         if (newcon.second) {
6034           p->second.con_front = newcon.second.get();
6035           p->second.con_front->set_priv(s);
6036         }
6037         p->second.ping_history.clear();
6038       } else {
6039         dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6040                  << ", raced with osdmap update, closing out peer" << dendl;
6041         heartbeat_peers.erase(p);
6042       }
6043     } else {
6044       dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
6045     }
6046   }
6047   return true;
6048 }
6049
6050
6051
6052 // =========================================
6053
6054 void OSD::tick()
6055 {
6056   ceph_assert(ceph_mutex_is_locked(osd_lock));
6057   dout(10) << "tick" << dendl;
6058
6059   utime_t now = ceph_clock_now();
6060   // throw out any obsolete markdown log
6061   utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6062   while (!osd_markdown_log.empty() &&
6063           osd_markdown_log.front() + grace < now)
6064     osd_markdown_log.pop_front();
6065
6066   if (is_active() || is_waiting_for_healthy()) {
6067     maybe_update_heartbeat_peers();
6068   }
6069
6070   if (is_waiting_for_healthy()) {
6071     start_boot();
6072   }
6073
6074   if (is_waiting_for_healthy() || is_booting()) {
6075     std::lock_guard l(heartbeat_lock);
6076     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
6077       last_mon_heartbeat = now;
6078       dout(1) << __func__ << " checking mon for new map" << dendl;
6079       osdmap_subscribe(get_osdmap_epoch() + 1, false);
6080     }
6081   }
6082
6083   do_waiters();
6084
6085   // scrub purged_snaps every deep scrub interval
6086   {
6087     const utime_t last = superblock.last_purged_snaps_scrub;
6088     utime_t next = last;
6089     next += cct->_conf->osd_scrub_min_interval;
6090     std::mt19937 rng;
6091     // use a seed that is stable for each scrub interval, but varies
6092     // by OSD to avoid any herds.
6093     rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
6094     double r = (rng() % 1024) / 1024.0;
6095     next +=
6096       cct->_conf->osd_scrub_min_interval *
6097       cct->_conf->osd_scrub_interval_randomize_ratio * r;
6098     if (next < ceph_clock_now()) {
6099       dout(20) << __func__ << " last_purged_snaps_scrub " << last
6100                << " next " << next << " ... now" << dendl;
6101       scrub_purged_snaps();
6102     } else {
6103       dout(20) << __func__ << " last_purged_snaps_scrub " << last
6104                << " next " << next << dendl;
6105     }
6106   }
6107
6108   tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
6109 }
6110
6111 void OSD::tick_without_osd_lock()
6112 {
6113   ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
6114   dout(10) << "tick_without_osd_lock" << dendl;
6115
6116   logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
6117   logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
6118   logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
6119
6120   // refresh osd stats
6121   struct store_statfs_t stbuf;
6122   osd_alert_list_t alerts;
6123   int r = store->statfs(&stbuf, &alerts);
6124   ceph_assert(r == 0);
6125   service.set_statfs(stbuf, alerts);
6126
6127   // osd_lock is not being held, which means the OSD state
6128   // might change when doing the monitor report
6129   if (is_active() || is_waiting_for_healthy()) {
6130     {
6131       std::lock_guard l{heartbeat_lock};
6132       heartbeat_check();
6133     }
6134     map_lock.lock_shared();
6135     std::lock_guard l(mon_report_lock);
6136
6137     // mon report?
6138     utime_t now = ceph_clock_now();
6139     if (service.need_fullness_update() ||
6140         now - last_mon_report > cct->_conf->osd_mon_report_interval) {
6141       last_mon_report = now;
6142       send_full_update();
6143       send_failures();
6144     }
6145     map_lock.unlock_shared();
6146
6147     epoch_t max_waiting_epoch = 0;
6148     for (auto s : shards) {
6149       max_waiting_epoch = std::max(max_waiting_epoch,
6150                                    s->get_max_waiting_epoch());
6151     }
6152     if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6153       dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6154                << ", requesting new map" << dendl;
6155       osdmap_subscribe(superblock.newest_map + 1, false);
6156     }
6157   }
6158
6159   if (is_active()) {
6160     if (!scrub_random_backoff()) {
6161       sched_scrub();
6162     }
6163     service.promote_throttle_recalibrate();
6164     resume_creating_pg();
6165     bool need_send_beacon = false;
6166     const auto now = ceph::coarse_mono_clock::now();
6167     {
6168       // borrow lec lock to pretect last_sent_beacon from changing
6169       std::lock_guard l{min_last_epoch_clean_lock};
6170       const auto elapsed = now - last_sent_beacon;
6171       if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
6172         cct->_conf->osd_beacon_report_interval) {
6173         need_send_beacon = true;
6174       }
6175     }
6176     if (need_send_beacon) {
6177       send_beacon(now);
6178     }
6179   }
6180
6181   mgrc.update_daemon_health(get_health_metrics());
6182   service.kick_recovery_queue();
6183   tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6184                                               new C_Tick_WithoutOSDLock(this));
6185 }
6186
6187 // Usage:
6188 //   setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6189 //   rmomapkey <pool-id> [namespace/]<obj-name> <key>
6190 //   setomapheader <pool-id> [namespace/]<obj-name> <header>
6191 //   getomap <pool> [namespace/]<obj-name>
6192 //   truncobj <pool-id> [namespace/]<obj-name> <newlen>
6193 //   injectmdataerr [namespace/]<obj-name> [shardid]
6194 //   injectdataerr [namespace/]<obj-name> [shardid]
6195 //
6196 //   set_recovery_delay [utime]
6197 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
6198                                  std::string_view command,
6199                                  const cmdmap_t& cmdmap, ostream &ss)
6200 {
6201   //Test support
6202   //Support changing the omap on a single osd by using the Admin Socket to
6203   //directly request the osd make a change.
6204   if (command == "setomapval" || command == "rmomapkey" ||
6205       command == "setomapheader" || command == "getomap" ||
6206       command == "truncobj" || command == "injectmdataerr" ||
6207       command == "injectdataerr"
6208     ) {
6209     pg_t rawpg;
6210     int64_t pool;
6211     OSDMapRef curmap = service->get_osdmap();
6212     int r = -1;
6213
6214     string poolstr;
6215
6216     cmd_getval(cmdmap, "pool", poolstr);
6217     pool = curmap->lookup_pg_pool_name(poolstr);
6218     //If we can't find it by name then maybe id specified
6219     if (pool < 0 && isdigit(poolstr[0]))
6220       pool = atoll(poolstr.c_str());
6221     if (pool < 0) {
6222       ss << "Invalid pool '" << poolstr << "''";
6223       return;
6224     }
6225
6226     string objname, nspace;
6227     cmd_getval(cmdmap, "objname", objname);
6228     std::size_t found = objname.find_first_of('/');
6229     if (found != string::npos) {
6230       nspace = objname.substr(0, found);
6231       objname = objname.substr(found+1);
6232     }
6233     object_locator_t oloc(pool, nspace);
6234     r = curmap->object_locator_to_pg(object_t(objname), oloc,  rawpg);
6235
6236     if (r < 0) {
6237       ss << "Invalid namespace/objname";
6238       return;
6239     }
6240
6241     int64_t shardid = cmd_getval_or<int64_t>(cmdmap, "shardid", shard_id_t::NO_SHARD);
6242     hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6243     ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6244     spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6245     if (curmap->pg_is_ec(rawpg)) {
6246         if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6247             ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6248             return;
6249         }
6250     }
6251
6252     ObjectStore::Transaction t;
6253
6254     if (command == "setomapval") {
6255       map<string, bufferlist> newattrs;
6256       bufferlist val;
6257       string key, valstr;
6258       cmd_getval(cmdmap, "key", key);
6259       cmd_getval(cmdmap, "val", valstr);
6260
6261       val.append(valstr);
6262       newattrs[key] = val;
6263       t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
6264       r = store->queue_transaction(service->meta_ch, std::move(t));
6265       if (r < 0)
6266         ss << "error=" << r;
6267       else
6268         ss << "ok";
6269     } else if (command == "rmomapkey") {
6270       string key;
6271       cmd_getval(cmdmap, "key", key);
6272
6273       t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6274       r = store->queue_transaction(service->meta_ch, std::move(t));
6275       if (r < 0)
6276         ss << "error=" << r;
6277       else
6278         ss << "ok";
6279     } else if (command == "setomapheader") {
6280       bufferlist newheader;
6281       string headerstr;
6282
6283       cmd_getval(cmdmap, "header", headerstr);
6284       newheader.append(headerstr);
6285       t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6286       r = store->queue_transaction(service->meta_ch, std::move(t));
6287       if (r < 0)
6288         ss << "error=" << r;
6289       else
6290         ss << "ok";
6291     } else if (command == "getomap") {
6292       //Debug: Output entire omap
6293       bufferlist hdrbl;
6294       map<string, bufferlist> keyvals;
6295       auto ch = store->open_collection(coll_t(pgid));
6296       if (!ch) {
6297         ss << "unable to open collection for " << pgid;
6298         r = -ENOENT;
6299       } else {
6300         r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6301         if (r >= 0) {
6302           ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6303           for (map<string, bufferlist>::iterator it = keyvals.begin();
6304                it != keyvals.end(); ++it)
6305             ss << " key=" << (*it).first << " val="
6306                << string((*it).second.c_str(), (*it).second.length());
6307         } else {
6308           ss << "error=" << r;
6309         }
6310       }
6311     } else if (command == "truncobj") {
6312       int64_t trunclen;
6313       cmd_getval(cmdmap, "len", trunclen);
6314       t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6315       r = store->queue_transaction(service->meta_ch, std::move(t));
6316       if (r < 0)
6317         ss << "error=" << r;
6318       else
6319         ss << "ok";
6320     } else if (command == "injectdataerr") {
6321       store->inject_data_error(gobj);
6322       ss << "ok";
6323     } else if (command == "injectmdataerr") {
6324       store->inject_mdata_error(gobj);
6325       ss << "ok";
6326     }
6327     return;
6328   }
6329   if (command == "set_recovery_delay") {
6330     int64_t delay = cmd_getval_or<int64_t>(cmdmap, "utime", 0);
6331     ostringstream oss;
6332     oss << delay;
6333     int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6334                                          oss.str().c_str());
6335     if (r != 0) {
6336       ss << "set_recovery_delay: error setting "
6337          << "osd_recovery_delay_start to '" << delay << "': error "
6338          << r;
6339       return;
6340     }
6341     service->cct->_conf.apply_changes(nullptr);
6342     ss << "set_recovery_delay: set osd_recovery_delay_start "
6343        << "to " << service->cct->_conf->osd_recovery_delay_start;
6344     return;
6345   }
6346   if (command == "injectfull") {
6347     int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", -1);
6348     string type = cmd_getval_or<string>(cmdmap, "type", "full");
6349     OSDService::s_names state;
6350
6351     if (type == "none" || count == 0) {
6352       type = "none";
6353       count = 0;
6354     }
6355     state = service->get_full_state(type);
6356     if (state == OSDService::s_names::INVALID) {
6357       ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6358       return;
6359     }
6360     service->set_injectfull(state, count);
6361     return;
6362   }
6363   ss << "Internal error - command=" << command;
6364 }
6365
6366 // =========================================
6367
6368 void OSD::ms_handle_connect(Connection *con)
6369 {
6370   dout(10) << __func__ << " con " << con << dendl;
6371   if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6372     std::lock_guard l(osd_lock);
6373     if (is_stopping())
6374       return;
6375     dout(10) << __func__ << " on mon" << dendl;
6376
6377     if (is_preboot()) {
6378       start_boot();
6379     } else if (is_booting()) {
6380       _send_boot();       // resend boot message
6381     } else {
6382       map_lock.lock_shared();
6383       std::lock_guard l2(mon_report_lock);
6384
6385       utime_t now = ceph_clock_now();
6386       last_mon_report = now;
6387
6388       // resend everything, it's a new session
6389       send_full_update();
6390       send_alive();
6391       service.requeue_pg_temp();
6392       service.clear_sent_ready_to_merge();
6393       service.send_pg_temp();
6394       service.send_ready_to_merge();
6395       service.send_pg_created();
6396       requeue_failures();
6397       send_failures();
6398
6399       map_lock.unlock_shared();
6400       if (is_active()) {
6401         send_beacon(ceph::coarse_mono_clock::now());
6402       }
6403     }
6404
6405     // full map requests may happen while active or pre-boot
6406     if (requested_full_first) {
6407       rerequest_full_maps();
6408     }
6409   }
6410 }
6411
6412 void OSD::ms_handle_fast_connect(Connection *con)
6413 {
6414   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6415       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6416     if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6417       s = ceph::make_ref<Session>(cct, con);
6418       con->set_priv(s);
6419       dout(10) << " new session (outgoing) " << s << " con=" << s->con
6420           << " addr=" << s->con->get_peer_addr() << dendl;
6421       // we don't connect to clients
6422       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6423       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6424     }
6425   }
6426 }
6427
6428 void OSD::ms_handle_fast_accept(Connection *con)
6429 {
6430   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6431       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6432     if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6433       s = ceph::make_ref<Session>(cct, con);
6434       con->set_priv(s);
6435       dout(10) << "new session (incoming)" << s << " con=" << con
6436           << " addr=" << con->get_peer_addr()
6437           << " must have raced with connect" << dendl;
6438       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6439       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6440     }
6441   }
6442 }
6443
6444 bool OSD::ms_handle_reset(Connection *con)
6445 {
6446   auto session = ceph::ref_cast<Session>(con->get_priv());
6447   dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6448   if (!session)
6449     return false;
6450   session->wstate.reset(con);
6451   session->con->set_priv(nullptr);
6452   session->con.reset();  // break con <-> session ref cycle
6453   // note that we break session->con *before* the session_handle_reset
6454   // cleanup below.  this avoids a race between us and
6455   // PG::add_backoff, Session::check_backoff, etc.
6456   session_handle_reset(session);
6457   return true;
6458 }
6459
6460 bool OSD::ms_handle_refused(Connection *con)
6461 {
6462   if (!cct->_conf->osd_fast_fail_on_connection_refused)
6463     return false;
6464
6465   auto session = ceph::ref_cast<Session>(con->get_priv());
6466   dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6467   if (!session)
6468     return false;
6469   int type = con->get_peer_type();
6470   // handle only OSD failures here
6471   if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6472     OSDMapRef osdmap = get_osdmap();
6473     if (osdmap) {
6474       int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6475       if (id >= 0 && osdmap->is_up(id)) {
6476         // I'm cheating mon heartbeat grace logic, because we know it's not going
6477         // to respawn alone. +1 so we won't hit any boundary case.
6478         monc->send_mon_message(
6479           new MOSDFailure(
6480             monc->get_fsid(),
6481             id,
6482             osdmap->get_addrs(id),
6483             cct->_conf->osd_heartbeat_grace + 1,
6484             osdmap->get_epoch(),
6485             MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6486             ));
6487       }
6488     }
6489   }
6490   return true;
6491 }
6492
6493 struct CB_OSD_GetVersion {
6494   OSD *osd;
6495   explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6496   void operator ()(boost::system::error_code ec, version_t newest,
6497                    version_t oldest) {
6498     if (!ec)
6499       osd->_got_mon_epochs(oldest, newest);
6500   }
6501 };
6502
6503 void OSD::start_boot()
6504 {
6505   if (!_is_healthy()) {
6506     // if we are not healthy, do not mark ourselves up (yet)
6507     dout(1) << "not healthy; waiting to boot" << dendl;
6508     if (!is_waiting_for_healthy())
6509       start_waiting_for_healthy();
6510     // send pings sooner rather than later
6511     heartbeat_kick();
6512     return;
6513   }
6514   dout(1) << __func__ << dendl;
6515   set_state(STATE_PREBOOT);
6516   dout(10) << "start_boot - have maps " << superblock.oldest_map
6517            << ".." << superblock.newest_map << dendl;
6518   monc->get_version("osdmap", CB_OSD_GetVersion(this));
6519 }
6520
6521 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6522 {
6523   std::lock_guard l(osd_lock);
6524   if (is_preboot()) {
6525     _preboot(oldest, newest);
6526   }
6527 }
6528
6529 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6530 {
6531   ceph_assert(is_preboot());
6532   dout(10) << __func__ << " _preboot mon has osdmaps "
6533            << oldest << ".." << newest << dendl;
6534
6535   // ensure our local fullness awareness is accurate
6536   {
6537     std::lock_guard l(heartbeat_lock);
6538     heartbeat();
6539   }
6540
6541   const auto& monmap = monc->monmap;
6542   const auto osdmap = get_osdmap();
6543   // if our map within recent history, try to add ourselves to the osdmap.
6544   if (osdmap->get_epoch() == 0) {
6545     derr << "waiting for initial osdmap" << dendl;
6546   } else if (osdmap->is_destroyed(whoami)) {
6547     derr << "osdmap says I am destroyed" << dendl;
6548     // provide a small margin so we don't livelock seeing if we
6549     // un-destroyed ourselves.
6550     if (osdmap->get_epoch() > newest - 1) {
6551       exit(0);
6552     }
6553   } else if (osdmap->is_noup(whoami)) {
6554     derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6555   } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6556     derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6557          << dendl;
6558   } else if (service.need_fullness_update()) {
6559     derr << "osdmap fullness state needs update" << dendl;
6560     send_full_update();
6561   } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6562              superblock.purged_snaps_last < superblock.current_epoch) {
6563     dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6564              << " < newest_map " << superblock.current_epoch << dendl;
6565     _get_purged_snaps();
6566   } else if (osdmap->get_epoch() >= oldest - 1 &&
6567              osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6568
6569     // wait for pgs to fully catch up in a different thread, since
6570     // this thread might be required for splitting and merging PGs to
6571     // make progress.
6572     boot_finisher.queue(
6573       new LambdaContext(
6574         [this](int r) {
6575           std::unique_lock l(osd_lock);
6576           if (is_preboot()) {
6577             dout(10) << __func__ << " waiting for peering work to drain"
6578                      << dendl;
6579             l.unlock();
6580             for (auto shard : shards) {
6581               shard->wait_min_pg_epoch(get_osdmap_epoch());
6582             }
6583             l.lock();
6584           }
6585           if (is_preboot()) {
6586             _send_boot();
6587           }
6588         }));
6589     return;
6590   }
6591
6592   // get all the latest maps
6593   if (osdmap->get_epoch() + 1 >= oldest)
6594     osdmap_subscribe(osdmap->get_epoch() + 1, false);
6595   else
6596     osdmap_subscribe(oldest - 1, true);
6597 }
6598
6599 void OSD::_get_purged_snaps()
6600 {
6601   // NOTE: this is a naive, stateless implementaiton.  it may send multiple
6602   // overlapping requests to the mon, which will be somewhat inefficient, but
6603   // it should be reliable.
6604   dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6605            << ", newest_map " << superblock.current_epoch << dendl;
6606   MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6607     superblock.purged_snaps_last + 1,
6608     superblock.current_epoch + 1);
6609   monc->send_mon_message(m);
6610 }
6611
6612 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6613 {
6614   dout(10) << __func__ << " " << *m << dendl;
6615   ObjectStore::Transaction t;
6616   if (!is_preboot() ||
6617       m->last < superblock.purged_snaps_last) {
6618     goto out;
6619   }
6620   SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
6621                                   make_purged_snaps_oid(), &t,
6622                                   m->purged_snaps);
6623   superblock.purged_snaps_last = m->last;
6624   write_superblock(t);
6625   store->queue_transaction(
6626     service.meta_ch,
6627     std::move(t));
6628   service.publish_superblock(superblock);
6629   if (m->last < superblock.current_epoch) {
6630     _get_purged_snaps();
6631   } else {
6632     start_boot();
6633   }
6634 out:
6635   m->put();
6636 }
6637
6638 void OSD::send_full_update()
6639 {
6640   if (!service.need_fullness_update())
6641     return;
6642   unsigned state = 0;
6643   if (service.is_full()) {
6644     state = CEPH_OSD_FULL;
6645   } else if (service.is_backfillfull()) {
6646     state = CEPH_OSD_BACKFILLFULL;
6647   } else if (service.is_nearfull()) {
6648     state = CEPH_OSD_NEARFULL;
6649   }
6650   set<string> s;
6651   OSDMap::calc_state_set(state, s);
6652   dout(10) << __func__ << " want state " << s << dendl;
6653   monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6654 }
6655
6656 void OSD::start_waiting_for_healthy()
6657 {
6658   dout(1) << "start_waiting_for_healthy" << dendl;
6659   set_state(STATE_WAITING_FOR_HEALTHY);
6660   last_heartbeat_resample = utime_t();
6661
6662   // subscribe to osdmap updates, in case our peers really are known to be dead
6663   osdmap_subscribe(get_osdmap_epoch() + 1, false);
6664 }
6665
6666 bool OSD::_is_healthy()
6667 {
6668   if (!cct->get_heartbeat_map()->is_healthy()) {
6669     dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6670     return false;
6671   }
6672
6673   if (is_waiting_for_healthy()) {
6674      utime_t now = ceph_clock_now();
6675      if (osd_markdown_log.empty()) {
6676        dout(5) << __func__ << " force returning true since last markdown"
6677                << " was " << cct->_conf->osd_max_markdown_period
6678                << "s ago" << dendl;
6679        return true;
6680     }
6681     std::lock_guard l(heartbeat_lock);
6682     int num = 0, up = 0;
6683     for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6684          p != heartbeat_peers.end();
6685          ++p) {
6686       if (p->second.is_healthy(now))
6687         ++up;
6688       ++num;
6689     }
6690     if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6691       dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6692               << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6693       return false;
6694     }
6695   }
6696
6697   return true;
6698 }
6699
6700 void OSD::_send_boot()
6701 {
6702   dout(10) << "_send_boot" << dendl;
6703   Connection *local_connection =
6704     cluster_messenger->get_loopback_connection().get();
6705   entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6706   entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6707   entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6708   entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6709
6710   dout(20) << " initial client_addrs " << client_addrs
6711            << ", cluster_addrs " << cluster_addrs
6712            << ", hb_back_addrs " << hb_back_addrs
6713            << ", hb_front_addrs " << hb_front_addrs
6714            << dendl;
6715   if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6716     dout(10) << " assuming cluster_addrs match client_addrs "
6717              << client_addrs << dendl;
6718     cluster_addrs = cluster_messenger->get_myaddrs();
6719   }
6720   if (auto session = local_connection->get_priv(); !session) {
6721     cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6722   }
6723
6724   local_connection = hb_back_server_messenger->get_loopback_connection().get();
6725   if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6726     dout(10) << " assuming hb_back_addrs match cluster_addrs "
6727              << cluster_addrs << dendl;
6728     hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6729   }
6730   if (auto session = local_connection->get_priv(); !session) {
6731     hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6732   }
6733
6734   local_connection = hb_front_server_messenger->get_loopback_connection().get();
6735   if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6736     dout(10) << " assuming hb_front_addrs match client_addrs "
6737              << client_addrs << dendl;
6738     hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6739   }
6740   if (auto session = local_connection->get_priv(); !session) {
6741     hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6742   }
6743
6744   // we now know what our front and back addrs will be, and we are
6745   // about to tell the mon what our metadata (including numa bindings)
6746   // are, so now is a good time!
6747   set_numa_affinity();
6748
6749   MOSDBoot *mboot = new MOSDBoot(
6750     superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6751     hb_back_addrs, hb_front_addrs, cluster_addrs,
6752     CEPH_FEATURES_ALL);
6753   dout(10) << " final client_addrs " << client_addrs
6754            << ", cluster_addrs " << cluster_addrs
6755            << ", hb_back_addrs " << hb_back_addrs
6756            << ", hb_front_addrs " << hb_front_addrs
6757            << dendl;
6758   _collect_metadata(&mboot->metadata);
6759   monc->send_mon_message(mboot);
6760   set_state(STATE_BOOTING);
6761 }
6762
6763 void OSD::_collect_metadata(map<string,string> *pm)
6764 {
6765   // config info
6766   (*pm)["osd_data"] = dev_path;
6767   if (store->get_type() == "filestore") {
6768     // not applicable for bluestore
6769     (*pm)["osd_journal"] = journal_path;
6770   }
6771   (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6772   (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6773   (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6774   (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6775
6776   // backend
6777   (*pm)["osd_objectstore"] = store->get_type();
6778   (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6779   (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6780   (*pm)["default_device_class"] = store->get_default_device_class();
6781   string osdspec_affinity;
6782   int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6783   if (r < 0 || osdspec_affinity.empty()) {
6784     osdspec_affinity = "";
6785   }
6786   (*pm)["osdspec_affinity"] = osdspec_affinity;
6787   store->collect_metadata(pm);
6788
6789   collect_sys_info(pm, cct);
6790
6791   (*pm)["front_iface"] = pick_iface(
6792     cct,
6793     client_messenger->get_myaddrs().front().get_sockaddr_storage());
6794   (*pm)["back_iface"] = pick_iface(
6795     cct,
6796     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6797
6798   // network numa
6799   {
6800     int node = -1;
6801     set<int> nodes;
6802     set<string> unknown;
6803     for (auto nm : { "front_iface", "back_iface" }) {
6804       if (!(*pm)[nm].size()) {
6805         unknown.insert(nm);
6806         continue;
6807       }
6808       int n = -1;
6809       int r = get_iface_numa_node((*pm)[nm], &n);
6810       if (r < 0) {
6811         unknown.insert((*pm)[nm]);
6812         continue;
6813       }
6814       nodes.insert(n);
6815       if (node < 0) {
6816         node = n;
6817       }
6818     }
6819     if (unknown.size()) {
6820       (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6821     }
6822     if (!nodes.empty()) {
6823       (*pm)["network_numa_nodes"] = stringify(nodes);
6824     }
6825     if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6826       (*pm)["network_numa_node"] = stringify(node);
6827     }
6828   }
6829
6830   if (numa_node >= 0) {
6831     (*pm)["numa_node"] = stringify(numa_node);
6832     (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6833                                                   &numa_cpu_set);
6834   }
6835
6836   set<string> devnames;
6837   store->get_devices(&devnames);
6838   map<string,string> errs;
6839   get_device_metadata(devnames, pm, &errs);
6840   for (auto& i : errs) {
6841     dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6842   }
6843   dout(10) << __func__ << " " << *pm << dendl;
6844 }
6845
6846 void OSD::queue_want_up_thru(epoch_t want)
6847 {
6848   std::shared_lock map_locker{map_lock};
6849   epoch_t cur = get_osdmap()->get_up_thru(whoami);
6850   std::lock_guard report_locker(mon_report_lock);
6851   if (want > up_thru_wanted) {
6852     dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6853              << ", currently " << cur
6854              << dendl;
6855     up_thru_wanted = want;
6856     send_alive();
6857   } else {
6858     dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6859              << ", currently " << cur
6860              << dendl;
6861   }
6862 }
6863
6864 void OSD::send_alive()
6865 {
6866   ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6867   const auto osdmap = get_osdmap();
6868   if (!osdmap->exists(whoami))
6869     return;
6870   epoch_t up_thru = osdmap->get_up_thru(whoami);
6871   dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6872   if (up_thru_wanted > up_thru) {
6873     dout(10) << "send_alive want " << up_thru_wanted << dendl;
6874     monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6875   }
6876 }
6877
6878 void OSD::request_full_map(epoch_t first, epoch_t last)
6879 {
6880   dout(10) << __func__ << " " << first << ".." << last
6881            << ", previously requested "
6882            << requested_full_first << ".." << requested_full_last << dendl;
6883   ceph_assert(ceph_mutex_is_locked(osd_lock));
6884   ceph_assert(first > 0 && last > 0);
6885   ceph_assert(first <= last);
6886   ceph_assert(first >= requested_full_first);  // we shouldn't ever ask for older maps
6887   if (requested_full_first == 0) {
6888     // first request
6889     requested_full_first = first;
6890     requested_full_last = last;
6891   } else if (last <= requested_full_last) {
6892     // dup
6893     return;
6894   } else {
6895     // additional request
6896     first = requested_full_last + 1;
6897     requested_full_last = last;
6898   }
6899   MMonGetOSDMap *req = new MMonGetOSDMap;
6900   req->request_full(first, last);
6901   monc->send_mon_message(req);
6902 }
6903
6904 void OSD::got_full_map(epoch_t e)
6905 {
6906   ceph_assert(requested_full_first <= requested_full_last);
6907   ceph_assert(ceph_mutex_is_locked(osd_lock));
6908   if (requested_full_first == 0) {
6909     dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6910     return;
6911   }
6912   if (e < requested_full_first) {
6913     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6914              << ".." << requested_full_last
6915              << ", ignoring" << dendl;
6916     return;
6917   }
6918   if (e >= requested_full_last) {
6919     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6920              << ".." << requested_full_last << ", resetting" << dendl;
6921     requested_full_first = requested_full_last = 0;
6922     return;
6923   }
6924
6925   requested_full_first = e + 1;
6926
6927   dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6928            << ".." << requested_full_last
6929            << ", still need more" << dendl;
6930 }
6931
6932 void OSD::requeue_failures()
6933 {
6934   std::lock_guard l(heartbeat_lock);
6935   unsigned old_queue = failure_queue.size();
6936   unsigned old_pending = failure_pending.size();
6937   for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6938     failure_queue[p->first] = p->second.first;
6939     failure_pending.erase(p++);
6940   }
6941   dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6942            << failure_queue.size() << dendl;
6943 }
6944
6945 void OSD::send_failures()
6946 {
6947   ceph_assert(ceph_mutex_is_locked(map_lock));
6948   ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6949   std::lock_guard l(heartbeat_lock);
6950   utime_t now = ceph_clock_now();
6951   const auto osdmap = get_osdmap();
6952   while (!failure_queue.empty()) {
6953     int osd = failure_queue.begin()->first;
6954     if (!failure_pending.count(osd)) {
6955       int failed_for = (int)(double)(now - failure_queue.begin()->second);
6956       monc->send_mon_message(
6957         new MOSDFailure(
6958           monc->get_fsid(),
6959           osd,
6960           osdmap->get_addrs(osd),
6961           failed_for,
6962           osdmap->get_epoch()));
6963       failure_pending[osd] = make_pair(failure_queue.begin()->second,
6964                                        osdmap->get_addrs(osd));
6965     }
6966     failure_queue.erase(osd);
6967   }
6968 }
6969
6970 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6971 {
6972   MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6973                                    MOSDFailure::FLAG_ALIVE);
6974   monc->send_mon_message(m);
6975 }
6976
6977 void OSD::cancel_pending_failures()
6978 {
6979   std::lock_guard l(heartbeat_lock);
6980   auto it = failure_pending.begin();
6981   while (it != failure_pending.end()) {
6982     dout(10) << __func__ << " canceling in-flight failure report for osd."
6983              << it->first << dendl;
6984     send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6985     failure_pending.erase(it++);
6986   }
6987 }
6988
6989 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6990 {
6991   const auto& monmap = monc->monmap;
6992   // send beacon to mon even if we are just connected, and the monmap is not
6993   // initialized yet by then.
6994   if (monmap.epoch > 0 &&
6995       monmap.get_required_features().contains_all(
6996         ceph::features::mon::FEATURE_LUMINOUS)) {
6997     dout(20) << __func__ << " sending" << dendl;
6998     MOSDBeacon* beacon = nullptr;
6999     {
7000       std::lock_guard l{min_last_epoch_clean_lock};
7001       beacon = new MOSDBeacon(get_osdmap_epoch(),
7002                               min_last_epoch_clean,
7003                               superblock.last_purged_snaps_scrub,
7004                               cct->_conf->osd_beacon_report_interval);
7005       beacon->pgs = min_last_epoch_clean_pgs;
7006       last_sent_beacon = now;
7007     }
7008     monc->send_mon_message(beacon);
7009   } else {
7010     dout(20) << __func__ << " not sending" << dendl;
7011   }
7012 }
7013
7014 void OSD::handle_command(MCommand *m)
7015 {
7016   ConnectionRef con = m->get_connection();
7017   auto session = ceph::ref_cast<Session>(con->get_priv());
7018   if (!session) {
7019     con->send_message(new MCommandReply(m, -EACCES));
7020     m->put();
7021     return;
7022   }
7023   if (!session->caps.allow_all()) {
7024     con->send_message(new MCommandReply(m, -EACCES));
7025     m->put();
7026     return;
7027   }
7028   cct->get_admin_socket()->queue_tell_command(m);
7029   m->put();
7030 }
7031
7032 namespace {
7033   class unlock_guard {
7034     ceph::mutex& m;
7035   public:
7036     explicit unlock_guard(ceph::mutex& mutex)
7037       : m(mutex)
7038     {
7039       m.unlock();
7040     }
7041     unlock_guard(unlock_guard&) = delete;
7042     ~unlock_guard() {
7043       m.lock();
7044     }
7045   };
7046 }
7047
7048 void OSD::scrub_purged_snaps()
7049 {
7050   dout(10) << __func__ << dendl;
7051   ceph_assert(ceph_mutex_is_locked(osd_lock));
7052   SnapMapper::Scrubber s(cct, store.get(), service.meta_ch,
7053                          make_snapmapper_oid(),
7054                          make_purged_snaps_oid());
7055   clog->debug() << "purged_snaps scrub starts";
7056   osd_lock.unlock();
7057   s.run();
7058   if (s.stray.size()) {
7059     clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
7060   } else {
7061     clog->debug() << "purged_snaps scrub ok";
7062   }
7063   set<pair<spg_t,snapid_t>> queued;
7064   for (auto& [pool, snap, hash, shard] : s.stray) {
7065     const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
7066     if (!pi) {
7067       dout(20) << __func__ << " pool " << pool << " dne" << dendl;
7068       continue;
7069     }
7070     pg_t pgid(pi->raw_hash_to_pg(hash), pool);
7071     spg_t spgid(pgid, shard);
7072     pair<spg_t,snapid_t> p(spgid, snap);
7073     if (queued.count(p)) {
7074       dout(20) << __func__ << " pg " << spgid << " snap " << snap
7075                << " already queued" << dendl;
7076       continue;
7077     }
7078     PGRef pg = lookup_lock_pg(spgid);
7079     if (!pg) {
7080       dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
7081       continue;
7082     }
7083     queued.insert(p);
7084     dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
7085              << snap << dendl;
7086     pg->queue_snap_retrim(snap);
7087     pg->unlock();
7088   }
7089   osd_lock.lock();
7090   if (is_stopping()) {
7091     return;
7092   }
7093   dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
7094   ObjectStore::Transaction t;
7095   superblock.last_purged_snaps_scrub = ceph_clock_now();
7096   write_superblock(t);
7097   int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7098   ceph_assert(tr == 0);
7099   if (is_active()) {
7100     send_beacon(ceph::coarse_mono_clock::now());
7101   }
7102   dout(10) << __func__ << " done" << dendl;
7103 }
7104
7105 void OSD::probe_smart(const string& only_devid, ostream& ss)
7106 {
7107   set<string> devnames;
7108   store->get_devices(&devnames);
7109   uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7110     "osd_smart_report_timeout");
7111
7112   // == typedef std::map<std::string, mValue> mObject;
7113   json_spirit::mObject json_map;
7114
7115   for (auto dev : devnames) {
7116     // smartctl works only on physical devices; filter out any logical device
7117     if (dev.find("dm-") == 0) {
7118       continue;
7119     }
7120
7121     string err;
7122     string devid = get_device_id(dev, &err);
7123     if (devid.size() == 0) {
7124       dout(10) << __func__ << " no unique id for dev " << dev << " ("
7125                << err << "), skipping" << dendl;
7126       continue;
7127     }
7128     if (only_devid.size() && devid != only_devid) {
7129       continue;
7130     }
7131
7132     json_spirit::mValue smart_json;
7133     if (block_device_get_metrics(dev, smart_timeout,
7134                                  &smart_json)) {
7135       dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7136       continue;
7137     }
7138     json_map[devid] = smart_json;
7139   }
7140   json_spirit::write(json_map, ss, json_spirit::pretty_print);
7141 }
7142
7143 bool OSD::heartbeat_dispatch(Message *m)
7144 {
7145   dout(30) << "heartbeat_dispatch " << m << dendl;
7146   switch (m->get_type()) {
7147
7148   case CEPH_MSG_PING:
7149     dout(10) << "ping from " << m->get_source_inst() << dendl;
7150     m->put();
7151     break;
7152
7153   case MSG_OSD_PING:
7154     handle_osd_ping(static_cast<MOSDPing*>(m));
7155     break;
7156
7157   default:
7158     dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7159     m->put();
7160   }
7161
7162   return true;
7163 }
7164
7165 bool OSD::ms_dispatch(Message *m)
7166 {
7167   dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7168   if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7169     service.got_stop_ack();
7170     m->put();
7171     return true;
7172   }
7173
7174   // lock!
7175
7176   osd_lock.lock();
7177   if (is_stopping()) {
7178     osd_lock.unlock();
7179     m->put();
7180     return true;
7181   }
7182
7183   do_waiters();
7184   _dispatch(m);
7185
7186   osd_lock.unlock();
7187
7188   return true;
7189 }
7190
7191 void OSDService::maybe_share_map(
7192   Connection *con,
7193   const OSDMapRef& osdmap,
7194   epoch_t peer_epoch_lb)
7195 {
7196   // NOTE: we assume caller hold something that keeps the Connection itself
7197   // pinned (e.g., an OpRequest's MessageRef).
7198   auto session = ceph::ref_cast<Session>(con->get_priv());
7199   if (!session) {
7200     return;
7201   }
7202
7203   // assume the peer has the newer of the op's sent_epoch and what
7204   // we think we sent them.
7205   session->sent_epoch_lock.lock();
7206   if (peer_epoch_lb > session->last_sent_epoch) {
7207     dout(10) << __func__ << " con " << con
7208              << " " << con->get_peer_addr()
7209              << " map epoch " << session->last_sent_epoch
7210              << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7211     session->last_sent_epoch = peer_epoch_lb;
7212   }
7213   epoch_t last_sent_epoch = session->last_sent_epoch;
7214   session->sent_epoch_lock.unlock();
7215
7216   if (osdmap->get_epoch() <= last_sent_epoch) {
7217     return;
7218   }
7219
7220   send_incremental_map(last_sent_epoch, con, osdmap);
7221   last_sent_epoch = osdmap->get_epoch();
7222
7223   session->sent_epoch_lock.lock();
7224   if (session->last_sent_epoch < last_sent_epoch) {
7225     dout(10) << __func__ << " con " << con
7226              << " " << con->get_peer_addr()
7227              << " map epoch " << session->last_sent_epoch
7228              << " -> " << last_sent_epoch << " (shared)" << dendl;
7229     session->last_sent_epoch = last_sent_epoch;
7230   }
7231   session->sent_epoch_lock.unlock();
7232 }
7233
7234 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7235 {
7236   ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7237
7238   auto i = session->waiting_on_map.begin();
7239   while (i != session->waiting_on_map.end()) {
7240     OpRequestRef op = &(*i);
7241     ceph_assert(ms_can_fast_dispatch(op->get_req()));
7242     auto m = op->get_req<MOSDFastDispatchOp>();
7243     if (m->get_min_epoch() > osdmap->get_epoch()) {
7244       break;
7245     }
7246     session->waiting_on_map.erase(i++);
7247     op->put();
7248
7249     spg_t pgid;
7250     if (m->get_type() == CEPH_MSG_OSD_OP) {
7251       pg_t actual_pgid = osdmap->raw_pg_to_pg(
7252         static_cast<const MOSDOp*>(m)->get_pg());
7253       if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7254         continue;
7255       }
7256     } else {
7257       pgid = m->get_spg();
7258     }
7259     enqueue_op(pgid, std::move(op), m->get_map_epoch());
7260   }
7261
7262   if (session->waiting_on_map.empty()) {
7263     clear_session_waiting_on_map(session);
7264   } else {
7265     register_session_waiting_on_map(session);
7266   }
7267 }
7268
7269 void OSD::ms_fast_dispatch(Message *m)
7270 {
7271   auto dispatch_span = tracing::osd::tracer.start_trace(__func__);
7272   FUNCTRACE(cct);
7273   if (service.is_stopping()) {
7274     m->put();
7275     return;
7276   }
7277   // peering event?
7278   switch (m->get_type()) {
7279   case CEPH_MSG_PING:
7280     dout(10) << "ping from " << m->get_source() << dendl;
7281     m->put();
7282     return;
7283   case MSG_OSD_FORCE_RECOVERY:
7284     handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7285     return;
7286   case MSG_OSD_SCRUB2:
7287     handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7288     return;
7289   case MSG_OSD_PG_CREATE2:
7290     return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7291   case MSG_OSD_PG_NOTIFY:
7292     return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7293   case MSG_OSD_PG_INFO:
7294     return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7295   case MSG_OSD_PG_REMOVE:
7296     return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7297     // these are single-pg messages that handle themselves
7298   case MSG_OSD_PG_LOG:
7299   case MSG_OSD_PG_TRIM:
7300   case MSG_OSD_PG_NOTIFY2:
7301   case MSG_OSD_PG_QUERY2:
7302   case MSG_OSD_PG_INFO2:
7303   case MSG_OSD_BACKFILL_RESERVE:
7304   case MSG_OSD_RECOVERY_RESERVE:
7305   case MSG_OSD_PG_LEASE:
7306   case MSG_OSD_PG_LEASE_ACK:
7307     {
7308       MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7309       if (require_osd_peer(pm)) {
7310         enqueue_peering_evt(
7311           pm->get_spg(),
7312           PGPeeringEventRef(pm->get_event()));
7313       }
7314       pm->put();
7315       return;
7316     }
7317   }
7318
7319   OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7320   {
7321 #ifdef WITH_LTTNG
7322     osd_reqid_t reqid = op->get_reqid();
7323 #endif
7324     tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7325         reqid.name._num, reqid.tid, reqid.inc);
7326   }
7327   op->osd_parent_span = tracing::osd::tracer.add_span("op-request-created", dispatch_span);
7328
7329   if (m->trace)
7330     op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7331
7332   // note sender epoch, min req's epoch
7333   op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7334   op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7335   ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7336
7337   service.maybe_inject_dispatch_delay();
7338
7339   if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7340       m->get_type() != CEPH_MSG_OSD_OP) {
7341     // queue it directly
7342     enqueue_op(
7343       static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7344       std::move(op),
7345       static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7346   } else {
7347     // legacy client, and this is an MOSDOp (the *only* fast dispatch
7348     // message that didn't have an explicit spg_t); we need to map
7349     // them to an spg_t while preserving delivery order.
7350     auto priv = m->get_connection()->get_priv();
7351     if (auto session = static_cast<Session*>(priv.get()); session) {
7352       std::lock_guard l{session->session_dispatch_lock};
7353       op->get();
7354       session->waiting_on_map.push_back(*op);
7355       OSDMapRef nextmap = service.get_nextmap_reserved();
7356       dispatch_session_waiting(session, nextmap);
7357       service.release_map(nextmap);
7358     }
7359   }
7360   OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7361 }
7362
7363 int OSD::ms_handle_authentication(Connection *con)
7364 {
7365   int ret = 0;
7366   auto s = ceph::ref_cast<Session>(con->get_priv());
7367   if (!s) {
7368     s = ceph::make_ref<Session>(cct, con);
7369     con->set_priv(s);
7370     s->entity_name = con->get_peer_entity_name();
7371     dout(10) << __func__ << " new session " << s << " con " << s->con
7372              << " entity " << s->entity_name
7373              << " addr " << con->get_peer_addrs() << dendl;
7374   } else {
7375     dout(10) << __func__ << " existing session " << s << " con " << s->con
7376              << " entity " << s->entity_name
7377              << " addr " << con->get_peer_addrs() << dendl;
7378   }
7379
7380   AuthCapsInfo &caps_info = con->get_peer_caps_info();
7381   if (caps_info.allow_all) {
7382     s->caps.set_allow_all();
7383   } else if (caps_info.caps.length() > 0) {
7384     bufferlist::const_iterator p = caps_info.caps.cbegin();
7385     string str;
7386     try {
7387       decode(str, p);
7388     }
7389     catch (ceph::buffer::error& e) {
7390       dout(10) << __func__ << " session " << s << " " << s->entity_name
7391                << " failed to decode caps string" << dendl;
7392       ret = -EACCES;
7393     }
7394     if (!ret) {
7395       bool success = s->caps.parse(str);
7396       if (success) {
7397         dout(10) << __func__ << " session " << s
7398                  << " " << s->entity_name
7399                  << " has caps " << s->caps << " '" << str << "'" << dendl;
7400         ret = 1;
7401       } else {
7402         dout(10) << __func__ << " session " << s << " " << s->entity_name
7403                  << " failed to parse caps '" << str << "'" << dendl;
7404         ret = -EACCES;
7405       }
7406     }
7407   }
7408   return ret;
7409 }
7410
7411 void OSD::do_waiters()
7412 {
7413   ceph_assert(ceph_mutex_is_locked(osd_lock));
7414
7415   dout(10) << "do_waiters -- start" << dendl;
7416   while (!finished.empty()) {
7417     OpRequestRef next = finished.front();
7418     finished.pop_front();
7419     dispatch_op(next);
7420   }
7421   dout(10) << "do_waiters -- finish" << dendl;
7422 }
7423
7424 void OSD::dispatch_op(OpRequestRef op)
7425 {
7426   switch (op->get_req()->get_type()) {
7427
7428   case MSG_OSD_PG_CREATE:
7429     handle_pg_create(op);
7430     break;
7431   }
7432 }
7433
7434 void OSD::_dispatch(Message *m)
7435 {
7436   ceph_assert(ceph_mutex_is_locked(osd_lock));
7437   dout(20) << "_dispatch " << m << " " << *m << dendl;
7438
7439   switch (m->get_type()) {
7440     // -- don't need OSDMap --
7441
7442     // map and replication
7443   case CEPH_MSG_OSD_MAP:
7444     handle_osd_map(static_cast<MOSDMap*>(m));
7445     break;
7446   case MSG_MON_GET_PURGED_SNAPS_REPLY:
7447     handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7448     break;
7449
7450     // osd
7451   case MSG_OSD_SCRUB:
7452     handle_scrub(static_cast<MOSDScrub*>(m));
7453     break;
7454
7455   case MSG_COMMAND:
7456     handle_command(static_cast<MCommand*>(m));
7457     return;
7458
7459     // -- need OSDMap --
7460
7461   case MSG_OSD_PG_CREATE:
7462     {
7463       OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7464       if (m->trace)
7465         op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7466       // no map?  starting up?
7467       if (!get_osdmap()) {
7468         dout(7) << "no OSDMap, not booted" << dendl;
7469         logger->inc(l_osd_waiting_for_map);
7470         waiting_for_osdmap.push_back(op);
7471         op->mark_delayed("no osdmap");
7472         break;
7473       }
7474
7475       // need OSDMap
7476       dispatch_op(op);
7477     }
7478   }
7479 }
7480
7481 // remove me post-nautilus
7482 void OSD::handle_scrub(MOSDScrub *m)
7483 {
7484   dout(10) << "handle_scrub " << *m << dendl;
7485   if (!require_mon_or_mgr_peer(m)) {
7486     m->put();
7487     return;
7488   }
7489   if (m->fsid != monc->get_fsid()) {
7490     dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7491             << dendl;
7492     m->put();
7493     return;
7494   }
7495
7496   vector<spg_t> spgs;
7497   _get_pgids(&spgs);
7498
7499   if (!m->scrub_pgs.empty()) {
7500     vector<spg_t> v;
7501     for (auto pgid : m->scrub_pgs) {
7502       spg_t pcand;
7503       if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7504           std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7505         v.push_back(pcand);
7506       }
7507     }
7508     spgs.swap(v);
7509   }
7510
7511   for (auto pgid : spgs) {
7512     enqueue_peering_evt(
7513       pgid,
7514       PGPeeringEventRef(
7515         std::make_shared<PGPeeringEvent>(
7516           get_osdmap_epoch(),
7517           get_osdmap_epoch(),
7518           PeeringState::RequestScrub(m->deep, m->repair))));
7519   }
7520
7521   m->put();
7522 }
7523
7524 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7525 {
7526   dout(10) << __func__ <<  " " << *m << dendl;
7527   if (!require_mon_or_mgr_peer(m)) {
7528     m->put();
7529     return;
7530   }
7531   if (m->fsid != monc->get_fsid()) {
7532     dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7533             << dendl;
7534     m->put();
7535     return;
7536   }
7537   for (auto pgid : m->scrub_pgs) {
7538     enqueue_peering_evt(
7539       pgid,
7540       PGPeeringEventRef(
7541         std::make_shared<PGPeeringEvent>(
7542           m->epoch,
7543           m->epoch,
7544           PeeringState::RequestScrub(m->deep, m->repair))));
7545   }
7546   m->put();
7547 }
7548
7549 bool OSD::scrub_random_backoff()
7550 {
7551   bool coin_flip = (rand() / (double)RAND_MAX >=
7552                     cct->_conf->osd_scrub_backoff_ratio);
7553   if (!coin_flip) {
7554     dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7555     return true;
7556   }
7557   return false;
7558 }
7559
7560
7561 void OSD::sched_scrub()
7562 {
7563   auto& scrub_scheduler = service.get_scrub_services();
7564
7565   // fail fast if no resources are available
7566   if (!scrub_scheduler.can_inc_scrubs()) {
7567     dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7568     return;
7569   }
7570
7571   // if there is a PG that is just now trying to reserve scrub replica resources -
7572   // we should wait and not initiate a new scrub
7573   if (scrub_scheduler.is_reserving_now()) {
7574     dout(20) << __func__ << ": scrub resources reservation in progress" << dendl;
7575     return;
7576   }
7577
7578   Scrub::ScrubPreconds env_conditions;
7579
7580   if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7581     if (!cct->_conf->osd_repair_during_recovery) {
7582       dout(15) << __func__ << ": not scheduling scrubs due to active recovery"
7583                << dendl;
7584       return;
7585     }
7586     dout(10) << __func__
7587       << " will only schedule explicitly requested repair due to active recovery"
7588       << dendl;
7589     env_conditions.allow_requested_repair_only = true;
7590   }
7591
7592   if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
7593     dout(20) << __func__ << " sched_scrub starts" << dendl;
7594     auto all_jobs = scrub_scheduler.list_registered_jobs();
7595     for (const auto& sj : all_jobs) {
7596       dout(20) << "sched_scrub scrub-queue jobs: " << *sj << dendl;
7597     }
7598   }
7599
7600   auto was_started = scrub_scheduler.select_pg_and_scrub(env_conditions);
7601   dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started)
7602            << ")" << dendl;
7603 }
7604
7605 Scrub::schedule_result_t OSDService::initiate_a_scrub(spg_t pgid,
7606                                                       bool allow_requested_repair_only)
7607 {
7608   dout(20) << __func__ << " trying " << pgid << dendl;
7609
7610   // we have a candidate to scrub. We need some PG information to know if scrubbing is
7611   // allowed
7612
7613   PGRef pg = osd->lookup_lock_pg(pgid);
7614   if (!pg) {
7615     // the PG was dequeued in the short timespan between creating the candidates list
7616     // (collect_ripe_jobs()) and here
7617     dout(5) << __func__ << " pg  " << pgid << " not found" << dendl;
7618     return Scrub::schedule_result_t::no_such_pg;
7619   }
7620
7621   // This has already started, so go on to the next scrub job
7622   if (pg->is_scrub_queued_or_active()) {
7623     pg->unlock();
7624     dout(20) << __func__ << ": already in progress pgid " << pgid << dendl;
7625     return Scrub::schedule_result_t::already_started;
7626   }
7627   // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7628   if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
7629     pg->unlock();
7630     dout(10) << __func__ << " skip " << pgid
7631              << " because repairing is not explicitly requested on it" << dendl;
7632     return Scrub::schedule_result_t::preconditions;
7633   }
7634
7635   auto scrub_attempt = pg->sched_scrub();
7636   pg->unlock();
7637   return scrub_attempt;
7638 }
7639
7640 void OSD::resched_all_scrubs()
7641 {
7642   dout(10) << __func__ << ": start" << dendl;
7643   auto all_jobs = service.get_scrub_services().list_registered_jobs();
7644   for (auto& e : all_jobs) {
7645
7646     auto& job = *e;
7647     dout(20) << __func__ << ": examine " << job.pgid << dendl;
7648
7649     PGRef pg = _lookup_lock_pg(job.pgid);
7650     if (!pg)
7651       continue;
7652
7653     if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
7654       dout(15) << __func__ << ": reschedule " << job.pgid << dendl;
7655       pg->reschedule_scrub();
7656     }
7657     pg->unlock();
7658   }
7659   dout(10) << __func__ << ": done" << dendl;
7660 }
7661
7662 MPGStats* OSD::collect_pg_stats()
7663 {
7664   dout(15) << __func__ << dendl;
7665   // This implementation unconditionally sends every is_primary PG's
7666   // stats every time we're called.  This has equivalent cost to the
7667   // previous implementation's worst case where all PGs are busy and
7668   // their stats are always enqueued for sending.
7669   std::shared_lock l{map_lock};
7670
7671   osd_stat_t cur_stat = service.get_osd_stat();
7672   cur_stat.os_perf_stat = store->get_cur_stats();
7673
7674   auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7675   m->osd_stat = cur_stat;
7676
7677   std::lock_guard lec{min_last_epoch_clean_lock};
7678   min_last_epoch_clean = get_osdmap_epoch();
7679   min_last_epoch_clean_pgs.clear();
7680
7681   std::set<int64_t> pool_set;
7682   vector<PGRef> pgs;
7683   _get_pgs(&pgs);
7684   for (auto& pg : pgs) {
7685     auto pool = pg->pg_id.pgid.pool();
7686     pool_set.emplace((int64_t)pool);
7687     if (!pg->is_primary()) {
7688       continue;
7689     }
7690     pg->with_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7691         m->pg_stat[pg->pg_id.pgid] = s;
7692         min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
7693         min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7694       });
7695   }
7696   store_statfs_t st;
7697   bool per_pool_stats = false;
7698   bool per_pool_omap_stats = false;
7699   for (auto p : pool_set) {
7700     int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7701     if (r == -ENOTSUP) {
7702       break;
7703     } else {
7704       assert(r >= 0);
7705       m->pool_stat[p] = st;
7706       per_pool_stats = true;
7707     }
7708   }
7709
7710   // indicate whether we are reporting per-pool stats
7711   m->osd_stat.num_osds = 1;
7712   m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7713   m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7714
7715   return m;
7716 }
7717
7718 vector<DaemonHealthMetric> OSD::get_health_metrics()
7719 {
7720   vector<DaemonHealthMetric> metrics;
7721   {
7722     utime_t oldest_secs;
7723     const utime_t now = ceph_clock_now();
7724     auto too_old = now;
7725     too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7726     int slow = 0;
7727     TrackedOpRef oldest_op;
7728     OSDMapRef osdmap = get_osdmap();
7729     // map of slow op counts by slow op event type for an aggregated logging to
7730     // the cluster log.
7731     map<uint8_t, int> slow_op_types;
7732     // map of slow op counts by pool for reporting a pool name with highest
7733     // slow ops.
7734     map<uint64_t, int> slow_op_pools;
7735     bool log_aggregated_slow_op =
7736             cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
7737     auto count_slow_ops = [&](TrackedOp& op) {
7738       if (op.get_initiated() < too_old) {
7739         stringstream ss;
7740         ss << "slow request " << op.get_desc()
7741            << " initiated "
7742            << op.get_initiated()
7743            << " currently "
7744            << op.state_string();
7745         lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7746         if (log_aggregated_slow_op) {
7747           if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
7748             uint8_t op_type = req->state_flag();
7749             auto m = req->get_req<MOSDFastDispatchOp>();
7750             uint64_t poolid = m->get_spg().pgid.m_pool;
7751             slow_op_types[op_type]++;
7752             if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
7753               slow_op_pools[poolid]++;
7754             }
7755           }
7756         } else {
7757           clog->warn() << ss.str();
7758         }
7759         slow++;
7760         if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7761           oldest_op = &op;
7762         }
7763         return true;
7764       } else {
7765         return false;
7766       }
7767     };
7768     if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7769       if (slow) {
7770         derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7771              << oldest_op->get_desc() << dendl;
7772         if (log_aggregated_slow_op &&
7773              slow_op_types.size() > 0) {
7774           stringstream ss;
7775           ss << slow << " slow requests (by type [ ";
7776           for (const auto& [op_type, count] : slow_op_types) {
7777             ss << "'" << OpRequest::get_state_string(op_type)
7778                << "' : " << count
7779                << " ";
7780           }
7781           auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
7782                                  [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
7783                                    return p1.second < p2.second;
7784                                  });
7785           if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
7786             string pool_name = osdmap->get_pool_name(slow_pool_it->first);
7787             ss << "] most affected pool [ '"
7788                << pool_name
7789                << "' : "
7790                << slow_pool_it->second
7791                << " ])";
7792           } else {
7793             ss << "])";
7794           }
7795           lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7796           clog->warn() << ss.str();
7797         }
7798       }
7799       metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7800     } else {
7801       // no news is not good news.
7802       metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7803     }
7804   }
7805   {
7806     std::lock_guard l(pending_creates_lock);
7807     auto n_primaries = pending_creates_from_mon;
7808     for (const auto& create : pending_creates_from_osd) {
7809       if (create.second) {
7810         n_primaries++;
7811       }
7812     }
7813     metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7814   }
7815   return metrics;
7816 }
7817
7818 // =====================================================
7819 // MAP
7820
7821 void OSD::wait_for_new_map(OpRequestRef op)
7822 {
7823   // ask?
7824   if (waiting_for_osdmap.empty()) {
7825     osdmap_subscribe(get_osdmap_epoch() + 1, false);
7826   }
7827
7828   logger->inc(l_osd_waiting_for_map);
7829   waiting_for_osdmap.push_back(op);
7830   op->mark_delayed("wait for new map");
7831 }
7832
7833
7834 /** update_map
7835  * assimilate new OSDMap(s).  scan pgs, etc.
7836  */
7837
7838 void OSD::note_down_osd(int peer)
7839 {
7840   ceph_assert(ceph_mutex_is_locked(osd_lock));
7841   cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7842
7843   std::lock_guard l{heartbeat_lock};
7844   failure_queue.erase(peer);
7845   failure_pending.erase(peer);
7846   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7847   if (p != heartbeat_peers.end()) {
7848     p->second.clear_mark_down();
7849     heartbeat_peers.erase(p);
7850   }
7851 }
7852
7853 void OSD::note_up_osd(int peer)
7854 {
7855   heartbeat_set_peers_need_update();
7856 }
7857
7858 struct C_OnMapCommit : public Context {
7859   OSD *osd;
7860   epoch_t first, last;
7861   MOSDMap *msg;
7862   C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7863     : osd(o), first(f), last(l), msg(m) {}
7864   void finish(int r) override {
7865     osd->_committed_osd_maps(first, last, msg);
7866     msg->put();
7867   }
7868 };
7869
7870 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7871 {
7872   std::lock_guard l(osdmap_subscribe_lock);
7873   if (latest_subscribed_epoch >= epoch && !force_request)
7874     return;
7875
7876   latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7877
7878   if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7879       force_request) {
7880     monc->renew_subs();
7881   }
7882 }
7883
7884 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7885 {
7886   epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7887   if (min <= superblock.oldest_map)
7888     return;
7889
7890   int num = 0;
7891   ObjectStore::Transaction t;
7892   for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7893     dout(20) << " removing old osdmap epoch " << e << dendl;
7894     t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7895     t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7896     superblock.oldest_map = e + 1;
7897     num++;
7898     if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7899       service.publish_superblock(superblock);
7900       write_superblock(t);
7901       int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7902       ceph_assert(tr == 0);
7903       num = 0;
7904       if (!skip_maps) {
7905         // skip_maps leaves us with a range of old maps if we fail to remove all
7906         // of them before moving superblock.oldest_map forward to the first map
7907         // in the incoming MOSDMap msg. so we should continue removing them in
7908         // this case, even we could do huge series of delete transactions all at
7909         // once.
7910         break;
7911       }
7912     }
7913   }
7914   if (num > 0) {
7915     service.publish_superblock(superblock);
7916     write_superblock(t);
7917     int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7918     ceph_assert(tr == 0);
7919   }
7920   // we should not remove the cached maps
7921   ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7922 }
7923
7924 void OSD::handle_osd_map(MOSDMap *m)
7925 {
7926   // wait for pgs to catch up
7927   {
7928     // we extend the map cache pins to accomodate pgs slow to consume maps
7929     // for some period, until we hit the max_lag_factor bound, at which point
7930     // we block here to stop injesting more maps than they are able to keep
7931     // up with.
7932     epoch_t max_lag = cct->_conf->osd_map_cache_size *
7933       m_osd_pg_epoch_max_lag_factor;
7934     ceph_assert(max_lag > 0);
7935     epoch_t osd_min = 0;
7936     for (auto shard : shards) {
7937       epoch_t min = shard->get_min_pg_epoch();
7938       if (osd_min == 0 || min < osd_min) {
7939         osd_min = min;
7940       }
7941     }
7942     epoch_t osdmap_epoch = get_osdmap_epoch();
7943     if (osd_min > 0 &&
7944         osdmap_epoch > max_lag &&
7945         osdmap_epoch - max_lag > osd_min) {
7946       epoch_t need = osdmap_epoch - max_lag;
7947       dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7948                << " max_lag " << max_lag << ")" << dendl;
7949       for (auto shard : shards) {
7950         epoch_t min = shard->get_min_pg_epoch();
7951         if (need > min) {
7952           dout(10) << __func__ << " waiting for pgs to consume " << need
7953                    << " (shard " << shard->shard_id << " min " << min
7954                    << ", map cache is " << cct->_conf->osd_map_cache_size
7955                    << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7956                    << ")" << dendl;
7957           unlock_guard unlock{osd_lock};
7958           shard->wait_min_pg_epoch(need);
7959         }
7960       }
7961     }
7962   }
7963
7964   ceph_assert(ceph_mutex_is_locked(osd_lock));
7965   map<epoch_t,OSDMapRef> added_maps;
7966   map<epoch_t,bufferlist> added_maps_bl;
7967   if (m->fsid != monc->get_fsid()) {
7968     dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7969             << monc->get_fsid() << dendl;
7970     m->put();
7971     return;
7972   }
7973   if (is_initializing()) {
7974     dout(0) << "ignoring osdmap until we have initialized" << dendl;
7975     m->put();
7976     return;
7977   }
7978
7979   auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7980   if (session && !(session->entity_name.is_mon() ||
7981                    session->entity_name.is_osd())) {
7982     //not enough perms!
7983     dout(10) << "got osd map from Session " << session
7984              << " which we can't take maps from (not a mon or osd)" << dendl;
7985     m->put();
7986     return;
7987   }
7988
7989   // share with the objecter
7990   if (!is_preboot())
7991     service.objecter->handle_osd_map(m);
7992
7993   epoch_t first = m->get_first();
7994   epoch_t last = m->get_last();
7995   dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7996           << superblock.newest_map
7997           << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7998           << dendl;
7999
8000   logger->inc(l_osd_map);
8001   logger->inc(l_osd_mape, last - first + 1);
8002   if (first <= superblock.newest_map)
8003     logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8004   if (service.max_oldest_map < m->oldest_map) {
8005     service.max_oldest_map = m->oldest_map;
8006     ceph_assert(service.max_oldest_map >= superblock.oldest_map);
8007   }
8008
8009   // make sure there is something new, here, before we bother flushing
8010   // the queues and such
8011   if (last <= superblock.newest_map) {
8012     dout(10) << " no new maps here, dropping" << dendl;
8013     m->put();
8014     return;
8015   }
8016
8017   // missing some?
8018   bool skip_maps = false;
8019   if (first > superblock.newest_map + 1) {
8020     dout(10) << "handle_osd_map message skips epochs "
8021              << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8022     if (m->oldest_map <= superblock.newest_map + 1) {
8023       osdmap_subscribe(superblock.newest_map + 1, false);
8024       m->put();
8025       return;
8026     }
8027     // always try to get the full range of maps--as many as we can.  this
8028     //  1- is good to have
8029     //  2- is at present the only way to ensure that we get a *full* map as
8030     //     the first map!
8031     if (m->oldest_map < first) {
8032       osdmap_subscribe(m->oldest_map - 1, true);
8033       m->put();
8034       return;
8035     }
8036     skip_maps = true;
8037   }
8038
8039   ObjectStore::Transaction t;
8040   uint64_t txn_size = 0;
8041
8042   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8043
8044   // store new maps: queue for disk and put in the osdmap cache
8045   epoch_t start = std::max(superblock.newest_map + 1, first);
8046   for (epoch_t e = start; e <= last; e++) {
8047     if (txn_size >= t.get_num_bytes()) {
8048       derr << __func__ << " transaction size overflowed" << dendl;
8049       ceph_assert(txn_size < t.get_num_bytes());
8050     }
8051     txn_size = t.get_num_bytes();
8052     map<epoch_t,bufferlist>::iterator p;
8053     p = m->maps.find(e);
8054     if (p != m->maps.end()) {
8055       dout(10) << "handle_osd_map  got full map for epoch " << e << dendl;
8056       OSDMap *o = new OSDMap;
8057       bufferlist& bl = p->second;
8058
8059       o->decode(bl);
8060
8061       purged_snaps[e] = o->get_new_purged_snaps();
8062
8063       ghobject_t fulloid = get_osdmap_pobject_name(e);
8064       t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8065       added_maps[e] = add_map(o);
8066       added_maps_bl[e] = bl;
8067       got_full_map(e);
8068       continue;
8069     }
8070
8071     p = m->incremental_maps.find(e);
8072     if (p != m->incremental_maps.end()) {
8073       dout(10) << "handle_osd_map  got inc map for epoch " << e << dendl;
8074       bufferlist& bl = p->second;
8075       ghobject_t oid = get_inc_osdmap_pobject_name(e);
8076       t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8077
8078       OSDMap *o = new OSDMap;
8079       if (e > 1) {
8080         bufferlist obl;
8081         bool got = get_map_bl(e - 1, obl);
8082         if (!got) {
8083           auto p = added_maps_bl.find(e - 1);
8084           ceph_assert(p != added_maps_bl.end());
8085           obl = p->second;
8086         }
8087         o->decode(obl);
8088       }
8089
8090       OSDMap::Incremental inc;
8091       auto p = bl.cbegin();
8092       inc.decode(p);
8093
8094       if (o->apply_incremental(inc) < 0) {
8095         derr << "ERROR: bad fsid?  i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
8096         ceph_abort_msg("bad fsid");
8097       }
8098
8099       bufferlist fbl;
8100       o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8101
8102       bool injected_failure = false;
8103       if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8104           (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8105         derr << __func__ << " injecting map crc failure" << dendl;
8106         injected_failure = true;
8107       }
8108
8109       if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8110         dout(2) << "got incremental " << e
8111                 << " but failed to encode full with correct crc; requesting"
8112                 << dendl;
8113         clog->warn() << "failed to encode map e" << e << " with expected crc";
8114         dout(20) << "my encoded map was:\n";
8115         fbl.hexdump(*_dout);
8116         *_dout << dendl;
8117         delete o;
8118         request_full_map(e, last);
8119         last = e - 1;
8120
8121         // don't continue committing if we failed to enc the first inc map
8122         if (last < start) {
8123           dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8124           m->put();
8125           return;
8126         }
8127         break;
8128       }
8129       got_full_map(e);
8130       purged_snaps[e] = o->get_new_purged_snaps();
8131
8132       ghobject_t fulloid = get_osdmap_pobject_name(e);
8133       t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8134       added_maps[e] = add_map(o);
8135       added_maps_bl[e] = fbl;
8136       continue;
8137     }
8138
8139     ceph_abort_msg("MOSDMap lied about what maps it had?");
8140   }
8141
8142   // even if this map isn't from a mon, we may have satisfied our subscription
8143   monc->sub_got("osdmap", last);
8144
8145   if (!m->maps.empty() && requested_full_first) {
8146     dout(10) << __func__ << " still missing full maps " << requested_full_first
8147              << ".." << requested_full_last << dendl;
8148     rerequest_full_maps();
8149   }
8150
8151   if (superblock.oldest_map) {
8152     // make sure we at least keep pace with incoming maps
8153     trim_maps(m->oldest_map, last - first + 1, skip_maps);
8154     pg_num_history.prune(superblock.oldest_map);
8155   }
8156
8157   if (!superblock.oldest_map || skip_maps)
8158     superblock.oldest_map = first;
8159   superblock.newest_map = last;
8160   superblock.current_epoch = last;
8161
8162   // note in the superblock that we were clean thru the prior epoch
8163   epoch_t boot_epoch = service.get_boot_epoch();
8164   if (boot_epoch && boot_epoch >= superblock.mounted) {
8165     superblock.mounted = boot_epoch;
8166     superblock.clean_thru = last;
8167   }
8168
8169   // check for pg_num changes and deleted pools
8170   OSDMapRef lastmap;
8171   for (auto& i : added_maps) {
8172     if (!lastmap) {
8173       if (!(lastmap = service.try_get_map(i.first - 1))) {
8174         dout(10) << __func__ << " can't get previous map " << i.first - 1
8175                  << " probably first start of this osd" << dendl;
8176         continue;
8177       }
8178     }
8179     ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8180     for (auto& j : lastmap->get_pools()) {
8181       if (!i.second->have_pg_pool(j.first)) {
8182         pg_num_history.log_pool_delete(i.first, j.first);
8183         dout(10) << __func__ << " recording final pg_pool_t for pool "
8184                  << j.first << dendl;
8185         // this information is needed by _make_pg() if have to restart before
8186         // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8187         ghobject_t obj = make_final_pool_info_oid(j.first);
8188         bufferlist bl;
8189         encode(j.second, bl, CEPH_FEATURES_ALL);
8190         string name = lastmap->get_pool_name(j.first);
8191         encode(name, bl);
8192         map<string,string> profile;
8193         if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8194           profile = lastmap->get_erasure_code_profile(
8195             lastmap->get_pg_pool(j.first)->erasure_code_profile);
8196         }
8197         encode(profile, bl);
8198         t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8199       } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8200                  new_pg_num != j.second.get_pg_num()) {
8201         dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8202                  << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8203         pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8204       }
8205     }
8206     for (auto& j : i.second->get_pools()) {
8207       if (!lastmap->have_pg_pool(j.first)) {
8208         dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8209                  << j.second.get_pg_num() << dendl;
8210         pg_num_history.log_pg_num_change(i.first, j.first,
8211                                          j.second.get_pg_num());
8212       }
8213     }
8214     lastmap = i.second;
8215   }
8216   pg_num_history.epoch = last;
8217   {
8218     bufferlist bl;
8219     ::encode(pg_num_history, bl);
8220     t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8221     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8222   }
8223
8224   // record new purged_snaps
8225   if (superblock.purged_snaps_last == start - 1) {
8226     SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
8227                                     make_purged_snaps_oid(), &t,
8228                                     purged_snaps);
8229     superblock.purged_snaps_last = last;
8230   } else {
8231     dout(10) << __func__ << " superblock purged_snaps_last is "
8232              << superblock.purged_snaps_last
8233              << ", not recording new purged_snaps" << dendl;
8234   }
8235
8236   // superblock and commit
8237   write_superblock(t);
8238   t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8239   store->queue_transaction(
8240     service.meta_ch,
8241     std::move(t));
8242   service.publish_superblock(superblock);
8243 }
8244
8245 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8246 {
8247   dout(10) << __func__ << " " << first << ".." << last << dendl;
8248   if (is_stopping()) {
8249     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8250     return;
8251   }
8252   std::lock_guard l(osd_lock);
8253   if (is_stopping()) {
8254     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8255     return;
8256   }
8257   map_lock.lock();
8258
8259   ceph_assert(first <= last);
8260
8261   bool do_shutdown = false;
8262   bool do_restart = false;
8263   bool network_error = false;
8264   OSDMapRef osdmap = get_osdmap();
8265
8266   // advance through the new maps
8267   for (epoch_t cur = first; cur <= last; cur++) {
8268     dout(10) << " advance to epoch " << cur
8269              << " (<= last " << last
8270              << " <= newest_map " << superblock.newest_map
8271              << ")" << dendl;
8272
8273     OSDMapRef newmap = get_map(cur);
8274     ceph_assert(newmap);  // we just cached it above!
8275
8276     // start blocklisting messages sent to peers that go down.
8277     service.pre_publish_map(newmap);
8278
8279     // kill connections to newly down osds
8280     bool waited_for_reservations = false;
8281     set<int> old;
8282     osdmap = get_osdmap();
8283     osdmap->get_all_osds(old);
8284     for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8285       if (*p != whoami &&
8286           osdmap->is_up(*p) && // in old map
8287           newmap->is_down(*p)) {    // but not the new one
8288         if (!waited_for_reservations) {
8289           service.await_reserved_maps();
8290           waited_for_reservations = true;
8291         }
8292         note_down_osd(*p);
8293       } else if (*p != whoami &&
8294                 osdmap->is_down(*p) &&
8295                 newmap->is_up(*p)) {
8296         note_up_osd(*p);
8297       }
8298     }
8299
8300     if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8301       dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8302                << dendl;
8303       if (is_booting()) {
8304         // this captures the case where we sent the boot message while
8305         // NOUP was being set on the mon and our boot request was
8306         // dropped, and then later it is cleared.  it imperfectly
8307         // handles the case where our original boot message was not
8308         // dropped and we restart even though we might have booted, but
8309         // that is harmless (boot will just take slightly longer).
8310         do_restart = true;
8311       }
8312     }
8313
8314     osdmap = std::move(newmap);
8315     set_osdmap(osdmap);
8316     epoch_t up_epoch;
8317     epoch_t boot_epoch;
8318     service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8319     if (!up_epoch &&
8320         osdmap->is_up(whoami) &&
8321         osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8322       up_epoch = osdmap->get_epoch();
8323       dout(10) << "up_epoch is " << up_epoch << dendl;
8324       if (!boot_epoch) {
8325         boot_epoch = osdmap->get_epoch();
8326         dout(10) << "boot_epoch is " << boot_epoch << dendl;
8327       }
8328       service.set_epochs(&boot_epoch, &up_epoch, NULL);
8329     }
8330   }
8331
8332   epoch_t _bind_epoch = service.get_bind_epoch();
8333   if (osdmap->is_up(whoami) &&
8334       osdmap->get_addrs(whoami).legacy_equals(
8335         client_messenger->get_myaddrs()) &&
8336       _bind_epoch < osdmap->get_up_from(whoami)) {
8337
8338     if (is_booting()) {
8339       dout(1) << "state: booting -> active" << dendl;
8340       set_state(STATE_ACTIVE);
8341       do_restart = false;
8342
8343       // set incarnation so that osd_reqid_t's we generate for our
8344       // objecter requests are unique across restarts.
8345       service.objecter->set_client_incarnation(osdmap->get_epoch());
8346       cancel_pending_failures();
8347     }
8348   }
8349
8350   if (osdmap->get_epoch() > 0 &&
8351       is_active()) {
8352     if (!osdmap->exists(whoami)) {
8353       derr << "map says i do not exist.  shutting down." << dendl;
8354       do_shutdown = true;   // don't call shutdown() while we have
8355                             // everything paused
8356     } else if (osdmap->is_stop(whoami)) {
8357       derr << "map says i am stopped by admin. shutting down." << dendl;
8358       do_shutdown = true;
8359     } else if (!osdmap->is_up(whoami) ||
8360                !osdmap->get_addrs(whoami).legacy_equals(
8361                  client_messenger->get_myaddrs()) ||
8362                !osdmap->get_cluster_addrs(whoami).legacy_equals(
8363                  cluster_messenger->get_myaddrs()) ||
8364                !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8365                  hb_back_server_messenger->get_myaddrs()) ||
8366                !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8367                  hb_front_server_messenger->get_myaddrs())) {
8368       if (!osdmap->is_up(whoami)) {
8369         if (service.is_preparing_to_stop() || service.is_stopping()) {
8370           service.got_stop_ack();
8371         } else {
8372           clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8373                           "but it is still running";
8374           clog->debug() << "map e" << osdmap->get_epoch()
8375                         << " wrongly marked me down at e"
8376                         << osdmap->get_down_at(whoami);
8377         }
8378         if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8379           // note that this is best-effort...
8380           monc->send_mon_message(
8381             new MOSDMarkMeDead(
8382               monc->get_fsid(),
8383               whoami,
8384               osdmap->get_epoch()));
8385         }
8386       } else if (!osdmap->get_addrs(whoami).legacy_equals(
8387                    client_messenger->get_myaddrs())) {
8388         clog->error() << "map e" << osdmap->get_epoch()
8389                       << " had wrong client addr (" << osdmap->get_addrs(whoami)
8390                       << " != my " << client_messenger->get_myaddrs() << ")";
8391       } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8392                    cluster_messenger->get_myaddrs())) {
8393         clog->error() << "map e" << osdmap->get_epoch()
8394                       << " had wrong cluster addr ("
8395                       << osdmap->get_cluster_addrs(whoami)
8396                       << " != my " << cluster_messenger->get_myaddrs() << ")";
8397       } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8398                    hb_back_server_messenger->get_myaddrs())) {
8399         clog->error() << "map e" << osdmap->get_epoch()
8400                       << " had wrong heartbeat back addr ("
8401                       << osdmap->get_hb_back_addrs(whoami)
8402                       << " != my " << hb_back_server_messenger->get_myaddrs()
8403                       << ")";
8404       } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8405                    hb_front_server_messenger->get_myaddrs())) {
8406         clog->error() << "map e" << osdmap->get_epoch()
8407                       << " had wrong heartbeat front addr ("
8408                       << osdmap->get_hb_front_addrs(whoami)
8409                       << " != my " << hb_front_server_messenger->get_myaddrs()
8410                       << ")";
8411       }
8412
8413       if (!service.is_stopping()) {
8414         epoch_t up_epoch = 0;
8415         epoch_t bind_epoch = osdmap->get_epoch();
8416         service.set_epochs(NULL,&up_epoch, &bind_epoch);
8417         do_restart = true;
8418
8419         //add markdown log
8420         utime_t now = ceph_clock_now();
8421         utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8422         osd_markdown_log.push_back(now);
8423         if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8424           derr << __func__ << " marked down "
8425                << osd_markdown_log.size()
8426                << " > osd_max_markdown_count "
8427                << cct->_conf->osd_max_markdown_count
8428                << " in last " << grace << " seconds, shutting down"
8429                << dendl;
8430           do_restart = false;
8431           do_shutdown = true;
8432         }
8433
8434         start_waiting_for_healthy();
8435
8436         set<int> avoid_ports;
8437 #if defined(__FreeBSD__)
8438         // prevent FreeBSD from grabbing the client_messenger port during
8439         // rebinding. In which case a cluster_meesneger will connect also
8440         // to the same port
8441         client_messenger->get_myaddrs().get_ports(&avoid_ports);
8442 #endif
8443         cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8444
8445         int r = cluster_messenger->rebind(avoid_ports);
8446         if (r != 0) {
8447           do_shutdown = true;  // FIXME: do_restart?
8448           network_error = true;
8449           derr << __func__ << " marked down:"
8450                << " rebind cluster_messenger failed" << dendl;
8451         }
8452
8453         hb_back_server_messenger->mark_down_all();
8454         hb_front_server_messenger->mark_down_all();
8455         hb_front_client_messenger->mark_down_all();
8456         hb_back_client_messenger->mark_down_all();
8457
8458         reset_heartbeat_peers(true);
8459       }
8460     }
8461   } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
8462     derr << "map says i am stopped by admin. shutting down." << dendl;
8463     do_shutdown = true;
8464   }
8465
8466   map_lock.unlock();
8467
8468   check_osdmap_features();
8469
8470   // yay!
8471   consume_map();
8472
8473   if (is_active() || is_waiting_for_healthy())
8474     maybe_update_heartbeat_peers();
8475
8476   if (is_active()) {
8477     activate_map();
8478   }
8479
8480   if (do_shutdown) {
8481     if (network_error) {
8482       cancel_pending_failures();
8483     }
8484     // trigger shutdown in a different thread
8485     dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8486     queue_async_signal(SIGINT);
8487   }
8488   else if (m->newest_map && m->newest_map > last) {
8489     dout(10) << " msg say newest map is " << m->newest_map
8490              << ", requesting more" << dendl;
8491     osdmap_subscribe(osdmap->get_epoch()+1, false);
8492   }
8493   else if (is_preboot()) {
8494     if (m->get_source().is_mon())
8495       _preboot(m->oldest_map, m->newest_map);
8496     else
8497       start_boot();
8498   }
8499   else if (do_restart)
8500     start_boot();
8501
8502 }
8503
8504 void OSD::check_osdmap_features()
8505 {
8506   // adjust required feature bits?
8507
8508   // we have to be a bit careful here, because we are accessing the
8509   // Policy structures without taking any lock.  in particular, only
8510   // modify integer values that can safely be read by a racing CPU.
8511   // since we are only accessing existing Policy structures a their
8512   // current memory location, and setting or clearing bits in integer
8513   // fields, and we are the only writer, this is not a problem.
8514
8515   const auto osdmap = get_osdmap();
8516   {
8517     Messenger::Policy p = client_messenger->get_default_policy();
8518     uint64_t mask;
8519     uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8520     if ((p.features_required & mask) != features) {
8521       dout(0) << "crush map has features " << features
8522               << ", adjusting msgr requires for clients" << dendl;
8523       p.features_required = (p.features_required & ~mask) | features;
8524       client_messenger->set_default_policy(p);
8525     }
8526   }
8527   {
8528     Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8529     uint64_t mask;
8530     uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8531     if ((p.features_required & mask) != features) {
8532       dout(0) << "crush map has features " << features
8533               << " was " << p.features_required
8534               << ", adjusting msgr requires for mons" << dendl;
8535       p.features_required = (p.features_required & ~mask) | features;
8536       client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8537     }
8538   }
8539   {
8540     Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8541     uint64_t mask;
8542     uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8543
8544     if ((p.features_required & mask) != features) {
8545       dout(0) << "crush map has features " << features
8546               << ", adjusting msgr requires for osds" << dendl;
8547       p.features_required = (p.features_required & ~mask) | features;
8548       cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8549     }
8550
8551     if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8552       dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8553       superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8554       ObjectStore::Transaction t;
8555       write_superblock(t);
8556       int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8557       ceph_assert(err == 0);
8558     }
8559   }
8560
8561   if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8562     hb_front_server_messenger->set_require_authorizer(false);
8563     hb_back_server_messenger->set_require_authorizer(false);
8564   } else {
8565     hb_front_server_messenger->set_require_authorizer(true);
8566     hb_back_server_messenger->set_require_authorizer(true);
8567   }
8568
8569   if (osdmap->require_osd_release != last_require_osd_release) {
8570     dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8571             << " -> " << to_string(osdmap->require_osd_release) << dendl;
8572     store->write_meta("require_osd_release",
8573                       stringify((int)osdmap->require_osd_release));
8574     last_require_osd_release = osdmap->require_osd_release;
8575   }
8576 }
8577
8578 struct C_FinishSplits : public Context {
8579   OSD *osd;
8580   set<PGRef> pgs;
8581   C_FinishSplits(OSD *osd, const set<PGRef> &in)
8582     : osd(osd), pgs(in) {}
8583   void finish(int r) override {
8584     osd->_finish_splits(pgs);
8585   }
8586 };
8587
8588 void OSD::_finish_splits(set<PGRef>& pgs)
8589 {
8590   dout(10) << __func__ << " " << pgs << dendl;
8591   if (is_stopping())
8592     return;
8593   for (set<PGRef>::iterator i = pgs.begin();
8594        i != pgs.end();
8595        ++i) {
8596     PG *pg = i->get();
8597
8598     PeeringCtx rctx;
8599     pg->lock();
8600     dout(10) << __func__ << " " << *pg << dendl;
8601     epoch_t e = pg->get_osdmap_epoch();
8602     pg->handle_initialize(rctx);
8603     pg->queue_null(e, e);
8604     dispatch_context(rctx, pg, service.get_osdmap());
8605     pg->unlock();
8606
8607     unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8608     shards[shard_index]->register_and_wake_split_child(pg);
8609   }
8610 };
8611
8612 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8613                            unsigned need)
8614 {
8615   std::lock_guard l(merge_lock);
8616   auto& p = merge_waiters[nextmap->get_epoch()][target];
8617   p[src->pg_id] = src;
8618   dout(10) << __func__ << " added merge_waiter " << src->pg_id
8619            << " for " << target  << ", have " << p.size() << "/" << need
8620            << dendl;
8621   return p.size() == need;
8622 }
8623
8624 bool OSD::advance_pg(
8625   epoch_t osd_epoch,
8626   PG *pg,
8627   ThreadPool::TPHandle &handle,
8628   PeeringCtx &rctx)
8629 {
8630   if (osd_epoch <= pg->get_osdmap_epoch()) {
8631     return true;
8632   }
8633   ceph_assert(pg->is_locked());
8634   OSDMapRef lastmap = pg->get_osdmap();
8635   set<PGRef> new_pgs;  // any split children
8636   bool ret = true;
8637
8638   unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8639     lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8640   for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8641        next_epoch <= osd_epoch;
8642        ++next_epoch) {
8643     OSDMapRef nextmap = service.try_get_map(next_epoch);
8644     if (!nextmap) {
8645       dout(20) << __func__ << " missing map " << next_epoch << dendl;
8646       continue;
8647     }
8648
8649     unsigned new_pg_num =
8650       (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8651       nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8652     if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8653       // check for merge
8654       if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8655         spg_t parent;
8656         if (pg->pg_id.is_merge_source(
8657               old_pg_num,
8658               new_pg_num,
8659               &parent)) {
8660           // we are merge source
8661           PGRef spg = pg; // carry a ref
8662           dout(1) << __func__ << " " << pg->pg_id
8663                   << " is merge source, target is " << parent
8664                    << dendl;
8665           pg->write_if_dirty(rctx);
8666           if (!new_pgs.empty()) {
8667             rctx.transaction.register_on_applied(new C_FinishSplits(this,
8668                                                                     new_pgs));
8669             new_pgs.clear();
8670           }
8671           dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8672           pg->ch->flush();
8673           // release backoffs explicitly, since the on_shutdown path
8674           // aggressively tears down backoff state.
8675           if (pg->is_primary()) {
8676             pg->release_pg_backoffs();
8677           }
8678           pg->on_shutdown();
8679           OSDShard *sdata = pg->osd_shard;
8680           {
8681             std::lock_guard l(sdata->shard_lock);
8682             if (pg->pg_slot) {
8683               sdata->_detach_pg(pg->pg_slot);
8684               // update pg count now since we might not get an osdmap
8685               // any time soon.
8686               if (pg->is_primary())
8687                 logger->dec(l_osd_pg_primary);
8688               else if (pg->is_nonprimary())
8689                 logger->dec(l_osd_pg_replica); // misnomer
8690               else
8691                 logger->dec(l_osd_pg_stray);
8692             }
8693           }
8694           pg->unlock();
8695
8696           set<spg_t> children;
8697           parent.is_split(new_pg_num, old_pg_num, &children);
8698           if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8699             enqueue_peering_evt(
8700               parent,
8701               PGPeeringEventRef(
8702                 std::make_shared<PGPeeringEvent>(
8703                   nextmap->get_epoch(),
8704                   nextmap->get_epoch(),
8705                   NullEvt())));
8706           }
8707           ret = false;
8708           goto out;
8709         } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8710           // we are merge target
8711           set<spg_t> children;
8712           pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8713           dout(20) << __func__ << " " << pg->pg_id
8714                    << " is merge target, sources are " << children
8715                    << dendl;
8716           map<spg_t,PGRef> sources;
8717           {
8718             std::lock_guard l(merge_lock);
8719             auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8720             unsigned need = children.size();
8721             dout(20) << __func__ << " have " << s.size() << "/"
8722                      << need << dendl;
8723             if (s.size() == need) {
8724               sources.swap(s);
8725               merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8726               if (merge_waiters[nextmap->get_epoch()].empty()) {
8727                 merge_waiters.erase(nextmap->get_epoch());
8728               }
8729             }
8730           }
8731           if (!sources.empty()) {
8732             unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8733             unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8734             dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8735             pg->merge_from(
8736               sources, rctx, split_bits,
8737               nextmap->get_pg_pool(
8738                 pg->pg_id.pool())->last_pg_merge_meta);
8739             pg->pg_slot->waiting_for_merge_epoch = 0;
8740           } else {
8741             dout(20) << __func__ << " not ready to merge yet" << dendl;
8742             pg->write_if_dirty(rctx);
8743             if (!new_pgs.empty()) {
8744               rctx.transaction.register_on_applied(new C_FinishSplits(this,
8745                                                                       new_pgs));
8746               new_pgs.clear();
8747             }
8748             dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8749             pg->unlock();
8750             // kick source(s) to get them ready
8751             for (auto& i : children) {
8752               dout(20) << __func__ << " kicking source " << i << dendl;
8753               enqueue_peering_evt(
8754                 i,
8755                 PGPeeringEventRef(
8756                   std::make_shared<PGPeeringEvent>(
8757                     nextmap->get_epoch(),
8758                     nextmap->get_epoch(),
8759                     NullEvt())));
8760             }
8761             ret = false;
8762             goto out;
8763           }
8764         }
8765       }
8766     }
8767
8768     vector<int> newup, newacting;
8769     int up_primary, acting_primary;
8770     nextmap->pg_to_up_acting_osds(
8771       pg->pg_id.pgid,
8772       &newup, &up_primary,
8773       &newacting, &acting_primary);
8774     pg->handle_advance_map(
8775       nextmap, lastmap, newup, up_primary,
8776       newacting, acting_primary, rctx);
8777
8778     auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8779     auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8780     if (oldpool != lastmap->get_pools().end()
8781         && newpool != nextmap->get_pools().end()) {
8782       dout(20) << __func__
8783                << " new pool opts " << newpool->second.opts
8784                << " old pool opts " << oldpool->second.opts
8785                << dendl;
8786
8787       double old_min_interval = 0, new_min_interval = 0;
8788       oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8789       newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8790
8791       double old_max_interval = 0, new_max_interval = 0;
8792       oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8793       newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8794
8795       // Assume if an interval is change from set to unset or vice versa the actual config
8796       // is different.  Keep it simple even if it is possible to call resched_all_scrub()
8797       // unnecessarily.
8798       if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8799         pg->on_info_history_change();
8800       }
8801     }
8802
8803     if (new_pg_num && old_pg_num != new_pg_num) {
8804       // check for split
8805       set<spg_t> children;
8806       if (pg->pg_id.is_split(
8807             old_pg_num,
8808             new_pg_num,
8809             &children)) {
8810         split_pgs(
8811           pg, children, &new_pgs, lastmap, nextmap,
8812           rctx);
8813       }
8814     }
8815
8816     lastmap = nextmap;
8817     old_pg_num = new_pg_num;
8818     handle.reset_tp_timeout();
8819   }
8820   pg->handle_activate_map(rctx);
8821
8822   ret = true;
8823  out:
8824   if (!new_pgs.empty()) {
8825     rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8826   }
8827   return ret;
8828 }
8829
8830 void OSD::consume_map()
8831 {
8832   ceph_assert(ceph_mutex_is_locked(osd_lock));
8833   auto osdmap = get_osdmap();
8834   dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8835
8836   /** make sure the cluster is speaking in SORTBITWISE, because we don't
8837    *  speak the older sorting version any more. Be careful not to force
8838    *  a shutdown if we are merely processing old maps, though.
8839    */
8840   if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8841     derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8842     ceph_abort();
8843   }
8844
8845   service.pre_publish_map(osdmap);
8846   service.await_reserved_maps();
8847   service.publish_map(osdmap);
8848
8849   // prime splits and merges
8850   set<pair<spg_t,epoch_t>> newly_split;  // splits, and when
8851   set<pair<spg_t,epoch_t>> merge_pgs;    // merge participants, and when
8852   for (auto& shard : shards) {
8853     shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8854   }
8855   if (!newly_split.empty()) {
8856     for (auto& shard : shards) {
8857       shard->prime_splits(osdmap, &newly_split);
8858     }
8859     ceph_assert(newly_split.empty());
8860   }
8861
8862   // prune sent_ready_to_merge
8863   service.prune_sent_ready_to_merge(osdmap);
8864
8865   // FIXME, maybe: We could race against an incoming peering message
8866   // that instantiates a merge PG after identify_merges() below and
8867   // never set up its peer to complete the merge.  An OSD restart
8868   // would clear it up.  This is a hard race to resolve,
8869   // extraordinarily rare (we only merge PGs that are stable and
8870   // clean, so it'd have to be an imported PG to an OSD with a
8871   // slightly stale OSDMap...), so I'm ignoring it for now.  We plan to
8872   // replace all of this with a seastar-based code soon anyway.
8873   if (!merge_pgs.empty()) {
8874     // mark the pgs we already have, or create new and empty merge
8875     // participants for those we are missing.  do this all under the
8876     // shard lock so we don't have to worry about racing pg creates
8877     // via _process.
8878     for (auto& shard : shards) {
8879       shard->prime_merges(osdmap, &merge_pgs);
8880     }
8881     ceph_assert(merge_pgs.empty());
8882   }
8883
8884   service.prune_pg_created();
8885
8886   unsigned pushes_to_free = 0;
8887   for (auto& shard : shards) {
8888     shard->consume_map(osdmap, &pushes_to_free);
8889   }
8890
8891   vector<spg_t> pgids;
8892   _get_pgids(&pgids);
8893
8894   // count (FIXME, probably during seastar rewrite)
8895   int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8896   vector<PGRef> pgs;
8897   _get_pgs(&pgs);
8898   for (auto& pg : pgs) {
8899     // FIXME (probably during seastar rewrite): this is lockless and
8900     // racy, but we don't want to take pg lock here.
8901     if (pg->is_primary())
8902       num_pg_primary++;
8903     else if (pg->is_nonprimary())
8904       num_pg_replica++;  // misnomer
8905     else
8906       num_pg_stray++;
8907   }
8908
8909   {
8910     // FIXME (as part of seastar rewrite): move to OSDShard
8911     std::lock_guard l(pending_creates_lock);
8912     for (auto pg = pending_creates_from_osd.begin();
8913          pg != pending_creates_from_osd.end();) {
8914       if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8915         dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8916                  << "discarding pending_create_from_osd" << dendl;
8917         pg = pending_creates_from_osd.erase(pg);
8918       } else {
8919         ++pg;
8920       }
8921     }
8922   }
8923
8924   service.maybe_inject_dispatch_delay();
8925
8926   dispatch_sessions_waiting_on_map();
8927
8928   service.maybe_inject_dispatch_delay();
8929
8930   service.release_reserved_pushes(pushes_to_free);
8931
8932   // queue null events to push maps down to individual PGs
8933   for (auto pgid : pgids) {
8934     enqueue_peering_evt(
8935       pgid,
8936       PGPeeringEventRef(
8937         std::make_shared<PGPeeringEvent>(
8938           osdmap->get_epoch(),
8939           osdmap->get_epoch(),
8940           NullEvt())));
8941   }
8942   logger->set(l_osd_pg, pgids.size());
8943   logger->set(l_osd_pg_primary, num_pg_primary);
8944   logger->set(l_osd_pg_replica, num_pg_replica);
8945   logger->set(l_osd_pg_stray, num_pg_stray);
8946 }
8947
8948 void OSD::activate_map()
8949 {
8950   ceph_assert(ceph_mutex_is_locked(osd_lock));
8951   auto osdmap = get_osdmap();
8952
8953   dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8954
8955   // norecover?
8956   if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8957     if (!service.recovery_is_paused()) {
8958       dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8959       service.pause_recovery();
8960     }
8961   } else {
8962     if (service.recovery_is_paused()) {
8963       dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8964       service.unpause_recovery();
8965     }
8966   }
8967
8968   service.activate_map();
8969
8970   // process waiters
8971   take_waiters(waiting_for_osdmap);
8972 }
8973
8974 bool OSD::require_mon_peer(const Message *m)
8975 {
8976   if (!m->get_connection()->peer_is_mon()) {
8977     dout(0) << "require_mon_peer received from non-mon "
8978             << m->get_connection()->get_peer_addr()
8979             << " " << *m << dendl;
8980     return false;
8981   }
8982   return true;
8983 }
8984
8985 bool OSD::require_mon_or_mgr_peer(const Message *m)
8986 {
8987   if (!m->get_connection()->peer_is_mon() &&
8988       !m->get_connection()->peer_is_mgr()) {
8989     dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8990             << m->get_connection()->get_peer_addr()
8991             << " " << *m << dendl;
8992     return false;
8993   }
8994   return true;
8995 }
8996
8997 bool OSD::require_osd_peer(const Message *m)
8998 {
8999   if (!m->get_connection()->peer_is_osd()) {
9000     dout(0) << "require_osd_peer received from non-osd "
9001             << m->get_connection()->get_peer_addr()
9002             << " " << *m << dendl;
9003     return false;
9004   }
9005   return true;
9006 }
9007
9008 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9009 {
9010   epoch_t up_epoch = service.get_up_epoch();
9011   if (epoch < up_epoch) {
9012     dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9013     return false;
9014   }
9015
9016   if (!is_active()) {
9017     dout(7) << "still in boot state, dropping message " << *m << dendl;
9018     return false;
9019   }
9020
9021   return true;
9022 }
9023
9024 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
9025                                      bool is_fast_dispatch)
9026 {
9027   int from = m->get_source().num();
9028
9029   if (map->is_down(from) ||
9030       (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9031     dout(5) << "from dead osd." << from << ", marking down, "
9032             << " msg was " << m->get_source_inst().addr
9033             << " expected "
9034             << (map->is_up(from) ?
9035                 map->get_cluster_addrs(from) : entity_addrvec_t())
9036             << dendl;
9037     ConnectionRef con = m->get_connection();
9038     con->mark_down();
9039     if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
9040       if (!is_fast_dispatch)
9041         s->session_dispatch_lock.lock();
9042       clear_session_waiting_on_map(s);
9043       con->set_priv(nullptr);   // break ref <-> session cycle, if any
9044       s->con.reset();
9045       if (!is_fast_dispatch)
9046         s->session_dispatch_lock.unlock();
9047     }
9048     return false;
9049   }
9050   return true;
9051 }
9052
9053
9054 /*
9055  * require that we have same (or newer) map, and that
9056  * the source is the pg primary.
9057  */
9058 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9059                                     bool is_fast_dispatch)
9060 {
9061   const Message *m = op->get_req();
9062   const auto osdmap = get_osdmap();
9063   dout(15) << "require_same_or_newer_map " << epoch
9064            << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9065
9066   ceph_assert(ceph_mutex_is_locked(osd_lock));
9067
9068   // do they have a newer map?
9069   if (epoch > osdmap->get_epoch()) {
9070     dout(7) << "waiting for newer map epoch " << epoch
9071             << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9072     wait_for_new_map(op);
9073     return false;
9074   }
9075
9076   if (!require_self_aliveness(op->get_req(), epoch)) {
9077     return false;
9078   }
9079
9080   // ok, our map is same or newer.. do they still exist?
9081   if (m->get_connection()->get_messenger() == cluster_messenger &&
9082       !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9083     return false;
9084   }
9085
9086   return true;
9087 }
9088
9089
9090
9091
9092
9093 // ----------------------------------------
9094 // pg creation
9095
9096 void OSD::split_pgs(
9097   PG *parent,
9098   const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9099   OSDMapRef curmap,
9100   OSDMapRef nextmap,
9101   PeeringCtx &rctx)
9102 {
9103   unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9104   parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9105
9106   vector<object_stat_sum_t> updated_stats;
9107   parent->start_split_stats(childpgids, &updated_stats);
9108
9109   vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9110   for (set<spg_t>::const_iterator i = childpgids.begin();
9111        i != childpgids.end();
9112        ++i, ++stat_iter) {
9113     ceph_assert(stat_iter != updated_stats.end());
9114     dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9115     PG* child = _make_pg(nextmap, *i);
9116     child->lock(true);
9117     out_pgs->insert(child);
9118     child->ch = store->create_new_collection(child->coll);
9119
9120     {
9121       uint32_t shard_index = i->hash_to_shard(shards.size());
9122       assert(NULL != shards[shard_index]);
9123       store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9124     }
9125
9126     unsigned split_bits = i->get_split_bits(pg_num);
9127     dout(10) << " pg_num is " << pg_num
9128              << ", m_seed " << i->ps()
9129              << ", split_bits is " << split_bits << dendl;
9130     parent->split_colls(
9131       *i,
9132       split_bits,
9133       i->ps(),
9134       &child->get_pool().info,
9135       rctx.transaction);
9136     parent->split_into(
9137       i->pgid,
9138       child,
9139       split_bits);
9140
9141     child->init_collection_pool_opts();
9142
9143     child->finish_split_stats(*stat_iter, rctx.transaction);
9144     child->unlock();
9145   }
9146   ceph_assert(stat_iter != updated_stats.end());
9147   parent->finish_split_stats(*stat_iter, rctx.transaction);
9148 }
9149
9150 /*
9151  * holding osd_lock
9152  */
9153 void OSD::handle_pg_create(OpRequestRef op)
9154 {
9155   // NOTE: this can be removed in P release (mimic is the last version to
9156   // send MOSDPGCreate messages).
9157
9158   auto m = op->get_req<MOSDPGCreate>();
9159   ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9160
9161   dout(10) << "handle_pg_create " << *m << dendl;
9162
9163   if (!require_mon_peer(op->get_req())) {
9164     return;
9165   }
9166
9167   if (!require_same_or_newer_map(op, m->epoch, false))
9168     return;
9169
9170   op->mark_started();
9171
9172   const auto osdmap = get_osdmap();
9173   map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9174   for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9175        p != m->mkpg.end();
9176        ++p, ++ci) {
9177     ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9178     epoch_t created = p->second.created;
9179     if (p->second.split_bits) // Skip split pgs
9180       continue;
9181     pg_t on = p->first;
9182
9183     if (!osdmap->have_pg_pool(on.pool())) {
9184       dout(20) << "ignoring pg on deleted pool " << on << dendl;
9185       continue;
9186     }
9187
9188     dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9189
9190     spg_t pgid;
9191     bool mapped = osdmap->get_primary_shard(on, &pgid);
9192     ceph_assert(mapped);
9193
9194     // is it still ours?
9195     vector<int> up, acting;
9196     int up_primary = -1;
9197     int acting_primary = -1;
9198     osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9199     int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9200
9201     if (acting_primary != whoami) {
9202       dout(10) << "mkpg " << on << "  not acting_primary (" << acting_primary
9203                << "), my role=" << role << ", skipping" << dendl;
9204       continue;
9205     }
9206
9207
9208     PastIntervals pi;
9209     pg_history_t history;
9210     build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9211
9212     // The mon won't resend unless the primary changed, so we ignore
9213     // same_interval_since.  We'll pass this history with the current
9214     // epoch as the event.
9215     if (history.same_primary_since > m->epoch) {
9216       dout(10) << __func__ << ": got obsolete pg create on pgid "
9217                << pgid << " from epoch " << m->epoch
9218                << ", primary changed in " << history.same_primary_since
9219                << dendl;
9220       continue;
9221     }
9222     enqueue_peering_evt(
9223       pgid,
9224       PGPeeringEventRef(
9225         std::make_shared<PGPeeringEvent>(
9226           osdmap->get_epoch(),
9227           osdmap->get_epoch(),
9228           NullEvt(),
9229           true,
9230           new PGCreateInfo(
9231             pgid,
9232             osdmap->get_epoch(),
9233             history,
9234             pi,
9235             true)
9236           )));
9237   }
9238
9239   {
9240     std::lock_guard l(pending_creates_lock);
9241     if (pending_creates_from_mon == 0) {
9242       last_pg_create_epoch = m->epoch;
9243     }
9244   }
9245
9246   maybe_update_heartbeat_peers();
9247 }
9248
9249
9250 // ----------------------------------------
9251 // peering and recovery
9252
9253 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9254                            ThreadPool::TPHandle *handle)
9255 {
9256   if (!service.get_osdmap()->is_up(whoami)) {
9257     dout(20) << __func__ << " not up in osdmap" << dendl;
9258   } else if (!is_active()) {
9259     dout(20) << __func__ << " not active" << dendl;
9260   } else {
9261     for (auto& [osd, ls] : ctx.message_map) {
9262       if (!curmap->is_up(osd)) {
9263         dout(20) << __func__ << " skipping down osd." << osd << dendl;
9264         continue;
9265       }
9266       ConnectionRef con = service.get_con_osd_cluster(
9267         osd, curmap->get_epoch());
9268       if (!con) {
9269         dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9270                  << dendl;
9271         continue;
9272       }
9273       service.maybe_share_map(con.get(), curmap);
9274       for (auto m : ls) {
9275         con->send_message2(m);
9276       }
9277       ls.clear();
9278     }
9279   }
9280   if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9281     int tr = store->queue_transaction(
9282       pg->ch,
9283       std::move(ctx.transaction), TrackedOpRef(),
9284       handle);
9285     ceph_assert(tr == 0);
9286   }
9287 }
9288
9289 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9290 {
9291   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9292   if (!require_mon_peer(m)) {
9293     m->put();
9294     return;
9295   }
9296   for (auto& p : m->pgs) {
9297     spg_t pgid = p.first;
9298     epoch_t created = p.second.first;
9299     utime_t created_stamp = p.second.second;
9300     auto q = m->pg_extra.find(pgid);
9301     if (q == m->pg_extra.end()) {
9302       dout(20) << __func__ << " " << pgid << " e" << created
9303                << "@" << created_stamp
9304                << " (no history or past_intervals)" << dendl;
9305       // pre-octopus ... no pg history.  this can be removed in Q release.
9306       enqueue_peering_evt(
9307         pgid,
9308         PGPeeringEventRef(
9309           std::make_shared<PGPeeringEvent>(
9310             m->epoch,
9311             m->epoch,
9312             NullEvt(),
9313             true,
9314             new PGCreateInfo(
9315               pgid,
9316               created,
9317               pg_history_t(created, created_stamp),
9318               PastIntervals(),
9319               true)
9320             )));
9321     } else {
9322       dout(20) << __func__ << " " << pgid << " e" << created
9323                << "@" << created_stamp
9324                << " history " << q->second.first
9325                << " pi " << q->second.second << dendl;
9326       if (!q->second.second.empty() &&
9327           m->epoch < q->second.second.get_bounds().second) {
9328         clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9329                       << " and unmatched past_intervals " << q->second.second
9330                       << " (history " << q->second.first << ")";
9331       } else {
9332         enqueue_peering_evt(
9333           pgid,
9334           PGPeeringEventRef(
9335             std::make_shared<PGPeeringEvent>(
9336               m->epoch,
9337               m->epoch,
9338               NullEvt(),
9339               true,
9340               new PGCreateInfo(
9341                 pgid,
9342                 m->epoch,
9343                 q->second.first,
9344                 q->second.second,
9345                 true)
9346               )));
9347       }
9348     }
9349   }
9350
9351   {
9352     std::lock_guard l(pending_creates_lock);
9353     if (pending_creates_from_mon == 0) {
9354       last_pg_create_epoch = m->epoch;
9355     }
9356   }
9357
9358   m->put();
9359 }
9360
9361 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9362 {
9363   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9364   if (!require_osd_peer(m)) {
9365     m->put();
9366     return;
9367   }
9368   int from = m->get_source().num();
9369   for (auto& p : m->get_pg_list()) {
9370     spg_t pgid(p.info.pgid.pgid, p.to);
9371     enqueue_peering_evt(
9372       pgid,
9373       PGPeeringEventRef(
9374         std::make_shared<PGPeeringEvent>(
9375           p.epoch_sent,
9376           p.query_epoch,
9377           MNotifyRec(
9378             pgid, pg_shard_t(from, p.from),
9379             p,
9380             m->get_connection()->get_features()),
9381           true,
9382           new PGCreateInfo(
9383             pgid,
9384             p.query_epoch,
9385             p.info.history,
9386             p.past_intervals,
9387             false)
9388           )));
9389   }
9390   m->put();
9391 }
9392
9393 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9394 {
9395   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9396   if (!require_osd_peer(m)) {
9397     m->put();
9398     return;
9399   }
9400   int from = m->get_source().num();
9401   for (auto& p : m->pg_list) {
9402     enqueue_peering_evt(
9403       spg_t(p.info.pgid.pgid, p.to),
9404       PGPeeringEventRef(
9405        std::make_shared<PGPeeringEvent>(
9406          p.epoch_sent, p.query_epoch,
9407          MInfoRec(
9408            pg_shard_t(from, p.from),
9409            p.info,
9410            p.epoch_sent)))
9411       );
9412   }
9413   m->put();
9414 }
9415
9416 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9417 {
9418   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9419   if (!require_osd_peer(m)) {
9420     m->put();
9421     return;
9422   }
9423   for (auto& pgid : m->pg_list) {
9424     enqueue_peering_evt(
9425       pgid,
9426       PGPeeringEventRef(
9427         std::make_shared<PGPeeringEvent>(
9428           m->get_epoch(), m->get_epoch(),
9429           PeeringState::DeleteStart())));
9430   }
9431   m->put();
9432 }
9433
9434 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9435 {
9436   dout(10) << __func__ << " " << *m << dendl;
9437   if (!require_mon_or_mgr_peer(m)) {
9438     m->put();
9439     return;
9440   }
9441   epoch_t epoch = get_osdmap_epoch();
9442   for (auto pgid : m->forced_pgs) {
9443     if (m->options & OFR_BACKFILL) {
9444       if (m->options & OFR_CANCEL) {
9445         enqueue_peering_evt(
9446           pgid,
9447           PGPeeringEventRef(
9448             std::make_shared<PGPeeringEvent>(
9449               epoch, epoch,
9450               PeeringState::UnsetForceBackfill())));
9451       } else {
9452         enqueue_peering_evt(
9453           pgid,
9454           PGPeeringEventRef(
9455             std::make_shared<PGPeeringEvent>(
9456               epoch, epoch,
9457               PeeringState::SetForceBackfill())));
9458       }
9459     } else if (m->options & OFR_RECOVERY) {
9460       if (m->options & OFR_CANCEL) {
9461         enqueue_peering_evt(
9462           pgid,
9463           PGPeeringEventRef(
9464             std::make_shared<PGPeeringEvent>(
9465               epoch, epoch,
9466               PeeringState::UnsetForceRecovery())));
9467       } else {
9468         enqueue_peering_evt(
9469           pgid,
9470           PGPeeringEventRef(
9471             std::make_shared<PGPeeringEvent>(
9472               epoch, epoch,
9473               PeeringState::SetForceRecovery())));
9474       }
9475     }
9476   }
9477   m->put();
9478 }
9479
9480 void OSD::handle_pg_query_nopg(const MQuery& q)
9481 {
9482   spg_t pgid = q.pgid;
9483   dout(10) << __func__ << " " << pgid << dendl;
9484
9485   OSDMapRef osdmap = get_osdmap();
9486   if (!osdmap->have_pg_pool(pgid.pool()))
9487     return;
9488
9489   dout(10) << " pg " << pgid << " dne" << dendl;
9490   pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9491   ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9492   if (con) {
9493     Message *m;
9494     if (q.query.type == pg_query_t::LOG ||
9495         q.query.type == pg_query_t::FULLLOG) {
9496       m = new MOSDPGLog(
9497         q.query.from, q.query.to,
9498         osdmap->get_epoch(), empty,
9499         q.query.epoch_sent);
9500     } else {
9501       pg_notify_t notify{q.query.from, q.query.to,
9502                          q.query.epoch_sent,
9503                          osdmap->get_epoch(),
9504                          empty,
9505                          PastIntervals()};
9506       m = new MOSDPGNotify2(spg_t{pgid.pgid, q.query.from},
9507                             std::move(notify));
9508     }
9509     service.maybe_share_map(con.get(), osdmap);
9510     con->send_message(m);
9511   }
9512 }
9513
9514 void OSDService::queue_check_readable(spg_t spgid,
9515                                       epoch_t lpr,
9516                                       ceph::signedspan delay)
9517 {
9518   if (delay == ceph::signedspan::zero()) {
9519     osd->enqueue_peering_evt(
9520       spgid,
9521       PGPeeringEventRef(
9522         std::make_shared<PGPeeringEvent>(
9523           lpr, lpr,
9524           PeeringState::CheckReadable())));
9525   } else {
9526     mono_timer.add_event(
9527       delay,
9528       [this, spgid, lpr]() {
9529         queue_check_readable(spgid, lpr);
9530       });
9531   }
9532 }
9533
9534
9535 // =========================================================
9536 // RECOVERY
9537
9538 void OSDService::_maybe_queue_recovery() {
9539   ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9540   uint64_t available_pushes;
9541   while (!awaiting_throttle.empty() &&
9542          _recover_now(&available_pushes)) {
9543     uint64_t to_start = std::min(
9544       available_pushes,
9545       cct->_conf->osd_recovery_max_single_start);
9546     _queue_for_recovery(awaiting_throttle.front(), to_start);
9547     awaiting_throttle.pop_front();
9548     dout(10) << __func__ << " starting " << to_start
9549              << ", recovery_ops_reserved " << recovery_ops_reserved
9550              << " -> " << (recovery_ops_reserved + to_start) << dendl;
9551     recovery_ops_reserved += to_start;
9552   }
9553 }
9554
9555 bool OSDService::_recover_now(uint64_t *available_pushes)
9556 {
9557   if (available_pushes)
9558       *available_pushes = 0;
9559
9560   if (ceph_clock_now() < defer_recovery_until) {
9561     dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9562     return false;
9563   }
9564
9565   if (recovery_paused) {
9566     dout(15) << __func__ << " paused" << dendl;
9567     return false;
9568   }
9569
9570   uint64_t max = osd->get_recovery_max_active();
9571   if (max <= recovery_ops_active + recovery_ops_reserved) {
9572     dout(15) << __func__ << " active " << recovery_ops_active
9573              << " + reserved " << recovery_ops_reserved
9574              << " >= max " << max << dendl;
9575     return false;
9576   }
9577
9578   if (available_pushes)
9579     *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9580
9581   return true;
9582 }
9583
9584 unsigned OSDService::get_target_pg_log_entries() const
9585 {
9586   auto num_pgs = osd->get_num_pgs();
9587   auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9588   if (num_pgs > 0 && target > 0) {
9589     // target an even spread of our budgeted log entries across all
9590     // PGs.  note that while we only get to control the entry count
9591     // for primary PGs, we'll normally be responsible for a mix of
9592     // primary and replica PGs (for the same pool(s) even), so this
9593     // will work out.
9594     return std::max<unsigned>(
9595       std::min<unsigned>(target / num_pgs,
9596                          cct->_conf->osd_max_pg_log_entries),
9597       cct->_conf->osd_min_pg_log_entries);
9598   } else {
9599     // fall back to a per-pg value.
9600     return cct->_conf->osd_min_pg_log_entries;
9601   }
9602 }
9603
9604 void OSD::do_recovery(
9605   PG *pg, epoch_t queued, uint64_t reserved_pushes,
9606   ThreadPool::TPHandle &handle)
9607 {
9608   uint64_t started = 0;
9609
9610   /*
9611    * When the value of osd_recovery_sleep is set greater than zero, recovery
9612    * ops are scheduled after osd_recovery_sleep amount of time from the previous
9613    * recovery event's schedule time. This is done by adding a
9614    * recovery_requeue_callback event, which re-queues the recovery op using
9615    * queue_recovery_after_sleep.
9616    */
9617   float recovery_sleep = get_osd_recovery_sleep();
9618   {
9619     std::lock_guard l(service.sleep_lock);
9620     if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9621       PGRef pgref(pg);
9622       auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9623         dout(20) << "do_recovery wake up at "
9624                  << ceph_clock_now()
9625                  << ", re-queuing recovery" << dendl;
9626         std::lock_guard l(service.sleep_lock);
9627         service.recovery_needs_sleep = false;
9628         service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9629       });
9630
9631       // This is true for the first recovery op and when the previous recovery op
9632       // has been scheduled in the past. The next recovery op is scheduled after
9633       // completing the sleep from now.
9634
9635       if (auto now = ceph::real_clock::now();
9636           service.recovery_schedule_time < now) {
9637         service.recovery_schedule_time = now;
9638       }
9639       service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9640       service.sleep_timer.add_event_at(service.recovery_schedule_time,
9641                                        recovery_requeue_callback);
9642       dout(20) << "Recovery event scheduled at "
9643                << service.recovery_schedule_time << dendl;
9644       return;
9645     }
9646   }
9647
9648   {
9649     {
9650       std::lock_guard l(service.sleep_lock);
9651       service.recovery_needs_sleep = true;
9652     }
9653
9654     if (pg->pg_has_reset_since(queued)) {
9655       goto out;
9656     }
9657
9658     dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9659 #ifdef DEBUG_RECOVERY_OIDS
9660     dout(20) << "  active was " << service.recovery_oids[pg->pg_id] << dendl;
9661 #endif
9662
9663     bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9664     dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9665              << " on " << *pg << dendl;
9666
9667     if (do_unfound) {
9668       PeeringCtx rctx;
9669       rctx.handle = &handle;
9670       pg->find_unfound(queued, rctx);
9671       dispatch_context(rctx, pg, pg->get_osdmap());
9672     }
9673   }
9674
9675  out:
9676   ceph_assert(started <= reserved_pushes);
9677   service.release_reserved_pushes(reserved_pushes);
9678 }
9679
9680 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9681 {
9682   std::lock_guard l(recovery_lock);
9683   dout(10) << "start_recovery_op " << *pg << " " << soid
9684            << " (" << recovery_ops_active << "/"
9685            << osd->get_recovery_max_active() << " rops)"
9686            << dendl;
9687   recovery_ops_active++;
9688
9689 #ifdef DEBUG_RECOVERY_OIDS
9690   dout(20) << "  active was " << recovery_oids[pg->pg_id] << dendl;
9691   ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9692   recovery_oids[pg->pg_id].insert(soid);
9693 #endif
9694 }
9695
9696 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9697 {
9698   std::lock_guard l(recovery_lock);
9699   dout(10) << "finish_recovery_op " << *pg << " " << soid
9700            << " dequeue=" << dequeue
9701            << " (" << recovery_ops_active << "/"
9702            << osd->get_recovery_max_active() << " rops)"
9703            << dendl;
9704
9705   // adjust count
9706   ceph_assert(recovery_ops_active > 0);
9707   recovery_ops_active--;
9708
9709 #ifdef DEBUG_RECOVERY_OIDS
9710   dout(20) << "  active oids was " << recovery_oids[pg->pg_id] << dendl;
9711   ceph_assert(recovery_oids[pg->pg_id].count(soid));
9712   recovery_oids[pg->pg_id].erase(soid);
9713 #endif
9714
9715   _maybe_queue_recovery();
9716 }
9717
9718 bool OSDService::is_recovery_active()
9719 {
9720   if (cct->_conf->osd_debug_pretend_recovery_active) {
9721     return true;
9722   }
9723   return local_reserver.has_reservation() || remote_reserver.has_reservation();
9724 }
9725
9726 void OSDService::release_reserved_pushes(uint64_t pushes)
9727 {
9728   std::lock_guard l(recovery_lock);
9729   dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9730            << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9731            << dendl;
9732   ceph_assert(recovery_ops_reserved >= pushes);
9733   recovery_ops_reserved -= pushes;
9734   _maybe_queue_recovery();
9735 }
9736
9737 // =========================================================
9738 // OPS
9739
9740 bool OSD::op_is_discardable(const MOSDOp *op)
9741 {
9742   // drop client request if they are not connected and can't get the
9743   // reply anyway.
9744   if (!op->get_connection()->is_connected()) {
9745     return true;
9746   }
9747   return false;
9748 }
9749
9750 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9751 {
9752   const utime_t stamp = op->get_req()->get_recv_stamp();
9753   const utime_t latency = ceph_clock_now() - stamp;
9754   const unsigned priority = op->get_req()->get_priority();
9755   const int cost = op->get_req()->get_cost();
9756   const uint64_t owner = op->get_req()->get_source().num();
9757   const int type = op->get_req()->get_type();
9758
9759   dout(15) << "enqueue_op " << op << " prio " << priority
9760            << " type " << type
9761            << " cost " << cost
9762            << " latency " << latency
9763            << " epoch " << epoch
9764            << " " << *(op->get_req()) << dendl;
9765   op->osd_trace.event("enqueue op");
9766   op->osd_trace.keyval("priority", priority);
9767   op->osd_trace.keyval("cost", cost);
9768
9769   auto enqueue_span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
9770   enqueue_span->AddEvent(__func__, {
9771     {"priority", priority},
9772     {"cost", cost},
9773     {"epoch", epoch},
9774     {"owner", owner},
9775     {"type", type}
9776     });
9777
9778   op->mark_queued_for_pg();
9779   logger->tinc(l_osd_op_before_queue_op_lat, latency);
9780   if (type == MSG_OSD_PG_PUSH ||
9781       type == MSG_OSD_PG_PUSH_REPLY) {
9782     op_shardedwq.queue(
9783       OpSchedulerItem(
9784         unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9785         cost, priority, stamp, owner, epoch));
9786   } else {
9787     op_shardedwq.queue(
9788       OpSchedulerItem(
9789         unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9790         cost, priority, stamp, owner, epoch));
9791   }
9792 }
9793
9794 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9795 {
9796   dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9797   op_shardedwq.queue(
9798     OpSchedulerItem(
9799       unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9800       10,
9801       cct->_conf->osd_peering_op_priority,
9802       utime_t(),
9803       0,
9804       evt->get_epoch_sent()));
9805 }
9806
9807 /*
9808  * NOTE: dequeue called in worker thread, with pg lock
9809  */
9810 void OSD::dequeue_op(
9811   PGRef pg, OpRequestRef op,
9812   ThreadPool::TPHandle &handle)
9813 {
9814   const Message *m = op->get_req();
9815
9816   FUNCTRACE(cct);
9817   OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9818
9819   utime_t now = ceph_clock_now();
9820   op->set_dequeued_time(now);
9821
9822   utime_t latency = now - m->get_recv_stamp();
9823   dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9824            << " cost " << m->get_cost()
9825            << " latency " << latency
9826            << " " << *m
9827            << " pg " << *pg << dendl;
9828
9829   logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9830
9831   service.maybe_share_map(m->get_connection().get(),
9832                           pg->get_osdmap(),
9833                           op->sent_epoch);
9834
9835   if (pg->is_deleting())
9836     return;
9837
9838   op->mark_reached_pg();
9839   op->osd_trace.event("dequeue_op");
9840
9841   pg->do_request(op, handle);
9842
9843   // finish
9844   dout(10) << "dequeue_op " << op << " finish" << dendl;
9845   OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9846 }
9847
9848
9849 void OSD::dequeue_peering_evt(
9850   OSDShard *sdata,
9851   PG *pg,
9852   PGPeeringEventRef evt,
9853   ThreadPool::TPHandle& handle)
9854 {
9855   auto curmap = sdata->get_osdmap();
9856   bool need_up_thru = false;
9857   epoch_t same_interval_since = 0;
9858   if (!pg) {
9859     if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9860       handle_pg_query_nopg(*q);
9861     } else {
9862       derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9863       ceph_abort();
9864     }
9865   } else if (PeeringCtx rctx;
9866              advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9867     pg->do_peering_event(evt, rctx);
9868     if (pg->is_deleted()) {
9869       pg->unlock();
9870       return;
9871     }
9872     dispatch_context(rctx, pg, curmap, &handle);
9873     need_up_thru = pg->get_need_up_thru();
9874     same_interval_since = pg->get_same_interval_since();
9875     pg->unlock();
9876   }
9877
9878   if (need_up_thru) {
9879     queue_want_up_thru(same_interval_since);
9880   }
9881
9882   service.send_pg_temp();
9883 }
9884
9885 void OSD::dequeue_delete(
9886   OSDShard *sdata,
9887   PG *pg,
9888   epoch_t e,
9889   ThreadPool::TPHandle& handle)
9890 {
9891   dequeue_peering_evt(
9892     sdata,
9893     pg,
9894     PGPeeringEventRef(
9895       std::make_shared<PGPeeringEvent>(
9896         e, e,
9897         PeeringState::DeleteSome())),
9898     handle);
9899 }
9900
9901
9902
9903 // --------------------------------
9904
9905 const char** OSD::get_tracked_conf_keys() const
9906 {
9907   static const char* KEYS[] = {
9908     "osd_max_backfills",
9909     "osd_min_recovery_priority",
9910     "osd_max_trimming_pgs",
9911     "osd_op_complaint_time",
9912     "osd_op_log_threshold",
9913     "osd_op_history_size",
9914     "osd_op_history_duration",
9915     "osd_op_history_slow_op_size",
9916     "osd_op_history_slow_op_threshold",
9917     "osd_enable_op_tracker",
9918     "osd_map_cache_size",
9919     "osd_pg_epoch_max_lag_factor",
9920     "osd_pg_epoch_persisted_max_stale",
9921     "osd_recovery_sleep",
9922     "osd_recovery_sleep_hdd",
9923     "osd_recovery_sleep_ssd",
9924     "osd_recovery_sleep_hybrid",
9925     "osd_delete_sleep",
9926     "osd_delete_sleep_hdd",
9927     "osd_delete_sleep_ssd",
9928     "osd_delete_sleep_hybrid",
9929     "osd_snap_trim_sleep",
9930     "osd_snap_trim_sleep_hdd",
9931     "osd_snap_trim_sleep_ssd",
9932     "osd_snap_trim_sleep_hybrid",
9933     "osd_scrub_sleep",
9934     "osd_recovery_max_active",
9935     "osd_recovery_max_active_hdd",
9936     "osd_recovery_max_active_ssd",
9937     // clog & admin clog
9938     "clog_to_monitors",
9939     "clog_to_syslog",
9940     "clog_to_syslog_facility",
9941     "clog_to_syslog_level",
9942     "osd_objectstore_fuse",
9943     "clog_to_graylog",
9944     "clog_to_graylog_host",
9945     "clog_to_graylog_port",
9946     "host",
9947     "fsid",
9948     "osd_recovery_delay_start",
9949     "osd_client_message_size_cap",
9950     "osd_client_message_cap",
9951     "osd_heartbeat_min_size",
9952     "osd_heartbeat_interval",
9953     "osd_object_clean_region_max_num_intervals",
9954     "osd_scrub_min_interval",
9955     "osd_scrub_max_interval",
9956     NULL
9957   };
9958   return KEYS;
9959 }
9960
9961 void OSD::handle_conf_change(const ConfigProxy& conf,
9962                              const std::set <std::string> &changed)
9963 {
9964   std::lock_guard l{osd_lock};
9965
9966   if (changed.count("osd_max_backfills") ||
9967       changed.count("osd_delete_sleep") ||
9968       changed.count("osd_delete_sleep_hdd") ||
9969       changed.count("osd_delete_sleep_ssd") ||
9970       changed.count("osd_delete_sleep_hybrid") ||
9971       changed.count("osd_snap_trim_sleep") ||
9972       changed.count("osd_snap_trim_sleep_hdd") ||
9973       changed.count("osd_snap_trim_sleep_ssd") ||
9974       changed.count("osd_snap_trim_sleep_hybrid") ||
9975       changed.count("osd_scrub_sleep") ||
9976       changed.count("osd_recovery_sleep") ||
9977       changed.count("osd_recovery_sleep_hdd") ||
9978       changed.count("osd_recovery_sleep_ssd") ||
9979       changed.count("osd_recovery_sleep_hybrid") ||
9980       changed.count("osd_recovery_max_active") ||
9981       changed.count("osd_recovery_max_active_hdd") ||
9982       changed.count("osd_recovery_max_active_ssd")) {
9983     if (!maybe_override_options_for_qos() &&
9984         changed.count("osd_max_backfills")) {
9985       // Scheduler is not "mclock". Fallback to earlier behavior
9986       service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9987       service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9988     }
9989   }
9990   if (changed.count("osd_min_recovery_priority")) {
9991     service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9992     service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9993   }
9994   if (changed.count("osd_max_trimming_pgs")) {
9995     service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9996   }
9997   if (changed.count("osd_op_complaint_time") ||
9998       changed.count("osd_op_log_threshold")) {
9999     op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10000                                            cct->_conf->osd_op_log_threshold);
10001   }
10002   if (changed.count("osd_op_history_size") ||
10003       changed.count("osd_op_history_duration")) {
10004     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10005                                              cct->_conf->osd_op_history_duration);
10006   }
10007   if (changed.count("osd_op_history_slow_op_size") ||
10008       changed.count("osd_op_history_slow_op_threshold")) {
10009     op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10010                                                       cct->_conf->osd_op_history_slow_op_threshold);
10011   }
10012   if (changed.count("osd_enable_op_tracker")) {
10013       op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10014   }
10015   if (changed.count("osd_map_cache_size")) {
10016     service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10017     service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10018     service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10019   }
10020   if (changed.count("clog_to_monitors") ||
10021       changed.count("clog_to_syslog") ||
10022       changed.count("clog_to_syslog_level") ||
10023       changed.count("clog_to_syslog_facility") ||
10024       changed.count("clog_to_graylog") ||
10025       changed.count("clog_to_graylog_host") ||
10026       changed.count("clog_to_graylog_port") ||
10027       changed.count("host") ||
10028       changed.count("fsid")) {
10029     update_log_config();
10030   }
10031   if (changed.count("osd_pg_epoch_max_lag_factor")) {
10032     m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10033       "osd_pg_epoch_max_lag_factor");
10034   }
10035
10036 #ifdef HAVE_LIBFUSE
10037   if (changed.count("osd_objectstore_fuse")) {
10038     if (store) {
10039       enable_disable_fuse(false);
10040     }
10041   }
10042 #endif
10043
10044   if (changed.count("osd_recovery_delay_start")) {
10045     service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10046     service.kick_recovery_queue();
10047   }
10048
10049   if (changed.count("osd_client_message_cap")) {
10050     uint64_t newval = cct->_conf->osd_client_message_cap;
10051     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10052     if (pol.throttler_messages) {
10053       pol.throttler_messages->reset_max(newval);
10054     }
10055   }
10056   if (changed.count("osd_client_message_size_cap")) {
10057     uint64_t newval = cct->_conf->osd_client_message_size_cap;
10058     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10059     if (pol.throttler_bytes) {
10060       pol.throttler_bytes->reset_max(newval);
10061     }
10062   }
10063   if (changed.count("osd_object_clean_region_max_num_intervals")) {
10064     ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10065   }
10066
10067   if (changed.count("osd_scrub_min_interval") ||
10068       changed.count("osd_scrub_max_interval")) {
10069     resched_all_scrubs();
10070     dout(0) << __func__ << ": scrub interval change" << dendl;
10071   }
10072   check_config();
10073   if (changed.count("osd_asio_thread_count")) {
10074     service.poolctx.stop();
10075     service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10076   }
10077 }
10078
10079 void OSD::maybe_override_max_osd_capacity_for_qos()
10080 {
10081   // If the scheduler enabled is mclock, override the default
10082   // osd capacity with the value obtained from running the
10083   // osd bench test. This is later used to setup mclock.
10084   if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
10085       (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false) &&
10086       (!unsupported_objstore_for_qos())) {
10087     std::string max_capacity_iops_config;
10088     bool force_run_benchmark =
10089       cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
10090
10091     if (store_is_rotational) {
10092       max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
10093     } else {
10094       max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
10095     }
10096
10097     if (!force_run_benchmark) {
10098       double default_iops = 0.0;
10099
10100       // Get the current osd iops capacity
10101       double cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
10102
10103       // Get the default max iops capacity
10104       auto val = cct->_conf.get_val_default(max_capacity_iops_config);
10105       if (!val.has_value()) {
10106         derr << __func__ << " Unable to determine default value of "
10107             << max_capacity_iops_config << dendl;
10108         // Cannot determine default iops. Force a run of the OSD benchmark.
10109         force_run_benchmark = true;
10110       } else {
10111         // Default iops
10112         default_iops = std::stod(val.value());
10113       }
10114
10115       // Determine if we really need to run the osd benchmark
10116       if (!force_run_benchmark && (default_iops != cur_iops)) {
10117         dout(1) << __func__ << std::fixed << std::setprecision(2)
10118                 << " default_iops: " << default_iops
10119                 << " cur_iops: " << cur_iops
10120                 << ". Skip OSD benchmark test." << dendl;
10121         return;
10122       }
10123     }
10124
10125     // Run osd bench: write 100 4MiB objects with blocksize 4KiB
10126     int64_t count = 12288000; // Count of bytes to write
10127     int64_t bsize = 4096;     // Block size
10128     int64_t osize = 4194304;  // Object size
10129     int64_t onum = 100;       // Count of objects to write
10130     double elapsed = 0.0;     // Time taken to complete the test
10131     double iops = 0.0;
10132     stringstream ss;
10133     int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
10134     if (ret != 0) {
10135       derr << __func__
10136            << " osd bench err: " << ret
10137            << " osd bench errstr: " << ss.str()
10138            << dendl;
10139       return;
10140     }
10141
10142     double rate = count / elapsed;
10143     iops = rate / bsize;
10144     dout(1) << __func__
10145             << " osd bench result -"
10146             << std::fixed << std::setprecision(3)
10147             << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
10148             << " iops: " << iops
10149             << " elapsed_sec: " << elapsed
10150             << dendl;
10151
10152     // Persist iops to the MON store
10153     ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
10154     if (ret < 0) {
10155       // Fallback to setting the config within the in-memory "values" map.
10156       cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops));
10157     }
10158
10159     // Override the max osd capacity for all shards
10160     for (auto& shard : shards) {
10161       shard->update_scheduler_config();
10162     }
10163   }
10164 }
10165
10166 bool OSD::maybe_override_options_for_qos()
10167 {
10168   // If the scheduler enabled is mclock, override the recovery, backfill
10169   // and sleep options so that mclock can meet the QoS goals.
10170   if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10171       !unsupported_objstore_for_qos()) {
10172     dout(1) << __func__
10173             << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
10174
10175     // Set high value for recovery max active
10176     uint32_t rec_max_active = 1000;
10177     cct->_conf.set_val(
10178       "osd_recovery_max_active", std::to_string(rec_max_active));
10179     cct->_conf.set_val(
10180       "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
10181     cct->_conf.set_val(
10182       "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
10183
10184     // Set high value for osd_max_backfill
10185     uint32_t max_backfills = 1000;
10186     cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
10187     service.local_reserver.set_max(max_backfills);
10188     service.remote_reserver.set_max(max_backfills);
10189
10190     // Disable recovery sleep
10191     cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10192     cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10193     cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10194     cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10195
10196     // Disable delete sleep
10197     cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10198     cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10199     cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10200     cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10201
10202     // Disable snap trim sleep
10203     cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10204     cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10205     cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10206     cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10207
10208     // Disable scrub sleep
10209     cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10210     return true;
10211   }
10212   return false;
10213 }
10214
10215 int OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
10216 {
10217   std::string cmd =
10218     "{"
10219       "\"prefix\": \"config set\", "
10220       "\"who\": \"osd." + std::to_string(whoami) + "\", "
10221       "\"name\": \"" + key + "\", "
10222       "\"value\": \"" + val + "\""
10223     "}";
10224
10225   vector<std::string> vcmd{cmd};
10226   bufferlist inbl;
10227   std::string outs;
10228   C_SaferCond cond;
10229   monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond);
10230   int r = cond.wait();
10231   if (r < 0) {
10232     derr << __func__ << " Failed to set config key " << key
10233          << " err: " << cpp_strerror(r)
10234          << " errstr: " << outs << dendl;
10235     return r;
10236   }
10237
10238   return 0;
10239 }
10240
10241 bool OSD::unsupported_objstore_for_qos()
10242 {
10243   static const std::vector<std::string> unsupported_objstores = { "filestore" };
10244   return std::find(unsupported_objstores.begin(),
10245                    unsupported_objstores.end(),
10246                    store->get_type()) != unsupported_objstores.end();
10247 }
10248
10249 void OSD::update_log_config()
10250 {
10251   auto parsed_options = clog->parse_client_options(cct);
10252   derr << "log_to_monitors " << parsed_options.log_to_monitors << dendl;
10253 }
10254
10255 void OSD::check_config()
10256 {
10257   // some sanity checks
10258   if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10259     clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10260                  << " is not > osd_pg_epoch_persisted_max_stale ("
10261                  << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10262   }
10263   if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
10264     clog->warn() << "osd_object_clean_region_max_num_intervals ("
10265                  << cct->_conf->osd_object_clean_region_max_num_intervals
10266                 << ") is < 0";
10267   }
10268 }
10269
10270 // --------------------------------
10271
10272 void OSD::get_latest_osdmap()
10273 {
10274   dout(10) << __func__ << " -- start" << dendl;
10275
10276   boost::system::error_code ec;
10277   service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
10278
10279   dout(10) << __func__ << " -- finish" << dendl;
10280 }
10281
10282 // --------------------------------
10283
10284 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10285   const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10286   const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
10287   dout(10) << "setting " << queries.size() << " queries" << dendl;
10288
10289   std::list<OSDPerfMetricQuery> supported_queries;
10290   for (auto &it : queries) {
10291     auto &query = it.first;
10292     if (!query.key_descriptor.empty()) {
10293       supported_queries.push_back(query);
10294     }
10295   }
10296   if (supported_queries.size() < queries.size()) {
10297     dout(1) << queries.size() - supported_queries.size()
10298             << " unsupported queries" << dendl;
10299   }
10300   {
10301     std::lock_guard locker{m_perf_queries_lock};
10302     m_perf_queries = supported_queries;
10303     m_perf_limits = queries;
10304   }
10305   std::vector<PGRef> pgs;
10306   _get_pgs(&pgs);
10307   for (auto& pg : pgs) {
10308     std::scoped_lock l{*pg};
10309     pg->set_dynamic_perf_stats_queries(supported_queries);
10310   }
10311 }
10312
10313 MetricPayload OSD::get_perf_reports() {
10314   OSDMetricPayload payload;
10315   std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10316
10317   std::vector<PGRef> pgs;
10318   _get_pgs(&pgs);
10319   DynamicPerfStats dps;
10320   for (auto& pg : pgs) {
10321     // m_perf_queries can be modified only in set_perf_queries by mgr client
10322     // request, and it is protected by by mgr client's lock, which is held
10323     // when set_perf_queries/get_perf_reports are called, so we may not hold
10324     // m_perf_queries_lock here.
10325     DynamicPerfStats pg_dps(m_perf_queries);
10326     pg->lock();
10327     pg->get_dynamic_perf_stats(&pg_dps);
10328     pg->unlock();
10329     dps.merge(pg_dps);
10330   }
10331   dps.add_to_reports(m_perf_limits, &reports);
10332   dout(20) << "reports for " << reports.size() << " queries" << dendl;
10333
10334   return payload;
10335 }
10336
10337 // =============================================================
10338
10339 #undef dout_context
10340 #define dout_context cct
10341 #undef dout_prefix
10342 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10343
10344 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10345 {
10346   dout(10) << pg->pg_id << " " << pg << dendl;
10347   slot->pg = pg;
10348   pg->osd_shard = this;
10349   pg->pg_slot = slot;
10350   osd->inc_num_pgs();
10351
10352   slot->epoch = pg->get_osdmap_epoch();
10353   pg_slots_by_epoch.insert(*slot);
10354 }
10355
10356 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10357 {
10358   dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10359   slot->pg->osd_shard = nullptr;
10360   slot->pg->pg_slot = nullptr;
10361   slot->pg = nullptr;
10362   osd->dec_num_pgs();
10363
10364   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10365   slot->epoch = 0;
10366   if (waiting_for_min_pg_epoch) {
10367     min_pg_epoch_cond.notify_all();
10368   }
10369 }
10370
10371 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10372 {
10373   std::lock_guard l(shard_lock);
10374   dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10375            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10376   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10377   dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10378   slot->epoch = e;
10379   pg_slots_by_epoch.insert(*slot);
10380   dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10381            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10382   if (waiting_for_min_pg_epoch) {
10383     min_pg_epoch_cond.notify_all();
10384   }
10385 }
10386
10387 epoch_t OSDShard::get_min_pg_epoch()
10388 {
10389   std::lock_guard l(shard_lock);
10390   auto p = pg_slots_by_epoch.begin();
10391   if (p == pg_slots_by_epoch.end()) {
10392     return 0;
10393   }
10394   return p->epoch;
10395 }
10396
10397 void OSDShard::wait_min_pg_epoch(epoch_t need)
10398 {
10399   std::unique_lock l{shard_lock};
10400   ++waiting_for_min_pg_epoch;
10401   min_pg_epoch_cond.wait(l, [need, this] {
10402     if (pg_slots_by_epoch.empty()) {
10403       return true;
10404     } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10405       return true;
10406     } else {
10407       dout(10) << need << " waiting on "
10408                << pg_slots_by_epoch.begin()->epoch << dendl;
10409       return false;
10410     }
10411   });
10412   --waiting_for_min_pg_epoch;
10413 }
10414
10415 epoch_t OSDShard::get_max_waiting_epoch()
10416 {
10417   std::lock_guard l(shard_lock);
10418   epoch_t r = 0;
10419   for (auto& i : pg_slots) {
10420     if (!i.second->waiting_peering.empty()) {
10421       r = std::max(r, i.second->waiting_peering.rbegin()->first);
10422     }
10423   }
10424   return r;
10425 }
10426
10427 void OSDShard::consume_map(
10428   const OSDMapRef& new_osdmap,
10429   unsigned *pushes_to_free)
10430 {
10431   std::lock_guard l(shard_lock);
10432   OSDMapRef old_osdmap;
10433   {
10434     std::lock_guard l(osdmap_lock);
10435     old_osdmap = std::move(shard_osdmap);
10436     shard_osdmap = new_osdmap;
10437   }
10438   dout(10) << new_osdmap->get_epoch()
10439            << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10440            << dendl;
10441   int queued = 0;
10442
10443   // check slots
10444   auto p = pg_slots.begin();
10445   while (p != pg_slots.end()) {
10446     OSDShardPGSlot *slot = p->second.get();
10447     const spg_t& pgid = p->first;
10448     dout(20) << __func__ << " " << pgid << dendl;
10449     if (!slot->waiting_for_split.empty()) {
10450       dout(20) << __func__ << "  " << pgid
10451                << " waiting for split " << slot->waiting_for_split << dendl;
10452       ++p;
10453       continue;
10454     }
10455     if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10456       dout(20) << __func__ << "  " << pgid
10457                << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10458                << dendl;
10459       ++p;
10460       continue;
10461     }
10462     if (!slot->waiting_peering.empty()) {
10463       epoch_t first = slot->waiting_peering.begin()->first;
10464       if (first <= new_osdmap->get_epoch()) {
10465         dout(20) << __func__ << "  " << pgid
10466                  << " pending_peering first epoch " << first
10467                  << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10468         queued += _wake_pg_slot(pgid, slot);
10469       }
10470       ++p;
10471       continue;
10472     }
10473     if (!slot->waiting.empty()) {
10474       if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10475         dout(20) << __func__ << "  " << pgid << " maps to us, keeping"
10476                  << dendl;
10477         ++p;
10478         continue;
10479       }
10480       while (!slot->waiting.empty() &&
10481              slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10482         auto& qi = slot->waiting.front();
10483         dout(20) << __func__ << "  " << pgid
10484                  << " waiting item " << qi
10485                  << " epoch " << qi.get_map_epoch()
10486                  << " <= " << new_osdmap->get_epoch()
10487                  << ", "
10488                  << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10489                      "misdirected")
10490                  << ", dropping" << dendl;
10491         *pushes_to_free += qi.get_reserved_pushes();
10492         slot->waiting.pop_front();
10493       }
10494     }
10495     if (slot->waiting.empty() &&
10496         slot->num_running == 0 &&
10497         slot->waiting_for_split.empty() &&
10498         !slot->pg) {
10499       dout(20) << __func__ << "  " << pgid << " empty, pruning" << dendl;
10500       p = pg_slots.erase(p);
10501       continue;
10502     }
10503
10504     ++p;
10505   }
10506   if (queued) {
10507     std::lock_guard l{sdata_wait_lock};
10508     if (queued == 1)
10509       sdata_cond.notify_one();
10510     else
10511       sdata_cond.notify_all();
10512   }
10513 }
10514
10515 int OSDShard::_wake_pg_slot(
10516   spg_t pgid,
10517   OSDShardPGSlot *slot)
10518 {
10519   int count = 0;
10520   dout(20) << __func__ << " " << pgid
10521            << " to_process " << slot->to_process
10522            << " waiting " << slot->waiting
10523            << " waiting_peering " << slot->waiting_peering << dendl;
10524   for (auto i = slot->to_process.rbegin();
10525        i != slot->to_process.rend();
10526        ++i) {
10527     scheduler->enqueue_front(std::move(*i));
10528     count++;
10529   }
10530   slot->to_process.clear();
10531   for (auto i = slot->waiting.rbegin();
10532        i != slot->waiting.rend();
10533        ++i) {
10534     scheduler->enqueue_front(std::move(*i));
10535     count++;
10536   }
10537   slot->waiting.clear();
10538   for (auto i = slot->waiting_peering.rbegin();
10539        i != slot->waiting_peering.rend();
10540        ++i) {
10541     // this is overkill; we requeue everything, even if some of these
10542     // items are waiting for maps we don't have yet.  FIXME, maybe,
10543     // someday, if we decide this inefficiency matters
10544     for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10545       scheduler->enqueue_front(std::move(*j));
10546       count++;
10547     }
10548   }
10549   slot->waiting_peering.clear();
10550   ++slot->requeue_seq;
10551   return count;
10552 }
10553
10554 void OSDShard::identify_splits_and_merges(
10555   const OSDMapRef& as_of_osdmap,
10556   set<pair<spg_t,epoch_t>> *split_pgs,
10557   set<pair<spg_t,epoch_t>> *merge_pgs)
10558 {
10559   std::lock_guard l(shard_lock);
10560   if (shard_osdmap) {
10561     for (auto& i : pg_slots) {
10562       const spg_t& pgid = i.first;
10563       auto *slot = i.second.get();
10564       if (slot->pg) {
10565         osd->service.identify_splits_and_merges(
10566           shard_osdmap, as_of_osdmap, pgid,
10567           split_pgs, merge_pgs);
10568       } else if (!slot->waiting_for_split.empty()) {
10569         osd->service.identify_splits_and_merges(
10570           shard_osdmap, as_of_osdmap, pgid,
10571           split_pgs, nullptr);
10572       } else {
10573         dout(20) << __func__ << " slot " << pgid
10574                  << " has no pg and waiting_for_split " << dendl;
10575       }
10576     }
10577   }
10578 }
10579
10580 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10581                             set<pair<spg_t,epoch_t>> *pgids)
10582 {
10583   std::lock_guard l(shard_lock);
10584   _prime_splits(pgids);
10585   if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10586     set<pair<spg_t,epoch_t>> newer_children;
10587     for (auto i : *pgids) {
10588       osd->service.identify_splits_and_merges(
10589         as_of_osdmap, shard_osdmap, i.first,
10590         &newer_children, nullptr);
10591     }
10592     newer_children.insert(pgids->begin(), pgids->end());
10593     dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10594              << shard_osdmap->get_epoch() << ", new children " << newer_children
10595              << dendl;
10596     _prime_splits(&newer_children);
10597     // note: we don't care what is left over here for other shards.
10598     // if this shard is ahead of us and one isn't, e.g., one thread is
10599     // calling into prime_splits via _process (due to a newly created
10600     // pg) and this shard has a newer map due to a racing consume_map,
10601     // then any grandchildren left here will be identified (or were
10602     // identified) when the slower shard's osdmap is advanced.
10603     // _prime_splits() will tolerate the case where the pgid is
10604     // already primed.
10605   }
10606 }
10607
10608 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10609 {
10610   dout(10) << *pgids << dendl;
10611   auto p = pgids->begin();
10612   while (p != pgids->end()) {
10613     unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10614     if (shard_index == shard_id) {
10615       auto r = pg_slots.emplace(p->first, nullptr);
10616       if (r.second) {
10617         dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10618         r.first->second = make_unique<OSDShardPGSlot>();
10619         r.first->second->waiting_for_split.insert(p->second);
10620       } else {
10621         auto q = r.first;
10622         ceph_assert(q != pg_slots.end());
10623         dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10624                  << dendl;
10625         q->second->waiting_for_split.insert(p->second);
10626       }
10627       p = pgids->erase(p);
10628     } else {
10629       ++p;
10630     }
10631   }
10632 }
10633
10634 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10635                             set<pair<spg_t,epoch_t>> *merge_pgs)
10636 {
10637   std::lock_guard l(shard_lock);
10638   dout(20) << __func__ << " checking shard " << shard_id
10639            << " for remaining merge pgs " << merge_pgs << dendl;
10640   auto p = merge_pgs->begin();
10641   while (p != merge_pgs->end()) {
10642     spg_t pgid = p->first;
10643     epoch_t epoch = p->second;
10644     unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10645     if (shard_index != shard_id) {
10646       ++p;
10647       continue;
10648     }
10649     OSDShardPGSlot *slot;
10650     auto r = pg_slots.emplace(pgid, nullptr);
10651     if (r.second) {
10652       r.first->second = make_unique<OSDShardPGSlot>();
10653     }
10654     slot = r.first->second.get();
10655     if (slot->pg) {
10656       // already have pg
10657       dout(20) << __func__ << "  have merge participant pg " << pgid
10658                << " " << slot->pg << dendl;
10659     } else if (!slot->waiting_for_split.empty() &&
10660                *slot->waiting_for_split.begin() < epoch) {
10661       dout(20) << __func__ << "  pending split on merge participant pg " << pgid
10662                << " " << slot->waiting_for_split << dendl;
10663     } else {
10664       dout(20) << __func__ << "  creating empty merge participant " << pgid
10665                << " for merge in " << epoch << dendl;
10666       // leave history zeroed; PG::merge_from() will fill it in.
10667       pg_history_t history;
10668       PGCreateInfo cinfo(pgid, epoch - 1,
10669                          history, PastIntervals(), false);
10670       PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10671       _attach_pg(r.first->second.get(), pg.get());
10672       _wake_pg_slot(pgid, slot);
10673       pg->unlock();
10674     }
10675     // mark slot for merge
10676     dout(20) << __func__ << "  marking merge participant " << pgid << dendl;
10677     slot->waiting_for_merge_epoch = epoch;
10678     p = merge_pgs->erase(p);
10679   }
10680 }
10681
10682 void OSDShard::register_and_wake_split_child(PG *pg)
10683 {
10684   dout(15) <<  __func__ << ": " << pg << " #:" << pg_slots.size() << dendl;
10685   epoch_t epoch;
10686   {
10687     std::lock_guard l(shard_lock);
10688     dout(10) << __func__ << ": " << pg->pg_id << " " << pg << dendl;
10689     auto p = pg_slots.find(pg->pg_id);
10690     ceph_assert(p != pg_slots.end());
10691     auto *slot = p->second.get();
10692     dout(20) << __func__ << ": " << pg->pg_id << " waiting_for_split "
10693              << slot->waiting_for_split << dendl;
10694     ceph_assert(!slot->pg);
10695     ceph_assert(!slot->waiting_for_split.empty());
10696     _attach_pg(slot, pg);
10697
10698     epoch = pg->get_osdmap_epoch();
10699     ceph_assert(slot->waiting_for_split.count(epoch));
10700     slot->waiting_for_split.erase(epoch);
10701     if (slot->waiting_for_split.empty()) {
10702       _wake_pg_slot(pg->pg_id, slot);
10703     } else {
10704       dout(10) << __func__ << " still waiting for split on "
10705                << slot->waiting_for_split << dendl;
10706     }
10707   }
10708
10709   // kick child to ensure it pulls up to the latest osdmap
10710   osd->enqueue_peering_evt(
10711     pg->pg_id,
10712     PGPeeringEventRef(
10713       std::make_shared<PGPeeringEvent>(
10714         epoch,
10715         epoch,
10716         NullEvt())));
10717
10718   std::lock_guard l{sdata_wait_lock};
10719   sdata_cond.notify_one();
10720 }
10721
10722 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10723 {
10724   std::lock_guard l(shard_lock);
10725   vector<spg_t> to_delete;
10726   for (auto& i : pg_slots) {
10727     if (i.first != parent &&
10728         i.first.get_ancestor(old_pg_num) == parent) {
10729       dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10730                << dendl;
10731       _wake_pg_slot(i.first, i.second.get());
10732       to_delete.push_back(i.first);
10733     }
10734   }
10735   for (auto pgid : to_delete) {
10736     pg_slots.erase(pgid);
10737   }
10738 }
10739
10740 void OSDShard::update_scheduler_config()
10741 {
10742   std::lock_guard l(shard_lock);
10743   scheduler->update_configuration();
10744 }
10745
10746 std::string OSDShard::get_scheduler_type()
10747 {
10748   std::ostringstream scheduler_type;
10749   scheduler_type << *scheduler;
10750   return scheduler_type.str();
10751 }
10752
10753 OSDShard::OSDShard(
10754   int id,
10755   CephContext *cct,
10756   OSD *osd)
10757   : shard_id(id),
10758     cct(cct),
10759     osd(osd),
10760     shard_name(string("OSDShard.") + stringify(id)),
10761     sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10762     sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10763     osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10764     shard_lock_name(shard_name + "::shard_lock"),
10765     shard_lock{make_mutex(shard_lock_name)},
10766     scheduler(ceph::osd::scheduler::make_scheduler(
10767       cct, osd->num_shards, osd->store->is_rotational(),
10768       osd->store->get_type())),
10769     context_queue(sdata_wait_lock, sdata_cond)
10770 {
10771   dout(0) << "using op scheduler " << *scheduler << dendl;
10772 }
10773
10774
10775 // =============================================================
10776
10777 #undef dout_context
10778 #define dout_context osd->cct
10779 #undef dout_prefix
10780 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10781
10782 void OSD::ShardedOpWQ::_add_slot_waiter(
10783   spg_t pgid,
10784   OSDShardPGSlot *slot,
10785   OpSchedulerItem&& qi)
10786 {
10787   if (qi.is_peering()) {
10788     dout(20) << __func__ << " " << pgid
10789              << " peering, item epoch is "
10790              << qi.get_map_epoch()
10791              << ", will wait on " << qi << dendl;
10792     slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10793   } else {
10794     dout(20) << __func__ << " " << pgid
10795              << " item epoch is "
10796              << qi.get_map_epoch()
10797              << ", will wait on " << qi << dendl;
10798     slot->waiting.push_back(std::move(qi));
10799   }
10800 }
10801
10802 #undef dout_prefix
10803 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10804
10805 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10806 {
10807   uint32_t shard_index = thread_index % osd->num_shards;
10808   auto& sdata = osd->shards[shard_index];
10809   ceph_assert(sdata);
10810
10811   // If all threads of shards do oncommits, there is a out-of-order
10812   // problem.  So we choose the thread which has the smallest
10813   // thread_index(thread_index < num_shards) of shard to do oncommit
10814   // callback.
10815   bool is_smallest_thread_index = thread_index < osd->num_shards;
10816
10817   // peek at spg_t
10818   sdata->shard_lock.lock();
10819   if (sdata->scheduler->empty() &&
10820       (!is_smallest_thread_index || sdata->context_queue.empty())) {
10821     std::unique_lock wait_lock{sdata->sdata_wait_lock};
10822     if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10823       // we raced with a context_queue addition, don't wait
10824       wait_lock.unlock();
10825     } else if (!sdata->stop_waiting) {
10826       dout(20) << __func__ << " empty q, waiting" << dendl;
10827       osd->cct->get_heartbeat_map()->clear_timeout(hb);
10828       sdata->shard_lock.unlock();
10829       sdata->sdata_cond.wait(wait_lock);
10830       wait_lock.unlock();
10831       sdata->shard_lock.lock();
10832       if (sdata->scheduler->empty() &&
10833          !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10834         sdata->shard_lock.unlock();
10835         return;
10836       }
10837       // found a work item; reapply default wq timeouts
10838       osd->cct->get_heartbeat_map()->reset_timeout(hb,
10839         timeout_interval, suicide_interval);
10840     } else {
10841       dout(20) << __func__ << " need return immediately" << dendl;
10842       wait_lock.unlock();
10843       sdata->shard_lock.unlock();
10844       return;
10845     }
10846   }
10847
10848   list<Context *> oncommits;
10849   if (is_smallest_thread_index) {
10850     sdata->context_queue.move_to(oncommits);
10851   }
10852
10853   WorkItem work_item;
10854   while (!std::get_if<OpSchedulerItem>(&work_item)) {
10855     if (sdata->scheduler->empty()) {
10856       if (osd->is_stopping()) {
10857         sdata->shard_lock.unlock();
10858         for (auto c : oncommits) {
10859           dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10860           delete c;
10861         }
10862         return;    // OSD shutdown, discard.
10863       }
10864       sdata->shard_lock.unlock();
10865       handle_oncommits(oncommits);
10866       return;
10867     }
10868
10869     work_item = sdata->scheduler->dequeue();
10870     if (osd->is_stopping()) {
10871       sdata->shard_lock.unlock();
10872       for (auto c : oncommits) {
10873         dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10874         delete c;
10875       }
10876       return;    // OSD shutdown, discard.
10877     }
10878
10879     // If the work item is scheduled in the future, wait until
10880     // the time returned in the dequeue response before retrying.
10881     if (auto when_ready = std::get_if<double>(&work_item)) {
10882       if (is_smallest_thread_index) {
10883         sdata->shard_lock.unlock();
10884         handle_oncommits(oncommits);
10885         sdata->shard_lock.lock();
10886       }
10887       std::unique_lock wait_lock{sdata->sdata_wait_lock};
10888       auto future_time = ceph::real_clock::from_double(*when_ready);
10889       dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10890       // Disable heartbeat timeout until we find a non-future work item to process.
10891       osd->cct->get_heartbeat_map()->clear_timeout(hb);
10892       sdata->shard_lock.unlock();
10893       ++sdata->waiting_threads;
10894       sdata->sdata_cond.wait_until(wait_lock, future_time);
10895       --sdata->waiting_threads;
10896       wait_lock.unlock();
10897       sdata->shard_lock.lock();
10898       // Reapply default wq timeouts
10899       osd->cct->get_heartbeat_map()->reset_timeout(hb,
10900         timeout_interval, suicide_interval);
10901       // Populate the oncommits list if there were any additions
10902       // to the context_queue while we were waiting
10903       if (is_smallest_thread_index) {
10904         sdata->context_queue.move_to(oncommits);
10905       }
10906     }
10907   } // while
10908
10909   // Access the stored item
10910   auto item = std::move(std::get<OpSchedulerItem>(work_item));
10911   if (osd->is_stopping()) {
10912     sdata->shard_lock.unlock();
10913     for (auto c : oncommits) {
10914       dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10915       delete c;
10916     }
10917     return;    // OSD shutdown, discard.
10918   }
10919
10920   const auto token = item.get_ordering_token();
10921   auto r = sdata->pg_slots.emplace(token, nullptr);
10922   if (r.second) {
10923     r.first->second = make_unique<OSDShardPGSlot>();
10924   }
10925   OSDShardPGSlot *slot = r.first->second.get();
10926   dout(20) << __func__ << " " << token
10927            << (r.second ? " (new)" : "")
10928            << " to_process " << slot->to_process
10929            << " waiting " << slot->waiting
10930            << " waiting_peering " << slot->waiting_peering
10931            << dendl;
10932   slot->to_process.push_back(std::move(item));
10933   dout(20) << __func__ << " " << slot->to_process.back()
10934            << " queued" << dendl;
10935
10936  retry_pg:
10937   PGRef pg = slot->pg;
10938
10939   // lock pg (if we have it)
10940   if (pg) {
10941     // note the requeue seq now...
10942     uint64_t requeue_seq = slot->requeue_seq;
10943     ++slot->num_running;
10944
10945     sdata->shard_lock.unlock();
10946     osd->service.maybe_inject_dispatch_delay();
10947     pg->lock();
10948     osd->service.maybe_inject_dispatch_delay();
10949     sdata->shard_lock.lock();
10950
10951     auto q = sdata->pg_slots.find(token);
10952     if (q == sdata->pg_slots.end()) {
10953       // this can happen if we race with pg removal.
10954       dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10955       pg->unlock();
10956       sdata->shard_lock.unlock();
10957       handle_oncommits(oncommits);
10958       return;
10959     }
10960     slot = q->second.get();
10961     --slot->num_running;
10962
10963     if (slot->to_process.empty()) {
10964       // raced with _wake_pg_slot or consume_map
10965       dout(20) << __func__ << " " << token
10966                << " nothing queued" << dendl;
10967       pg->unlock();
10968       sdata->shard_lock.unlock();
10969       handle_oncommits(oncommits);
10970       return;
10971     }
10972     if (requeue_seq != slot->requeue_seq) {
10973       dout(20) << __func__ << " " << token
10974                << " requeue_seq " << slot->requeue_seq << " > our "
10975                << requeue_seq << ", we raced with _wake_pg_slot"
10976                << dendl;
10977       pg->unlock();
10978       sdata->shard_lock.unlock();
10979       handle_oncommits(oncommits);
10980       return;
10981     }
10982     if (slot->pg != pg) {
10983       // this can happen if we race with pg removal.
10984       dout(20) << __func__ << " slot " << token << " no longer attached to "
10985                << pg << dendl;
10986       pg->unlock();
10987       goto retry_pg;
10988     }
10989   }
10990
10991   dout(20) << __func__ << " " << token
10992            << " to_process " << slot->to_process
10993            << " waiting " << slot->waiting
10994            << " waiting_peering " << slot->waiting_peering << dendl;
10995
10996   ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10997                                  suicide_interval);
10998
10999   // take next item
11000   auto qi = std::move(slot->to_process.front());
11001   slot->to_process.pop_front();
11002   dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11003   set<pair<spg_t,epoch_t>> new_children;
11004   OSDMapRef osdmap;
11005
11006   while (!pg) {
11007     // should this pg shard exist on this osd in this (or a later) epoch?
11008     osdmap = sdata->shard_osdmap;
11009     const PGCreateInfo *create_info = qi.creates_pg();
11010     if (!slot->waiting_for_split.empty()) {
11011       dout(20) << __func__ << " " << token
11012                << " splitting " << slot->waiting_for_split << dendl;
11013       _add_slot_waiter(token, slot, std::move(qi));
11014     } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11015       dout(20) << __func__ << " " << token
11016                << " map " << qi.get_map_epoch() << " > "
11017                << osdmap->get_epoch() << dendl;
11018       _add_slot_waiter(token, slot, std::move(qi));
11019     } else if (qi.is_peering()) {
11020       if (!qi.peering_requires_pg()) {
11021         // for pg-less events, we run them under the ordering lock, since
11022         // we don't have the pg lock to keep them ordered.
11023         qi.run(osd, sdata, pg, tp_handle);
11024       } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11025         if (create_info) {
11026           if (create_info->by_mon &&
11027               osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11028             dout(20) << __func__ << " " << token
11029                      << " no pg, no longer primary, ignoring mon create on "
11030                      << qi << dendl;
11031           } else {
11032             dout(20) << __func__ << " " << token
11033                      << " no pg, should create on " << qi << dendl;
11034             pg = osd->handle_pg_create_info(osdmap, create_info);
11035             if (pg) {
11036               // we created the pg! drop out and continue "normally"!
11037               sdata->_attach_pg(slot, pg.get());
11038               sdata->_wake_pg_slot(token, slot);
11039
11040               // identify split children between create epoch and shard epoch.
11041               osd->service.identify_splits_and_merges(
11042                 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11043               sdata->_prime_splits(&new_children);
11044               // distribute remaining split children to other shards below!
11045               break;
11046             }
11047             dout(20) << __func__ << " ignored create on " << qi << dendl;
11048           }
11049         } else {
11050           dout(20) << __func__ << " " << token
11051                    << " no pg, peering, !create, discarding " << qi << dendl;
11052         }
11053       } else {
11054         dout(20) << __func__ << " " << token
11055                  << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11056                  << ", discarding " << qi
11057                  << dendl;
11058       }
11059     } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11060       dout(20) << __func__ << " " << token
11061                << " no pg, should exist e" << osdmap->get_epoch()
11062                << ", will wait on " << qi << dendl;
11063       _add_slot_waiter(token, slot, std::move(qi));
11064     } else {
11065       dout(20) << __func__ << " " << token
11066                << " no pg, shouldn't exist e" << osdmap->get_epoch()
11067                << ", dropping " << qi << dendl;
11068       // share map with client?
11069       if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11070         osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11071                                      sdata->shard_osdmap,
11072                                      (*_op)->sent_epoch);
11073       }
11074       unsigned pushes_to_free = qi.get_reserved_pushes();
11075       if (pushes_to_free > 0) {
11076         sdata->shard_lock.unlock();
11077         osd->service.release_reserved_pushes(pushes_to_free);
11078         handle_oncommits(oncommits);
11079         return;
11080       }
11081     }
11082     sdata->shard_lock.unlock();
11083     handle_oncommits(oncommits);
11084     return;
11085   }
11086   if (qi.is_peering()) {
11087     OSDMapRef osdmap = sdata->shard_osdmap;
11088     if (qi.get_map_epoch() > osdmap->get_epoch()) {
11089       _add_slot_waiter(token, slot, std::move(qi));
11090       sdata->shard_lock.unlock();
11091       pg->unlock();
11092       handle_oncommits(oncommits);
11093       return;
11094     }
11095   }
11096   sdata->shard_lock.unlock();
11097
11098   if (!new_children.empty()) {
11099     for (auto shard : osd->shards) {
11100       shard->prime_splits(osdmap, &new_children);
11101     }
11102     ceph_assert(new_children.empty());
11103   }
11104
11105   // osd_opwq_process marks the point at which an operation has been dequeued
11106   // and will begin to be handled by a worker thread.
11107   {
11108 #ifdef WITH_LTTNG
11109     osd_reqid_t reqid;
11110     if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11111       reqid = (*_op)->get_reqid();
11112     }
11113 #endif
11114     tracepoint(osd, opwq_process_start, reqid.name._type,
11115         reqid.name._num, reqid.tid, reqid.inc);
11116   }
11117
11118   lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11119   Formatter *f = Formatter::create("json");
11120   f->open_object_section("q");
11121   dump(f);
11122   f->close_section();
11123   f->flush(*_dout);
11124   delete f;
11125   *_dout << dendl;
11126
11127   qi.run(osd, sdata, pg, tp_handle);
11128
11129   {
11130 #ifdef WITH_LTTNG
11131     osd_reqid_t reqid;
11132     if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11133       reqid = (*_op)->get_reqid();
11134     }
11135 #endif
11136     tracepoint(osd, opwq_process_finish, reqid.name._type,
11137         reqid.name._num, reqid.tid, reqid.inc);
11138   }
11139
11140   handle_oncommits(oncommits);
11141 }
11142
11143 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11144   if (unlikely(m_fast_shutdown) ) {
11145     // stop enqueing when we are in the middle of a fast shutdown
11146     return;
11147   }
11148
11149   uint32_t shard_index =
11150     item.get_ordering_token().hash_to_shard(osd->shards.size());
11151
11152   OSDShard* sdata = osd->shards[shard_index];
11153   assert (NULL != sdata);
11154   if (sdata->get_scheduler_type() == "mClockScheduler") {
11155     item.maybe_set_is_qos_item();
11156   }
11157
11158   dout(20) << __func__ << " " << item << dendl;
11159
11160   bool empty = true;
11161   {
11162     std::lock_guard l{sdata->shard_lock};
11163     empty = sdata->scheduler->empty();
11164     sdata->scheduler->enqueue(std::move(item));
11165   }
11166
11167   {
11168     std::lock_guard l{sdata->sdata_wait_lock};
11169     if (empty) {
11170       sdata->sdata_cond.notify_all();
11171     } else if (sdata->waiting_threads) {
11172       sdata->sdata_cond.notify_one();
11173     }
11174   }
11175 }
11176
11177 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
11178 {
11179   if (unlikely(m_fast_shutdown) ) {
11180     // stop enqueing when we are in the middle of a fast shutdown
11181     return;
11182   }
11183
11184   auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11185   auto& sdata = osd->shards[shard_index];
11186   ceph_assert(sdata);
11187   sdata->shard_lock.lock();
11188   auto p = sdata->pg_slots.find(item.get_ordering_token());
11189   if (p != sdata->pg_slots.end() &&
11190       !p->second->to_process.empty()) {
11191     // we may be racing with _process, which has dequeued a new item
11192     // from scheduler, put it on to_process, and is now busy taking the
11193     // pg lock.  ensure this old requeued item is ordered before any
11194     // such newer item in to_process.
11195     p->second->to_process.push_front(std::move(item));
11196     item = std::move(p->second->to_process.back());
11197     p->second->to_process.pop_back();
11198     dout(20) << __func__
11199              << " " << p->second->to_process.front()
11200              << " shuffled w/ " << item << dendl;
11201   } else {
11202     dout(20) << __func__ << " " << item << dendl;
11203   }
11204   sdata->scheduler->enqueue_front(std::move(item));
11205   sdata->shard_lock.unlock();
11206   std::lock_guard l{sdata->sdata_wait_lock};
11207   sdata->sdata_cond.notify_one();
11208 }
11209
11210 void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11211 {
11212   uint32_t shard_index = 0;
11213   m_fast_shutdown = true;
11214
11215   for (; shard_index < osd->num_shards; shard_index++) {
11216     auto& sdata = osd->shards[shard_index];
11217     ceph_assert(sdata);
11218     sdata->shard_lock.lock();
11219     int work_count = 0;
11220     while(! sdata->scheduler->empty() ) {
11221       auto work_item = sdata->scheduler->dequeue();
11222       work_count++;
11223     }
11224     sdata->shard_lock.unlock();
11225   }
11226 }
11227
11228 namespace ceph::osd_cmds {
11229
11230 int heap(CephContext& cct,
11231          const cmdmap_t& cmdmap,
11232          std::ostream& outos,
11233          std::ostream& erros)
11234 {
11235   if (!ceph_using_tcmalloc()) {
11236         erros << "could not issue heap profiler command -- not using tcmalloc!";
11237         return -EOPNOTSUPP;
11238   }
11239
11240   string cmd;
11241   if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
11242         erros << "unable to get value for command \"" << cmd << "\"";
11243        return -EINVAL;
11244   }
11245
11246   std::vector<std::string> cmd_vec;
11247   get_str_vec(cmd, cmd_vec);
11248
11249   string val;
11250   if (cmd_getval(cmdmap, "value", val)) {
11251     cmd_vec.push_back(val);
11252   }
11253
11254   ceph_heap_profiler_handle_command(cmd_vec, outos);
11255
11256   return 0;
11257 }
11258
11259 } // namespace ceph::osd_cmds