ceph/src/osd/OSD.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2017 OVH
   8  *
   9  * This is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License version 2.1, as published by the Free Software
  12  * Foundation.  See file COPYING.
  13  *
  14  */
  15
  16 #include "acconfig.h"
  17
  18 #include <cctype>
  19 #include <fstream>
  20 #include <iostream>
  21 #include <iterator>
  22
  23 #include <unistd.h>
  24 #include <sys/stat.h>
  25 #include <signal.h>
  26 #include <time.h>
  27 #include <boost/range/adaptor/reversed.hpp>
  28
  29 #ifdef HAVE_SYS_PARAM_H
  30 #include <sys/param.h>
  31 #endif
  32
  33 #ifdef HAVE_SYS_MOUNT_H
  34 #include <sys/mount.h>
  35 #endif
  36
  37 #include "osd/PG.h"
  38 #include "osd/scrubber/scrub_machine.h"
  39 #include "osd/scrubber/pg_scrubber.h"
  40
  41 #include "include/types.h"
  42 #include "include/compat.h"
  43 #include "include/random.h"
  44 #include "include/scope_guard.h"
  45
  46 #include "OSD.h"
  47 #include "OSDMap.h"
  48 #include "Watch.h"
  49 #include "osdc/Objecter.h"
  50
  51 #include "common/errno.h"
  52 #include "common/ceph_argparse.h"
  53 #include "common/ceph_releases.h"
  54 #include "common/ceph_time.h"
  55 #include "common/version.h"
  56 #include "common/async/blocked_completion.h"
  57 #include "common/pick_address.h"
  58 #include "common/blkdev.h"
  59 #include "common/numa.h"
  60
  61 #include "os/ObjectStore.h"
  62 #ifdef HAVE_LIBFUSE
  63 #include "os/FuseStore.h"
  64 #endif
  65
  66 #include "PrimaryLogPG.h"
  67
  68 #include "msg/Messenger.h"
  69 #include "msg/Message.h"
  70
  71 #include "mon/MonClient.h"
  72
  73 #include "messages/MLog.h"
  74
  75 #include "messages/MGenericMessage.h"
  76 #include "messages/MOSDPing.h"
  77 #include "messages/MOSDFailure.h"
  78 #include "messages/MOSDMarkMeDown.h"
  79 #include "messages/MOSDMarkMeDead.h"
  80 #include "messages/MOSDFull.h"
  81 #include "messages/MOSDOp.h"
  82 #include "messages/MOSDOpReply.h"
  83 #include "messages/MOSDBackoff.h"
  84 #include "messages/MOSDBeacon.h"
  85 #include "messages/MOSDRepOp.h"
  86 #include "messages/MOSDRepOpReply.h"
  87 #include "messages/MOSDBoot.h"
  88 #include "messages/MOSDPGTemp.h"
  89 #include "messages/MOSDPGReadyToMerge.h"
  90
  91 #include "messages/MOSDMap.h"
  92 #include "messages/MMonGetOSDMap.h"
  93 #include "messages/MOSDPGNotify.h"
  94 #include "messages/MOSDPGNotify2.h"
  95 #include "messages/MOSDPGQuery2.h"
  96 #include "messages/MOSDPGLog.h"
  97 #include "messages/MOSDPGRemove.h"
  98 #include "messages/MOSDPGInfo.h"
  99 #include "messages/MOSDPGInfo2.h"
 100 #include "messages/MOSDPGCreate2.h"
 101 #include "messages/MBackfillReserve.h"
 102 #include "messages/MRecoveryReserve.h"
 103 #include "messages/MOSDForceRecovery.h"
 104 #include "messages/MOSDECSubOpWrite.h"
 105 #include "messages/MOSDECSubOpWriteReply.h"
 106 #include "messages/MOSDECSubOpRead.h"
 107 #include "messages/MOSDECSubOpReadReply.h"
 108 #include "messages/MOSDPGCreated.h"
 109 #include "messages/MOSDPGUpdateLogMissing.h"
 110 #include "messages/MOSDPGUpdateLogMissingReply.h"
 111
 112 #include "messages/MOSDPeeringOp.h"
 113
 114 #include "messages/MOSDAlive.h"
 115
 116 #include "messages/MOSDScrub2.h"
 117
 118 #include "messages/MCommand.h"
 119 #include "messages/MCommandReply.h"
 120
 121 #include "messages/MPGStats.h"
 122
 123 #include "messages/MMonGetPurgedSnaps.h"
 124 #include "messages/MMonGetPurgedSnapsReply.h"
 125
 126 #include "common/perf_counters.h"
 127 #include "common/Timer.h"
 128 #include "common/LogClient.h"
 129 #include "common/AsyncReserver.h"
 130 #include "common/HeartbeatMap.h"
 131 #include "common/admin_socket.h"
 132 #include "common/ceph_context.h"
 133
 134 #include "global/signal_handler.h"
 135 #include "global/pidfile.h"
 136
 137 #include "include/color.h"
 138 #include "perfglue/cpu_profiler.h"
 139 #include "perfglue/heap_profiler.h"
 140
 141 #include "osd/ClassHandler.h"
 142 #include "osd/OpRequest.h"
 143
 144 #include "auth/AuthAuthorizeHandler.h"
 145 #include "auth/RotatingKeyRing.h"
 146
 147 #include "objclass/objclass.h"
 148
 149 #include "common/cmdparse.h"
 150 #include "include/str_list.h"
 151 #include "include/util.h"
 152
 153 #include "include/ceph_assert.h"
 154 #include "common/config.h"
 155 #include "common/EventTrace.h"
 156
 157 #include "json_spirit/json_spirit_reader.h"
 158 #include "json_spirit/json_spirit_writer.h"
 159
 160 #ifdef WITH_LTTNG
 161 #define TRACEPOINT_DEFINE
 162 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 163 #include "tracing/osd.h"
 164 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 165 #undef TRACEPOINT_DEFINE
 166 #else
 167 #define tracepoint(...)
 168 #endif
 169
 170 #include "osd_tracer.h"
 171
 172
 173 #define dout_context cct
 174 #define dout_subsys ceph_subsys_osd
 175 #undef dout_prefix
 176 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
 177
 178 using std::deque;
 179 using std::list;
 180 using std::lock_guard;
 181 using std::make_pair;
 182 using std::make_tuple;
 183 using std::make_unique;
 184 using std::map;
 185 using std::ostream;
 186 using std::ostringstream;
 187 using std::pair;
 188 using std::set;
 189 using std::string;
 190 using std::stringstream;
 191 using std::to_string;
 192 using std::unique_ptr;
 193 using std::vector;
 194
 195 using ceph::bufferlist;
 196 using ceph::bufferptr;
 197 using ceph::decode;
 198 using ceph::encode;
 199 using ceph::fixed_u_to_string;
 200 using ceph::Formatter;
 201 using ceph::heartbeat_handle_d;
 202 using ceph::make_mutex;
 203
 204 using namespace ceph::osd::scheduler;
 205 using TOPNSPC::common::cmd_getval;
 206 using TOPNSPC::common::cmd_getval_or;
 207
 208 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
 209   return *_dout << "osd." << whoami << " " << epoch << " ";
 210 }
 211
 212
 213 //Initial features in new superblock.
 214 //Features here are also automatically upgraded
 215 CompatSet OSD::get_osd_initial_compat_set() {
 216   CompatSet::FeatureSet ceph_osd_feature_compat;
 217   CompatSet::FeatureSet ceph_osd_feature_ro_compat;
 218   CompatSet::FeatureSet ceph_osd_feature_incompat;
 219   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
 220   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
 221   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
 222   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
 223   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
 224   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
 225   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
 226   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
 227   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
 228   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
 229   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
 230   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
 231   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
 232   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
 233   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
 234   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
 235   return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
 236                    ceph_osd_feature_incompat);
 237 }
 238
 239 //Features are added here that this OSD supports.
 240 CompatSet OSD::get_osd_compat_set() {
 241   CompatSet compat =  get_osd_initial_compat_set();
 242   //Any features here can be set in code, but not in initial superblock
 243   compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
 244   return compat;
 245 }
 246
 247 OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
 248   osd(osd),
 249   cct(osd->cct),
 250   whoami(osd->whoami), store(osd->store.get()),
 251   log_client(osd->log_client), clog(osd->clog),
 252   pg_recovery_stats(osd->pg_recovery_stats),
 253   cluster_messenger(osd->cluster_messenger),
 254   client_messenger(osd->client_messenger),
 255   logger(osd->logger),
 256   recoverystate_perf(osd->recoverystate_perf),
 257   monc(osd->monc),
 258   osd_max_object_size(cct->_conf, "osd_max_object_size"),
 259   osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
 260   publish_lock{ceph::make_mutex("OSDService::publish_lock")},
 261   pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
 262   m_scrub_queue{cct, *this},
 263   agent_valid_iterator(false),
 264   agent_ops(0),
 265   flush_mode_high_count(0),
 266   agent_active(true),
 267   agent_thread(this),
 268   agent_stop_flag(false),
 269   agent_timer(osd->client_messenger->cct, agent_timer_lock),
 270   last_recalibrate(ceph_clock_now()),
 271   promote_max_objects(0),
 272   promote_max_bytes(0),
 273   poolctx(poolctx),
 274   objecter(make_unique<Objecter>(osd->client_messenger->cct,
 275                                  osd->objecter_messenger,
 276                                  osd->monc, poolctx)),
 277   m_objecter_finishers(cct->_conf->osd_objecter_finishers),
 278   watch_timer(osd->client_messenger->cct, watch_lock),
 279   next_notif_id(0),
 280   recovery_request_timer(cct, recovery_request_lock, false),
 281   sleep_timer(cct, sleep_lock, false),
 282   reserver_finisher(cct),
 283   local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 284                  cct->_conf->osd_min_recovery_priority),
 285   remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 286                   cct->_conf->osd_min_recovery_priority),
 287   snap_reserver(cct, &reserver_finisher,
 288                 cct->_conf->osd_max_trimming_pgs),
 289   recovery_ops_active(0),
 290   recovery_ops_reserved(0),
 291   recovery_paused(false),
 292   map_cache(cct, cct->_conf->osd_map_cache_size),
 293   map_bl_cache(cct->_conf->osd_map_cache_size),
 294   map_bl_inc_cache(cct->_conf->osd_map_cache_size),
 295   cur_state(NONE),
 296   cur_ratio(0), physical_ratio(0),
 297   boot_epoch(0), up_epoch(0), bind_epoch(0)
 298 {
 299   objecter->init();
 300
 301   for (int i = 0; i < m_objecter_finishers; i++) {
 302     ostringstream str;
 303     str << "objecter-finisher-" << i;
 304     auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
 305     objecter_finishers.push_back(std::move(fin));
 306   }
 307 }
 308
 309 #ifdef PG_DEBUG_REFS
 310 void OSDService::add_pgid(spg_t pgid, PG *pg) {
 311   std::lock_guard l(pgid_lock);
 312   if (!pgid_tracker.count(pgid)) {
 313     live_pgs[pgid] = pg;
 314   }
 315   pgid_tracker[pgid]++;
 316 }
 317 void OSDService::remove_pgid(spg_t pgid, PG *pg)
 318 {
 319   std::lock_guard l(pgid_lock);
 320   ceph_assert(pgid_tracker.count(pgid));
 321   ceph_assert(pgid_tracker[pgid] > 0);
 322   pgid_tracker[pgid]--;
 323   if (pgid_tracker[pgid] == 0) {
 324     pgid_tracker.erase(pgid);
 325     live_pgs.erase(pgid);
 326   }
 327 }
 328 void OSDService::dump_live_pgids()
 329 {
 330   std::lock_guard l(pgid_lock);
 331   derr << "live pgids:" << dendl;
 332   for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
 333        i != pgid_tracker.cend();
 334        ++i) {
 335     derr << "\t" << *i << dendl;
 336     live_pgs[i->first]->dump_live_ids();
 337   }
 338 }
 339 #endif
 340
 341
 342 ceph::signedspan OSDService::get_mnow() const
 343 {
 344   return ceph::mono_clock::now() - osd->startup_time;
 345 }
 346
 347 void OSDService::identify_splits_and_merges(
 348   OSDMapRef old_map,
 349   OSDMapRef new_map,
 350   spg_t pgid,
 351   set<pair<spg_t,epoch_t>> *split_children,
 352   set<pair<spg_t,epoch_t>> *merge_pgs)
 353 {
 354   dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
 355            << " to e" << new_map->get_epoch() << dendl;
 356   if (!old_map->have_pg_pool(pgid.pool())) {
 357     dout(20) << __func__ << " " << pgid << " pool " << pgid.pool()
 358              << " does not exist in old map" << dendl;
 359     return;
 360   }
 361   int old_pgnum = old_map->get_pg_num(pgid.pool());
 362   auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
 363   if (p == osd->pg_num_history.pg_nums.end()) {
 364     dout(20) << __func__ << " " << pgid << " pool " << pgid.pool()
 365              << " has no history" << dendl;
 366     return;
 367   }
 368   dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
 369            << " to e" << new_map->get_epoch()
 370            << " pg_nums " << p->second << dendl;
 371   deque<spg_t> queue;
 372   queue.push_back(pgid);
 373   set<spg_t> did;
 374   while (!queue.empty()) {
 375     auto cur = queue.front();
 376     queue.pop_front();
 377     did.insert(cur);
 378     unsigned pgnum = old_pgnum;
 379     for (auto q = p->second.lower_bound(old_map->get_epoch());
 380          q != p->second.end() &&
 381            q->first <= new_map->get_epoch();
 382          ++q) {
 383       if (pgnum < q->second) {
 384         // split?
 385         if (cur.ps() < pgnum) {
 386           set<spg_t> children;
 387           if (cur.is_split(pgnum, q->second, &children)) {
 388             dout(20) << __func__ << " " << cur << " e" << q->first
 389                      << " pg_num " << pgnum << " -> " << q->second
 390                      << " children " << children << dendl;
 391             for (auto i : children) {
 392               split_children->insert(make_pair(i, q->first));
 393               if (!did.count(i))
 394                 queue.push_back(i);
 395             }
 396           }
 397         } else if (cur.ps() < q->second) {
 398           dout(20) << __func__ << " " << cur << " e" << q->first
 399                    << " pg_num " << pgnum << " -> " << q->second
 400                    << " is a child" << dendl;
 401           // normally we'd capture this from the parent, but it's
 402           // possible the parent doesn't exist yet (it will be
 403           // fabricated to allow an intervening merge).  note this PG
 404           // as a split child here to be sure we catch it.
 405           split_children->insert(make_pair(cur, q->first));
 406         } else {
 407           dout(20) << __func__ << " " << cur << " e" << q->first
 408                    << " pg_num " << pgnum << " -> " << q->second
 409                    << " is post-split, skipping" << dendl;
 410         }
 411       } else if (merge_pgs) {
 412         // merge?
 413         if (cur.ps() >= q->second) {
 414           if (cur.ps() < pgnum) {
 415             spg_t parent;
 416             if (cur.is_merge_source(pgnum, q->second, &parent)) {
 417               set<spg_t> children;
 418               parent.is_split(q->second, pgnum, &children);
 419               dout(20) << __func__ << " " << cur << " e" << q->first
 420                        << " pg_num " << pgnum << " -> " << q->second
 421                        << " is merge source, target " << parent
 422                        << ", source(s) " << children << dendl;
 423               merge_pgs->insert(make_pair(parent, q->first));
 424               if (!did.count(parent)) {
 425                 // queue (and re-scan) parent in case it might not exist yet
 426                 // and there are some future splits pending on it
 427                 queue.push_back(parent);
 428               }
 429               for (auto c : children) {
 430                 merge_pgs->insert(make_pair(c, q->first));
 431                 if (!did.count(c))
 432                   queue.push_back(c);
 433               }
 434             }
 435           } else {
 436             dout(20) << __func__ << " " << cur << " e" << q->first
 437                      << " pg_num " << pgnum << " -> " << q->second
 438                      << " is beyond old pgnum, skipping" << dendl;
 439           }
 440         } else {
 441           set<spg_t> children;
 442           if (cur.is_split(q->second, pgnum, &children)) {
 443             dout(20) << __func__ << " " << cur << " e" << q->first
 444                      << " pg_num " << pgnum << " -> " << q->second
 445                      << " is merge target, source " << children << dendl;
 446             for (auto c : children) {
 447               merge_pgs->insert(make_pair(c, q->first));
 448               if (!did.count(c))
 449                 queue.push_back(c);
 450             }
 451             merge_pgs->insert(make_pair(cur, q->first));
 452           }
 453         }
 454       }
 455       pgnum = q->second;
 456     }
 457   }
 458 }
 459
 460 void OSDService::need_heartbeat_peer_update()
 461 {
 462   osd->need_heartbeat_peer_update();
 463 }
 464
 465 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
 466 {
 467   std::lock_guard l(hb_stamp_lock);
 468   if (peer >= hb_stamps.size()) {
 469     hb_stamps.resize(peer + 1);
 470   }
 471   if (!hb_stamps[peer]) {
 472     hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
 473   }
 474   return hb_stamps[peer];
 475 }
 476
 477 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
 478 {
 479   osd->enqueue_peering_evt(
 480     spgid,
 481     PGPeeringEventRef(
 482       std::make_shared<PGPeeringEvent>(
 483         epoch, epoch,
 484         RenewLease())));
 485 }
 486
 487 void OSDService::start_shutdown()
 488 {
 489   {
 490     std::lock_guard l(agent_timer_lock);
 491     agent_timer.shutdown();
 492   }
 493
 494   {
 495     std::lock_guard l(sleep_lock);
 496     sleep_timer.shutdown();
 497   }
 498
 499   {
 500     std::lock_guard l(recovery_request_lock);
 501     recovery_request_timer.shutdown();
 502   }
 503 }
 504
 505 void OSDService::shutdown_reserver()
 506 {
 507   reserver_finisher.wait_for_empty();
 508   reserver_finisher.stop();
 509 }
 510
 511 void OSDService::shutdown()
 512 {
 513   mono_timer.suspend();
 514
 515   {
 516     std::lock_guard l(watch_lock);
 517     watch_timer.shutdown();
 518   }
 519
 520   objecter->shutdown();
 521   for (auto& f : objecter_finishers) {
 522     f->wait_for_empty();
 523     f->stop();
 524   }
 525
 526   publish_map(OSDMapRef());
 527   next_osdmap = OSDMapRef();
 528 }
 529
 530 void OSDService::init()
 531 {
 532   reserver_finisher.start();
 533   for (auto& f : objecter_finishers) {
 534     f->start();
 535   }
 536   objecter->set_client_incarnation(0);
 537
 538   // deprioritize objecter in daemonperf output
 539   objecter->get_logger()->set_prio_adjust(-3);
 540
 541   watch_timer.init();
 542   agent_timer.init();
 543   mono_timer.resume();
 544
 545   agent_thread.create("osd_srv_agent");
 546
 547   if (cct->_conf->osd_recovery_delay_start)
 548     defer_recovery(cct->_conf->osd_recovery_delay_start);
 549 }
 550
 551 void OSDService::final_init()
 552 {
 553   objecter->start(osdmap.get());
 554 }
 555
 556 void OSDService::activate_map()
 557 {
 558   // wake/unwake the tiering agent
 559   std::lock_guard l{agent_lock};
 560   agent_active =
 561     !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
 562     osd->is_active();
 563   agent_cond.notify_all();
 564 }
 565
 566 OSDMapRef OSDService::get_nextmap_reserved() {
 567   std::lock_guard l(pre_publish_lock);
 568
 569   epoch_t e = next_osdmap->get_epoch();
 570
 571   std::map<epoch_t, unsigned>::iterator i =
 572     map_reservations.insert(std::make_pair(e, 0)).first;
 573   i->second++;
 574   dout(20) << __func__  << " map_reservations: " << map_reservations << dendl;
 575   return next_osdmap;
 576 }
 577
 578 /// releases reservation on map
 579 void OSDService::release_map(OSDMapRef osdmap) {
 580   std::lock_guard l(pre_publish_lock);
 581   dout(20) << __func__  << " epoch: " << osdmap->get_epoch() << dendl;
 582   std::map<epoch_t, unsigned>::iterator i =
 583     map_reservations.find(osdmap->get_epoch());
 584   ceph_assert(i != map_reservations.end());
 585   ceph_assert(i->second > 0);
 586   if (--(i->second) == 0) {
 587     map_reservations.erase(i);
 588   }
 589   if (pre_publish_waiter) {
 590     dout(20) << __func__  << " notify all." << dendl;
 591     pre_publish_cond.notify_all();
 592   }
 593 }
 594
 595 /// blocks until there are no reserved maps prior to next_osdmap
 596 void OSDService::await_reserved_maps() {
 597   std::unique_lock l{pre_publish_lock};
 598   dout(20) << __func__  << " epoch:" << next_osdmap->get_epoch() << dendl;
 599
 600   ceph_assert(next_osdmap);
 601   pre_publish_waiter++;
 602   pre_publish_cond.wait(l, [this] {
 603     auto i = map_reservations.cbegin();
 604     return (i == map_reservations.cend() ||
 605             i->first >= next_osdmap->get_epoch());
 606   });
 607   pre_publish_waiter--;
 608   dout(20) << __func__  << " done " <<  pre_publish_waiter << dendl;
 609 }
 610
 611 void OSDService::request_osdmap_update(epoch_t e)
 612 {
 613   osd->osdmap_subscribe(e, false);
 614 }
 615
 616
 617 class AgentTimeoutCB : public Context {
 618   PGRef pg;
 619 public:
 620   explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
 621   void finish(int) override {
 622     pg->agent_choose_mode_restart();
 623   }
 624 };
 625
 626 void OSDService::agent_entry()
 627 {
 628   dout(10) << __func__ << " start" << dendl;
 629   std::unique_lock agent_locker{agent_lock};
 630
 631   while (!agent_stop_flag) {
 632     if (agent_queue.empty()) {
 633       dout(20) << __func__ << " empty queue" << dendl;
 634       agent_cond.wait(agent_locker);
 635       continue;
 636     }
 637     uint64_t level = agent_queue.rbegin()->first;
 638     set<PGRef>& top = agent_queue.rbegin()->second;
 639     dout(10) << __func__
 640              << " tiers " << agent_queue.size()
 641              << ", top is " << level
 642              << " with pgs " << top.size()
 643              << ", ops " << agent_ops << "/"
 644              << cct->_conf->osd_agent_max_ops
 645              << (agent_active ? " active" : " NOT ACTIVE")
 646              << dendl;
 647     dout(20) << __func__ << " oids " << agent_oids << dendl;
 648     int max = cct->_conf->osd_agent_max_ops - agent_ops;
 649     int agent_flush_quota = max;
 650     if (!flush_mode_high_count)
 651       agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
 652     if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
 653       agent_cond.wait(agent_locker);
 654       continue;
 655     }
 656
 657     if (!agent_valid_iterator || agent_queue_pos == top.end()) {
 658       agent_queue_pos = top.begin();
 659       agent_valid_iterator = true;
 660     }
 661     PGRef pg = *agent_queue_pos;
 662     dout(10) << "high_count " << flush_mode_high_count
 663              << " agent_ops " << agent_ops
 664              << " flush_quota " << agent_flush_quota << dendl;
 665     agent_locker.unlock();
 666     if (!pg->agent_work(max, agent_flush_quota)) {
 667       dout(10) << __func__ << " " << pg->pg_id
 668         << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
 669         << " seconds" << dendl;
 670
 671       logger->inc(l_osd_tier_delay);
 672       // Queue a timer to call agent_choose_mode for this pg in 5 seconds
 673       std::lock_guard timer_locker{agent_timer_lock};
 674       Context *cb = new AgentTimeoutCB(pg);
 675       agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
 676     }
 677     agent_locker.lock();
 678   }
 679   dout(10) << __func__ << " finish" << dendl;
 680 }
 681
 682 void OSDService::agent_stop()
 683 {
 684   {
 685     std::lock_guard l(agent_lock);
 686
 687     // By this time all ops should be cancelled
 688     ceph_assert(agent_ops == 0);
 689     // By this time all PGs are shutdown and dequeued
 690     if (!agent_queue.empty()) {
 691       set<PGRef>& top = agent_queue.rbegin()->second;
 692       derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
 693       ceph_abort_msg("agent queue not empty");
 694     }
 695
 696     agent_stop_flag = true;
 697     agent_cond.notify_all();
 698   }
 699   agent_thread.join();
 700 }
 701
 702 // -------------------------------------
 703
 704 void OSDService::promote_throttle_recalibrate()
 705 {
 706   utime_t now = ceph_clock_now();
 707   double dur = now - last_recalibrate;
 708   last_recalibrate = now;
 709   unsigned prob = promote_probability_millis;
 710
 711   uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
 712   uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
 713
 714   unsigned min_prob = 1;
 715
 716   uint64_t attempts, obj, bytes;
 717   promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
 718   dout(10) << __func__ << " " << attempts << " attempts, promoted "
 719            << obj << " objects and " << byte_u_t(bytes) << "; target "
 720            << target_obj_sec << " obj/sec or "
 721            << byte_u_t(target_bytes_sec) << "/sec"
 722            << dendl;
 723
 724   // calculate what the probability *should* be, given the targets
 725   unsigned new_prob;
 726   if (attempts && dur > 0) {
 727     uint64_t avg_size = 1;
 728     if (obj)
 729       avg_size = std::max<uint64_t>(bytes / obj, 1);
 730     unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
 731     unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
 732       / (double)attempts;
 733     dout(20) << __func__ << "  po " << po << " pb " << pb << " avg_size "
 734              << avg_size << dendl;
 735     if (target_obj_sec && target_bytes_sec)
 736       new_prob = std::min(po, pb);
 737     else if (target_obj_sec)
 738       new_prob = po;
 739     else if (target_bytes_sec)
 740       new_prob = pb;
 741     else
 742       new_prob = 1000;
 743   } else {
 744     new_prob = 1000;
 745   }
 746   dout(20) << __func__ << "  new_prob " << new_prob << dendl;
 747
 748   // correct for persistent skew between target rate and actual rate, adjust
 749   double ratio = 1.0;
 750   unsigned actual = 0;
 751   if (attempts && obj) {
 752     actual = obj * 1000 / attempts;
 753     ratio = (double)actual / (double)prob;
 754     new_prob = (double)new_prob / ratio;
 755   }
 756   new_prob = std::max(new_prob, min_prob);
 757   new_prob = std::min(new_prob, 1000u);
 758
 759   // adjust
 760   prob = (prob + new_prob) / 2;
 761   prob = std::max(prob, min_prob);
 762   prob = std::min(prob, 1000u);
 763   dout(10) << __func__ << "  actual " << actual
 764            << ", actual/prob ratio " << ratio
 765            << ", adjusted new_prob " << new_prob
 766            << ", prob " << promote_probability_millis << " -> " << prob
 767            << dendl;
 768   promote_probability_millis = prob;
 769
 770   // set hard limits for this interval to mitigate stampedes
 771   promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
 772   promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
 773 }
 774
 775 // -------------------------------------
 776
 777 float OSDService::get_failsafe_full_ratio()
 778 {
 779   float full_ratio = cct->_conf->osd_failsafe_full_ratio;
 780   if (full_ratio > 1.0) full_ratio /= 100.0;
 781   return full_ratio;
 782 }
 783
 784 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
 785 {
 786   // The OSDMap ratios take precendence.  So if the failsafe is .95 and
 787   // the admin sets the cluster full to .96, the failsafe moves up to .96
 788   // too.  (Not that having failsafe == full is ideal, but it's better than
 789   // dropping writes before the clusters appears full.)
 790   OSDMapRef osdmap = get_osdmap();
 791   if (!osdmap || osdmap->get_epoch() == 0) {
 792     return NONE;
 793   }
 794   float nearfull_ratio = osdmap->get_nearfull_ratio();
 795   float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
 796   float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
 797   float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
 798
 799   if (osdmap->require_osd_release < ceph_release_t::luminous) {
 800     // use the failsafe for nearfull and full; the mon isn't using the
 801     // flags anyway because we're mid-upgrade.
 802     full_ratio = failsafe_ratio;
 803     backfillfull_ratio = failsafe_ratio;
 804     nearfull_ratio = failsafe_ratio;
 805   } else if (full_ratio <= 0 ||
 806              backfillfull_ratio <= 0 ||
 807              nearfull_ratio <= 0) {
 808     derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
 809     // use failsafe flag.  ick.  the monitor did something wrong or the user
 810     // did something stupid.
 811     full_ratio = failsafe_ratio;
 812     backfillfull_ratio = failsafe_ratio;
 813     nearfull_ratio = failsafe_ratio;
 814   }
 815
 816   if (injectfull_state > NONE && injectfull) {
 817     inject = "(Injected)";
 818     return injectfull_state;
 819   } else if (pratio > failsafe_ratio) {
 820     return FAILSAFE;
 821   } else if (ratio > full_ratio) {
 822     return FULL;
 823   } else if (ratio > backfillfull_ratio) {
 824     return BACKFILLFULL;
 825   } else if (pratio > nearfull_ratio) {
 826     return NEARFULL;
 827   }
 828    return NONE;
 829 }
 830
 831 void OSDService::check_full_status(float ratio, float pratio)
 832 {
 833   std::lock_guard l(full_status_lock);
 834
 835   cur_ratio = ratio;
 836   physical_ratio = pratio;
 837
 838   string inject;
 839   s_names new_state;
 840   new_state = recalc_full_state(ratio, pratio, inject);
 841
 842   dout(20) << __func__ << " cur ratio " << ratio
 843            << ", physical ratio " << pratio
 844            << ", new state " << get_full_state_name(new_state)
 845            << " " << inject
 846            << dendl;
 847
 848   // warn
 849   if (cur_state != new_state) {
 850     dout(10) << __func__ << " " << get_full_state_name(cur_state)
 851              << " -> " << get_full_state_name(new_state) << dendl;
 852     if (new_state == FAILSAFE) {
 853       clog->error() << "full status failsafe engaged, dropping updates, now "
 854                     << (int)roundf(ratio * 100) << "% full";
 855     } else if (cur_state == FAILSAFE) {
 856       clog->error() << "full status failsafe disengaged, no longer dropping "
 857                      << "updates, now " << (int)roundf(ratio * 100) << "% full";
 858     }
 859     cur_state = new_state;
 860   }
 861 }
 862
 863 bool OSDService::need_fullness_update()
 864 {
 865   OSDMapRef osdmap = get_osdmap();
 866   s_names cur = NONE;
 867   if (osdmap->exists(whoami)) {
 868     if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
 869       cur = FULL;
 870     } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
 871       cur = BACKFILLFULL;
 872     } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
 873       cur = NEARFULL;
 874     }
 875   }
 876   s_names want = NONE;
 877   if (is_full())
 878     want = FULL;
 879   else if (is_backfillfull())
 880     want = BACKFILLFULL;
 881   else if (is_nearfull())
 882     want = NEARFULL;
 883   return want != cur;
 884 }
 885
 886 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
 887 {
 888   if (injectfull && injectfull_state >= type) {
 889     // injectfull is either a count of the number of times to return failsafe full
 890     // or if -1 then always return full
 891     if (injectfull > 0)
 892       --injectfull;
 893     ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
 894              << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
 895              << dendl;
 896     return true;
 897   }
 898   return false;
 899 }
 900
 901 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
 902 {
 903   std::lock_guard l(full_status_lock);
 904
 905   if (_check_inject_full(dpp, type))
 906     return true;
 907
 908   if (cur_state >= type)
 909     ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
 910                        << " physical " << physical_ratio << dendl;
 911
 912   return cur_state >= type;
 913 }
 914
 915 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
 916 {
 917   ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
 918   {
 919     std::lock_guard l(full_status_lock);
 920     if (_check_inject_full(dpp, type)) {
 921       return true;
 922     }
 923   }
 924
 925   float pratio;
 926   float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
 927
 928   string notused;
 929   s_names tentative_state = recalc_full_state(ratio, pratio, notused);
 930
 931   if (tentative_state >= type)
 932     ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
 933
 934   return tentative_state >= type;
 935 }
 936
 937 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
 938 {
 939   return _check_full(dpp, FAILSAFE);
 940 }
 941
 942 bool OSDService::check_full(DoutPrefixProvider *dpp) const
 943 {
 944   return _check_full(dpp, FULL);
 945 }
 946
 947 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
 948 {
 949   return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
 950 }
 951
 952 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
 953 {
 954   return _check_full(dpp, BACKFILLFULL);
 955 }
 956
 957 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
 958 {
 959   return _check_full(dpp, NEARFULL);
 960 }
 961
 962 bool OSDService::is_failsafe_full() const
 963 {
 964   std::lock_guard l(full_status_lock);
 965   return cur_state == FAILSAFE;
 966 }
 967
 968 bool OSDService::is_full() const
 969 {
 970   std::lock_guard l(full_status_lock);
 971   return cur_state >= FULL;
 972 }
 973
 974 bool OSDService::is_backfillfull() const
 975 {
 976   std::lock_guard l(full_status_lock);
 977   return cur_state >= BACKFILLFULL;
 978 }
 979
 980 bool OSDService::is_nearfull() const
 981 {
 982   std::lock_guard l(full_status_lock);
 983   return cur_state >= NEARFULL;
 984 }
 985
 986 void OSDService::set_injectfull(s_names type, int64_t count)
 987 {
 988   std::lock_guard l(full_status_lock);
 989   injectfull_state = type;
 990   injectfull = count;
 991 }
 992
 993 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
 994                             osd_alert_list_t& alerts)
 995 {
 996   uint64_t bytes = stbuf.total;
 997   uint64_t avail = stbuf.available;
 998   uint64_t used = stbuf.get_used_raw();
 999
1000   // For testing fake statfs values so it doesn't matter if all
1001   // OSDs are using the same partition.
1002   if (cct->_conf->fake_statfs_for_testing) {
1003     uint64_t total_num_bytes = 0;
1004     vector<PGRef> pgs;
1005     osd->_get_pgs(&pgs);
1006     for (auto p : pgs) {
1007       total_num_bytes += p->get_stats_num_bytes();
1008     }
1009     bytes = cct->_conf->fake_statfs_for_testing;
1010     if (total_num_bytes < bytes)
1011       avail = bytes - total_num_bytes;
1012     else
1013       avail = 0;
1014     dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
1015             << " adjust available " << avail
1016             << dendl;
1017     used = bytes - avail;
1018   }
1019
1020   logger->set(l_osd_stat_bytes, bytes);
1021   logger->set(l_osd_stat_bytes_used, used);
1022   logger->set(l_osd_stat_bytes_avail, avail);
1023
1024   std::lock_guard l(stat_lock);
1025   osd_stat.statfs = stbuf;
1026   osd_stat.os_alerts.clear();
1027   osd_stat.os_alerts[whoami].swap(alerts);
1028   if (cct->_conf->fake_statfs_for_testing) {
1029     osd_stat.statfs.total = bytes;
1030     osd_stat.statfs.available = avail;
1031     // For testing don't want used to go negative, so clear reserved
1032     osd_stat.statfs.internally_reserved = 0;
1033   }
1034 }
1035
1036 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
1037                                     int num_pgs)
1038 {
1039   utime_t now = ceph_clock_now();
1040   auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
1041   std::lock_guard l(stat_lock);
1042   osd_stat.hb_peers.swap(hb_peers);
1043   osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
1044   osd_stat.num_pgs = num_pgs;
1045   // Clean entries that aren't updated
1046   // This is called often enough that we can just remove 1 at a time
1047   for (auto i: osd_stat.hb_pingtime) {
1048     if (i.second.last_update == 0)
1049       continue;
1050     if (stale_time && now.sec() - i.second.last_update > stale_time) {
1051       dout(20) << __func__ << " time out heartbeat for osd " << i.first
1052                << " last_update " << i.second.last_update << dendl;
1053       osd_stat.hb_pingtime.erase(i.first);
1054       break;
1055     }
1056   }
1057   return osd_stat;
1058 }
1059
1060 void OSDService::inc_osd_stat_repaired()
1061 {
1062   std::lock_guard l(stat_lock);
1063   osd_stat.num_shards_repaired++;
1064   return;
1065 }
1066
1067 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1068                                          uint64_t adjust_used)
1069 {
1070   *pratio =
1071    ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1072
1073   if (adjust_used) {
1074     dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used()  << dendl;
1075     if (new_stat.statfs.available > adjust_used)
1076       new_stat.statfs.available -= adjust_used;
1077     else
1078       new_stat.statfs.available = 0;
1079     dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1080   }
1081
1082   // Check all pgs and adjust kb_used to include all pending backfill data
1083   int backfill_adjusted = 0;
1084   vector<PGRef> pgs;
1085   osd->_get_pgs(&pgs);
1086   for (auto p : pgs) {
1087     backfill_adjusted += p->pg_stat_adjust(&new_stat);
1088   }
1089   if (backfill_adjusted) {
1090     dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1091   }
1092   return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1093 }
1094
1095 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1096 {
1097   dout(20) << __func__ << " " << m->get_type_name() << " to osd." << peer
1098            << " from_epoch " << from_epoch << dendl;
1099   OSDMapRef next_map = get_nextmap_reserved();
1100   // service map is always newer/newest
1101   ceph_assert(from_epoch <= next_map->get_epoch());
1102
1103   if (next_map->is_down(peer) ||
1104       next_map->get_info(peer).up_from > from_epoch) {
1105     m->put();
1106     release_map(next_map);
1107     return;
1108   }
1109   ConnectionRef peer_con;
1110   if (peer == whoami) {
1111     peer_con = osd->cluster_messenger->get_loopback_connection();
1112   } else {
1113     peer_con = osd->cluster_messenger->connect_to_osd(
1114         next_map->get_cluster_addrs(peer), false, true);
1115   }
1116   maybe_share_map(peer_con.get(), next_map);
1117   peer_con->send_message(m);
1118   release_map(next_map);
1119 }
1120
1121 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1122 {
1123   dout(20) << __func__ << " from_epoch " << from_epoch << dendl;
1124   OSDMapRef next_map = get_nextmap_reserved();
1125   // service map is always newer/newest
1126   ceph_assert(from_epoch <= next_map->get_epoch());
1127
1128   for (auto& iter : messages) {
1129     if (next_map->is_down(iter.first) ||
1130         next_map->get_info(iter.first).up_from > from_epoch) {
1131       iter.second->put();
1132       continue;
1133     }
1134     ConnectionRef peer_con;
1135     if (iter.first == whoami) {
1136       peer_con = osd->cluster_messenger->get_loopback_connection();
1137     } else {
1138       peer_con = osd->cluster_messenger->connect_to_osd(
1139           next_map->get_cluster_addrs(iter.first), false, true);
1140     }
1141     maybe_share_map(peer_con.get(), next_map);
1142     peer_con->send_message(iter.second);
1143   }
1144   release_map(next_map);
1145 }
1146 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1147 {
1148   dout(20) << __func__ << " to osd." << peer
1149            << " from_epoch " << from_epoch << dendl;
1150   OSDMapRef next_map = get_nextmap_reserved();
1151   // service map is always newer/newest
1152   ceph_assert(from_epoch <= next_map->get_epoch());
1153
1154   if (next_map->is_down(peer) ||
1155       next_map->get_info(peer).up_from > from_epoch) {
1156     release_map(next_map);
1157     return NULL;
1158   }
1159   ConnectionRef con;
1160   if (peer == whoami) {
1161     con = osd->cluster_messenger->get_loopback_connection();
1162   } else {
1163     con = osd->cluster_messenger->connect_to_osd(
1164         next_map->get_cluster_addrs(peer), false, true);
1165   }
1166   release_map(next_map);
1167   return con;
1168 }
1169
1170 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1171 {
1172   dout(20) << __func__ << " to osd." << peer
1173            << " from_epoch " << from_epoch << dendl;
1174   OSDMapRef next_map = get_nextmap_reserved();
1175   // service map is always newer/newest
1176   ceph_assert(from_epoch <= next_map->get_epoch());
1177
1178   pair<ConnectionRef,ConnectionRef> ret;
1179   if (next_map->is_down(peer) ||
1180       next_map->get_info(peer).up_from > from_epoch) {
1181     release_map(next_map);
1182     return ret;
1183   }
1184   ret.first = osd->hb_back_client_messenger->connect_to_osd(
1185     next_map->get_hb_back_addrs(peer));
1186   ret.second = osd->hb_front_client_messenger->connect_to_osd(
1187     next_map->get_hb_front_addrs(peer));
1188   release_map(next_map);
1189   return ret;
1190 }
1191
1192 entity_name_t OSDService::get_cluster_msgr_name() const
1193 {
1194   return cluster_messenger->get_myname();
1195 }
1196
1197 void OSDService::queue_want_pg_temp(pg_t pgid,
1198                                     const vector<int>& want,
1199                                     bool forced)
1200 {
1201   std::lock_guard l(pg_temp_lock);
1202   auto p = pg_temp_pending.find(pgid);
1203   if (p == pg_temp_pending.end() ||
1204       p->second.acting != want ||
1205       forced) {
1206     pg_temp_wanted[pgid] = {want, forced};
1207   }
1208 }
1209
1210 void OSDService::remove_want_pg_temp(pg_t pgid)
1211 {
1212   std::lock_guard l(pg_temp_lock);
1213   pg_temp_wanted.erase(pgid);
1214   pg_temp_pending.erase(pgid);
1215 }
1216
1217 void OSDService::_sent_pg_temp()
1218 {
1219 #ifdef HAVE_STDLIB_MAP_SPLICING
1220   pg_temp_pending.merge(pg_temp_wanted);
1221 #else
1222   pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1223                          make_move_iterator(end(pg_temp_wanted)));
1224 #endif
1225   pg_temp_wanted.clear();
1226 }
1227
1228 void OSDService::requeue_pg_temp()
1229 {
1230   std::lock_guard l(pg_temp_lock);
1231   // wanted overrides pending.  note that remove_want_pg_temp
1232   // clears the item out of both.
1233   unsigned old_wanted = pg_temp_wanted.size();
1234   unsigned old_pending = pg_temp_pending.size();
1235   _sent_pg_temp();
1236   pg_temp_wanted.swap(pg_temp_pending);
1237   dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1238            << pg_temp_wanted.size() << dendl;
1239 }
1240
1241 std::ostream& operator<<(std::ostream& out,
1242                          const OSDService::pg_temp_t& pg_temp)
1243 {
1244   out << pg_temp.acting;
1245   if (pg_temp.forced) {
1246     out << " (forced)";
1247   }
1248   return out;
1249 }
1250
1251 void OSDService::send_pg_temp()
1252 {
1253   std::lock_guard l(pg_temp_lock);
1254   if (pg_temp_wanted.empty())
1255     return;
1256   dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1257   MOSDPGTemp *ms[2] = {nullptr, nullptr};
1258   for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1259     auto& m = ms[pg_temp.forced];
1260     if (!m) {
1261       m = new MOSDPGTemp(osdmap->get_epoch());
1262       m->forced = pg_temp.forced;
1263     }
1264     m->pg_temp.emplace(pgid, pg_temp.acting);
1265   }
1266   for (auto m : ms) {
1267     if (m) {
1268       monc->send_mon_message(m);
1269     }
1270   }
1271   _sent_pg_temp();
1272 }
1273
1274 void OSDService::send_pg_created(pg_t pgid)
1275 {
1276   std::lock_guard l(pg_created_lock);
1277   dout(20) << __func__ << dendl;
1278   auto o = get_osdmap();
1279   if (o->require_osd_release >= ceph_release_t::luminous) {
1280     pg_created.insert(pgid);
1281     monc->send_mon_message(new MOSDPGCreated(pgid));
1282   }
1283 }
1284
1285 void OSDService::send_pg_created()
1286 {
1287   std::lock_guard l(pg_created_lock);
1288   dout(20) << __func__ << dendl;
1289   auto o = get_osdmap();
1290   if (o->require_osd_release >= ceph_release_t::luminous) {
1291     for (auto pgid : pg_created) {
1292       monc->send_mon_message(new MOSDPGCreated(pgid));
1293     }
1294   }
1295 }
1296
1297 void OSDService::prune_pg_created()
1298 {
1299   std::lock_guard l(pg_created_lock);
1300   dout(20) << __func__ << dendl;
1301   auto o = get_osdmap();
1302   auto i = pg_created.begin();
1303   while (i != pg_created.end()) {
1304     auto p = o->get_pg_pool(i->pool());
1305     if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1306       dout(20) << __func__ << " pruning " << *i << dendl;
1307       i = pg_created.erase(i);
1308     } else {
1309       dout(20) << __func__ << " keeping " << *i << dendl;
1310       ++i;
1311     }
1312   }
1313 }
1314
1315
1316 // --------------------------------------
1317 // dispatch
1318
1319 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1320                                  epoch_t *_bind_epoch) const
1321 {
1322   std::lock_guard l(epoch_lock);
1323   if (_boot_epoch)
1324     *_boot_epoch = boot_epoch;
1325   if (_up_epoch)
1326     *_up_epoch = up_epoch;
1327   if (_bind_epoch)
1328     *_bind_epoch = bind_epoch;
1329 }
1330
1331 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1332                             const epoch_t *_bind_epoch)
1333 {
1334   std::lock_guard l(epoch_lock);
1335   if (_boot_epoch) {
1336     ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1337     boot_epoch = *_boot_epoch;
1338   }
1339   if (_up_epoch) {
1340     ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1341     up_epoch = *_up_epoch;
1342   }
1343   if (_bind_epoch) {
1344     ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1345     bind_epoch = *_bind_epoch;
1346   }
1347 }
1348
1349 bool OSDService::prepare_to_stop()
1350 {
1351   std::unique_lock l(is_stopping_lock);
1352   if (get_state() != NOT_STOPPING)
1353     return false;
1354
1355   OSDMapRef osdmap = get_osdmap();
1356   if (osdmap && osdmap->is_up(whoami)) {
1357     dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
1358     set_state(PREPARING_TO_STOP);
1359     monc->send_mon_message(
1360       new MOSDMarkMeDown(
1361         monc->get_fsid(),
1362         whoami,
1363         osdmap->get_addrs(whoami),
1364         osdmap->get_epoch(),
1365         true,  // request ack
1366         true   // mark as down and dead
1367         ));
1368     const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1369     is_stopping_cond.wait_for(l, timeout,
1370       [this] { return get_state() == STOPPING; });
1371   }
1372
1373   dout(0) << __func__ << " starting shutdown" << dendl;
1374   set_state(STOPPING);
1375   return true;
1376 }
1377
1378 void OSDService::got_stop_ack()
1379 {
1380   std::scoped_lock l(is_stopping_lock);
1381   if (get_state() == PREPARING_TO_STOP) {
1382     dout(0) << __func__ << " starting shutdown" << dendl;
1383     set_state(STOPPING);
1384     is_stopping_cond.notify_all();
1385   } else {
1386     dout(10) << __func__ << " ignoring msg" << dendl;
1387   }
1388 }
1389
1390 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1391                                                OSDSuperblock& sblock)
1392 {
1393   MOSDMap *m = new MOSDMap(monc->get_fsid(),
1394                            osdmap->get_encoding_features());
1395   m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound;
1396   m->newest_map = sblock.newest_map;
1397
1398   int max = cct->_conf->osd_map_message_max;
1399   ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1400
1401   if (since < m->cluster_osdmap_trim_lower_bound) {
1402     // we don't have the next map the target wants, so start with a
1403     // full map.
1404     bufferlist bl;
1405     dout(10) << __func__ << " cluster osdmap lower bound "
1406              << sblock.cluster_osdmap_trim_lower_bound
1407              << " > since " << since << ", starting with full map"
1408              << dendl;
1409     since = m->cluster_osdmap_trim_lower_bound;
1410     if (!get_map_bl(since, bl)) {
1411       derr << __func__ << " missing full map " << since << dendl;
1412       goto panic;
1413     }
1414     max--;
1415     max_bytes -= bl.length();
1416     m->maps[since] = std::move(bl);
1417   }
1418   for (epoch_t e = since + 1; e <= to; ++e) {
1419     bufferlist bl;
1420     if (get_inc_map_bl(e, bl)) {
1421       m->incremental_maps[e] = std::move(bl);
1422     } else {
1423       dout(10) << __func__ << " missing incremental map " << e << dendl;
1424       if (!get_map_bl(e, bl)) {
1425         derr << __func__ << " also missing full map " << e << dendl;
1426         goto panic;
1427       }
1428       m->maps[e] = std::move(bl);
1429     }
1430     max--;
1431     max_bytes -= bl.length();
1432     if (max <= 0 || max_bytes <= 0) {
1433       break;
1434     }
1435   }
1436   return m;
1437
1438  panic:
1439   if (!m->maps.empty() ||
1440       !m->incremental_maps.empty()) {
1441     // send what we have so far
1442     return m;
1443   }
1444   // send something
1445   bufferlist bl;
1446   if (get_inc_map_bl(m->newest_map, bl)) {
1447     m->incremental_maps[m->newest_map] = std::move(bl);
1448   } else {
1449     derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1450     if (!get_map_bl(m->newest_map, bl)) {
1451       derr << __func__ << " unable to load latest full map " << m->newest_map
1452            << dendl;
1453       ceph_abort();
1454     }
1455     m->maps[m->newest_map] = std::move(bl);
1456   }
1457   return m;
1458 }
1459
1460 void OSDService::send_map(MOSDMap *m, Connection *con)
1461 {
1462   con->send_message(m);
1463 }
1464
1465 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1466                                       const OSDMapRef& osdmap)
1467 {
1468   epoch_t to = osdmap->get_epoch();
1469   dout(10) << "send_incremental_map " << since << " -> " << to
1470            << " to " << con << " " << con->get_peer_addr() << dendl;
1471
1472   MOSDMap *m = NULL;
1473   while (!m) {
1474     OSDSuperblock sblock(get_superblock());
1475     if (since < sblock.oldest_map) {
1476       // just send latest full map
1477       MOSDMap *m = new MOSDMap(monc->get_fsid(),
1478                                osdmap->get_encoding_features());
1479       m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound;
1480       m->newest_map = sblock.newest_map;
1481       get_map_bl(to, m->maps[to]);
1482       send_map(m, con);
1483       return;
1484     }
1485
1486     if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1487       dout(10) << "  " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1488                << ", only sending most recent" << dendl;
1489       since = to - cct->_conf->osd_map_share_max_epochs;
1490     }
1491
1492     m = build_incremental_map_msg(since, to, sblock);
1493   }
1494   send_map(m, con);
1495 }
1496
1497 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1498 {
1499   bool found = map_bl_cache.lookup(e, &bl);
1500   if (found) {
1501     logger->inc(l_osd_map_bl_cache_hit);
1502     return true;
1503   }
1504   logger->inc(l_osd_map_bl_cache_miss);
1505   found = store->read(meta_ch,
1506                       OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1507                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1508   if (found) {
1509     _add_map_bl(e, bl);
1510   }
1511   return found;
1512 }
1513
1514 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1515 {
1516   std::lock_guard l(map_cache_lock);
1517   bool found = map_bl_inc_cache.lookup(e, &bl);
1518   if (found) {
1519     logger->inc(l_osd_map_bl_cache_hit);
1520     return true;
1521   }
1522   logger->inc(l_osd_map_bl_cache_miss);
1523   found = store->read(meta_ch,
1524                       OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1525                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1526   if (found) {
1527     _add_map_inc_bl(e, bl);
1528   }
1529   return found;
1530 }
1531
1532 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1533 {
1534   dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1535   // cache a contiguous buffer
1536   if (bl.get_num_buffers() > 1) {
1537     bl.rebuild();
1538   }
1539   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1540   map_bl_cache.add(e, bl);
1541 }
1542
1543 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1544 {
1545   dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1546   // cache a contiguous buffer
1547   if (bl.get_num_buffers() > 1) {
1548     bl.rebuild();
1549   }
1550   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1551   map_bl_inc_cache.add(e, bl);
1552 }
1553
1554 OSDMapRef OSDService::_add_map(OSDMap *o)
1555 {
1556   epoch_t e = o->get_epoch();
1557
1558   if (cct->_conf->osd_map_dedup) {
1559     // Dedup against an existing map at a nearby epoch
1560     OSDMapRef for_dedup = map_cache.lower_bound(e);
1561     if (for_dedup) {
1562       OSDMap::dedup(for_dedup.get(), o);
1563     }
1564   }
1565   bool existed;
1566   OSDMapRef l = map_cache.add(e, o, &existed);
1567   if (existed) {
1568     delete o;
1569   }
1570   return l;
1571 }
1572
1573 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1574 {
1575   std::lock_guard l(map_cache_lock);
1576   OSDMapRef retval = map_cache.lookup(epoch);
1577   if (retval) {
1578     dout(30) << "get_map " << epoch << " -cached" << dendl;
1579     logger->inc(l_osd_map_cache_hit);
1580     return retval;
1581   }
1582   {
1583     logger->inc(l_osd_map_cache_miss);
1584     epoch_t lb = map_cache.cached_key_lower_bound();
1585     if (epoch < lb) {
1586       dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1587       logger->inc(l_osd_map_cache_miss_low);
1588       logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1589     }
1590   }
1591
1592   OSDMap *map = new OSDMap;
1593   if (epoch > 0) {
1594     dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1595     bufferlist bl;
1596     if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1597       derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1598       delete map;
1599       return OSDMapRef();
1600     }
1601     map->decode(bl);
1602   } else {
1603     dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1604   }
1605   return _add_map(map);
1606 }
1607
1608 // ops
1609
1610
1611 void OSDService::reply_op_error(OpRequestRef op, int err)
1612 {
1613   reply_op_error(op, err, eversion_t(), 0, {});
1614 }
1615
1616 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1617                                 version_t uv,
1618                                 vector<pg_log_op_return_item_t> op_returns)
1619 {
1620   auto m = op->get_req<MOSDOp>();
1621   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1622   int flags;
1623   flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1624
1625   MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1626                                        !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1627   reply->set_reply_versions(v, uv);
1628   reply->set_op_returns(op_returns);
1629   m->get_connection()->send_message(reply);
1630 }
1631
1632 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1633 {
1634   if (!cct->_conf->osd_debug_misdirected_ops) {
1635     return;
1636   }
1637
1638   auto m = op->get_req<MOSDOp>();
1639   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1640
1641   ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1642
1643   if (pg->is_ec_pg()) {
1644     /**
1645        * OSD recomputes op target based on current OSDMap. With an EC pg, we
1646        * can get this result:
1647        * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1648        *    [CRUSH_ITEM_NONE, 2, 3]/3
1649        * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1650        *    [3, 2, 3]/3
1651        * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1652        *    -- misdirected op
1653        * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1654        *    it and fulfils it
1655        *
1656        * We can't compute the op target based on the sending map epoch due to
1657        * splitting.  The simplest thing is to detect such cases here and drop
1658        * them without an error (the client will resend anyway).
1659        */
1660     ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1661     OSDMapRef opmap = try_get_map(m->get_map_epoch());
1662     if (!opmap) {
1663       dout(7) << __func__ << ": " << *pg << " no longer have map for "
1664               << m->get_map_epoch() << ", dropping" << dendl;
1665       return;
1666     }
1667     pg_t _pgid = m->get_raw_pg();
1668     spg_t pgid;
1669     if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1670       _pgid = opmap->raw_pg_to_pg(_pgid);
1671     if (opmap->get_primary_shard(_pgid, &pgid) &&
1672         pgid.shard != pg->pg_id.shard) {
1673       dout(7) << __func__ << ": " << *pg << " primary changed since "
1674               << m->get_map_epoch() << ", dropping" << dendl;
1675       return;
1676     }
1677   }
1678
1679   dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1680   clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1681                << " pg " << m->get_raw_pg()
1682                << " to osd." << whoami
1683                << " not " << pg->get_acting()
1684                << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1685 }
1686
1687 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1688 {
1689   osd->op_shardedwq.queue(std::move(qi));
1690 }
1691
1692 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1693 {
1694   osd->op_shardedwq.queue_front(std::move(qi));
1695 }
1696
1697 void OSDService::queue_recovery_context(
1698   PG *pg,
1699   GenContext<ThreadPool::TPHandle&> *c,
1700   uint64_t cost,
1701   int priority)
1702 {
1703   epoch_t e = get_osdmap_epoch();
1704
1705   uint64_t cost_for_queue = [this, cost] {
1706     if (cct->_conf->osd_op_queue == "mclock_scheduler") {
1707       return cost;
1708     } else {
1709       /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
1710        * require very large costs for several messages in order to do any
1711        * meaningful amount of throttling.  This branch should be removed after
1712        * Reef.
1713        */
1714       return cct->_conf->osd_recovery_cost;
1715     }
1716   }();
1717
1718   enqueue_back(
1719     OpSchedulerItem(
1720       unique_ptr<OpSchedulerItem::OpQueueable>(
1721         new PGRecoveryContext(pg->get_pgid(), c, e, priority)),
1722       cost_for_queue,
1723       cct->_conf->osd_recovery_priority,
1724       ceph_clock_now(),
1725       0,
1726       e));
1727 }
1728
1729 void OSDService::queue_for_snap_trim(PG *pg)
1730 {
1731   dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1732   enqueue_back(
1733     OpSchedulerItem(
1734       unique_ptr<OpSchedulerItem::OpQueueable>(
1735         new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1736       cct->_conf->osd_snap_trim_cost,
1737       cct->_conf->osd_snap_trim_priority,
1738       ceph_clock_now(),
1739       0,
1740       pg->get_osdmap_epoch()));
1741 }
1742
1743 template <class MSG_TYPE>
1744 void OSDService::queue_scrub_event_msg(PG* pg,
1745                                        Scrub::scrub_prio_t with_priority,
1746                                        unsigned int qu_priority,
1747                                        Scrub::act_token_t act_token)
1748 {
1749   const auto epoch = pg->get_osdmap_epoch();
1750   auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
1751   dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
1752            << ". Epoch: " << epoch << " token: " << act_token << dendl;
1753   enqueue_back(OpSchedulerItem(
1754     unique_ptr<OpSchedulerItem::OpQueueable>(msg), get_scrub_cost(),
1755     pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1756 }
1757
1758 template <class MSG_TYPE>
1759 void OSDService::queue_scrub_event_msg(PG* pg,
1760                                        Scrub::scrub_prio_t with_priority)
1761 {
1762   const auto epoch = pg->get_osdmap_epoch();
1763   auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1764   dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1765   enqueue_back(OpSchedulerItem(
1766     unique_ptr<OpSchedulerItem::OpQueueable>(msg), get_scrub_cost(),
1767     pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1768 }
1769
1770 int64_t OSDService::get_scrub_cost()
1771 {
1772
1773   int64_t cost_for_queue = cct->_conf->osd_scrub_cost;
1774   if (cct->_conf->osd_op_queue == "mclock_scheduler") {
1775     cost_for_queue = cct->_conf->osd_scrub_event_cost *
1776                      cct->_conf->osd_shallow_scrub_chunk_max;
1777   }
1778   return cost_for_queue;
1779 }
1780
1781 void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1782 {
1783   queue_scrub_event_msg<PGScrub>(pg, with_priority);
1784 }
1785
1786 void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1787 {
1788   queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1789 }
1790
1791 void OSDService::queue_for_rep_scrub(PG* pg,
1792                                      Scrub::scrub_prio_t with_priority,
1793                                      unsigned int qu_priority,
1794                                      Scrub::act_token_t act_token)
1795 {
1796   queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
1797 }
1798
1799 void OSDService::queue_for_rep_scrub_resched(PG* pg,
1800                                              Scrub::scrub_prio_t with_priority,
1801                                              unsigned int qu_priority,
1802                                              Scrub::act_token_t act_token)
1803 {
1804   // Resulting scrub event: 'SchedReplica'
1805   queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
1806                                            act_token);
1807 }
1808
1809 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1810 {
1811   // Resulting scrub event: 'RemotesReserved'
1812   queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1813 }
1814
1815 void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1816 {
1817   // Resulting scrub event: 'ReservationFailure'
1818   queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1819 }
1820
1821 void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1822 {
1823   // Resulting scrub event: 'InternalSchedScrub'
1824   queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1825 }
1826
1827 void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1828 {
1829   // Resulting scrub event: 'ActivePushesUpd'
1830   queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1831 }
1832
1833 void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
1834 {
1835   // Resulting scrub event: 'SelectedChunkFree'
1836   queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
1837 }
1838
1839 void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
1840 {
1841   // Resulting scrub event: 'ChunkIsBusy'
1842   queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
1843 }
1844
1845 void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1846 {
1847   queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1848 }
1849
1850 void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1851 {
1852   // Resulting scrub event: 'Unblocked'
1853   queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1854 }
1855
1856 void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1857 {
1858   // Resulting scrub event: 'DigestUpdate'
1859   queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1860 }
1861
1862 void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
1863 {
1864   // Resulting scrub event: 'IntLocalMapDone'
1865   queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
1866 }
1867
1868 void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1869 {
1870   // Resulting scrub event: 'GotReplicas'
1871   queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1872 }
1873
1874 void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1875 {
1876   // Resulting scrub event: 'ReplicaPushesUpd'
1877   queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1878 }
1879
1880 void OSDService::queue_scrub_is_finished(PG *pg)
1881 {
1882   // Resulting scrub event: 'ScrubFinished'
1883   queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
1884 }
1885
1886 void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
1887 {
1888   // Resulting scrub event: 'NextChunk'
1889   queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
1890 }
1891
1892 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1893 {
1894   dout(10) << __func__ << " on " << pgid << " e " << e  << dendl;
1895   enqueue_back(
1896     OpSchedulerItem(
1897       unique_ptr<OpSchedulerItem::OpQueueable>(
1898         new PGDelete(pgid, e)),
1899       cct->_conf->osd_pg_delete_cost,
1900       cct->_conf->osd_pg_delete_priority,
1901       ceph_clock_now(),
1902       0,
1903       e));
1904 }
1905
1906 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1907 {
1908   return osd->try_finish_pg_delete(pg, old_pg_num);
1909 }
1910
1911 // ---
1912
1913 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1914 {
1915   std::lock_guard l(merge_lock);
1916   dout(10) << __func__ << " " << pg->pg_id << dendl;
1917   ready_to_merge_source[pg->pg_id.pgid] = version;
1918   assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1919   _send_ready_to_merge();
1920 }
1921
1922 void OSDService::set_ready_to_merge_target(PG *pg,
1923                                            eversion_t version,
1924                                            epoch_t last_epoch_started,
1925                                            epoch_t last_epoch_clean)
1926 {
1927   std::lock_guard l(merge_lock);
1928   dout(10) << __func__ << " " << pg->pg_id << dendl;
1929   ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1930                                          make_tuple(version,
1931                                                     last_epoch_started,
1932                                                     last_epoch_clean)));
1933   assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1934   _send_ready_to_merge();
1935 }
1936
1937 void OSDService::set_not_ready_to_merge_source(pg_t source)
1938 {
1939   std::lock_guard l(merge_lock);
1940   dout(10) << __func__ << " " << source << dendl;
1941   not_ready_to_merge_source.insert(source);
1942   assert(ready_to_merge_source.count(source) == 0);
1943   _send_ready_to_merge();
1944 }
1945
1946 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1947 {
1948   std::lock_guard l(merge_lock);
1949   dout(10) << __func__ << " " << target << " source " << source << dendl;
1950   not_ready_to_merge_target[target] = source;
1951   assert(ready_to_merge_target.count(target) == 0);
1952   _send_ready_to_merge();
1953 }
1954
1955 void OSDService::send_ready_to_merge()
1956 {
1957   std::lock_guard l(merge_lock);
1958   _send_ready_to_merge();
1959 }
1960
1961 void OSDService::_send_ready_to_merge()
1962 {
1963   dout(20) << __func__
1964            << " ready_to_merge_source " << ready_to_merge_source
1965            << " not_ready_to_merge_source " << not_ready_to_merge_source
1966            << " ready_to_merge_target " << ready_to_merge_target
1967            << " not_ready_to_merge_target " << not_ready_to_merge_target
1968            << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1969            << dendl;
1970   for (auto src : not_ready_to_merge_source) {
1971     if (sent_ready_to_merge_source.count(src) == 0) {
1972       monc->send_mon_message(new MOSDPGReadyToMerge(
1973                                src,
1974                                {}, {}, 0, 0,
1975                                false,
1976                                osdmap->get_epoch()));
1977       sent_ready_to_merge_source.insert(src);
1978     }
1979   }
1980   for (auto p : not_ready_to_merge_target) {
1981     if (sent_ready_to_merge_source.count(p.second) == 0) {
1982       monc->send_mon_message(new MOSDPGReadyToMerge(
1983                                p.second,
1984                                {}, {}, 0, 0,
1985                                false,
1986                                osdmap->get_epoch()));
1987       sent_ready_to_merge_source.insert(p.second);
1988     }
1989   }
1990   for (auto src : ready_to_merge_source) {
1991     if (not_ready_to_merge_source.count(src.first) ||
1992         not_ready_to_merge_target.count(src.first.get_parent())) {
1993       continue;
1994     }
1995     auto p = ready_to_merge_target.find(src.first.get_parent());
1996     if (p != ready_to_merge_target.end() &&
1997         sent_ready_to_merge_source.count(src.first) == 0) {
1998       monc->send_mon_message(new MOSDPGReadyToMerge(
1999                                src.first,           // source pgid
2000                                src.second,          // src version
2001                                std::get<0>(p->second), // target version
2002                                std::get<1>(p->second), // PG's last_epoch_started
2003                                std::get<2>(p->second), // PG's last_epoch_clean
2004                                true,
2005                                osdmap->get_epoch()));
2006       sent_ready_to_merge_source.insert(src.first);
2007     }
2008   }
2009 }
2010
2011 void OSDService::clear_ready_to_merge(PG *pg)
2012 {
2013   std::lock_guard l(merge_lock);
2014   dout(10) << __func__ << " " << pg->pg_id << dendl;
2015   ready_to_merge_source.erase(pg->pg_id.pgid);
2016   ready_to_merge_target.erase(pg->pg_id.pgid);
2017   not_ready_to_merge_source.erase(pg->pg_id.pgid);
2018   not_ready_to_merge_target.erase(pg->pg_id.pgid);
2019   sent_ready_to_merge_source.erase(pg->pg_id.pgid);
2020 }
2021
2022 void OSDService::clear_sent_ready_to_merge()
2023 {
2024   std::lock_guard l(merge_lock);
2025   sent_ready_to_merge_source.clear();
2026 }
2027
2028 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
2029 {
2030   std::lock_guard l(merge_lock);
2031   auto i = sent_ready_to_merge_source.begin();
2032   while (i != sent_ready_to_merge_source.end()) {
2033     if (!osdmap->pg_exists(*i)) {
2034       dout(10) << __func__ << " " << *i << dendl;
2035       i = sent_ready_to_merge_source.erase(i);
2036     } else {
2037       dout(20) << __func__ << " exist " << *i << dendl;
2038       ++i;
2039     }
2040   }
2041 }
2042
2043 // ---
2044
2045 void OSDService::_queue_for_recovery(
2046   pg_awaiting_throttle_t p,
2047   uint64_t reserved_pushes)
2048 {
2049   ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
2050
2051   uint64_t cost_for_queue = [this, &reserved_pushes, &p] {
2052     if (cct->_conf->osd_op_queue == "mclock_scheduler") {
2053       return p.cost_per_object * reserved_pushes;
2054     } else {
2055       /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
2056        * require very large costs for several messages in order to do any
2057        * meaningful amount of throttling.  This branch should be removed after
2058        * Reef.
2059        */
2060       return cct->_conf->osd_recovery_cost;
2061     }
2062   }();
2063
2064   enqueue_back(
2065     OpSchedulerItem(
2066       unique_ptr<OpSchedulerItem::OpQueueable>(
2067         new PGRecovery(
2068           p.pg->get_pgid(),
2069           p.epoch_queued,
2070           reserved_pushes,
2071           p.priority)),
2072       cost_for_queue,
2073       cct->_conf->osd_recovery_priority,
2074       ceph_clock_now(),
2075       0,
2076       p.epoch_queued));
2077 }
2078
2079 // ====================================================================
2080 // OSD
2081
2082 #undef dout_prefix
2083 #define dout_prefix *_dout
2084
2085 // Commands shared between OSD's console and admin console:
2086 namespace ceph::osd_cmds {
2087
2088 int heap(CephContext& cct,
2089          const cmdmap_t& cmdmap,
2090          std::ostream& outos,
2091          std::ostream& erros);
2092
2093 } // namespace ceph::osd_cmds
2094
2095 int OSD::mkfs(CephContext *cct,
2096               std::unique_ptr<ObjectStore> store,
2097               uuid_d fsid,
2098               int whoami,
2099               string osdspec_affinity)
2100 {
2101   int ret;
2102
2103   OSDSuperblock sb;
2104   bufferlist sbbl;
2105   // if we are fed a uuid for this osd, use it.
2106   store->set_fsid(cct->_conf->osd_uuid);
2107
2108   ret = store->mkfs();
2109   if (ret) {
2110     derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2111          << cpp_strerror(ret) << dendl;
2112     return ret;
2113   }
2114
2115   store->set_cache_shards(1);  // doesn't matter for mkfs!
2116
2117   ret = store->mount();
2118   if (ret) {
2119     derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2120          << cpp_strerror(ret) << dendl;
2121     return ret;
2122   }
2123
2124   auto umount_store = make_scope_guard([&] {
2125     store->umount();
2126   });
2127
2128   ObjectStore::CollectionHandle ch =
2129     store->open_collection(coll_t::meta());
2130   if (ch) {
2131     ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2132     if (ret < 0) {
2133       derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2134       return ret;
2135     }
2136     /* if we already have superblock, check content of superblock */
2137     dout(0) << " have superblock" << dendl;
2138     auto p = sbbl.cbegin();
2139     decode(sb, p);
2140     if (whoami != sb.whoami) {
2141       derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2142            << dendl;
2143       return -EINVAL;
2144     }
2145     if (fsid != sb.cluster_fsid) {
2146       derr << "provided cluster fsid " << fsid
2147            << " != superblock's " << sb.cluster_fsid << dendl;
2148       return -EINVAL;
2149     }
2150   } else {
2151     // create superblock
2152     sb.cluster_fsid = fsid;
2153     sb.osd_fsid = store->get_fsid();
2154     sb.whoami = whoami;
2155     sb.compat_features = get_osd_initial_compat_set();
2156
2157     bufferlist bl;
2158     encode(sb, bl);
2159
2160     ObjectStore::CollectionHandle ch = store->create_new_collection(
2161       coll_t::meta());
2162     ObjectStore::Transaction t;
2163     t.create_collection(coll_t::meta(), 0);
2164     t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2165     ret = store->queue_transaction(ch, std::move(t));
2166     if (ret) {
2167       derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2168            << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2169       return ret;
2170     }
2171     ch->flush();
2172   }
2173
2174   ret = write_meta(cct, store.get(), sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
2175   if (ret) {
2176     derr << "OSD::mkfs: failed to write fsid file: error "
2177          << cpp_strerror(ret) << dendl;
2178   }
2179   return ret;
2180 }
2181
2182 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2183 {
2184   char val[80];
2185   int r;
2186
2187   snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2188   r = store->write_meta("magic", val);
2189   if (r < 0)
2190     return r;
2191
2192   snprintf(val, sizeof(val), "%d", whoami);
2193   r = store->write_meta("whoami", val);
2194   if (r < 0)
2195     return r;
2196
2197   cluster_fsid.print(val);
2198   r = store->write_meta("ceph_fsid", val);
2199   if (r < 0)
2200     return r;
2201
2202   string key = cct->_conf.get_val<string>("key");
2203   if (key.size()) {
2204     r = store->write_meta("osd_key", key);
2205     if (r < 0)
2206       return r;
2207   } else {
2208     string keyfile = cct->_conf.get_val<string>("keyfile");
2209     if (!keyfile.empty()) {
2210       bufferlist keybl;
2211       string err;
2212       r = keybl.read_file(keyfile.c_str(), &err);
2213       if (r < 0) {
2214         derr << __func__ << " failed to read keyfile " << keyfile << ": "
2215              << err << ": " << cpp_strerror(r) << dendl;
2216         return r;
2217       }
2218       r = store->write_meta("osd_key", keybl.to_str());
2219       if (r < 0)
2220         return r;
2221     }
2222   }
2223   if (!osdspec_affinity.empty()) {
2224     r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2225     if (r < 0)
2226       return r;
2227   }
2228
2229   r = store->write_meta("ceph_version_when_created", pretty_version_to_str());
2230   if (r < 0)
2231     return r;
2232
2233   ostringstream created_at;
2234   utime_t now = ceph_clock_now();
2235   now.gmtime(created_at);
2236   r = store->write_meta("created_at", created_at.str());
2237   if (r < 0)
2238     return r;
2239
2240   r = store->write_meta("ready", "ready");
2241   if (r < 0)
2242     return r;
2243
2244   return 0;
2245 }
2246
2247 int OSD::peek_meta(ObjectStore *store,
2248                    std::string *magic,
2249                    uuid_d *cluster_fsid,
2250                    uuid_d *osd_fsid,
2251                    int *whoami,
2252                    ceph_release_t *require_osd_release)
2253 {
2254   string val;
2255
2256   int r = store->read_meta("magic", &val);
2257   if (r < 0)
2258     return r;
2259   *magic = val;
2260
2261   r = store->read_meta("whoami", &val);
2262   if (r < 0)
2263     return r;
2264   *whoami = atoi(val.c_str());
2265
2266   r = store->read_meta("ceph_fsid", &val);
2267   if (r < 0)
2268     return r;
2269   r = cluster_fsid->parse(val.c_str());
2270   if (!r)
2271     return -EINVAL;
2272
2273   r = store->read_meta("fsid", &val);
2274   if (r < 0) {
2275     *osd_fsid = uuid_d();
2276   } else {
2277     r = osd_fsid->parse(val.c_str());
2278     if (!r)
2279       return -EINVAL;
2280   }
2281
2282   r = store->read_meta("require_osd_release", &val);
2283   if (r >= 0) {
2284     *require_osd_release = ceph_release_from_name(val);
2285   }
2286
2287   return 0;
2288 }
2289
2290
2291 #undef dout_prefix
2292 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2293
2294 // cons/des
2295
2296 OSD::OSD(CephContext *cct_,
2297          std::unique_ptr<ObjectStore> store_,
2298          int id,
2299          Messenger *internal_messenger,
2300          Messenger *external_messenger,
2301          Messenger *hb_client_front,
2302          Messenger *hb_client_back,
2303          Messenger *hb_front_serverm,
2304          Messenger *hb_back_serverm,
2305          Messenger *osdc_messenger,
2306          MonClient *mc,
2307          const std::string &dev, const std::string &jdev,
2308          ceph::async::io_context_pool& poolctx) :
2309   Dispatcher(cct_),
2310   tick_timer(cct, osd_lock),
2311   tick_timer_without_osd_lock(cct, tick_timer_lock),
2312   gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2313   cluster_messenger(internal_messenger),
2314   client_messenger(external_messenger),
2315   objecter_messenger(osdc_messenger),
2316   monc(mc),
2317   mgrc(cct_, client_messenger, &mc->monmap),
2318   logger(create_logger()),
2319   recoverystate_perf(create_recoverystate_perf()),
2320   store(std::move(store_)),
2321   log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2322   clog(log_client.create_channel()),
2323   whoami(id),
2324   dev_path(dev), journal_path(jdev),
2325   store_is_rotational(store->is_rotational()),
2326   trace_endpoint("0.0.0.0", 0, "osd"),
2327   asok_hook(NULL),
2328   m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2329                                   "osd_pg_epoch_max_lag_factor")),
2330   osd_compat(get_osd_compat_set()),
2331   osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2332             get_num_op_threads()),
2333   heartbeat_stop(false),
2334   heartbeat_need_update(true),
2335   hb_front_client_messenger(hb_client_front),
2336   hb_back_client_messenger(hb_client_back),
2337   hb_front_server_messenger(hb_front_serverm),
2338   hb_back_server_messenger(hb_back_serverm),
2339   daily_loadavg(0.0),
2340   heartbeat_thread(this),
2341   heartbeat_dispatcher(this),
2342   op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2343                   cct->_conf->osd_num_op_tracker_shard),
2344   test_ops_hook(NULL),
2345   op_shardedwq(
2346     this,
2347     ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2348     ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
2349     &osd_op_tp),
2350   last_pg_create_epoch(0),
2351   boot_finisher(cct),
2352   up_thru_wanted(0),
2353   requested_full_first(0),
2354   requested_full_last(0),
2355   service(this, poolctx)
2356 {
2357
2358   if (!gss_ktfile_client.empty()) {
2359     // Assert we can export environment variable
2360     /*
2361         The default client keytab is used, if it is present and readable,
2362         to automatically obtain initial credentials for GSSAPI client
2363         applications. The principal name of the first entry in the client
2364         keytab is used by default when obtaining initial credentials.
2365         1. The KRB5_CLIENT_KTNAME environment variable.
2366         2. The default_client_keytab_name profile variable in [libdefaults].
2367         3. The hardcoded default, DEFCKTNAME.
2368     */
2369     const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2370                                     gss_ktfile_client.c_str(), 1));
2371     ceph_assert(set_result == 0);
2372   }
2373
2374   monc->set_messenger(client_messenger);
2375   op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2376                                          cct->_conf->osd_op_log_threshold);
2377   op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2378                                            cct->_conf->osd_op_history_duration);
2379   op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2380                                                     cct->_conf->osd_op_history_slow_op_threshold);
2381   ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2382 #ifdef WITH_BLKIN
2383   std::stringstream ss;
2384   ss << "osd." << whoami;
2385   trace_endpoint.copy_name(ss.str());
2386 #endif
2387
2388   // initialize shards
2389   num_shards = get_num_op_shards();
2390   for (uint32_t i = 0; i < num_shards; i++) {
2391     OSDShard *one_shard = new OSDShard(
2392       i,
2393       cct,
2394       this);
2395     shards.push_back(one_shard);
2396   }
2397 }
2398
2399 OSD::~OSD()
2400 {
2401   while (!shards.empty()) {
2402     delete shards.back();
2403     shards.pop_back();
2404   }
2405   cct->get_perfcounters_collection()->remove(recoverystate_perf);
2406   cct->get_perfcounters_collection()->remove(logger);
2407   delete recoverystate_perf;
2408   delete logger;
2409 }
2410
2411 double OSD::get_tick_interval() const
2412 {
2413   // vary +/- 5% to avoid scrub scheduling livelocks
2414   constexpr auto delta = 0.05;
2415   return (OSD_TICK_INTERVAL *
2416           ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2417 }
2418
2419 void OSD::handle_signal(int signum)
2420 {
2421   ceph_assert(signum == SIGINT || signum == SIGTERM);
2422   derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2423   shutdown();
2424 }
2425
2426 int OSD::pre_init()
2427 {
2428   std::lock_guard lock(osd_lock);
2429   if (is_stopping())
2430     return 0;
2431
2432   if (store->test_mount_in_use()) {
2433     derr << "OSD::pre_init: object store '" << dev_path << "' is "
2434          << "currently in use. (Is ceph-osd already running?)" << dendl;
2435     return -EBUSY;
2436   }
2437
2438   cct->_conf.add_observer(this);
2439   return 0;
2440 }
2441
2442 int OSD::set_numa_affinity()
2443 {
2444   // storage numa node
2445   int store_node = -1;
2446   store->get_numa_node(&store_node, nullptr, nullptr);
2447   if (store_node >= 0) {
2448     dout(1) << __func__ << " storage numa node " << store_node << dendl;
2449   }
2450
2451   // check network numa node(s)
2452   int front_node = -1, back_node = -1;
2453   string front_iface = pick_iface(
2454     cct,
2455     client_messenger->get_myaddrs().front().get_sockaddr_storage());
2456   string back_iface = pick_iface(
2457     cct,
2458     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2459   int r = get_iface_numa_node(front_iface, &front_node);
2460   if (r >= 0 && front_node >= 0) {
2461     dout(1) << __func__ << " public network " << front_iface << " numa node "
2462             << front_node << dendl;
2463     r = get_iface_numa_node(back_iface, &back_node);
2464     if (r >= 0 && back_node >= 0) {
2465       dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2466               << back_node << dendl;
2467       if (front_node == back_node &&
2468           front_node == store_node) {
2469         dout(1) << " objectstore and network numa nodes all match" << dendl;
2470         if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2471           numa_node = front_node;
2472         }
2473       } else if (front_node != back_node) {
2474         dout(1) << __func__ << " public and cluster network numa nodes do not match"
2475                 << dendl;
2476       } else {
2477         dout(1) << __func__ << " objectstore and network numa nodes do not match"
2478                 << dendl;
2479       }
2480     } else if (back_node == -2) {
2481       dout(1) << __func__ << " cluster network " << back_iface
2482               << " ports numa nodes do not match" << dendl;
2483     } else {
2484       derr << __func__ << " unable to identify cluster interface '" << back_iface
2485            << "' numa node: " << cpp_strerror(r) << dendl;
2486     }
2487   } else if (front_node == -2) {
2488     dout(1) << __func__ << " public network " << front_iface
2489             << " ports numa nodes do not match" << dendl;
2490   } else {
2491     derr << __func__ << " unable to identify public interface '" << front_iface
2492          << "' numa node: " << cpp_strerror(r) << dendl;
2493   }
2494   if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2495     // this takes precedence over the automagic logic above
2496     numa_node = node;
2497   }
2498   if (numa_node >= 0) {
2499     int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2500     if (r < 0) {
2501       dout(1) << __func__ << " unable to determine numa node " << numa_node
2502               << " CPUs" << dendl;
2503       numa_node = -1;
2504     } else {
2505       dout(1) << __func__ << " setting numa affinity to node " << numa_node
2506               << " cpus "
2507               << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2508               << dendl;
2509       r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2510       if (r < 0) {
2511         r = -errno;
2512         derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2513              << dendl;
2514         numa_node = -1;
2515       }
2516     }
2517   } else {
2518     dout(1) << __func__ << " not setting numa affinity" << dendl;
2519   }
2520   return 0;
2521 }
2522
2523 // asok
2524
2525 class OSDSocketHook : public AdminSocketHook {
2526   OSD *osd;
2527 public:
2528   explicit OSDSocketHook(OSD *o) : osd(o) {}
2529   int call(std::string_view prefix, const cmdmap_t& cmdmap,
2530            const bufferlist& inbl,
2531            Formatter *f,
2532            std::ostream& ss,
2533            bufferlist& out) override {
2534     ceph_abort("should use async hook");
2535   }
2536   void call_async(
2537     std::string_view prefix,
2538     const cmdmap_t& cmdmap,
2539     Formatter *f,
2540     const bufferlist& inbl,
2541     std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2542     try {
2543       osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2544     } catch (const TOPNSPC::common::bad_cmd_get& e) {
2545       bufferlist empty;
2546       on_finish(-EINVAL, e.what(), empty);
2547     }
2548   }
2549 };
2550
2551 std::set<int64_t> OSD::get_mapped_pools()
2552 {
2553   std::set<int64_t> pools;
2554   std::vector<spg_t> pgids;
2555   _get_pgids(&pgids);
2556   for (const auto &pgid : pgids) {
2557     pools.insert(pgid.pool());
2558   }
2559   return pools;
2560 }
2561
2562 OSD::PGRefOrError OSD::locate_asok_target(const cmdmap_t& cmdmap,
2563                                      stringstream& ss,
2564                                      bool only_primary)
2565 {
2566   string pgidstr;
2567   if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2568     ss << "no pgid specified";
2569     return OSD::PGRefOrError{std::nullopt, -EINVAL};
2570   }
2571
2572   pg_t pgid;
2573   if (!pgid.parse(pgidstr.c_str())) {
2574     ss << "couldn't parse pgid '" << pgidstr << "'";
2575     return OSD::PGRefOrError{std::nullopt, -EINVAL};
2576   }
2577
2578   spg_t pcand;
2579   PGRef pg;
2580   if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) {
2581     if (pg->is_primary() || !only_primary) {
2582       return OSD::PGRefOrError{pg, 0};
2583     }
2584
2585     ss << "not primary for pgid " << pgid;
2586     pg->unlock();
2587     return OSD::PGRefOrError{std::nullopt, -EAGAIN};
2588   } else {
2589     ss << "i don't have pgid " << pgid;
2590     return OSD::PGRefOrError{std::nullopt, -ENOENT};
2591   }
2592 }
2593
2594 // note that the cmdmap is explicitly copied into asok_route_to_pg()
2595 int OSD::asok_route_to_pg(
2596   bool only_primary,
2597   std::string_view prefix,
2598   cmdmap_t cmdmap,
2599   Formatter* f,
2600   stringstream& ss,
2601   const bufferlist& inbl,
2602   bufferlist& outbl,
2603   std::function<void(int, const std::string&, bufferlist&)> on_finish)
2604 {
2605   auto [target_pg, ret] = locate_asok_target(cmdmap, ss, only_primary);
2606
2607   if (!target_pg.has_value()) {
2608     // 'ss' and 'ret' already contain the error information
2609     on_finish(ret, ss.str(), outbl);
2610     return ret;
2611   }
2612
2613   //  the PG was locked by locate_asok_target()
2614   try {
2615     (*target_pg)->do_command(prefix, cmdmap, inbl, on_finish);
2616     (*target_pg)->unlock();
2617     return 0;  // the pg handler calls on_finish directly
2618   } catch (const TOPNSPC::common::bad_cmd_get& e) {
2619     (*target_pg)->unlock();
2620     ss << e.what();
2621     on_finish(ret, ss.str(), outbl);
2622     return -EINVAL;
2623   }
2624 }
2625
2626 void OSD::asok_command(
2627   std::string_view prefix, const cmdmap_t& cmdmap,
2628   Formatter *f,
2629   const bufferlist& inbl,
2630   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2631 {
2632   int ret = 0;
2633   stringstream ss;   // stderr error message stream
2634   bufferlist outbl;  // if empty at end, we'll dump formatter as output
2635
2636   // --- PG commands are routed here to PG::do_command ---
2637   if (prefix == "pg" ||
2638       prefix == "query" ||
2639       prefix == "log" ||
2640       prefix == "mark_unfound_lost" ||
2641       prefix == "list_unfound" ||
2642       prefix == "scrub" ||
2643       prefix == "deep_scrub"
2644     ) {
2645     string pgidstr;
2646     pg_t pgid;
2647     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2648       ss << "no pgid specified";
2649       ret = -EINVAL;
2650       goto out;
2651     }
2652     if (!pgid.parse(pgidstr.c_str())) {
2653       ss << "couldn't parse pgid '" << pgidstr << "'";
2654       ret = -EINVAL;
2655       goto out;
2656     }
2657     spg_t pcand;
2658     PGRef pg;
2659     if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2660         (pg = _lookup_lock_pg(pcand))) {
2661       if (pg->is_primary()) {
2662         cmdmap_t new_cmdmap = cmdmap;
2663         try {
2664           pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2665           pg->unlock();
2666           return; // the pg handler calls on_finish directly
2667         } catch (const TOPNSPC::common::bad_cmd_get& e) {
2668           pg->unlock();
2669           ss << e.what();
2670           ret = -EINVAL;
2671           goto out;
2672         }
2673       } else {
2674         ss << "not primary for pgid " << pgid;
2675         // do not reply; they will get newer maps and realize they
2676         // need to resend.
2677         pg->unlock();
2678         ret = -EAGAIN;
2679         goto out;
2680       }
2681     } else {
2682       ss << "i don't have pgid " << pgid;
2683       ret = -ENOENT;
2684     }
2685   }
2686
2687   // --- PG commands that will be answered even if !primary ---
2688
2689   else if (prefix == "scrubdebug") {
2690     asok_route_to_pg(false, prefix, cmdmap, f, ss, inbl, outbl, on_finish);
2691     return;
2692   }
2693
2694   // --- OSD commands follow ---
2695
2696   else if (prefix == "status") {
2697     lock_guard l(osd_lock);
2698     f->open_object_section("status");
2699     f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2700     f->dump_stream("osd_fsid") << superblock.osd_fsid;
2701     f->dump_unsigned("whoami", superblock.whoami);
2702     f->dump_string("state", get_state_name(get_state()));
2703     f->dump_unsigned("oldest_map", superblock.oldest_map);
2704     f->dump_unsigned("cluster_osdmap_trim_lower_bound",
2705                      superblock.cluster_osdmap_trim_lower_bound);
2706     f->dump_unsigned("newest_map", superblock.newest_map);
2707     f->dump_unsigned("num_pgs", num_pgs);
2708     f->close_section();
2709   } else if (prefix == "flush_journal") {
2710     store->flush_journal();
2711   } else if (prefix == "dump_ops_in_flight" ||
2712              prefix == "ops" ||
2713              prefix == "dump_blocked_ops" ||
2714              prefix == "dump_blocked_ops_count" ||
2715              prefix == "dump_historic_ops" ||
2716              prefix == "dump_historic_ops_by_duration" ||
2717              prefix == "dump_historic_slow_ops") {
2718
2719     const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2720 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2721 will start to track new ops received afterwards.";
2722
2723     set<string> filters;
2724     vector<string> filter_str;
2725     if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2726         copy(filter_str.begin(), filter_str.end(),
2727            inserter(filters, filters.end()));
2728     }
2729
2730     if (prefix == "dump_ops_in_flight" ||
2731         prefix == "ops") {
2732       if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2733         ss << error_str;
2734         ret = -EINVAL;
2735         goto out;
2736       }
2737     }
2738     if (prefix == "dump_blocked_ops") {
2739       if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2740         ss << error_str;
2741         ret = -EINVAL;
2742         goto out;
2743       }
2744     }
2745     if (prefix == "dump_blocked_ops_count") {
2746       if (!op_tracker.dump_ops_in_flight(f, true, filters, true)) {
2747         ss << error_str;
2748         ret = -EINVAL;
2749         goto out;
2750       }
2751     }
2752     if (prefix == "dump_historic_ops") {
2753       if (!op_tracker.dump_historic_ops(f, false, filters)) {
2754         ss << error_str;
2755         ret = -EINVAL;
2756         goto out;
2757       }
2758     }
2759     if (prefix == "dump_historic_ops_by_duration") {
2760       if (!op_tracker.dump_historic_ops(f, true, filters)) {
2761         ss << error_str;
2762         ret = -EINVAL;
2763         goto out;
2764       }
2765     }
2766     if (prefix == "dump_historic_slow_ops") {
2767       if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2768         ss << error_str;
2769         ret = -EINVAL;
2770         goto out;
2771       }
2772     }
2773   } else if (prefix == "dump_op_pq_state") {
2774     f->open_object_section("pq");
2775     op_shardedwq.dump(f);
2776     f->close_section();
2777   } else if (prefix == "dump_blocklist") {
2778     list<pair<entity_addr_t,utime_t> > bl;
2779     list<pair<entity_addr_t,utime_t> > rbl;
2780     OSDMapRef curmap = service.get_osdmap();
2781     curmap->get_blocklist(&bl, &rbl);
2782
2783     f->open_array_section("blocklist");
2784     for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2785         it != bl.end(); ++it) {
2786       f->open_object_section("entry");
2787       f->open_object_section("entity_addr_t");
2788       it->first.dump(f);
2789       f->close_section(); //entity_addr_t
2790       it->second.localtime(f->dump_stream("expire_time"));
2791       f->close_section(); //entry
2792     }
2793     f->close_section(); //blocklist
2794     f->open_array_section("range_blocklist");
2795     for (list<pair<entity_addr_t,utime_t> >::iterator it = rbl.begin();
2796         it != rbl.end(); ++it) {
2797       f->open_object_section("entry");
2798       f->open_object_section("entity_addr_t");
2799       it->first.dump(f);
2800       f->close_section(); //entity_addr_t
2801       it->second.localtime(f->dump_stream("expire_time"));
2802       f->close_section(); //entry
2803     }
2804     f->close_section(); //blocklist
2805   } else if (prefix == "dump_watchers") {
2806     list<obj_watch_item_t> watchers;
2807     // scan pg's
2808     vector<PGRef> pgs;
2809     _get_pgs(&pgs);
2810     for (auto& pg : pgs) {
2811       list<obj_watch_item_t> pg_watchers;
2812       pg->get_watchers(&pg_watchers);
2813       watchers.splice(watchers.end(), pg_watchers);
2814     }
2815
2816     f->open_array_section("watchers");
2817     for (list<obj_watch_item_t>::iterator it = watchers.begin();
2818         it != watchers.end(); ++it) {
2819
2820       f->open_object_section("watch");
2821
2822       f->dump_string("namespace", it->obj.nspace);
2823       f->dump_string("object", it->obj.oid.name);
2824
2825       f->open_object_section("entity_name");
2826       it->wi.name.dump(f);
2827       f->close_section(); //entity_name_t
2828
2829       f->dump_unsigned("cookie", it->wi.cookie);
2830       f->dump_unsigned("timeout", it->wi.timeout_seconds);
2831
2832       f->open_object_section("entity_addr_t");
2833       it->wi.addr.dump(f);
2834       f->close_section(); //entity_addr_t
2835
2836       f->close_section(); //watch
2837     }
2838
2839     f->close_section(); //watchers
2840   } else if (prefix == "dump_recovery_reservations") {
2841     f->open_object_section("reservations");
2842     f->open_object_section("local_reservations");
2843     service.local_reserver.dump(f);
2844     f->close_section();
2845     f->open_object_section("remote_reservations");
2846     service.remote_reserver.dump(f);
2847     f->close_section();
2848     f->close_section();
2849   } else if (prefix == "dump_scrub_reservations") {
2850     f->open_object_section("scrub_reservations");
2851     service.get_scrub_services().dump_scrub_reservations(f);
2852     f->close_section();
2853   } else if (prefix == "get_latest_osdmap") {
2854     get_latest_osdmap();
2855   } else if (prefix == "set_heap_property") {
2856     string property;
2857     int64_t value = 0;
2858     string error;
2859     bool success = false;
2860     if (!cmd_getval(cmdmap, "property", property)) {
2861       error = "unable to get property";
2862       success = false;
2863     } else if (!cmd_getval(cmdmap, "value", value)) {
2864       error = "unable to get value";
2865       success = false;
2866     } else if (value < 0) {
2867       error = "negative value not allowed";
2868       success = false;
2869     } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2870       error = "invalid property";
2871       success = false;
2872     } else {
2873       success = true;
2874     }
2875     f->open_object_section("result");
2876     f->dump_string("error", error);
2877     f->dump_bool("success", success);
2878     f->close_section();
2879   } else if (prefix == "get_heap_property") {
2880     string property;
2881     size_t value = 0;
2882     string error;
2883     bool success = false;
2884     if (!cmd_getval(cmdmap, "property", property)) {
2885       error = "unable to get property";
2886       success = false;
2887     } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2888       error = "invalid property";
2889       success = false;
2890     } else {
2891       success = true;
2892     }
2893     f->open_object_section("result");
2894     f->dump_string("error", error);
2895     f->dump_bool("success", success);
2896     f->dump_int("value", value);
2897     f->close_section();
2898   } else if (prefix == "dump_objectstore_kv_stats") {
2899     store->get_db_statistics(f);
2900   } else if (prefix == "dump_scrubs") {
2901     service.get_scrub_services().dump_scrubs(f);
2902   } else if (prefix == "calc_objectstore_db_histogram") {
2903     store->generate_db_histogram(f);
2904   } else if (prefix == "flush_store_cache") {
2905     store->flush_cache(&ss);
2906   } else if (prefix == "rotate-stored-key") {
2907     store->write_meta("osd_key", inbl.to_str());
2908   } else if (prefix == "dump_pgstate_history") {
2909     f->open_object_section("pgstate_history");
2910     f->open_array_section("pgs");
2911     vector<PGRef> pgs;
2912     _get_pgs(&pgs);
2913     for (auto& pg : pgs) {
2914       f->open_object_section("pg");
2915       f->dump_stream("pg") << pg->pg_id;
2916       f->dump_string("currently", pg->get_current_state());
2917       pg->dump_pgstate_history(f);
2918       f->close_section();
2919     }
2920     f->close_section();
2921     f->close_section();
2922   } else if (prefix == "compact") {
2923     dout(1) << "triggering manual compaction" << dendl;
2924     auto start = ceph::coarse_mono_clock::now();
2925     store->compact();
2926     auto end = ceph::coarse_mono_clock::now();
2927     double duration = std::chrono::duration<double>(end-start).count();
2928     dout(1) << "finished manual compaction in "
2929             << duration
2930             << " seconds" << dendl;
2931     f->open_object_section("compact_result");
2932     f->dump_float("elapsed_time", duration);
2933     f->close_section();
2934   } else if (prefix == "get_mapped_pools") {
2935     f->open_array_section("mapped_pools");
2936     set<int64_t> poollist = get_mapped_pools();
2937     for (auto pool : poollist) {
2938       f->dump_int("pool_id", pool);
2939     }
2940     f->close_section();
2941   } else if (prefix == "smart") {
2942     string devid;
2943     cmd_getval(cmdmap, "devid", devid);
2944     ostringstream out;
2945     probe_smart(devid, out);
2946     outbl.append(out.str());
2947   } else if (prefix == "list_devices") {
2948     set<string> devnames;
2949     store->get_devices(&devnames);
2950     f->open_array_section("list_devices");
2951     for (auto dev : devnames) {
2952       if (dev.find("dm-") == 0) {
2953         continue;
2954       }
2955       string err;
2956       f->open_object_section("device");
2957       f->dump_string("device", "/dev/" + dev);
2958       f->dump_string("device_id", get_device_id(dev, &err));
2959       f->close_section();
2960     }
2961     f->close_section();
2962   } else if (prefix == "send_beacon") {
2963     lock_guard l(osd_lock);
2964     if (is_active()) {
2965       send_beacon(ceph::coarse_mono_clock::now());
2966     }
2967   }
2968
2969   else if (prefix == "cluster_log") {
2970     vector<string> msg;
2971     cmd_getval(cmdmap, "message", msg);
2972     if (msg.empty()) {
2973       ret = -EINVAL;
2974       ss << "ignoring empty log message";
2975       goto out;
2976     }
2977     string message = msg.front();
2978     for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2979       message += " " + *a;
2980     string lvl;
2981     cmd_getval(cmdmap, "level", lvl);
2982     clog_type level = string_to_clog_type(lvl);
2983     if (level < 0) {
2984       ret = -EINVAL;
2985       ss << "unknown level '" << lvl << "'";
2986       goto out;
2987     }
2988     clog->do_log(level, message);
2989   }
2990
2991   else if (prefix == "bench") {
2992     // default count 1G, size 4MB
2993     int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 1LL << 30);
2994     int64_t bsize = cmd_getval_or<int64_t>(cmdmap, "size", 4LL << 20);
2995     int64_t osize = cmd_getval_or<int64_t>(cmdmap, "object_size", 0);
2996     int64_t onum = cmd_getval_or<int64_t>(cmdmap, "object_num", 0);
2997     double elapsed = 0.0;
2998
2999     ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
3000     if (ret != 0) {
3001       goto out;
3002     }
3003
3004     double rate = count / elapsed;
3005     double iops = rate / bsize;
3006     f->open_object_section("osd_bench_results");
3007     f->dump_int("bytes_written", count);
3008     f->dump_int("blocksize", bsize);
3009     f->dump_float("elapsed_sec", elapsed);
3010     f->dump_float("bytes_per_sec", rate);
3011     f->dump_float("iops", iops);
3012     f->close_section();
3013   }
3014
3015   else if (prefix == "flush_pg_stats") {
3016     mgrc.send_pgstats();
3017     f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
3018   }
3019
3020   else if (prefix == "heap") {
3021     std::stringstream outss;
3022     ret = ceph::osd_cmds::heap(*cct, cmdmap, outss, ss);
3023     outbl.append(outss);
3024   }
3025
3026   else if (prefix == "debug dump_missing") {
3027     f->open_array_section("pgs");
3028     vector<PGRef> pgs;
3029     _get_pgs(&pgs);
3030     for (auto& pg : pgs) {
3031       string s = stringify(pg->pg_id);
3032       f->open_array_section(s.c_str());
3033       pg->lock();
3034       pg->dump_missing(f);
3035       pg->unlock();
3036       f->close_section();
3037     }
3038     f->close_section();
3039   }
3040
3041   else if (prefix == "debug kick_recovery_wq") {
3042     int64_t delay;
3043     cmd_getval(cmdmap, "delay", delay);
3044     ostringstream oss;
3045     oss << delay;
3046     ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
3047     if (ret != 0) {
3048       ss << "kick_recovery_wq: error setting "
3049          << "osd_recovery_delay_start to '" << delay << "': error "
3050          << ret;
3051       goto out;
3052     }
3053     cct->_conf.apply_changes(nullptr);
3054     ss << "kicking recovery queue. set osd_recovery_delay_start "
3055        << "to " << cct->_conf->osd_recovery_delay_start;
3056   }
3057
3058   else if (prefix == "cpu_profiler") {
3059     ostringstream ds;
3060     string arg;
3061     cmd_getval(cmdmap, "arg", arg);
3062     vector<string> argvec;
3063     get_str_vec(arg, argvec);
3064     cpu_profiler_handle_command(argvec, ds);
3065     outbl.append(ds.str());
3066   }
3067
3068   else if (prefix == "dump_pg_recovery_stats") {
3069     lock_guard l(osd_lock);
3070     pg_recovery_stats.dump_formatted(f);
3071   }
3072
3073   else if (prefix == "reset_pg_recovery_stats") {
3074     lock_guard l(osd_lock);
3075     pg_recovery_stats.reset();
3076   }
3077
3078   else if (prefix == "perf histogram dump") {
3079     std::string logger;
3080     std::string counter;
3081     cmd_getval(cmdmap, "logger", logger);
3082     cmd_getval(cmdmap, "counter", counter);
3083     cct->get_perfcounters_collection()->dump_formatted_histograms(
3084       f, false, logger, counter);
3085   }
3086
3087   else if (prefix == "cache drop") {
3088     lock_guard l(osd_lock);
3089     dout(20) << "clearing all caches" << dendl;
3090     // Clear the objectstore's cache - onode and buffer for Bluestore,
3091     // system's pagecache for Filestore
3092     ret = store->flush_cache(&ss);
3093     if (ret < 0) {
3094       ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
3095       goto out;
3096     }
3097     // Clear the objectcontext cache (per PG)
3098     vector<PGRef> pgs;
3099     _get_pgs(&pgs);
3100     for (auto& pg: pgs) {
3101       pg->clear_cache();
3102     }
3103   }
3104
3105   else if (prefix == "cache status") {
3106     lock_guard l(osd_lock);
3107     int obj_ctx_count = 0;
3108     vector<PGRef> pgs;
3109     _get_pgs(&pgs);
3110     for (auto& pg: pgs) {
3111       obj_ctx_count += pg->get_cache_obj_count();
3112     }
3113     f->open_object_section("cache_status");
3114     f->dump_int("object_ctx", obj_ctx_count);
3115     store->dump_cache_stats(f);
3116     f->close_section();
3117   }
3118
3119   else if (prefix == "scrub_purged_snaps") {
3120     lock_guard l(osd_lock);
3121     scrub_purged_snaps();
3122   }
3123
3124   else if (prefix == "dump_osd_network") {
3125     lock_guard l(osd_lock);
3126     int64_t value = 0;
3127     if (!(cmd_getval(cmdmap, "value", value))) {
3128       // Convert milliseconds to microseconds
3129       value = static_cast<double>(g_conf().get_val<double>(
3130                                     "mon_warn_on_slow_ping_time")) * 1000;
3131       if (value == 0) {
3132         double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
3133         value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
3134         value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
3135       }
3136     } else {
3137       // Convert user input to microseconds
3138       value *= 1000;
3139     }
3140     if (value < 0) value = 0;
3141
3142     struct osd_ping_time_t {
3143       uint32_t pingtime;
3144       int to;
3145       bool back;
3146       std::array<uint32_t,3> times;
3147       std::array<uint32_t,3> min;
3148       std::array<uint32_t,3> max;
3149       uint32_t last;
3150       uint32_t last_update;
3151
3152       bool operator<(const osd_ping_time_t& rhs) const {
3153         if (pingtime < rhs.pingtime)
3154           return true;
3155         if (pingtime > rhs.pingtime)
3156           return false;
3157         if (to < rhs.to)
3158           return true;
3159         if (to > rhs.to)
3160           return false;
3161         return back;
3162       }
3163     };
3164
3165     set<osd_ping_time_t> sorted;
3166     // Get pingtimes under lock and not on the stack
3167     map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3168     service.get_hb_pingtime(pingtimes);
3169     for (auto j : *pingtimes) {
3170       if (j.second.last_update == 0)
3171         continue;
3172       osd_ping_time_t item;
3173       item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3174       item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3175       if (item.pingtime >= value) {
3176         item.to = j.first;
3177         item.times[0] = j.second.back_pingtime[0];
3178         item.times[1] = j.second.back_pingtime[1];
3179         item.times[2] = j.second.back_pingtime[2];
3180         item.min[0] = j.second.back_min[0];
3181         item.min[1] = j.second.back_min[1];
3182         item.min[2] = j.second.back_min[2];
3183         item.max[0] = j.second.back_max[0];
3184         item.max[1] = j.second.back_max[1];
3185         item.max[2] = j.second.back_max[2];
3186         item.last = j.second.back_last;
3187         item.back = true;
3188         item.last_update = j.second.last_update;
3189         sorted.emplace(item);
3190       }
3191       if (j.second.front_last == 0)
3192         continue;
3193       item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3194       item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3195       if (item.pingtime >= value) {
3196         item.to = j.first;
3197         item.times[0] = j.second.front_pingtime[0];
3198         item.times[1] = j.second.front_pingtime[1];
3199         item.times[2] = j.second.front_pingtime[2];
3200         item.min[0] = j.second.front_min[0];
3201         item.min[1] = j.second.front_min[1];
3202         item.min[2] = j.second.front_min[2];
3203         item.max[0] = j.second.front_max[0];
3204         item.max[1] = j.second.front_max[1];
3205         item.max[2] = j.second.front_max[2];
3206         item.last = j.second.front_last;
3207         item.last_update = j.second.last_update;
3208         item.back = false;
3209         sorted.emplace(item);
3210       }
3211     }
3212     delete pingtimes;
3213     //
3214     // Network ping times (1min 5min 15min)
3215     f->open_object_section("network_ping_times");
3216     f->dump_int("threshold", value / 1000);
3217     f->open_array_section("entries");
3218     for (auto &sitem : boost::adaptors::reverse(sorted)) {
3219       ceph_assert(sitem.pingtime >= value);
3220       f->open_object_section("entry");
3221
3222       const time_t lu(sitem.last_update);
3223       char buffer[26];
3224       string lustr(ctime_r(&lu, buffer));
3225       lustr.pop_back();   // Remove trailing \n
3226       auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3227       f->dump_string("last update", lustr);
3228       f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3229       f->dump_int("from osd", whoami);
3230       f->dump_int("to osd", sitem.to);
3231       f->dump_string("interface", (sitem.back ? "back" : "front"));
3232       f->open_object_section("average");
3233       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3234       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3235       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3236       f->close_section();  // average
3237       f->open_object_section("min");
3238       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3239       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3240       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3241       f->close_section();  // min
3242       f->open_object_section("max");
3243       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3244       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3245       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3246       f->close_section();  // max
3247       f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3248       f->close_section();  // entry
3249     }
3250     f->close_section(); // entries
3251     f->close_section(); // network_ping_times
3252   } else if (prefix == "dump_pool_statfs") {
3253     lock_guard l(osd_lock);
3254
3255     int64_t p = 0;
3256     if (!(cmd_getval(cmdmap, "poolid", p))) {
3257       ss << "Error dumping pool statfs: no poolid provided";
3258       ret = -EINVAL;
3259       goto out;
3260     }
3261
3262     store_statfs_t st;
3263     bool per_pool_omap_stats = false;
3264
3265     ret = store->pool_statfs(p, &st, &per_pool_omap_stats);
3266     if (ret < 0) {
3267       ss << "Error dumping pool statfs: " << cpp_strerror(ret);
3268       goto out;
3269     } else {
3270       ss << "dumping pool statfs...";
3271       f->open_object_section("pool_statfs");
3272       f->dump_int("poolid", p);
3273       st.dump(f);
3274       f->close_section();
3275     }
3276   } else {
3277     ceph_abort_msg("broken asok registration");
3278   }
3279
3280  out:
3281   on_finish(ret, ss.str(), outbl);
3282 }
3283
3284 int OSD::run_osd_bench_test(
3285   int64_t count,
3286   int64_t bsize,
3287   int64_t osize,
3288   int64_t onum,
3289   double *elapsed,
3290   ostream &ss)
3291 {
3292   int ret = 0;
3293   srand(time(NULL) % (unsigned long) -1);
3294   uint32_t duration = cct->_conf->osd_bench_duration;
3295
3296   if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3297     // let us limit the block size because the next checks rely on it
3298     // having a sane value.  If we allow any block size to be set things
3299     // can still go sideways.
3300     ss << "block 'size' values are capped at "
3301        << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3302        << " a higher value, please adjust 'osd_bench_max_block_size'";
3303     ret = -EINVAL;
3304     return ret;
3305   } else if (bsize < (int64_t) (1 << 20)) {
3306     // entering the realm of small block sizes.
3307     // limit the count to a sane value, assuming a configurable amount of
3308     // IOPS and duration, so that the OSD doesn't get hung up on this,
3309     // preventing timeouts from going off
3310     int64_t max_count =
3311       bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3312     if (count > max_count) {
3313       ss << "'count' values greater than " << max_count
3314          << " for a block size of " << byte_u_t(bsize) << ", assuming "
3315          << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3316          << " for " << duration << " seconds,"
3317          << " can cause ill effects on osd. "
3318          << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3319          << " value if you wish to use a higher 'count'.";
3320       ret = -EINVAL;
3321       return ret;
3322     }
3323   } else {
3324     // 1MB block sizes are big enough so that we get more stuff done.
3325     // However, to avoid the osd from getting hung on this and having
3326     // timers being triggered, we are going to limit the count assuming
3327     // a configurable throughput and duration.
3328     // NOTE: max_count is the total amount of bytes that we believe we
3329     //       will be able to write during 'duration' for the given
3330     //       throughput.  The block size hardly impacts this unless it's
3331     //       way too big.  Given we already check how big the block size
3332     //       is, it's safe to assume everything will check out.
3333     int64_t max_count =
3334       cct->_conf->osd_bench_large_size_max_throughput * duration;
3335     if (count > max_count) {
3336       ss << "'count' values greater than " << max_count
3337          << " for a block size of " << byte_u_t(bsize) << ", assuming "
3338          << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3339          << " for " << duration << " seconds,"
3340          << " can cause ill effects on osd. "
3341          << " Please adjust 'osd_bench_large_size_max_throughput'"
3342          << " with a higher value if you wish to use a higher 'count'.";
3343       ret = -EINVAL;
3344       return ret;
3345     }
3346   }
3347
3348   if (osize && bsize > osize) {
3349     bsize = osize;
3350   }
3351
3352   dout(1) << " bench count " << count
3353           << " bsize " << byte_u_t(bsize) << dendl;
3354
3355   ObjectStore::Transaction cleanupt;
3356
3357   if (osize && onum) {
3358     bufferlist bl;
3359     bufferptr bp(osize);
3360     memset(bp.c_str(), 'a', bp.length());
3361     bl.push_back(std::move(bp));
3362     bl.rebuild_page_aligned();
3363     for (int i=0; i<onum; ++i) {
3364       char nm[30];
3365       snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3366       object_t oid(nm);
3367       hobject_t soid(sobject_t(oid, 0));
3368       ObjectStore::Transaction t;
3369       t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3370       store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3371       cleanupt.remove(coll_t(), ghobject_t(soid));
3372     }
3373   }
3374
3375   {
3376     C_SaferCond waiter;
3377     if (!service.meta_ch->flush_commit(&waiter)) {
3378       waiter.wait();
3379     }
3380   }
3381
3382   bufferlist bl;
3383   utime_t start = ceph_clock_now();
3384   for (int64_t pos = 0; pos < count; pos += bsize) {
3385     char nm[34];
3386     unsigned offset = 0;
3387     bufferptr bp(bsize);
3388     memset(bp.c_str(), rand() & 0xff, bp.length());
3389     bl.push_back(std::move(bp));
3390     bl.rebuild_page_aligned();
3391     if (onum && osize) {
3392       snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3393       offset = rand() % (osize / bsize) * bsize;
3394     } else {
3395       snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3396     }
3397     object_t oid(nm);
3398     hobject_t soid(sobject_t(oid, 0));
3399     ObjectStore::Transaction t;
3400     t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3401     store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3402     if (!onum || !osize) {
3403       cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3404     }
3405     bl.clear();
3406   }
3407
3408   {
3409     C_SaferCond waiter;
3410     if (!service.meta_ch->flush_commit(&waiter)) {
3411       waiter.wait();
3412     }
3413   }
3414   utime_t end = ceph_clock_now();
3415   *elapsed = end - start;
3416
3417   // clean up
3418   store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3419   {
3420     C_SaferCond waiter;
3421     if (!service.meta_ch->flush_commit(&waiter)) {
3422       waiter.wait();
3423     }
3424   }
3425
3426  return ret;
3427 }
3428
3429 class TestOpsSocketHook : public AdminSocketHook {
3430   OSDService *service;
3431   ObjectStore *store;
3432 public:
3433   TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3434   int call(std::string_view command, const cmdmap_t& cmdmap,
3435            const bufferlist&,
3436            Formatter *f,
3437            std::ostream& errss,
3438            bufferlist& out) override {
3439     int r = 0;
3440     stringstream outss;
3441     try {
3442       test_ops(service, store, command, cmdmap, outss);
3443       out.append(outss);
3444     } catch (const TOPNSPC::common::bad_cmd_get& e) {
3445       errss << e.what();
3446       r = -EINVAL;
3447     }
3448     return r;
3449   }
3450   void test_ops(OSDService *service, ObjectStore *store,
3451                 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3452
3453 };
3454
3455 class OSD::C_Tick : public Context {
3456   OSD *osd;
3457   public:
3458   explicit C_Tick(OSD *o) : osd(o) {}
3459   void finish(int r) override {
3460     osd->tick();
3461   }
3462 };
3463
3464 class OSD::C_Tick_WithoutOSDLock : public Context {
3465   OSD *osd;
3466   public:
3467   explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3468   void finish(int r) override {
3469     osd->tick_without_osd_lock();
3470   }
3471 };
3472
3473 int OSD::enable_disable_fuse(bool stop)
3474 {
3475 #ifdef HAVE_LIBFUSE
3476   int r;
3477   string mntpath = cct->_conf->osd_data + "/fuse";
3478   if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3479     dout(1) << __func__ << " disabling" << dendl;
3480     fuse_store->stop();
3481     delete fuse_store;
3482     fuse_store = NULL;
3483     r = ::rmdir(mntpath.c_str());
3484     if (r < 0) {
3485       r = -errno;
3486       derr << __func__ << " failed to rmdir " << mntpath << ": "
3487            << cpp_strerror(r) << dendl;
3488       return r;
3489     }
3490     return 0;
3491   }
3492   if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3493     dout(1) << __func__ << " enabling" << dendl;
3494     r = ::mkdir(mntpath.c_str(), 0700);
3495     if (r < 0)
3496       r = -errno;
3497     if (r < 0 && r != -EEXIST) {
3498       derr << __func__ << " unable to create " << mntpath << ": "
3499            << cpp_strerror(r) << dendl;
3500       return r;
3501     }
3502     fuse_store = new FuseStore(store.get(), mntpath);
3503     r = fuse_store->start();
3504     if (r < 0) {
3505       derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3506       delete fuse_store;
3507       fuse_store = NULL;
3508       return r;
3509     }
3510   }
3511 #endif  // HAVE_LIBFUSE
3512   return 0;
3513 }
3514
3515 size_t OSD::get_num_cache_shards()
3516 {
3517   return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3518 }
3519
3520 int OSD::get_num_op_shards()
3521 {
3522   if (cct->_conf->osd_op_num_shards)
3523     return cct->_conf->osd_op_num_shards;
3524   if (store_is_rotational)
3525     return cct->_conf->osd_op_num_shards_hdd;
3526   else
3527     return cct->_conf->osd_op_num_shards_ssd;
3528 }
3529
3530 int OSD::get_num_op_threads()
3531 {
3532   if (cct->_conf->osd_op_num_threads_per_shard)
3533     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3534   if (store_is_rotational)
3535     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3536   else
3537     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3538 }
3539
3540 float OSD::get_osd_recovery_sleep()
3541 {
3542   if (cct->_conf->osd_recovery_sleep)
3543     return cct->_conf->osd_recovery_sleep;
3544   if (!store_is_rotational && !journal_is_rotational)
3545     return cct->_conf->osd_recovery_sleep_ssd;
3546   else if (store_is_rotational && !journal_is_rotational)
3547     return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3548   else
3549     return cct->_conf->osd_recovery_sleep_hdd;
3550 }
3551
3552 float OSD::get_osd_delete_sleep()
3553 {
3554   float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3555   if (osd_delete_sleep > 0)
3556     return osd_delete_sleep;
3557   if (!store_is_rotational && !journal_is_rotational)
3558     return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3559   if (store_is_rotational && !journal_is_rotational)
3560     return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3561   return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3562 }
3563
3564 int OSD::get_recovery_max_active()
3565 {
3566   if (cct->_conf->osd_recovery_max_active)
3567     return cct->_conf->osd_recovery_max_active;
3568   if (store_is_rotational)
3569     return cct->_conf->osd_recovery_max_active_hdd;
3570   else
3571     return cct->_conf->osd_recovery_max_active_ssd;
3572 }
3573
3574 float OSD::get_osd_snap_trim_sleep()
3575 {
3576   float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3577   if (osd_snap_trim_sleep > 0)
3578     return osd_snap_trim_sleep;
3579   if (!store_is_rotational && !journal_is_rotational)
3580     return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3581   if (store_is_rotational && !journal_is_rotational)
3582     return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3583   return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3584 }
3585
3586 int OSD::init()
3587 {
3588   OSDMapRef osdmap;
3589   CompatSet initial, diff;
3590   std::lock_guard lock(osd_lock);
3591   if (is_stopping())
3592     return 0;
3593   tracing::osd::tracer.init("osd");
3594   tick_timer.init();
3595   tick_timer_without_osd_lock.init();
3596   service.recovery_request_timer.init();
3597   service.sleep_timer.init();
3598
3599   boot_finisher.start();
3600
3601   {
3602     string val;
3603     store->read_meta("require_osd_release", &val);
3604     last_require_osd_release = ceph_release_from_name(val);
3605   }
3606
3607   // mount.
3608   dout(2) << "init " << dev_path
3609           << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3610           << dendl;
3611   dout(2) << "journal " << journal_path << dendl;
3612   ceph_assert(store);  // call pre_init() first!
3613
3614   store->set_cache_shards(get_num_cache_shards());
3615
3616  int rotating_auth_attempts = 0;
3617  auto rotating_auth_timeout =
3618    g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3619
3620   int r = store->mount();
3621   if (r < 0) {
3622     derr << "OSD:init: unable to mount object store" << dendl;
3623     return r;
3624   }
3625   journal_is_rotational = store->is_journal_rotational();
3626   dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3627           << dendl;
3628
3629   enable_disable_fuse(false);
3630
3631   dout(2) << "boot" << dendl;
3632
3633   service.meta_ch = store->open_collection(coll_t::meta());
3634   if (!service.meta_ch) {
3635     derr << "OSD:init: unable to open meta collection"
3636          << dendl;
3637     r = -ENOENT;
3638     goto out;
3639   }
3640   // initialize the daily loadavg with current 15min loadavg
3641   double loadavgs[3];
3642   if (getloadavg(loadavgs, 3) == 3) {
3643     daily_loadavg = loadavgs[2];
3644   } else {
3645     derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3646     daily_loadavg = 1.0;
3647   }
3648
3649   // sanity check long object name handling
3650   {
3651     hobject_t l;
3652     l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3653     l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3654     l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3655     r = store->validate_hobject_key(l);
3656     if (r < 0) {
3657       derr << "backend (" << store->get_type() << ") is unable to support max "
3658            << "object name[space] len" << dendl;
3659       derr << "   osd max object name len = "
3660            << cct->_conf->osd_max_object_name_len << dendl;
3661       derr << "   osd max object namespace len = "
3662            << cct->_conf->osd_max_object_namespace_len << dendl;
3663       derr << cpp_strerror(r) << dendl;
3664       if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3665         goto out;
3666       }
3667       derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3668            << dendl;
3669     } else {
3670       dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3671     }
3672   }
3673
3674   // read superblock
3675   r = read_superblock();
3676   if (r < 0) {
3677     derr << "OSD::init() : unable to read osd superblock" << dendl;
3678     r = -EINVAL;
3679     goto out;
3680   }
3681
3682   if (osd_compat.compare(superblock.compat_features) < 0) {
3683     derr << "The disk uses features unsupported by the executable." << dendl;
3684     derr << " ondisk features " << superblock.compat_features << dendl;
3685     derr << " daemon features " << osd_compat << dendl;
3686
3687     if (osd_compat.writeable(superblock.compat_features)) {
3688       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3689       derr << "it is still writeable, though. Missing features: " << diff << dendl;
3690       r = -EOPNOTSUPP;
3691       goto out;
3692     }
3693     else {
3694       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3695       derr << "Cannot write to disk! Missing features: " << diff << dendl;
3696       r = -EOPNOTSUPP;
3697       goto out;
3698     }
3699   }
3700
3701   assert_warn(whoami == superblock.whoami);
3702   if (whoami != superblock.whoami) {
3703     derr << "OSD::init: superblock says osd"
3704          << superblock.whoami << " but I am osd." << whoami << dendl;
3705     r = -EINVAL;
3706     goto out;
3707   }
3708
3709   startup_time = ceph::mono_clock::now();
3710
3711   // load up "current" osdmap
3712   assert_warn(!get_osdmap());
3713   if (get_osdmap()) {
3714     derr << "OSD::init: unable to read current osdmap" << dendl;
3715     r = -EINVAL;
3716     goto out;
3717   }
3718   osdmap = get_map(superblock.current_epoch);
3719   set_osdmap(osdmap);
3720
3721   // make sure we don't have legacy pgs deleting
3722   {
3723     vector<coll_t> ls;
3724     int r = store->list_collections(ls);
3725     ceph_assert(r >= 0);
3726     for (auto c : ls) {
3727       spg_t pgid;
3728       if (c.is_pg(&pgid) &&
3729           !osdmap->have_pg_pool(pgid.pool())) {
3730         ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3731         if (!store->exists(service.meta_ch, oid)) {
3732           derr << __func__ << " missing pg_pool_t for deleted pool "
3733                << pgid.pool() << " for pg " << pgid
3734                << "; please downgrade to luminous and allow "
3735                << "pg deletion to complete before upgrading" << dendl;
3736           ceph_abort();
3737         }
3738       }
3739     }
3740   }
3741
3742   initial = get_osd_initial_compat_set();
3743   diff = superblock.compat_features.unsupported(initial);
3744   if (superblock.compat_features.merge(initial)) {
3745     // Are we adding SNAPMAPPER2?
3746     if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3747       dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3748               << dendl;
3749       auto ch = service.meta_ch;
3750       auto hoid = make_snapmapper_oid();
3751       unsigned max = cct->_conf->osd_target_transaction_size;
3752       r = SnapMapper::convert_legacy(cct, store.get(), ch, hoid, max);
3753       if (r < 0)
3754         goto out;
3755     }
3756     // We need to persist the new compat_set before we
3757     // do anything else
3758     dout(5) << "Upgrading superblock adding: " << diff << dendl;
3759
3760     if (!superblock.cluster_osdmap_trim_lower_bound) {
3761       superblock.cluster_osdmap_trim_lower_bound = superblock.oldest_map;
3762     }
3763
3764     ObjectStore::Transaction t;
3765     write_superblock(t);
3766     r = store->queue_transaction(service.meta_ch, std::move(t));
3767     if (r < 0)
3768       goto out;
3769   }
3770
3771   // make sure snap mapper object exists
3772   if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3773     dout(10) << "init creating/touching snapmapper object" << dendl;
3774     ObjectStore::Transaction t;
3775     t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3776     r = store->queue_transaction(service.meta_ch, std::move(t));
3777     if (r < 0)
3778       goto out;
3779   }
3780   if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3781     dout(10) << "init creating/touching purged_snaps object" << dendl;
3782     ObjectStore::Transaction t;
3783     t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3784     r = store->queue_transaction(service.meta_ch, std::move(t));
3785     if (r < 0)
3786       goto out;
3787   }
3788
3789   if (cct->_conf->osd_open_classes_on_start) {
3790     int r = ClassHandler::get_instance().open_all_classes();
3791     if (r)
3792       dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3793   }
3794
3795   check_osdmap_features();
3796
3797   {
3798     epoch_t bind_epoch = osdmap->get_epoch();
3799     service.set_epochs(NULL, NULL, &bind_epoch);
3800   }
3801
3802   clear_temp_objects();
3803
3804   // initialize osdmap references in sharded wq
3805   for (auto& shard : shards) {
3806     std::lock_guard l(shard->osdmap_lock);
3807     shard->shard_osdmap = osdmap;
3808   }
3809
3810   // load up pgs (as they previously existed)
3811   load_pgs();
3812
3813   dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3814
3815   if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3816     dout(2) << "compacting object store's omap" << dendl;
3817     store->compact();
3818   }
3819
3820   // prime osd stats
3821   {
3822     struct store_statfs_t stbuf;
3823     osd_alert_list_t alerts;
3824     int r = store->statfs(&stbuf, &alerts);
3825     ceph_assert(r == 0);
3826     service.set_statfs(stbuf, alerts);
3827   }
3828
3829   // client_messenger's auth_client will be set up by monc->init() later.
3830   for (auto m : { cluster_messenger,
3831         objecter_messenger,
3832         hb_front_client_messenger,
3833         hb_back_client_messenger,
3834         hb_front_server_messenger,
3835         hb_back_server_messenger } ) {
3836     m->set_auth_client(monc);
3837   }
3838   for (auto m : { client_messenger,
3839         cluster_messenger,
3840         hb_front_server_messenger,
3841         hb_back_server_messenger }) {
3842     m->set_auth_server(monc);
3843   }
3844   monc->set_handle_authentication_dispatcher(this);
3845
3846   monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3847                       | CEPH_ENTITY_TYPE_MGR);
3848   r = monc->init();
3849   if (r < 0)
3850     goto out;
3851
3852   mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
3853   mgrc.set_perf_metric_query_cb(
3854     [this](const ConfigPayload &config_payload) {
3855         set_perf_queries(config_payload);
3856       },
3857       [this] {
3858         return get_perf_reports();
3859       });
3860   mgrc.init();
3861
3862   // tell monc about log_client so it will know about mon session resets
3863   monc->set_log_client(&log_client);
3864   update_log_config();
3865
3866   // i'm ready!
3867   client_messenger->add_dispatcher_tail(&mgrc);
3868   client_messenger->add_dispatcher_tail(this);
3869   cluster_messenger->add_dispatcher_head(this);
3870
3871   hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3872   hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3873   hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3874   hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3875
3876   objecter_messenger->add_dispatcher_head(service.objecter.get());
3877
3878   service.init();
3879   service.publish_map(osdmap);
3880   service.publish_superblock(superblock);
3881
3882   for (auto& shard : shards) {
3883     // put PGs in a temporary set because we may modify pg_slots
3884     // unordered_map below.
3885     set<PGRef> pgs;
3886     for (auto& i : shard->pg_slots) {
3887       PGRef pg = i.second->pg;
3888       if (!pg) {
3889         continue;
3890       }
3891       pgs.insert(pg);
3892     }
3893     for (auto pg : pgs) {
3894       std::scoped_lock l{*pg};
3895       set<pair<spg_t,epoch_t>> new_children;
3896       set<pair<spg_t,epoch_t>> merge_pgs;
3897       service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3898                                          &new_children, &merge_pgs);
3899       if (!new_children.empty()) {
3900         for (auto shard : shards) {
3901           shard->prime_splits(osdmap, &new_children);
3902         }
3903         assert(new_children.empty());
3904       }
3905       if (!merge_pgs.empty()) {
3906         for (auto shard : shards) {
3907           shard->prime_merges(osdmap, &merge_pgs);
3908         }
3909         assert(merge_pgs.empty());
3910       }
3911     }
3912   }
3913
3914   osd_op_tp.start();
3915
3916   // start the heartbeat
3917   heartbeat_thread.create("osd_srv_heartbt");
3918
3919   // tick
3920   tick_timer.add_event_after(get_tick_interval(),
3921                              new C_Tick(this));
3922   {
3923     std::lock_guard l(tick_timer_lock);
3924     tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3925                                                 new C_Tick_WithoutOSDLock(this));
3926   }
3927
3928   osd_lock.unlock();
3929
3930   r = monc->authenticate();
3931   if (r < 0) {
3932     derr << __func__ << " authentication failed: " << cpp_strerror(r)
3933          << dendl;
3934     exit(1);
3935   }
3936
3937   while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3938     derr << "unable to obtain rotating service keys; retrying" << dendl;
3939     ++rotating_auth_attempts;
3940     if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3941         derr << __func__ << " wait_auth_rotating timed out"
3942              <<" -- maybe I have a clock skew against the monitors?" << dendl;
3943         exit(1);
3944     }
3945   }
3946
3947   r = update_crush_device_class();
3948   if (r < 0) {
3949     derr << __func__ << " unable to update_crush_device_class: "
3950          << cpp_strerror(r) << dendl;
3951     exit(1);
3952   }
3953
3954   r = update_crush_location();
3955   if (r < 0) {
3956     derr << __func__ << " unable to update_crush_location: "
3957          << cpp_strerror(r) << dendl;
3958     exit(1);
3959   }
3960
3961   osd_lock.lock();
3962   if (is_stopping())
3963     return 0;
3964
3965   // start objecter *after* we have authenticated, so that we don't ignore
3966   // the OSDMaps it requests.
3967   service.final_init();
3968
3969   check_config();
3970
3971   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3972   consume_map();
3973
3974   dout(0) << "done with init, starting boot process" << dendl;
3975
3976   // subscribe to any pg creations
3977   monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3978
3979   // MgrClient needs this (it doesn't have MonClient reference itself)
3980   monc->sub_want("mgrmap", 0, 0);
3981
3982   // we don't need to ask for an osdmap here; objecter will
3983   //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3984
3985   monc->renew_subs();
3986
3987   start_boot();
3988
3989   // Override a few options if mclock scheduler is enabled.
3990   maybe_override_sleep_options_for_qos();
3991   maybe_override_cost_for_qos();
3992   maybe_override_options_for_qos();
3993   maybe_override_max_osd_capacity_for_qos();
3994
3995   return 0;
3996
3997 out:
3998   enable_disable_fuse(true);
3999   store->umount();
4000   store.reset();
4001   return r;
4002 }
4003
4004 void OSD::final_init()
4005 {
4006   AdminSocket *admin_socket = cct->get_admin_socket();
4007   asok_hook = new OSDSocketHook(this);
4008   int r = admin_socket->register_command("status", asok_hook,
4009                                          "high-level status of OSD");
4010   ceph_assert(r == 0);
4011   r = admin_socket->register_command("flush_journal",
4012                                      asok_hook,
4013                                      "flush the journal to permanent store");
4014   ceph_assert(r == 0);
4015   r = admin_socket->register_command("dump_ops_in_flight " \
4016                                      "name=filterstr,type=CephString,n=N,req=false",
4017                                      asok_hook,
4018                                      "show the ops currently in flight");
4019   ceph_assert(r == 0);
4020   r = admin_socket->register_command("ops " \
4021                                      "name=filterstr,type=CephString,n=N,req=false",
4022                                      asok_hook,
4023                                      "show the ops currently in flight");
4024   ceph_assert(r == 0);
4025   r = admin_socket->register_command("dump_blocked_ops " \
4026                                      "name=filterstr,type=CephString,n=N,req=false",
4027                                      asok_hook,
4028                                      "show the blocked ops currently in flight");
4029   ceph_assert(r == 0);
4030   r = admin_socket->register_command("dump_blocked_ops_count " \
4031              "name=filterstr,type=CephString,n=N,req=false",
4032              asok_hook,
4033              "show the count of blocked ops currently in flight");
4034   ceph_assert(r == 0);
4035   r = admin_socket->register_command("dump_historic_ops " \
4036                                      "name=filterstr,type=CephString,n=N,req=false",
4037                                      asok_hook,
4038                                      "show recent ops");
4039   ceph_assert(r == 0);
4040   r = admin_socket->register_command("dump_historic_slow_ops " \
4041                                      "name=filterstr,type=CephString,n=N,req=false",
4042                                      asok_hook,
4043                                      "show slowest recent ops");
4044   ceph_assert(r == 0);
4045   r = admin_socket->register_command("dump_historic_ops_by_duration " \
4046                                      "name=filterstr,type=CephString,n=N,req=false",
4047                                      asok_hook,
4048                                      "show slowest recent ops, sorted by duration");
4049   ceph_assert(r == 0);
4050   r = admin_socket->register_command("dump_op_pq_state",
4051                                      asok_hook,
4052                                      "dump op queue state");
4053   ceph_assert(r == 0);
4054   r = admin_socket->register_command("dump_blocklist",
4055                                      asok_hook,
4056                                      "dump blocklisted clients and times");
4057   ceph_assert(r == 0);
4058   r = admin_socket->register_command("dump_watchers",
4059                                      asok_hook,
4060                                      "show clients which have active watches,"
4061                                      " and on which objects");
4062   ceph_assert(r == 0);
4063   r = admin_socket->register_command("dump_recovery_reservations",
4064                                      asok_hook,
4065                                      "show recovery reservations");
4066   ceph_assert(r == 0);
4067   r = admin_socket->register_command("dump_scrub_reservations",
4068                                      asok_hook,
4069                                      "show scrub reservations");
4070   ceph_assert(r == 0);
4071   r = admin_socket->register_command("get_latest_osdmap",
4072                                      asok_hook,
4073                                      "force osd to update the latest map from "
4074                                      "the mon");
4075   ceph_assert(r == 0);
4076
4077   r = admin_socket->register_command("set_heap_property " \
4078                                      "name=property,type=CephString " \
4079                                      "name=value,type=CephInt",
4080                                      asok_hook,
4081                                      "update malloc extension heap property");
4082   ceph_assert(r == 0);
4083
4084   r = admin_socket->register_command("get_heap_property " \
4085                                      "name=property,type=CephString",
4086                                      asok_hook,
4087                                      "get malloc extension heap property");
4088   ceph_assert(r == 0);
4089
4090   r = admin_socket->register_command("dump_objectstore_kv_stats",
4091                                      asok_hook,
4092                                      "print statistics of kvdb which used by bluestore");
4093   ceph_assert(r == 0);
4094
4095   r = admin_socket->register_command("dump_scrubs",
4096                                      asok_hook,
4097                                      "print scheduled scrubs");
4098   ceph_assert(r == 0);
4099
4100   r = admin_socket->register_command("calc_objectstore_db_histogram",
4101                                      asok_hook,
4102                                      "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
4103   ceph_assert(r == 0);
4104
4105   r = admin_socket->register_command("flush_store_cache",
4106                                      asok_hook,
4107                                      "Flush bluestore internal cache");
4108   ceph_assert(r == 0);
4109   r = admin_socket->register_command("rotate-stored-key",
4110                                      asok_hook,
4111                                      "Update the stored osd_key");
4112   ceph_assert(r == 0);
4113   r = admin_socket->register_command("dump_pgstate_history",
4114                                      asok_hook,
4115                                      "show recent state history");
4116   ceph_assert(r == 0);
4117
4118   r = admin_socket->register_command("compact",
4119                                      asok_hook,
4120                                      "Commpact object store's omap."
4121                                      " WARNING: Compaction probably slows your requests");
4122   ceph_assert(r == 0);
4123
4124   r = admin_socket->register_command("get_mapped_pools",
4125                                      asok_hook,
4126                                      "dump pools whose PG(s) are mapped to this OSD.");
4127
4128   ceph_assert(r == 0);
4129
4130   r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
4131                                      asok_hook,
4132                                      "probe OSD devices for SMART data.");
4133
4134   ceph_assert(r == 0);
4135
4136   r = admin_socket->register_command("list_devices",
4137                                      asok_hook,
4138                                      "list OSD devices.");
4139   r = admin_socket->register_command("send_beacon",
4140                                      asok_hook,
4141                                      "send OSD beacon to mon immediately");
4142
4143   r = admin_socket->register_command(
4144     "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
4145     "Dump osd heartbeat network ping times");
4146   ceph_assert(r == 0);
4147
4148   r = admin_socket->register_command(
4149     "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook,
4150     "Dump store's statistics for the given pool");
4151   ceph_assert(r == 0);
4152
4153   test_ops_hook = new TestOpsSocketHook(&(this->service), this->store.get());
4154   // Note: pools are CephString instead of CephPoolname because
4155   // these commands traditionally support both pool names and numbers
4156   r = admin_socket->register_command(
4157    "setomapval " \
4158    "name=pool,type=CephString " \
4159    "name=objname,type=CephObjectname " \
4160    "name=key,type=CephString "\
4161    "name=val,type=CephString",
4162    test_ops_hook,
4163    "set omap key");
4164   ceph_assert(r == 0);
4165   r = admin_socket->register_command(
4166     "rmomapkey " \
4167     "name=pool,type=CephString " \
4168     "name=objname,type=CephObjectname " \
4169     "name=key,type=CephString",
4170     test_ops_hook,
4171     "remove omap key");
4172   ceph_assert(r == 0);
4173   r = admin_socket->register_command(
4174     "setomapheader " \
4175     "name=pool,type=CephString " \
4176     "name=objname,type=CephObjectname " \
4177     "name=header,type=CephString",
4178     test_ops_hook,
4179     "set omap header");
4180   ceph_assert(r == 0);
4181
4182   r = admin_socket->register_command(
4183     "getomap " \
4184     "name=pool,type=CephString " \
4185     "name=objname,type=CephObjectname",
4186     test_ops_hook,
4187     "output entire object map");
4188   ceph_assert(r == 0);
4189
4190   r = admin_socket->register_command(
4191     "truncobj " \
4192     "name=pool,type=CephString " \
4193     "name=objname,type=CephObjectname " \
4194     "name=len,type=CephInt",
4195     test_ops_hook,
4196     "truncate object to length");
4197   ceph_assert(r == 0);
4198
4199   r = admin_socket->register_command(
4200     "injectdataerr " \
4201     "name=pool,type=CephString " \
4202     "name=objname,type=CephObjectname " \
4203     "name=shardid,type=CephInt,req=false,range=0|255",
4204     test_ops_hook,
4205     "inject data error to an object");
4206   ceph_assert(r == 0);
4207
4208   r = admin_socket->register_command(
4209     "injectmdataerr " \
4210     "name=pool,type=CephString " \
4211     "name=objname,type=CephObjectname " \
4212     "name=shardid,type=CephInt,req=false,range=0|255",
4213     test_ops_hook,
4214     "inject metadata error to an object");
4215   ceph_assert(r == 0);
4216   r = admin_socket->register_command(
4217     "set_recovery_delay " \
4218     "name=utime,type=CephInt,req=false",
4219     test_ops_hook,
4220      "Delay osd recovery by specified seconds");
4221   ceph_assert(r == 0);
4222   r = admin_socket->register_command(
4223    "injectfull " \
4224    "name=type,type=CephString,req=false " \
4225    "name=count,type=CephInt,req=false ",
4226    test_ops_hook,
4227    "Inject a full disk (optional count times)");
4228   ceph_assert(r == 0);
4229   r = admin_socket->register_command(
4230     "bench " \
4231     "name=count,type=CephInt,req=false "    \
4232     "name=size,type=CephInt,req=false "            \
4233     "name=object_size,type=CephInt,req=false "     \
4234     "name=object_num,type=CephInt,req=false ",
4235     asok_hook,
4236     "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4237     "(default count=1G default size=4MB). Results in log.");
4238   ceph_assert(r == 0);
4239   r = admin_socket->register_command(
4240     "cluster_log " \
4241     "name=level,type=CephChoices,strings=error,warning,info,debug "     \
4242     "name=message,type=CephString,n=N",
4243     asok_hook,
4244     "log a message to the cluster log");
4245   ceph_assert(r == 0);
4246   r = admin_socket->register_command(
4247     "flush_pg_stats",
4248     asok_hook,
4249     "flush pg stats");
4250   ceph_assert(r == 0);
4251   r = admin_socket->register_command(
4252     "heap " \
4253     "name=heapcmd,type=CephChoices,strings="                            \
4254     "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4255     "name=value,type=CephString,req=false",
4256     asok_hook,
4257     "show heap usage info (available only if compiled with tcmalloc)");
4258   ceph_assert(r == 0);
4259   r = admin_socket->register_command(
4260     "debug dump_missing "                       \
4261     "name=filename,type=CephFilepath",
4262     asok_hook,
4263     "dump missing objects to a named file");
4264   ceph_assert(r == 0);
4265   r = admin_socket->register_command(
4266     "debug kick_recovery_wq "                                           \
4267     "name=delay,type=CephInt,range=0",
4268     asok_hook,
4269     "set osd_recovery_delay_start to <val>");
4270   ceph_assert(r == 0);
4271   r = admin_socket->register_command(
4272     "cpu_profiler "                                             \
4273     "name=arg,type=CephChoices,strings=status|flush",
4274     asok_hook,
4275     "run cpu profiling on daemon");
4276   ceph_assert(r == 0);
4277   r = admin_socket->register_command(
4278     "dump_pg_recovery_stats",
4279     asok_hook,
4280     "dump pg recovery statistics");
4281   ceph_assert(r == 0);
4282   r = admin_socket->register_command(
4283     "reset_pg_recovery_stats",
4284     asok_hook,
4285     "reset pg recovery statistics");
4286   ceph_assert(r == 0);
4287   r = admin_socket->register_command(
4288     "cache drop",
4289     asok_hook,
4290     "Drop all OSD caches");
4291   ceph_assert(r == 0);
4292   r = admin_socket->register_command(
4293     "cache status",
4294     asok_hook,
4295     "Get OSD caches statistics");
4296   ceph_assert(r == 0);
4297   r = admin_socket->register_command(
4298     "scrub_purged_snaps",
4299     asok_hook,
4300     "Scrub purged_snaps vs snapmapper index");
4301   ceph_assert(r == 0);
4302   r = admin_socket->register_command(
4303     "scrubdebug "                                               \
4304     "name=pgid,type=CephPgid "                                  \
4305     "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4306     "name=value,type=CephString,req=false",
4307     asok_hook,
4308     "debug the scrubber");
4309   ceph_assert(r == 0);
4310
4311   // -- pg commands --
4312   // old form: ceph pg <pgid> command ...
4313   r = admin_socket->register_command(
4314     "pg "                          \
4315     "name=pgid,type=CephPgid "     \
4316     "name=cmd,type=CephChoices,strings=query",
4317     asok_hook,
4318     "");
4319   ceph_assert(r == 0);
4320   r = admin_socket->register_command(
4321     "pg "                          \
4322     "name=pgid,type=CephPgid "     \
4323     "name=cmd,type=CephChoices,strings=log",
4324     asok_hook,
4325     "");
4326   ceph_assert(r == 0);
4327   r = admin_socket->register_command(
4328     "pg "                          \
4329     "name=pgid,type=CephPgid "     \
4330     "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4331     "name=mulcmd,type=CephChoices,strings=revert|delete",
4332     asok_hook,
4333     "");
4334   ceph_assert(r == 0);
4335   r = admin_socket->register_command(
4336     "pg "                          \
4337     "name=pgid,type=CephPgid "     \
4338     "name=cmd,type=CephChoices,strings=list_unfound " \
4339     "name=offset,type=CephString,req=false",
4340     asok_hook,
4341     "");
4342   ceph_assert(r == 0);
4343   r = admin_socket->register_command(
4344     "pg "                          \
4345     "name=pgid,type=CephPgid "     \
4346     "name=cmd,type=CephChoices,strings=scrub " \
4347     "name=time,type=CephInt,req=false",
4348     asok_hook,
4349     "");
4350   ceph_assert(r == 0);
4351   r = admin_socket->register_command(
4352     "pg "                          \
4353     "name=pgid,type=CephPgid "     \
4354     "name=cmd,type=CephChoices,strings=deep_scrub " \
4355     "name=time,type=CephInt,req=false",
4356     asok_hook,
4357     "");
4358   ceph_assert(r == 0);
4359   // new form: tell <pgid> <cmd> for both cli and rest
4360   r = admin_socket->register_command(
4361     "query",
4362     asok_hook,
4363     "show details of a specific pg");
4364   ceph_assert(r == 0);
4365   r = admin_socket->register_command(
4366     "log",
4367     asok_hook,
4368     "dump pg_log of a specific pg");
4369   ceph_assert(r == 0);
4370   r = admin_socket->register_command(
4371     "mark_unfound_lost "                                        \
4372     "name=pgid,type=CephPgid,req=false "                        \
4373     "name=mulcmd,type=CephChoices,strings=revert|delete",
4374     asok_hook,
4375     "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4376   ceph_assert(r == 0);
4377   r = admin_socket->register_command(
4378     "list_unfound "                                     \
4379     "name=pgid,type=CephPgid,req=false "                \
4380     "name=offset,type=CephString,req=false",
4381     asok_hook,
4382     "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4383   ceph_assert(r == 0);
4384   r = admin_socket->register_command(
4385     "scrub "                            \
4386     "name=pgid,type=CephPgid,req=false "        \
4387     "name=time,type=CephInt,req=false",
4388     asok_hook,
4389     "Trigger a scheduled scrub ");
4390   ceph_assert(r == 0);
4391   r = admin_socket->register_command(
4392     "deep_scrub "                       \
4393     "name=pgid,type=CephPgid,req=false "        \
4394     "name=time,type=CephInt,req=false",
4395     asok_hook,
4396     "Trigger a scheduled deep scrub ");
4397   ceph_assert(r == 0);
4398 }
4399
4400 PerfCounters* OSD::create_logger()
4401 {
4402   PerfCounters* logger = build_osd_logger(cct);
4403   cct->get_perfcounters_collection()->add(logger);
4404   return logger;
4405 }
4406
4407 PerfCounters* OSD::create_recoverystate_perf()
4408 {
4409   PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
4410   cct->get_perfcounters_collection()->add(recoverystate_perf);
4411   return recoverystate_perf;
4412 }
4413
4414 int OSD::shutdown()
4415 {
4416   // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4417   //cct->_conf->osd_fast_shutdown = true;
4418
4419   dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4420           << cct->_conf->osd_fast_shutdown
4421           << ", null-fm = " << store->has_null_manager() << dendl;
4422
4423   utime_t  start_time_func = ceph_clock_now();
4424
4425   if (cct->_conf->osd_fast_shutdown) {
4426     derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4427     if (cct->_conf->osd_fast_shutdown_notify_mon)
4428       service.prepare_to_stop();
4429
4430     // There is no state we need to keep wehn running in NULL-FM moode
4431     if (!store->has_null_manager()) {
4432       cct->_log->flush();
4433       _exit(0);
4434     }
4435   } else if (!service.prepare_to_stop()) {
4436     return 0; // already shutting down
4437   }
4438
4439   osd_lock.lock();
4440   if (is_stopping()) {
4441     osd_lock.unlock();
4442     return 0;
4443   }
4444
4445   if (!cct->_conf->osd_fast_shutdown) {
4446     dout(0) << "shutdown" << dendl;
4447   }
4448
4449   // don't accept new task for this OSD
4450   set_state(STATE_STOPPING);
4451
4452   // Disabled debugging during fast-shutdown
4453   if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4454     cct->_conf.set_val("debug_osd", "100");
4455     cct->_conf.set_val("debug_journal", "100");
4456     cct->_conf.set_val("debug_filestore", "100");
4457     cct->_conf.set_val("debug_bluestore", "100");
4458     cct->_conf.set_val("debug_ms", "100");
4459     cct->_conf.apply_changes(nullptr);
4460   }
4461
4462   // stop MgrClient earlier as it's more like an internal consumer of OSD
4463   //
4464   // should occur before unmounting the database in fast-shutdown to avoid
4465   // a race condition (see https://tracker.ceph.com/issues/56101)
4466   mgrc.shutdown();
4467
4468   if (cct->_conf->osd_fast_shutdown) {
4469     // first, stop new task from being taken from op_shardedwq
4470     // and clear all pending tasks
4471     op_shardedwq.stop_for_fast_shutdown();
4472
4473     utime_t  start_time_timer = ceph_clock_now();
4474     tick_timer.shutdown();
4475     {
4476       std::lock_guard l(tick_timer_lock);
4477       tick_timer_without_osd_lock.shutdown();
4478     }
4479
4480     osd_lock.unlock();
4481     utime_t  start_time_osd_drain = ceph_clock_now();
4482
4483     // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4484     osd_op_tp.drain();
4485     osd_op_tp.stop();
4486
4487     utime_t  start_time_umount = ceph_clock_now();
4488     store->prepare_for_fast_shutdown();
4489     std::lock_guard lock(osd_lock);
4490     // TBD: assert in allocator that nothing is being add
4491     store->umount();
4492
4493     utime_t end_time = ceph_clock_now();
4494     if (cct->_conf->osd_fast_shutdown_timeout) {
4495       ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4496     }
4497     dout(0) <<"Fast Shutdown duration total     :" << end_time              - start_time_func       << " seconds" << dendl;
4498     dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount     - start_time_osd_drain  << " seconds" << dendl;
4499     dout(0) <<"Fast Shutdown duration umount    :" << end_time              - start_time_umount     << " seconds" << dendl;
4500     dout(0) <<"Fast Shutdown duration timer     :" << start_time_osd_drain  - start_time_timer      << " seconds" << dendl;
4501     cct->_log->flush();
4502
4503     // now it is safe to exit
4504     _exit(0);
4505   }
4506
4507   service.start_shutdown();
4508
4509   // stop sending work to pgs.  this just prevents any new work in _process
4510   // from racing with on_shutdown and potentially entering the pg after.
4511   op_shardedwq.drain();
4512
4513   // Shutdown PGs
4514   {
4515     vector<PGRef> pgs;
4516     _get_pgs(&pgs);
4517     for (auto pg : pgs) {
4518       pg->shutdown();
4519     }
4520   }
4521
4522   // drain op queue again (in case PGs requeued something)
4523   op_shardedwq.drain();
4524
4525   // unregister commands
4526   cct->get_admin_socket()->unregister_commands(asok_hook);
4527   delete asok_hook;
4528   asok_hook = NULL;
4529
4530   cct->get_admin_socket()->unregister_commands(test_ops_hook);
4531   delete test_ops_hook;
4532   test_ops_hook = NULL;
4533
4534   osd_lock.unlock();
4535
4536   {
4537     std::lock_guard l{heartbeat_lock};
4538     heartbeat_stop = true;
4539     heartbeat_cond.notify_all();
4540     heartbeat_peers.clear();
4541   }
4542   heartbeat_thread.join();
4543
4544   hb_back_server_messenger->mark_down_all();
4545   hb_front_server_messenger->mark_down_all();
4546   hb_front_client_messenger->mark_down_all();
4547   hb_back_client_messenger->mark_down_all();
4548
4549   osd_op_tp.drain();
4550   osd_op_tp.stop();
4551   dout(10) << "op sharded tp stopped" << dendl;
4552
4553   dout(10) << "stopping agent" << dendl;
4554   service.agent_stop();
4555
4556   boot_finisher.wait_for_empty();
4557
4558   osd_lock.lock();
4559
4560   boot_finisher.stop();
4561   reset_heartbeat_peers(true);
4562
4563   tick_timer.shutdown();
4564
4565   {
4566     std::lock_guard l(tick_timer_lock);
4567     tick_timer_without_osd_lock.shutdown();
4568   }
4569
4570   // note unmount epoch
4571   dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4572   superblock.mounted = service.get_boot_epoch();
4573   superblock.clean_thru = get_osdmap_epoch();
4574   ObjectStore::Transaction t;
4575   write_superblock(t);
4576   int r = store->queue_transaction(service.meta_ch, std::move(t));
4577   if (r) {
4578     derr << "OSD::shutdown: error writing superblock: "
4579          << cpp_strerror(r) << dendl;
4580   }
4581
4582
4583   service.shutdown_reserver();
4584
4585   // Remove PGs
4586 #ifdef PG_DEBUG_REFS
4587   service.dump_live_pgids();
4588 #endif
4589   while (true) {
4590     vector<PGRef> pgs;
4591     _get_pgs(&pgs, true);
4592     if (pgs.empty()) {
4593       break;
4594     }
4595     for (auto& pg : pgs) {
4596       if (pg->is_deleted()) {
4597         continue;
4598       }
4599       dout(20) << " kicking pg " << pg << dendl;
4600       pg->lock();
4601       if (pg->get_num_ref() != 1) {
4602         derr << "pgid " << pg->get_pgid() << " has ref count of "
4603              << pg->get_num_ref() << dendl;
4604 #ifdef PG_DEBUG_REFS
4605         pg->dump_live_ids();
4606 #endif
4607         if (cct->_conf->osd_shutdown_pgref_assert) {
4608           ceph_abort();
4609         }
4610       }
4611       pg->ch.reset();
4612       pg->unlock();
4613     }
4614   }
4615 #ifdef PG_DEBUG_REFS
4616   service.dump_live_pgids();
4617 #endif
4618
4619   osd_lock.unlock();
4620   cct->_conf.remove_observer(this);
4621   osd_lock.lock();
4622
4623   service.meta_ch.reset();
4624
4625   dout(10) << "syncing store" << dendl;
4626   enable_disable_fuse(true);
4627
4628   if (cct->_conf->osd_journal_flush_on_shutdown) {
4629     dout(10) << "flushing journal" << dendl;
4630     store->flush_journal();
4631   }
4632
4633   monc->shutdown();
4634   osd_lock.unlock();
4635   {
4636     std::unique_lock l{map_lock};
4637     set_osdmap(OSDMapRef());
4638   }
4639   for (auto s : shards) {
4640     std::lock_guard l(s->osdmap_lock);
4641     s->shard_osdmap = OSDMapRef();
4642   }
4643   service.shutdown();
4644
4645   std::lock_guard lock(osd_lock);
4646   store->umount();
4647   store.reset();
4648   dout(10) << "Store synced" << dendl;
4649
4650   op_tracker.on_shutdown();
4651
4652   ClassHandler::get_instance().shutdown();
4653   client_messenger->shutdown();
4654   cluster_messenger->shutdown();
4655   hb_front_client_messenger->shutdown();
4656   hb_back_client_messenger->shutdown();
4657   objecter_messenger->shutdown();
4658   hb_front_server_messenger->shutdown();
4659   hb_back_server_messenger->shutdown();
4660
4661   utime_t duration = ceph_clock_now() - start_time_func;
4662   dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
4663
4664
4665   return r;
4666 }
4667
4668 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4669 {
4670   bool created = false;
4671   while (true) {
4672     dout(10) << __func__ << " cmd: " << cmd << dendl;
4673     vector<string> vcmd{cmd};
4674     bufferlist inbl;
4675     C_SaferCond w;
4676     string outs;
4677     monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4678     int r = w.wait();
4679     if (r < 0) {
4680       if (r == -ENOENT && !created) {
4681         string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4682           + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4683         vector<string> vnewcmd{newcmd};
4684         bufferlist inbl;
4685         C_SaferCond w;
4686         string outs;
4687         monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4688         int r = w.wait();
4689         if (r < 0) {
4690           derr << __func__ << " fail: osd does not exist and created failed: "
4691                << cpp_strerror(r) << dendl;
4692           return r;
4693         }
4694         created = true;
4695         continue;
4696       }
4697       derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4698       return r;
4699     }
4700     break;
4701   }
4702
4703   return 0;
4704 }
4705
4706 int OSD::update_crush_location()
4707 {
4708   if (!cct->_conf->osd_crush_update_on_start) {
4709     dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4710     return 0;
4711   }
4712
4713   char weight[32];
4714   if (cct->_conf->osd_crush_initial_weight >= 0) {
4715     snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4716   } else {
4717     struct store_statfs_t st;
4718     osd_alert_list_t alerts;
4719     int r = store->statfs(&st, &alerts);
4720     if (r < 0) {
4721       derr << "statfs: " << cpp_strerror(r) << dendl;
4722       return r;
4723     }
4724     snprintf(weight, sizeof(weight), "%.4lf",
4725              std::max(.00001,
4726                       double(st.total) /
4727                       double(1ull << 40 /* TB */)));
4728   }
4729
4730   dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4731
4732   string cmd =
4733     string("{\"prefix\": \"osd crush create-or-move\", ") +
4734     string("\"id\": ") + stringify(whoami) + ", " +
4735     string("\"weight\":") + weight + ", " +
4736     string("\"args\": [") + stringify(cct->crush_location) + "]}";
4737   return mon_cmd_maybe_osd_create(cmd);
4738 }
4739
4740 int OSD::update_crush_device_class()
4741 {
4742   if (!cct->_conf->osd_class_update_on_start) {
4743     dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4744     return 0;
4745   }
4746
4747   string device_class;
4748   int r = store->read_meta("crush_device_class", &device_class);
4749   if (r < 0 || device_class.empty()) {
4750     device_class = store->get_default_device_class();
4751   }
4752
4753   if (device_class.empty()) {
4754     dout(20) << __func__ << " no device class stored locally" << dendl;
4755     return 0;
4756   }
4757
4758   string cmd =
4759     string("{\"prefix\": \"osd crush set-device-class\", ") +
4760     string("\"class\": \"") + device_class + string("\", ") +
4761     string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4762
4763   r = mon_cmd_maybe_osd_create(cmd);
4764   if (r == -EBUSY) {
4765     // good, already bound to a device-class
4766     return 0;
4767   } else {
4768     return r;
4769   }
4770 }
4771
4772 void OSD::write_superblock(ObjectStore::Transaction& t)
4773 {
4774   dout(10) << "write_superblock " << superblock << dendl;
4775
4776   //hack: at minimum it's using the baseline feature set
4777   if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4778     superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4779
4780   bufferlist bl;
4781   encode(superblock, bl);
4782   t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4783 }
4784
4785 int OSD::read_superblock()
4786 {
4787   bufferlist bl;
4788   int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4789   if (r < 0)
4790     return r;
4791
4792   auto p = bl.cbegin();
4793   decode(superblock, p);
4794
4795   dout(10) << "read_superblock " << superblock << dendl;
4796
4797   return 0;
4798 }
4799
4800 void OSD::clear_temp_objects()
4801 {
4802   dout(10) << __func__ << dendl;
4803   vector<coll_t> ls;
4804   store->list_collections(ls);
4805   for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4806     spg_t pgid;
4807     if (!p->is_pg(&pgid))
4808       continue;
4809
4810     // list temp objects
4811     dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4812
4813     vector<ghobject_t> temps;
4814     ghobject_t next;
4815     while (1) {
4816       vector<ghobject_t> objects;
4817       auto ch = store->open_collection(*p);
4818       ceph_assert(ch);
4819       store->collection_list(ch, next, ghobject_t::get_max(),
4820                              store->get_ideal_list_max(),
4821                              &objects, &next);
4822       if (objects.empty())
4823         break;
4824       vector<ghobject_t>::iterator q;
4825       for (q = objects.begin(); q != objects.end(); ++q) {
4826         // Hammer set pool for temps to -1, so check for clean-up
4827         if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4828           temps.push_back(*q);
4829         } else {
4830           break;
4831         }
4832       }
4833       // If we saw a non-temp object and hit the break above we can
4834       // break out of the while loop too.
4835       if (q != objects.end())
4836         break;
4837     }
4838     if (!temps.empty()) {
4839       ObjectStore::Transaction t;
4840       int removed = 0;
4841       for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4842         dout(20) << "  removing " << *p << " object " << *q << dendl;
4843         t.remove(*p, *q);
4844         if (++removed > cct->_conf->osd_target_transaction_size) {
4845           store->queue_transaction(service.meta_ch, std::move(t));
4846           t = ObjectStore::Transaction();
4847           removed = 0;
4848         }
4849       }
4850       if (removed) {
4851         store->queue_transaction(service.meta_ch, std::move(t));
4852       }
4853     }
4854   }
4855 }
4856
4857 void OSD::recursive_remove_collection(CephContext* cct,
4858                                       ObjectStore *store, spg_t pgid,
4859                                       coll_t tmp)
4860 {
4861   OSDriver driver(
4862     store,
4863     coll_t(),
4864     make_snapmapper_oid());
4865
4866   ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4867   ObjectStore::Transaction t;
4868   SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4869
4870   ghobject_t next;
4871   int max = cct->_conf->osd_target_transaction_size;
4872   vector<ghobject_t> objects;
4873   objects.reserve(max);
4874   while (true) {
4875     objects.clear();
4876     store->collection_list(ch, next, ghobject_t::get_max(),
4877       max, &objects, &next);
4878     generic_dout(10) << __func__ << " " << objects << dendl;
4879     if (objects.empty())
4880       break;
4881     for (auto& p: objects) {
4882       OSDriver::OSTransaction _t(driver.get_transaction(&t));
4883       int r = mapper.remove_oid(p.hobj, &_t);
4884       if (r != 0 && r != -ENOENT)
4885         ceph_abort();
4886       t.remove(tmp, p);
4887     }
4888     int r = store->queue_transaction(ch, std::move(t));
4889     ceph_assert(r == 0);
4890     t = ObjectStore::Transaction();
4891   }
4892   t.remove_collection(tmp);
4893   int r = store->queue_transaction(ch, std::move(t));
4894   ceph_assert(r == 0);
4895
4896   C_SaferCond waiter;
4897   if (!ch->flush_commit(&waiter)) {
4898     waiter.wait();
4899   }
4900 }
4901
4902
4903 // ======================================================
4904 // PG's
4905
4906 PG* OSD::_make_pg(
4907   OSDMapRef createmap,
4908   spg_t pgid)
4909 {
4910   dout(10) << __func__ << " " << pgid << dendl;
4911   pg_pool_t pi;
4912   map<string,string> ec_profile;
4913   string name;
4914   if (createmap->have_pg_pool(pgid.pool())) {
4915     pi = *createmap->get_pg_pool(pgid.pool());
4916     name = createmap->get_pool_name(pgid.pool());
4917     if (pi.is_erasure()) {
4918       ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4919     }
4920   } else {
4921     // pool was deleted; grab final pg_pool_t off disk.
4922     ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4923     bufferlist bl;
4924     int r = store->read(service.meta_ch, oid, 0, 0, bl);
4925     if (r < 0) {
4926       derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4927            << dendl;
4928       return nullptr;
4929     }
4930     ceph_assert(r >= 0);
4931     auto p = bl.cbegin();
4932     decode(pi, p);
4933     decode(name, p);
4934     if (p.end()) { // dev release v13.0.2 did not include ec_profile
4935       derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4936            << " tombstone" << dendl;
4937       return nullptr;
4938     }
4939     decode(ec_profile, p);
4940   }
4941   PGPool pool(createmap, pgid.pool(), pi, name);
4942   PG *pg;
4943   if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4944       pi.type == pg_pool_t::TYPE_ERASURE)
4945     pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4946   else
4947     ceph_abort();
4948   return pg;
4949 }
4950
4951 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4952 {
4953   v->clear();
4954   v->reserve(get_num_pgs());
4955   for (auto& s : shards) {
4956     std::lock_guard l(s->shard_lock);
4957     for (auto& j : s->pg_slots) {
4958       if (j.second->pg &&
4959           !j.second->pg->is_deleted()) {
4960         v->push_back(j.second->pg);
4961         if (clear_too) {
4962           s->_detach_pg(j.second.get());
4963         }
4964       }
4965     }
4966   }
4967 }
4968
4969 void OSD::_get_pgids(vector<spg_t> *v)
4970 {
4971   v->clear();
4972   v->reserve(get_num_pgs());
4973   for (auto& s : shards) {
4974     std::lock_guard l(s->shard_lock);
4975     for (auto& j : s->pg_slots) {
4976       if (j.second->pg &&
4977           !j.second->pg->is_deleted()) {
4978         v->push_back(j.first);
4979       }
4980     }
4981   }
4982 }
4983
4984 void OSD::register_pg(PGRef pg)
4985 {
4986   spg_t pgid = pg->get_pgid();
4987   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4988   auto sdata = shards[shard_index];
4989   std::lock_guard l(sdata->shard_lock);
4990   auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4991   ceph_assert(r.second);
4992   auto *slot = r.first->second.get();
4993   dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4994   sdata->_attach_pg(slot, pg.get());
4995 }
4996
4997 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4998 {
4999   auto sdata = pg->osd_shard;
5000   ceph_assert(sdata);
5001   {
5002     std::lock_guard l(sdata->shard_lock);
5003     auto p = sdata->pg_slots.find(pg->pg_id);
5004     if (p == sdata->pg_slots.end() ||
5005         !p->second->pg) {
5006       dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
5007       return false;
5008     }
5009     if (p->second->waiting_for_merge_epoch) {
5010       dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
5011       return false;
5012     }
5013     dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
5014     sdata->_detach_pg(p->second.get());
5015   }
5016
5017   for (auto shard : shards) {
5018     shard->unprime_split_children(pg->pg_id, old_pg_num);
5019   }
5020
5021   // update pg count now since we might not get an osdmap any time soon.
5022   if (pg->is_primary())
5023     service.logger->dec(l_osd_pg_primary);
5024   else if (pg->is_nonprimary())
5025     service.logger->dec(l_osd_pg_replica); // misnomver
5026   else
5027     service.logger->dec(l_osd_pg_stray);
5028
5029   return true;
5030 }
5031
5032 PGRef OSD::_lookup_pg(spg_t pgid)
5033 {
5034   uint32_t shard_index = pgid.hash_to_shard(num_shards);
5035   auto sdata = shards[shard_index];
5036   std::lock_guard l(sdata->shard_lock);
5037   auto p = sdata->pg_slots.find(pgid);
5038   if (p == sdata->pg_slots.end()) {
5039     return nullptr;
5040   }
5041   return p->second->pg;
5042 }
5043
5044 PGRef OSD::_lookup_lock_pg(spg_t pgid)
5045 {
5046   PGRef pg = _lookup_pg(pgid);
5047   if (!pg) {
5048     return nullptr;
5049   }
5050   pg->lock();
5051   if (!pg->is_deleted()) {
5052     return pg;
5053   }
5054   pg->unlock();
5055   return nullptr;
5056 }
5057
5058 PGRef OSD::lookup_lock_pg(spg_t pgid)
5059 {
5060   return _lookup_lock_pg(pgid);
5061 }
5062
5063 void OSD::load_pgs()
5064 {
5065   ceph_assert(ceph_mutex_is_locked(osd_lock));
5066   dout(0) << "load_pgs" << dendl;
5067
5068   {
5069     auto pghist = make_pg_num_history_oid();
5070     bufferlist bl;
5071     int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
5072     if (r >= 0 && bl.length() > 0) {
5073       auto p = bl.cbegin();
5074       decode(pg_num_history, p);
5075     }
5076     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
5077   }
5078
5079   vector<coll_t> ls;
5080   int r = store->list_collections(ls);
5081   if (r < 0) {
5082     derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
5083   }
5084
5085   int num = 0;
5086   for (vector<coll_t>::iterator it = ls.begin();
5087        it != ls.end();
5088        ++it) {
5089     spg_t pgid;
5090     if (it->is_temp(&pgid) ||
5091        (it->is_pg(&pgid) && PG::_has_removal_flag(store.get(), pgid))) {
5092       dout(10) << "load_pgs " << *it
5093                << " removing, legacy or flagged for removal pg" << dendl;
5094       recursive_remove_collection(cct, store.get(), pgid, *it);
5095       continue;
5096     }
5097
5098     if (!it->is_pg(&pgid)) {
5099       dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
5100       continue;
5101     }
5102
5103     dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
5104     epoch_t map_epoch = 0;
5105     int r = PG::peek_map_epoch(store.get(), pgid, &map_epoch);
5106     if (r < 0) {
5107       derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
5108            << dendl;
5109       continue;
5110     }
5111
5112     PGRef pg;
5113     if (map_epoch > 0) {
5114       OSDMapRef pgosdmap = service.try_get_map(map_epoch);
5115       if (!pgosdmap) {
5116         if (!get_osdmap()->have_pg_pool(pgid.pool())) {
5117           derr << __func__ << ": could not find map for epoch " << map_epoch
5118                << " on pg " << pgid << ", but the pool is not present in the "
5119                << "current map, so this is probably a result of bug 10617.  "
5120                << "Skipping the pg for now, you can use ceph-objectstore-tool "
5121                << "to clean it up later." << dendl;
5122           continue;
5123         } else {
5124           derr << __func__ << ": have pgid " << pgid << " at epoch "
5125                << map_epoch << ", but missing map.  Crashing."
5126                << dendl;
5127           ceph_abort_msg("Missing map in load_pgs");
5128         }
5129       }
5130       pg = _make_pg(pgosdmap, pgid);
5131     } else {
5132       pg = _make_pg(get_osdmap(), pgid);
5133     }
5134     if (!pg) {
5135       recursive_remove_collection(cct, store.get(), pgid, *it);
5136       continue;
5137     }
5138
5139     // there can be no waiters here, so we don't call _wake_pg_slot
5140
5141     pg->lock();
5142     pg->ch = store->open_collection(pg->coll);
5143
5144     // read pg state, log
5145     pg->read_state(store.get());
5146
5147     if (pg->dne())  {
5148       dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
5149       pg->ch = nullptr;
5150       pg->unlock();
5151       recursive_remove_collection(cct, store.get(), pgid, *it);
5152       continue;
5153     }
5154     {
5155       uint32_t shard_index = pgid.hash_to_shard(shards.size());
5156       assert(NULL != shards[shard_index]);
5157       store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5158     }
5159
5160     dout(10) << __func__ << " loaded " << *pg << dendl;
5161     pg->unlock();
5162
5163     register_pg(pg);
5164     ++num;
5165   }
5166   dout(0) << __func__ << " opened " << num << " pgs" << dendl;
5167 }
5168
5169
5170 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
5171                                  const PGCreateInfo *info)
5172 {
5173   spg_t pgid = info->pgid;
5174
5175   if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
5176     dout(10) << __func__ << " hit max pg, dropping" << dendl;
5177     return nullptr;
5178   }
5179
5180   OSDMapRef startmap = get_map(info->epoch);
5181
5182   if (info->by_mon) {
5183     int64_t pool_id = pgid.pgid.pool();
5184     const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
5185     if (!pool) {
5186       dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
5187       return nullptr;
5188     }
5189     if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
5190         !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
5191       // this ensures we do not process old creating messages after the
5192       // pool's initial pgs have been created (and pg are subsequently
5193       // allowed to split or merge).
5194       dout(20) << __func__ << "  dropping " << pgid
5195                << "create, pool does not have CREATING flag set" << dendl;
5196       return nullptr;
5197     }
5198   }
5199
5200   int up_primary, acting_primary;
5201   vector<int> up, acting;
5202   startmap->pg_to_up_acting_osds(
5203     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5204
5205   const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
5206   if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
5207       store->get_type() != "bluestore") {
5208     clog->warn() << "pg " << pgid
5209                  << " is at risk of silent data corruption: "
5210                  << "the pool allows ec overwrites but is not stored in "
5211                  << "bluestore, so deep scrubbing will not detect bitrot";
5212   }
5213   PeeringCtx rctx;
5214   create_pg_collection(
5215     rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
5216   init_pg_ondisk(rctx.transaction, pgid, pp);
5217
5218   int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
5219
5220   PGRef pg = _make_pg(startmap, pgid);
5221   pg->ch = store->create_new_collection(pg->coll);
5222
5223   {
5224     uint32_t shard_index = pgid.hash_to_shard(shards.size());
5225     assert(NULL != shards[shard_index]);
5226     store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5227   }
5228
5229   pg->lock(true);
5230
5231   // we are holding the shard lock
5232   ceph_assert(!pg->is_deleted());
5233
5234   pg->init(
5235     role,
5236     up,
5237     up_primary,
5238     acting,
5239     acting_primary,
5240     info->history,
5241     info->past_intervals,
5242     rctx.transaction);
5243
5244   pg->init_collection_pool_opts();
5245
5246   if (pg->is_primary()) {
5247     std::lock_guard locker{m_perf_queries_lock};
5248     pg->set_dynamic_perf_stats_queries(m_perf_queries);
5249   }
5250
5251   pg->handle_initialize(rctx);
5252   pg->handle_activate_map(rctx);
5253
5254   dispatch_context(rctx, pg.get(), osdmap, nullptr);
5255
5256   dout(10) << __func__ << " new pg " << *pg << dendl;
5257   return pg;
5258 }
5259
5260 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
5261                                 spg_t pgid,
5262                                 bool is_mon_create)
5263 {
5264   const auto max_pgs_per_osd =
5265     (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5266      cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5267
5268   if (num_pgs < max_pgs_per_osd) {
5269     return false;
5270   }
5271
5272   std::lock_guard l(pending_creates_lock);
5273   if (is_mon_create) {
5274     pending_creates_from_mon++;
5275   } else {
5276     bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
5277     pending_creates_from_osd.emplace(pgid, is_primary);
5278   }
5279   dout(1) << __func__ << " withhold creation of pg " << pgid
5280           << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
5281   return true;
5282 }
5283
5284 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5285 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5286 // to up set if pg_temp is empty. so an empty pg_temp won't work.
5287 static vector<int32_t> twiddle(const vector<int>& acting) {
5288   if (acting.size() > 1) {
5289     return {acting[0]};
5290   } else {
5291     vector<int32_t> twiddled(acting.begin(), acting.end());
5292     twiddled.push_back(-1);
5293     return twiddled;
5294   }
5295 }
5296
5297 void OSD::resume_creating_pg()
5298 {
5299   bool do_sub_pg_creates = false;
5300   bool have_pending_creates = false;
5301   {
5302     const auto max_pgs_per_osd =
5303       (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5304        cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5305     if (max_pgs_per_osd <= num_pgs) {
5306       // this could happen if admin decreases this setting before a PG is removed
5307       return;
5308     }
5309     unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5310     std::lock_guard l(pending_creates_lock);
5311     if (pending_creates_from_mon > 0) {
5312       dout(20) << __func__ << " pending_creates_from_mon "
5313                << pending_creates_from_mon << dendl;
5314       do_sub_pg_creates = true;
5315       if (pending_creates_from_mon >= spare_pgs) {
5316         spare_pgs = pending_creates_from_mon = 0;
5317       } else {
5318         spare_pgs -= pending_creates_from_mon;
5319         pending_creates_from_mon = 0;
5320       }
5321     }
5322     auto pg = pending_creates_from_osd.cbegin();
5323     while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
5324       dout(20) << __func__ << " pg " << pg->first << dendl;
5325       vector<int> acting;
5326       get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5327       service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
5328       pg = pending_creates_from_osd.erase(pg);
5329       do_sub_pg_creates = true;
5330       spare_pgs--;
5331     }
5332     have_pending_creates = (pending_creates_from_mon > 0 ||
5333                             !pending_creates_from_osd.empty());
5334   }
5335
5336   bool do_renew_subs = false;
5337   if (do_sub_pg_creates) {
5338     if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5339       dout(4) << __func__ << ": resolicit pg creates from mon since "
5340               << last_pg_create_epoch << dendl;
5341       do_renew_subs = true;
5342     }
5343   }
5344   version_t start = get_osdmap_epoch() + 1;
5345   if (have_pending_creates) {
5346     // don't miss any new osdmap deleting PGs
5347     if (monc->sub_want("osdmap", start, 0)) {
5348       dout(4) << __func__ << ": resolicit osdmap from mon since "
5349               << start << dendl;
5350       do_renew_subs = true;
5351     }
5352   } else if (do_sub_pg_creates) {
5353     // no need to subscribe the osdmap continuously anymore
5354     // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5355     if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
5356       dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
5357               << start << dendl;
5358       do_renew_subs = true;
5359     }
5360   }
5361
5362   if (do_renew_subs) {
5363     monc->renew_subs();
5364   }
5365
5366   service.send_pg_temp();
5367 }
5368
5369 void OSD::_add_heartbeat_peer(int p)
5370 {
5371   if (p == whoami)
5372     return;
5373   HeartbeatInfo *hi;
5374
5375   map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5376   if (i == heartbeat_peers.end()) {
5377     pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5378     if (!cons.first)
5379       return;
5380     assert(cons.second);
5381
5382     hi = &heartbeat_peers[p];
5383     hi->peer = p;
5384
5385     auto stamps = service.get_hb_stamps(p);
5386
5387     auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5388     sb->peer = p;
5389     sb->stamps = stamps;
5390     hi->hb_interval_start = ceph_clock_now();
5391     hi->con_back = cons.first.get();
5392     hi->con_back->set_priv(sb);
5393
5394     auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5395     sf->peer = p;
5396     sf->stamps = stamps;
5397     hi->con_front = cons.second.get();
5398     hi->con_front->set_priv(sf);
5399
5400     dout(10) << "_add_heartbeat_peer: new peer osd." << p
5401              << " " << hi->con_back->get_peer_addr()
5402              << " " << hi->con_front->get_peer_addr()
5403              << dendl;
5404   } else {
5405     hi = &i->second;
5406   }
5407   hi->epoch = get_osdmap_epoch();
5408 }
5409
5410 void OSD::_remove_heartbeat_peer(int n)
5411 {
5412   map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5413   ceph_assert(q != heartbeat_peers.end());
5414   dout(20) << " removing heartbeat peer osd." << n
5415            << " " << q->second.con_back->get_peer_addr()
5416            << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5417            << dendl;
5418   q->second.clear_mark_down();
5419   heartbeat_peers.erase(q);
5420 }
5421
5422 void OSD::need_heartbeat_peer_update()
5423 {
5424   if (is_stopping())
5425     return;
5426   dout(20) << "need_heartbeat_peer_update" << dendl;
5427   heartbeat_set_peers_need_update();
5428 }
5429
5430 void OSD::maybe_update_heartbeat_peers()
5431 {
5432   ceph_assert(ceph_mutex_is_locked(osd_lock));
5433
5434   if (is_waiting_for_healthy() || is_active()) {
5435     utime_t now = ceph_clock_now();
5436     if (last_heartbeat_resample == utime_t()) {
5437       last_heartbeat_resample = now;
5438       heartbeat_set_peers_need_update();
5439     } else if (!heartbeat_peers_need_update()) {
5440       utime_t dur = now - last_heartbeat_resample;
5441       if (dur > cct->_conf->osd_heartbeat_grace) {
5442         dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5443         heartbeat_set_peers_need_update();
5444         last_heartbeat_resample = now;
5445         // automatically clean up any stale heartbeat peers
5446         // if we are unhealthy, then clean all
5447         reset_heartbeat_peers(is_waiting_for_healthy());
5448       }
5449     }
5450   }
5451
5452   if (!heartbeat_peers_need_update())
5453     return;
5454   heartbeat_clear_peers_need_update();
5455
5456   std::lock_guard l(heartbeat_lock);
5457
5458   dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5459
5460
5461   // build heartbeat from set
5462   if (is_active()) {
5463     vector<PGRef> pgs;
5464     _get_pgs(&pgs);
5465     for (auto& pg : pgs) {
5466       pg->with_heartbeat_peers([&](int peer) {
5467           if (get_osdmap()->is_up(peer)) {
5468             _add_heartbeat_peer(peer);
5469           }
5470         });
5471     }
5472   }
5473
5474   // include next and previous up osds to ensure we have a fully-connected set
5475   set<int> want, extras;
5476   const int next = get_osdmap()->get_next_up_osd_after(whoami);
5477   if (next >= 0)
5478     want.insert(next);
5479   int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5480   if (prev >= 0 && prev != next)
5481     want.insert(prev);
5482
5483   // make sure we have at least **min_down** osds coming from different
5484   // subtree level (e.g., hosts) for fast failure detection.
5485   auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5486   auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5487   auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5488   get_osdmap()->get_random_up_osds_by_subtree(
5489     whoami, subtree, limit, want, &want);
5490
5491   for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5492     dout(10) << " adding neighbor peer osd." << *p << dendl;
5493     extras.insert(*p);
5494     _add_heartbeat_peer(*p);
5495   }
5496
5497   // remove down peers; enumerate extras
5498   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5499   while (p != heartbeat_peers.end()) {
5500     if (!get_osdmap()->is_up(p->first)) {
5501       int o = p->first;
5502       ++p;
5503       _remove_heartbeat_peer(o);
5504       continue;
5505     }
5506     if (p->second.epoch < get_osdmap_epoch()) {
5507       extras.insert(p->first);
5508     }
5509     ++p;
5510   }
5511
5512   // too few?
5513   for (int n = next; n >= 0; ) {
5514     if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5515       break;
5516     if (!extras.count(n) && !want.count(n) && n != whoami) {
5517       dout(10) << " adding random peer osd." << n << dendl;
5518       extras.insert(n);
5519       _add_heartbeat_peer(n);
5520     }
5521     n = get_osdmap()->get_next_up_osd_after(n);
5522     if (n == next)
5523       break;  // came full circle; stop
5524   }
5525
5526   // too many?
5527   for (set<int>::iterator p = extras.begin();
5528        (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5529        ++p) {
5530     if (want.count(*p))
5531       continue;
5532     _remove_heartbeat_peer(*p);
5533   }
5534
5535   dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5536
5537   // clean up stale failure pending
5538   for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5539     if (heartbeat_peers.count(it->first) == 0) {
5540       send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5541       failure_pending.erase(it++);
5542     } else {
5543       it++;
5544     }
5545   }
5546 }
5547
5548 void OSD::reset_heartbeat_peers(bool all)
5549 {
5550   ceph_assert(ceph_mutex_is_locked(osd_lock));
5551   dout(10) << "reset_heartbeat_peers" << dendl;
5552   utime_t stale = ceph_clock_now();
5553   stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5554   std::lock_guard l(heartbeat_lock);
5555   for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5556     auto& [peer, hi] = *it;
5557     if (all || hi.is_stale(stale)) {
5558       hi.clear_mark_down();
5559       // stop sending failure_report to mon too
5560       failure_queue.erase(peer);
5561       failure_pending.erase(peer);
5562       it = heartbeat_peers.erase(it);
5563     } else {
5564       ++it;
5565     }
5566   }
5567 }
5568
5569 void OSD::handle_osd_ping(MOSDPing *m)
5570 {
5571   if (superblock.cluster_fsid != m->fsid) {
5572     dout(20) << "handle_osd_ping from " << m->get_source_inst()
5573              << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5574              << dendl;
5575     m->put();
5576     return;
5577   }
5578
5579   int from = m->get_source().num();
5580
5581   heartbeat_lock.lock();
5582   if (is_stopping()) {
5583     heartbeat_lock.unlock();
5584     m->put();
5585     return;
5586   }
5587
5588   utime_t now = ceph_clock_now();
5589   auto mnow = service.get_mnow();
5590   ConnectionRef con(m->get_connection());
5591   OSDMapRef curmap = service.get_osdmap();
5592   if (!curmap) {
5593     heartbeat_lock.unlock();
5594     m->put();
5595     return;
5596   }
5597
5598   auto sref = con->get_priv();
5599   Session *s = static_cast<Session*>(sref.get());
5600   if (!s) {
5601     heartbeat_lock.unlock();
5602     m->put();
5603     return;
5604   }
5605   if (!s->stamps) {
5606     s->peer = from;
5607     s->stamps = service.get_hb_stamps(from);
5608   }
5609
5610   switch (m->op) {
5611
5612   case MOSDPing::PING:
5613     {
5614       if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5615         auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5616         if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5617           if (heartbeat_drop->second == 0) {
5618             debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5619           } else {
5620             --heartbeat_drop->second;
5621             dout(5) << "Dropping heartbeat from " << from
5622                     << ", " << heartbeat_drop->second
5623                     << " remaining to drop" << dendl;
5624             break;
5625           }
5626         } else if (cct->_conf->osd_debug_drop_ping_probability >
5627                    ((((double)(rand()%100))/100.0))) {
5628           heartbeat_drop =
5629             debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5630                              cct->_conf->osd_debug_drop_ping_duration)).first;
5631           dout(5) << "Dropping heartbeat from " << from
5632                   << ", " << heartbeat_drop->second
5633                   << " remaining to drop" << dendl;
5634           break;
5635         }
5636       }
5637
5638       ceph::signedspan sender_delta_ub{};
5639       s->stamps->got_ping(
5640         m->up_from,
5641         mnow,
5642         m->mono_send_stamp,
5643         m->delta_ub,
5644         &sender_delta_ub);
5645       dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5646
5647       if (!cct->get_heartbeat_map()->is_healthy()) {
5648         dout(10) << "internal heartbeat not healthy, dropping ping request"
5649                  << dendl;
5650         break;
5651       }
5652
5653       Message *r = new MOSDPing(monc->get_fsid(),
5654                                 curmap->get_epoch(),
5655                                 MOSDPing::PING_REPLY,
5656                                 m->ping_stamp,
5657                                 m->mono_ping_stamp,
5658                                 mnow,
5659                                 service.get_up_epoch(),
5660                                 cct->_conf->osd_heartbeat_min_size,
5661                                 sender_delta_ub);
5662       con->send_message(r);
5663
5664       if (curmap->is_up(from)) {
5665         if (is_active()) {
5666           ConnectionRef cluster_con = service.get_con_osd_cluster(
5667             from, curmap->get_epoch());
5668           if (cluster_con) {
5669             service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5670           }
5671         }
5672       } else if (!curmap->exists(from) ||
5673                  curmap->get_down_at(from) > m->map_epoch) {
5674         // tell them they have died
5675         Message *r = new MOSDPing(monc->get_fsid(),
5676                                   curmap->get_epoch(),
5677                                   MOSDPing::YOU_DIED,
5678                                   m->ping_stamp,
5679                                   m->mono_ping_stamp,
5680                                   mnow,
5681                                   service.get_up_epoch(),
5682                                   cct->_conf->osd_heartbeat_min_size);
5683         con->send_message(r);
5684       }
5685     }
5686     break;
5687
5688   case MOSDPing::PING_REPLY:
5689     {
5690       map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5691       if (i != heartbeat_peers.end()) {
5692         auto acked = i->second.ping_history.find(m->ping_stamp);
5693         if (acked != i->second.ping_history.end()) {
5694           int &unacknowledged = acked->second.second;
5695           if (con == i->second.con_back) {
5696             dout(25) << "handle_osd_ping got reply from osd." << from
5697                      << " first_tx " << i->second.first_tx
5698                      << " last_tx " << i->second.last_tx
5699                      << " last_rx_back " << i->second.last_rx_back
5700                      << " -> " << now
5701                      << " last_rx_front " << i->second.last_rx_front
5702                      << dendl;
5703             i->second.last_rx_back = now;
5704             ceph_assert(unacknowledged > 0);
5705             --unacknowledged;
5706             // if there is no front con, set both stamps.
5707             if (i->second.con_front == NULL) {
5708               i->second.last_rx_front = now;
5709               ceph_assert(unacknowledged > 0);
5710               --unacknowledged;
5711             }
5712           } else if (con == i->second.con_front) {
5713             dout(25) << "handle_osd_ping got reply from osd." << from
5714                      << " first_tx " << i->second.first_tx
5715                      << " last_tx " << i->second.last_tx
5716                      << " last_rx_back " << i->second.last_rx_back
5717                      << " last_rx_front " << i->second.last_rx_front
5718                      << " -> " << now
5719                      << dendl;
5720             i->second.last_rx_front = now;
5721             ceph_assert(unacknowledged > 0);
5722             --unacknowledged;
5723           }
5724
5725           if (unacknowledged == 0) {
5726             // succeeded in getting all replies
5727             dout(25) << "handle_osd_ping got all replies from osd." << from
5728                      << " , erase pending ping(sent at " << m->ping_stamp << ")"
5729                      << " and older pending ping(s)"
5730                      << dendl;
5731
5732 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5733             ++i->second.hb_average_count;
5734             uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5735             i->second.hb_total_back += back_pingtime;
5736             if (back_pingtime < i->second.hb_min_back)
5737               i->second.hb_min_back = back_pingtime;
5738             if (back_pingtime > i->second.hb_max_back)
5739               i->second.hb_max_back = back_pingtime;
5740             uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5741             i->second.hb_total_front += front_pingtime;
5742             if (front_pingtime < i->second.hb_min_front)
5743               i->second.hb_min_front = front_pingtime;
5744             if (front_pingtime > i->second.hb_max_front)
5745               i->second.hb_max_front = front_pingtime;
5746
5747             ceph_assert(i->second.hb_interval_start != utime_t());
5748             if (i->second.hb_interval_start == utime_t())
5749               i->second.hb_interval_start = now;
5750             int64_t hb_avg_time_period = 60;
5751             if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5752               hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5753             }
5754             if (now - i->second.hb_interval_start >=  utime_t(hb_avg_time_period, 0)) {
5755               uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5756               uint32_t back_min = i->second.hb_min_back;
5757               uint32_t back_max = i->second.hb_max_back;
5758               uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5759               uint32_t front_min = i->second.hb_min_front;
5760               uint32_t front_max = i->second.hb_max_front;
5761
5762               // Reset for new interval
5763               i->second.hb_average_count = 0;
5764               i->second.hb_interval_start = now;
5765               i->second.hb_total_back = i->second.hb_max_back = 0;
5766               i->second.hb_min_back =  UINT_MAX;
5767               i->second.hb_total_front = i->second.hb_max_front = 0;
5768               i->second.hb_min_front = UINT_MAX;
5769
5770               // Record per osd interace ping times
5771               // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5772               if (i->second.hb_back_pingtime.size() == 0) {
5773                 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5774                 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5775                   i->second.hb_back_pingtime.push_back(back_avg);
5776                   i->second.hb_back_min.push_back(back_min);
5777                   i->second.hb_back_max.push_back(back_max);
5778                   i->second.hb_front_pingtime.push_back(front_avg);
5779                   i->second.hb_front_min.push_back(front_min);
5780                   i->second.hb_front_max.push_back(front_max);
5781                   ++i->second.hb_index;
5782                 }
5783               } else {
5784                 int index = i->second.hb_index & (hb_vector_size - 1);
5785                 i->second.hb_back_pingtime[index] = back_avg;
5786                 i->second.hb_back_min[index] = back_min;
5787                 i->second.hb_back_max[index] = back_max;
5788                 i->second.hb_front_pingtime[index] = front_avg;
5789                 i->second.hb_front_min[index] = front_min;
5790                 i->second.hb_front_max[index] = front_max;
5791                 ++i->second.hb_index;
5792               }
5793
5794               {
5795                 std::lock_guard l(service.stat_lock);
5796                 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5797                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5798
5799                 uint32_t total = 0;
5800                 uint32_t min = UINT_MAX;
5801                 uint32_t max = 0;
5802                 uint32_t count = 0;
5803                 uint32_t which = 0;
5804                 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5805                 for (int32_t k = size - 1 ; k >= 0; --k) {
5806                   ++count;
5807                   int index = (i->second.hb_index + k) % size;
5808                   total += i->second.hb_back_pingtime[index];
5809                   if (i->second.hb_back_min[index] < min)
5810                     min = i->second.hb_back_min[index];
5811                   if (i->second.hb_back_max[index] > max)
5812                     max = i->second.hb_back_max[index];
5813                   if (count == 1 || count == 5 || count == 15) {
5814                     service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5815                     service.osd_stat.hb_pingtime[from].back_min[which] = min;
5816                     service.osd_stat.hb_pingtime[from].back_max[which] = max;
5817                     which++;
5818                     if (count == 15)
5819                       break;
5820                   }
5821                 }
5822
5823                 if (i->second.con_front != NULL) {
5824                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5825
5826                   total = 0;
5827                   min = UINT_MAX;
5828                   max = 0;
5829                   count = 0;
5830                   which = 0;
5831                   for (int32_t k = size - 1 ; k >= 0; --k) {
5832                     ++count;
5833                     int index = (i->second.hb_index + k) % size;
5834                     total += i->second.hb_front_pingtime[index];
5835                     if (i->second.hb_front_min[index] < min)
5836                       min = i->second.hb_front_min[index];
5837                     if (i->second.hb_front_max[index] > max)
5838                       max = i->second.hb_front_max[index];
5839                     if (count == 1 || count == 5 || count == 15) {
5840                       service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5841                       service.osd_stat.hb_pingtime[from].front_min[which] = min;
5842                       service.osd_stat.hb_pingtime[from].front_max[which] = max;
5843                       which++;
5844                       if (count == 15)
5845                         break;
5846                     }
5847                   }
5848                 }
5849               }
5850             } else {
5851                 std::lock_guard l(service.stat_lock);
5852                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5853                 if (i->second.con_front != NULL)
5854                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5855             }
5856             i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5857           }
5858
5859           if (i->second.is_healthy(now)) {
5860             // Cancel false reports
5861             auto failure_queue_entry = failure_queue.find(from);
5862             if (failure_queue_entry != failure_queue.end()) {
5863               dout(10) << "handle_osd_ping canceling queued "
5864                        << "failure report for osd." << from << dendl;
5865               failure_queue.erase(failure_queue_entry);
5866             }
5867
5868             auto failure_pending_entry = failure_pending.find(from);
5869             if (failure_pending_entry != failure_pending.end()) {
5870               dout(10) << "handle_osd_ping canceling in-flight "
5871                        << "failure report for osd." << from << dendl;
5872               send_still_alive(curmap->get_epoch(),
5873                                from,
5874                                failure_pending_entry->second.second);
5875               failure_pending.erase(failure_pending_entry);
5876             }
5877           }
5878         } else {
5879           // old replies, deprecated by newly sent pings.
5880           dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5881                    << ") is found, treat as covered by newly sent pings "
5882                    << "and ignore"
5883                    << dendl;
5884         }
5885       }
5886
5887       if (m->map_epoch &&
5888           curmap->is_up(from)) {
5889         if (is_active()) {
5890           ConnectionRef cluster_con = service.get_con_osd_cluster(
5891             from, curmap->get_epoch());
5892           if (cluster_con) {
5893             service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5894           }
5895         }
5896       }
5897
5898       s->stamps->got_ping_reply(
5899         mnow,
5900         m->mono_send_stamp,
5901         m->delta_ub);
5902       dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5903     }
5904     break;
5905
5906   case MOSDPing::YOU_DIED:
5907     dout(10) << "handle_osd_ping " << m->get_source_inst()
5908              << " says i am down in " << m->map_epoch << dendl;
5909     osdmap_subscribe(curmap->get_epoch()+1, false);
5910     break;
5911   }
5912
5913   heartbeat_lock.unlock();
5914   m->put();
5915 }
5916
5917 void OSD::heartbeat_entry()
5918 {
5919   std::unique_lock l(heartbeat_lock);
5920   if (is_stopping())
5921     return;
5922   while (!heartbeat_stop) {
5923     heartbeat();
5924
5925     double wait;
5926     if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5927       wait = (float)cct->_conf->osd_heartbeat_interval;
5928     } else {
5929       wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5930     }
5931     auto w = ceph::make_timespan(wait);
5932     dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5933     heartbeat_cond.wait_for(l, w);
5934     if (is_stopping())
5935       return;
5936     dout(30) << "heartbeat_entry woke up" << dendl;
5937   }
5938 }
5939
5940 void OSD::heartbeat_check()
5941 {
5942   ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5943   utime_t now = ceph_clock_now();
5944
5945   // check for incoming heartbeats (move me elsewhere?)
5946   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5947        p != heartbeat_peers.end();
5948        ++p) {
5949
5950     if (p->second.first_tx == utime_t()) {
5951       dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5952                << " yet, skipping" << dendl;
5953       continue;
5954     }
5955
5956     dout(25) << "heartbeat_check osd." << p->first
5957              << " first_tx " << p->second.first_tx
5958              << " last_tx " << p->second.last_tx
5959              << " last_rx_back " << p->second.last_rx_back
5960              << " last_rx_front " << p->second.last_rx_front
5961              << dendl;
5962     if (p->second.is_unhealthy(now)) {
5963       utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5964       if (p->second.last_rx_back == utime_t() ||
5965           p->second.last_rx_front == utime_t()) {
5966         derr << "heartbeat_check: no reply from "
5967              << p->second.con_front->get_peer_addr().get_sockaddr()
5968              << " osd." << p->first
5969              << " ever on either front or back, first ping sent "
5970              << p->second.first_tx
5971              << " (oldest deadline " << oldest_deadline << ")"
5972              << dendl;
5973         // fail
5974         failure_queue[p->first] = p->second.first_tx;
5975       } else {
5976         derr << "heartbeat_check: no reply from "
5977              << p->second.con_front->get_peer_addr().get_sockaddr()
5978              << " osd." << p->first << " since back " << p->second.last_rx_back
5979              << " front " << p->second.last_rx_front
5980              << " (oldest deadline " << oldest_deadline << ")"
5981              << dendl;
5982         // fail
5983         failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5984       }
5985     }
5986   }
5987 }
5988
5989 void OSD::heartbeat()
5990 {
5991   ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5992   dout(30) << "heartbeat" << dendl;
5993
5994   auto load_for_logger = service.get_scrub_services().update_load_average();
5995   if (load_for_logger) {
5996     logger->set(l_osd_loadavg, load_for_logger.value());
5997   }
5998   dout(30) << "heartbeat checking stats" << dendl;
5999
6000   // refresh peer list and osd stats
6001   vector<int> hb_peers;
6002   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6003        p != heartbeat_peers.end();
6004        ++p)
6005     hb_peers.push_back(p->first);
6006
6007   auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
6008   dout(5) << __func__ << " " << new_stat << dendl;
6009   ceph_assert(new_stat.statfs.total);
6010
6011   float pratio;
6012   float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
6013
6014   service.check_full_status(ratio, pratio);
6015
6016   utime_t now = ceph_clock_now();
6017   auto mnow = service.get_mnow();
6018   utime_t deadline = now;
6019   deadline += cct->_conf->osd_heartbeat_grace;
6020
6021   // send heartbeats
6022   for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
6023        i != heartbeat_peers.end();
6024        ++i) {
6025     int peer = i->first;
6026     Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
6027     if (!s) {
6028       dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
6029       continue;
6030     }
6031     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
6032
6033     i->second.last_tx = now;
6034     if (i->second.first_tx == utime_t())
6035       i->second.first_tx = now;
6036     i->second.ping_history[now] = make_pair(deadline,
6037       HeartbeatInfo::HEARTBEAT_MAX_CONN);
6038     if (i->second.hb_interval_start == utime_t())
6039       i->second.hb_interval_start = now;
6040
6041     std::optional<ceph::signedspan> delta_ub;
6042     s->stamps->sent_ping(&delta_ub);
6043
6044     i->second.con_back->send_message(
6045       new MOSDPing(monc->get_fsid(),
6046                    service.get_osdmap_epoch(),
6047                    MOSDPing::PING,
6048                    now,
6049                    mnow,
6050                    mnow,
6051                    service.get_up_epoch(),
6052                    cct->_conf->osd_heartbeat_min_size,
6053                    delta_ub));
6054
6055     if (i->second.con_front)
6056       i->second.con_front->send_message(
6057         new MOSDPing(monc->get_fsid(),
6058                      service.get_osdmap_epoch(),
6059                      MOSDPing::PING,
6060                      now,
6061                      mnow,
6062                      mnow,
6063                      service.get_up_epoch(),
6064                      cct->_conf->osd_heartbeat_min_size,
6065                      delta_ub));
6066   }
6067
6068   logger->set(l_osd_hb_to, heartbeat_peers.size());
6069
6070   // hmm.. am i all alone?
6071   dout(30) << "heartbeat lonely?" << dendl;
6072   if (heartbeat_peers.empty()) {
6073     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
6074       last_mon_heartbeat = now;
6075       dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
6076       osdmap_subscribe(get_osdmap_epoch() + 1, false);
6077     }
6078   }
6079
6080   dout(30) << "heartbeat done" << dendl;
6081 }
6082
6083 bool OSD::heartbeat_reset(Connection *con)
6084 {
6085   std::lock_guard l(heartbeat_lock);
6086   auto s = con->get_priv();
6087   dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
6088   con->set_priv(nullptr);
6089   if (s) {
6090     if (is_stopping()) {
6091       return true;
6092     }
6093     auto session = static_cast<Session*>(s.get());
6094     auto p = heartbeat_peers.find(session->peer);
6095     if (p != heartbeat_peers.end() &&
6096         (p->second.con_back == con ||
6097          p->second.con_front == con)) {
6098       dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6099                << ", reopening" << dendl;
6100       p->second.clear_mark_down(con);
6101       pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
6102       if (newcon.first) {
6103         p->second.con_back = newcon.first.get();
6104         p->second.con_back->set_priv(s);
6105         if (newcon.second) {
6106           p->second.con_front = newcon.second.get();
6107           p->second.con_front->set_priv(s);
6108         }
6109         p->second.ping_history.clear();
6110       } else {
6111         dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6112                  << ", raced with osdmap update, closing out peer" << dendl;
6113         heartbeat_peers.erase(p);
6114       }
6115     } else {
6116       dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
6117     }
6118   }
6119   return true;
6120 }
6121
6122
6123
6124 // =========================================
6125
6126 void OSD::tick()
6127 {
6128   ceph_assert(ceph_mutex_is_locked(osd_lock));
6129   dout(10) << "tick" << dendl;
6130
6131   utime_t now = ceph_clock_now();
6132   // throw out any obsolete markdown log
6133   utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6134   while (!osd_markdown_log.empty() &&
6135           osd_markdown_log.front() + grace < now)
6136     osd_markdown_log.pop_front();
6137
6138   if (is_active() || is_waiting_for_healthy()) {
6139     maybe_update_heartbeat_peers();
6140   }
6141
6142   if (is_waiting_for_healthy()) {
6143     start_boot();
6144   }
6145
6146   if (is_waiting_for_healthy() || is_booting()) {
6147     std::lock_guard l(heartbeat_lock);
6148     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
6149       last_mon_heartbeat = now;
6150       dout(1) << __func__ << " checking mon for new map" << dendl;
6151       osdmap_subscribe(get_osdmap_epoch() + 1, false);
6152     }
6153   }
6154
6155   // scrub purged_snaps every deep scrub interval
6156   {
6157     const utime_t last = superblock.last_purged_snaps_scrub;
6158     utime_t next = last;
6159     next += cct->_conf->osd_scrub_min_interval;
6160     std::mt19937 rng;
6161     // use a seed that is stable for each scrub interval, but varies
6162     // by OSD to avoid any herds.
6163     rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
6164     double r = (rng() % 1024) / 1024.0;
6165     next +=
6166       cct->_conf->osd_scrub_min_interval *
6167       cct->_conf->osd_scrub_interval_randomize_ratio * r;
6168     if (next < ceph_clock_now()) {
6169       dout(20) << __func__ << " last_purged_snaps_scrub " << last
6170                << " next " << next << " ... now" << dendl;
6171       scrub_purged_snaps();
6172     } else {
6173       dout(20) << __func__ << " last_purged_snaps_scrub " << last
6174                << " next " << next << dendl;
6175     }
6176   }
6177
6178   tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
6179 }
6180
6181 void OSD::tick_without_osd_lock()
6182 {
6183   ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
6184   dout(10) << "tick_without_osd_lock" << dendl;
6185
6186   logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
6187   logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
6188   logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
6189
6190   // refresh osd stats
6191   struct store_statfs_t stbuf;
6192   osd_alert_list_t alerts;
6193   int r = store->statfs(&stbuf, &alerts);
6194   ceph_assert(r == 0);
6195   service.set_statfs(stbuf, alerts);
6196
6197   // osd_lock is not being held, which means the OSD state
6198   // might change when doing the monitor report
6199   if (is_active() || is_waiting_for_healthy()) {
6200     {
6201       std::lock_guard l{heartbeat_lock};
6202       heartbeat_check();
6203     }
6204     map_lock.lock_shared();
6205     std::lock_guard l(mon_report_lock);
6206
6207     // mon report?
6208     utime_t now = ceph_clock_now();
6209     if (service.need_fullness_update() ||
6210         now - last_mon_report > cct->_conf->osd_mon_report_interval) {
6211       last_mon_report = now;
6212       send_full_update();
6213       send_failures();
6214     }
6215     map_lock.unlock_shared();
6216
6217     epoch_t max_waiting_epoch = 0;
6218     for (auto s : shards) {
6219       max_waiting_epoch = std::max(max_waiting_epoch,
6220                                    s->get_max_waiting_epoch());
6221     }
6222     if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6223       dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6224                << ", requesting new map" << dendl;
6225       osdmap_subscribe(superblock.newest_map + 1, false);
6226     }
6227   }
6228
6229   if (is_active()) {
6230     if (!scrub_random_backoff()) {
6231       sched_scrub();
6232     }
6233     service.promote_throttle_recalibrate();
6234     resume_creating_pg();
6235     bool need_send_beacon = false;
6236     const auto now = ceph::coarse_mono_clock::now();
6237     {
6238       // borrow lec lock to pretect last_sent_beacon from changing
6239       std::lock_guard l{min_last_epoch_clean_lock};
6240       const auto elapsed = now - last_sent_beacon;
6241       if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
6242         cct->_conf->osd_beacon_report_interval) {
6243         need_send_beacon = true;
6244       }
6245     }
6246     if (need_send_beacon) {
6247       send_beacon(now);
6248     }
6249   }
6250
6251   mgrc.update_daemon_health(get_health_metrics());
6252   service.kick_recovery_queue();
6253   tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6254                                               new C_Tick_WithoutOSDLock(this));
6255 }
6256
6257 // Usage:
6258 //   setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6259 //   rmomapkey <pool-id> [namespace/]<obj-name> <key>
6260 //   setomapheader <pool-id> [namespace/]<obj-name> <header>
6261 //   getomap <pool> [namespace/]<obj-name>
6262 //   truncobj <pool-id> [namespace/]<obj-name> <newlen>
6263 //   injectmdataerr [namespace/]<obj-name> [shardid]
6264 //   injectdataerr [namespace/]<obj-name> [shardid]
6265 //
6266 //   set_recovery_delay [utime]
6267 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
6268                                  std::string_view command,
6269                                  const cmdmap_t& cmdmap, ostream &ss)
6270 {
6271   //Test support
6272   //Support changing the omap on a single osd by using the Admin Socket to
6273   //directly request the osd make a change.
6274   if (command == "setomapval" || command == "rmomapkey" ||
6275       command == "setomapheader" || command == "getomap" ||
6276       command == "truncobj" || command == "injectmdataerr" ||
6277       command == "injectdataerr"
6278     ) {
6279     pg_t rawpg;
6280     int64_t pool;
6281     OSDMapRef curmap = service->get_osdmap();
6282     int r = -1;
6283
6284     string poolstr;
6285
6286     cmd_getval(cmdmap, "pool", poolstr);
6287     pool = curmap->lookup_pg_pool_name(poolstr);
6288     //If we can't find it by name then maybe id specified
6289     if (pool < 0 && isdigit(poolstr[0]))
6290       pool = atoll(poolstr.c_str());
6291     if (pool < 0) {
6292       ss << "Invalid pool '" << poolstr << "''";
6293       return;
6294     }
6295
6296     string objname, nspace;
6297     cmd_getval(cmdmap, "objname", objname);
6298     std::size_t found = objname.find_first_of('/');
6299     if (found != string::npos) {
6300       nspace = objname.substr(0, found);
6301       objname = objname.substr(found+1);
6302     }
6303     object_locator_t oloc(pool, nspace);
6304     r = curmap->object_locator_to_pg(object_t(objname), oloc,  rawpg);
6305
6306     if (r < 0) {
6307       ss << "Invalid namespace/objname";
6308       return;
6309     }
6310
6311     int64_t shardid = cmd_getval_or<int64_t>(cmdmap, "shardid", shard_id_t::NO_SHARD);
6312     hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6313     ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6314     spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6315     if (curmap->pg_is_ec(rawpg)) {
6316         if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6317             ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6318             return;
6319         }
6320     }
6321
6322     ObjectStore::Transaction t;
6323
6324     if (command == "setomapval") {
6325       map<string, bufferlist> newattrs;
6326       bufferlist val;
6327       string key, valstr;
6328       cmd_getval(cmdmap, "key", key);
6329       cmd_getval(cmdmap, "val", valstr);
6330
6331       val.append(valstr);
6332       newattrs[key] = val;
6333       t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
6334       r = store->queue_transaction(service->meta_ch, std::move(t));
6335       if (r < 0)
6336         ss << "error=" << r;
6337       else
6338         ss << "ok";
6339     } else if (command == "rmomapkey") {
6340       string key;
6341       cmd_getval(cmdmap, "key", key);
6342
6343       t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6344       r = store->queue_transaction(service->meta_ch, std::move(t));
6345       if (r < 0)
6346         ss << "error=" << r;
6347       else
6348         ss << "ok";
6349     } else if (command == "setomapheader") {
6350       bufferlist newheader;
6351       string headerstr;
6352
6353       cmd_getval(cmdmap, "header", headerstr);
6354       newheader.append(headerstr);
6355       t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6356       r = store->queue_transaction(service->meta_ch, std::move(t));
6357       if (r < 0)
6358         ss << "error=" << r;
6359       else
6360         ss << "ok";
6361     } else if (command == "getomap") {
6362       //Debug: Output entire omap
6363       bufferlist hdrbl;
6364       map<string, bufferlist> keyvals;
6365       auto ch = store->open_collection(coll_t(pgid));
6366       if (!ch) {
6367         ss << "unable to open collection for " << pgid;
6368         r = -ENOENT;
6369       } else {
6370         r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6371         if (r >= 0) {
6372           ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6373           for (map<string, bufferlist>::iterator it = keyvals.begin();
6374                it != keyvals.end(); ++it)
6375             ss << " key=" << (*it).first << " val="
6376                << string((*it).second.c_str(), (*it).second.length());
6377         } else {
6378           ss << "error=" << r;
6379         }
6380       }
6381     } else if (command == "truncobj") {
6382       int64_t trunclen;
6383       cmd_getval(cmdmap, "len", trunclen);
6384       t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6385       r = store->queue_transaction(service->meta_ch, std::move(t));
6386       if (r < 0)
6387         ss << "error=" << r;
6388       else
6389         ss << "ok";
6390     } else if (command == "injectdataerr") {
6391       store->inject_data_error(gobj);
6392       ss << "ok";
6393     } else if (command == "injectmdataerr") {
6394       store->inject_mdata_error(gobj);
6395       ss << "ok";
6396     }
6397     return;
6398   }
6399   if (command == "set_recovery_delay") {
6400     int64_t delay = cmd_getval_or<int64_t>(cmdmap, "utime", 0);
6401     ostringstream oss;
6402     oss << delay;
6403     int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6404                                          oss.str().c_str());
6405     if (r != 0) {
6406       ss << "set_recovery_delay: error setting "
6407          << "osd_recovery_delay_start to '" << delay << "': error "
6408          << r;
6409       return;
6410     }
6411     service->cct->_conf.apply_changes(nullptr);
6412     ss << "set_recovery_delay: set osd_recovery_delay_start "
6413        << "to " << service->cct->_conf->osd_recovery_delay_start;
6414     return;
6415   }
6416   if (command == "injectfull") {
6417     int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", -1);
6418     string type = cmd_getval_or<string>(cmdmap, "type", "full");
6419     OSDService::s_names state;
6420
6421     if (type == "none" || count == 0) {
6422       type = "none";
6423       count = 0;
6424     }
6425     state = service->get_full_state(type);
6426     if (state == OSDService::s_names::INVALID) {
6427       ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6428       return;
6429     }
6430     service->set_injectfull(state, count);
6431     return;
6432   }
6433   ss << "Internal error - command=" << command;
6434 }
6435
6436 // =========================================
6437
6438 void OSD::ms_handle_connect(Connection *con)
6439 {
6440   dout(10) << __func__ << " con " << con << dendl;
6441   if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6442     std::lock_guard l(osd_lock);
6443     if (is_stopping())
6444       return;
6445     dout(10) << __func__ << " on mon" << dendl;
6446
6447     if (is_preboot()) {
6448       start_boot();
6449     } else if (is_booting()) {
6450       _send_boot();       // resend boot message
6451     } else {
6452       map_lock.lock_shared();
6453       std::lock_guard l2(mon_report_lock);
6454
6455       utime_t now = ceph_clock_now();
6456       last_mon_report = now;
6457
6458       // resend everything, it's a new session
6459       send_full_update();
6460       send_alive();
6461       service.requeue_pg_temp();
6462       service.clear_sent_ready_to_merge();
6463       service.send_pg_temp();
6464       service.send_ready_to_merge();
6465       service.send_pg_created();
6466       requeue_failures();
6467       send_failures();
6468
6469       map_lock.unlock_shared();
6470       if (is_active()) {
6471         send_beacon(ceph::coarse_mono_clock::now());
6472       }
6473     }
6474
6475     // full map requests may happen while active or pre-boot
6476     if (requested_full_first) {
6477       rerequest_full_maps();
6478     }
6479   }
6480 }
6481
6482 void OSD::ms_handle_fast_connect(Connection *con)
6483 {
6484   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6485       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6486     if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6487       s = ceph::make_ref<Session>(cct, con);
6488       con->set_priv(s);
6489       dout(10) << " new session (outgoing) " << s << " con=" << s->con
6490           << " addr=" << s->con->get_peer_addr() << dendl;
6491       // we don't connect to clients
6492       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6493       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6494     }
6495   }
6496 }
6497
6498 void OSD::ms_handle_fast_accept(Connection *con)
6499 {
6500   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6501       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6502     if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6503       s = ceph::make_ref<Session>(cct, con);
6504       con->set_priv(s);
6505       dout(10) << "new session (incoming)" << s << " con=" << con
6506           << " addr=" << con->get_peer_addr()
6507           << " must have raced with connect" << dendl;
6508       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6509       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6510     }
6511   }
6512 }
6513
6514 bool OSD::ms_handle_reset(Connection *con)
6515 {
6516   auto session = ceph::ref_cast<Session>(con->get_priv());
6517   dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6518   if (!session)
6519     return false;
6520   session->wstate.reset(con);
6521   session->con->set_priv(nullptr);
6522   session->con.reset();  // break con <-> session ref cycle
6523   // note that we break session->con *before* the session_handle_reset
6524   // cleanup below.  this avoids a race between us and
6525   // PG::add_backoff, Session::check_backoff, etc.
6526   session_handle_reset(session);
6527   return true;
6528 }
6529
6530 bool OSD::ms_handle_refused(Connection *con)
6531 {
6532   if (!cct->_conf->osd_fast_fail_on_connection_refused)
6533     return false;
6534
6535   auto session = ceph::ref_cast<Session>(con->get_priv());
6536   dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6537   if (!session)
6538     return false;
6539   int type = con->get_peer_type();
6540   // handle only OSD failures here
6541   if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6542     OSDMapRef osdmap = get_osdmap();
6543     if (osdmap) {
6544       int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6545       if (id >= 0 && osdmap->is_up(id)) {
6546         // I'm cheating mon heartbeat grace logic, because we know it's not going
6547         // to respawn alone. +1 so we won't hit any boundary case.
6548         monc->send_mon_message(
6549           new MOSDFailure(
6550             monc->get_fsid(),
6551             id,
6552             osdmap->get_addrs(id),
6553             cct->_conf->osd_heartbeat_grace + 1,
6554             osdmap->get_epoch(),
6555             MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6556             ));
6557       }
6558     }
6559   }
6560   return true;
6561 }
6562
6563 struct CB_OSD_GetVersion {
6564   OSD *osd;
6565   explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6566   void operator ()(boost::system::error_code ec, version_t newest,
6567                    version_t oldest) {
6568     if (!ec)
6569       osd->_got_mon_epochs(oldest, newest);
6570   }
6571 };
6572
6573 void OSD::start_boot()
6574 {
6575   if (!_is_healthy()) {
6576     // if we are not healthy, do not mark ourselves up (yet)
6577     dout(1) << "not healthy; waiting to boot" << dendl;
6578     if (!is_waiting_for_healthy())
6579       start_waiting_for_healthy();
6580     // send pings sooner rather than later
6581     heartbeat_kick();
6582     return;
6583   }
6584   dout(1) << __func__ << dendl;
6585   set_state(STATE_PREBOOT);
6586   dout(10) << "start_boot - have maps " << superblock.oldest_map
6587            << ".." << superblock.newest_map << dendl;
6588   monc->get_version("osdmap", CB_OSD_GetVersion(this));
6589 }
6590
6591 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6592 {
6593   std::lock_guard l(osd_lock);
6594   if (is_preboot()) {
6595     _preboot(oldest, newest);
6596   }
6597 }
6598
6599 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6600 {
6601   ceph_assert(is_preboot());
6602   dout(10) << __func__ << " _preboot mon has osdmaps "
6603            << oldest << ".." << newest << dendl;
6604
6605   // ensure our local fullness awareness is accurate
6606   {
6607     std::lock_guard l(heartbeat_lock);
6608     heartbeat();
6609   }
6610
6611   const auto& monmap = monc->monmap;
6612   const auto osdmap = get_osdmap();
6613   // if our map within recent history, try to add ourselves to the osdmap.
6614   if (osdmap->get_epoch() == 0) {
6615     derr << "waiting for initial osdmap" << dendl;
6616   } else if (osdmap->is_destroyed(whoami)) {
6617     derr << "osdmap says I am destroyed" << dendl;
6618     // provide a small margin so we don't livelock seeing if we
6619     // un-destroyed ourselves.
6620     if (osdmap->get_epoch() > newest - 1) {
6621       exit(0);
6622     }
6623   } else if (osdmap->is_noup(whoami)) {
6624     derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6625   } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6626     derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6627          << dendl;
6628   } else if (service.need_fullness_update()) {
6629     derr << "osdmap fullness state needs update" << dendl;
6630     send_full_update();
6631   } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6632              superblock.purged_snaps_last < superblock.current_epoch) {
6633     dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6634              << " < newest_map " << superblock.current_epoch << dendl;
6635     _get_purged_snaps();
6636   } else if (osdmap->get_epoch() >= oldest - 1 &&
6637              osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6638
6639     // wait for pgs to fully catch up in a different thread, since
6640     // this thread might be required for splitting and merging PGs to
6641     // make progress.
6642     boot_finisher.queue(
6643       new LambdaContext(
6644         [this](int r) {
6645           std::unique_lock l(osd_lock);
6646           if (is_preboot()) {
6647             dout(10) << __func__ << " waiting for peering work to drain"
6648                      << dendl;
6649             l.unlock();
6650             for (auto shard : shards) {
6651               shard->wait_min_pg_epoch(get_osdmap_epoch());
6652             }
6653             l.lock();
6654           }
6655           if (is_preboot()) {
6656             _send_boot();
6657           }
6658         }));
6659     return;
6660   }
6661
6662   // get all the latest maps
6663   if (osdmap->get_epoch() + 1 >= oldest)
6664     osdmap_subscribe(osdmap->get_epoch() + 1, false);
6665   else
6666     osdmap_subscribe(oldest - 1, true);
6667 }
6668
6669 void OSD::_get_purged_snaps()
6670 {
6671   // NOTE: this is a naive, stateless implementaiton.  it may send multiple
6672   // overlapping requests to the mon, which will be somewhat inefficient, but
6673   // it should be reliable.
6674   dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6675            << ", newest_map " << superblock.current_epoch << dendl;
6676   MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6677     superblock.purged_snaps_last + 1,
6678     superblock.current_epoch + 1);
6679   monc->send_mon_message(m);
6680 }
6681
6682 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6683 {
6684   dout(10) << __func__ << " " << *m << dendl;
6685   ObjectStore::Transaction t;
6686   if (!is_preboot() ||
6687       m->last < superblock.purged_snaps_last) {
6688     goto out;
6689   } else {
6690     OSDriver osdriver{store.get(), service.meta_ch, make_purged_snaps_oid()};
6691     SnapMapper::record_purged_snaps(
6692       cct,
6693       osdriver,
6694       osdriver.get_transaction(&t),
6695       m->purged_snaps);
6696   }
6697   superblock.purged_snaps_last = m->last;
6698   write_superblock(t);
6699   store->queue_transaction(
6700     service.meta_ch,
6701     std::move(t));
6702   service.publish_superblock(superblock);
6703   if (m->last < superblock.current_epoch) {
6704     _get_purged_snaps();
6705   } else {
6706     start_boot();
6707   }
6708 out:
6709   m->put();
6710 }
6711
6712 void OSD::send_full_update()
6713 {
6714   if (!service.need_fullness_update())
6715     return;
6716   unsigned state = 0;
6717   if (service.is_full()) {
6718     state = CEPH_OSD_FULL;
6719   } else if (service.is_backfillfull()) {
6720     state = CEPH_OSD_BACKFILLFULL;
6721   } else if (service.is_nearfull()) {
6722     state = CEPH_OSD_NEARFULL;
6723   }
6724   set<string> s;
6725   OSDMap::calc_state_set(state, s);
6726   dout(10) << __func__ << " want state " << s << dendl;
6727   monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6728 }
6729
6730 void OSD::start_waiting_for_healthy()
6731 {
6732   dout(1) << "start_waiting_for_healthy" << dendl;
6733   set_state(STATE_WAITING_FOR_HEALTHY);
6734   last_heartbeat_resample = utime_t();
6735
6736   // subscribe to osdmap updates, in case our peers really are known to be dead
6737   osdmap_subscribe(get_osdmap_epoch() + 1, false);
6738 }
6739
6740 bool OSD::_is_healthy()
6741 {
6742   if (!cct->get_heartbeat_map()->is_healthy()) {
6743     dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6744     return false;
6745   }
6746
6747   if (is_waiting_for_healthy()) {
6748      utime_t now = ceph_clock_now();
6749      if (osd_markdown_log.empty()) {
6750        dout(5) << __func__ << " force returning true since last markdown"
6751                << " was " << cct->_conf->osd_max_markdown_period
6752                << "s ago" << dendl;
6753        return true;
6754     }
6755     std::lock_guard l(heartbeat_lock);
6756     int num = 0, up = 0;
6757     for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6758          p != heartbeat_peers.end();
6759          ++p) {
6760       if (p->second.is_healthy(now))
6761         ++up;
6762       ++num;
6763     }
6764     if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6765       dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6766               << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6767       return false;
6768     }
6769   }
6770
6771   return true;
6772 }
6773
6774 void OSD::_send_boot()
6775 {
6776   dout(10) << "_send_boot" << dendl;
6777   Connection *local_connection =
6778     cluster_messenger->get_loopback_connection().get();
6779   entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6780   entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6781   entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6782   entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6783
6784   dout(20) << " initial client_addrs " << client_addrs
6785            << ", cluster_addrs " << cluster_addrs
6786            << ", hb_back_addrs " << hb_back_addrs
6787            << ", hb_front_addrs " << hb_front_addrs
6788            << dendl;
6789   if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6790     dout(10) << " assuming cluster_addrs match client_addrs "
6791              << client_addrs << dendl;
6792     cluster_addrs = cluster_messenger->get_myaddrs();
6793   }
6794   if (auto session = local_connection->get_priv(); !session) {
6795     cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6796   }
6797
6798   local_connection = hb_back_server_messenger->get_loopback_connection().get();
6799   if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6800     dout(10) << " assuming hb_back_addrs match cluster_addrs "
6801              << cluster_addrs << dendl;
6802     hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6803   }
6804   if (auto session = local_connection->get_priv(); !session) {
6805     hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6806   }
6807
6808   local_connection = hb_front_server_messenger->get_loopback_connection().get();
6809   if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6810     dout(10) << " assuming hb_front_addrs match client_addrs "
6811              << client_addrs << dendl;
6812     hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6813   }
6814   if (auto session = local_connection->get_priv(); !session) {
6815     hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6816   }
6817
6818   // we now know what our front and back addrs will be, and we are
6819   // about to tell the mon what our metadata (including numa bindings)
6820   // are, so now is a good time!
6821   set_numa_affinity();
6822
6823   MOSDBoot *mboot = new MOSDBoot(
6824     superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6825     hb_back_addrs, hb_front_addrs, cluster_addrs,
6826     CEPH_FEATURES_ALL);
6827   dout(10) << " final client_addrs " << client_addrs
6828            << ", cluster_addrs " << cluster_addrs
6829            << ", hb_back_addrs " << hb_back_addrs
6830            << ", hb_front_addrs " << hb_front_addrs
6831            << dendl;
6832   _collect_metadata(&mboot->metadata);
6833   monc->send_mon_message(mboot);
6834   set_state(STATE_BOOTING);
6835 }
6836
6837 void OSD::_collect_metadata(map<string,string> *pm)
6838 {
6839   // config info
6840   (*pm)["osd_data"] = dev_path;
6841   if (store->get_type() == "filestore") {
6842     // not applicable for bluestore
6843     (*pm)["osd_journal"] = journal_path;
6844   }
6845   (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6846   (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6847   (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6848   (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6849
6850   // backend
6851   (*pm)["osd_objectstore"] = store->get_type();
6852   (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6853   (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6854   (*pm)["default_device_class"] = store->get_default_device_class();
6855   string osdspec_affinity;
6856   int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6857   if (r < 0 || osdspec_affinity.empty()) {
6858     osdspec_affinity = "";
6859   }
6860   (*pm)["osdspec_affinity"] = osdspec_affinity;
6861   string ceph_version_when_created;
6862   r = store->read_meta("ceph_version_when_created", &ceph_version_when_created);
6863   if (r <0 || ceph_version_when_created.empty()) {
6864     ceph_version_when_created = "";
6865   }
6866   (*pm)["ceph_version_when_created"] = ceph_version_when_created;
6867   string created_at;
6868   r = store->read_meta("created_at", &created_at);
6869   if (r < 0 || created_at.empty()) {
6870     created_at = "";
6871   }
6872   (*pm)["created_at"] = created_at;
6873   store->collect_metadata(pm);
6874
6875   collect_sys_info(pm, cct);
6876
6877   (*pm)["front_iface"] = pick_iface(
6878     cct,
6879     client_messenger->get_myaddrs().front().get_sockaddr_storage());
6880   (*pm)["back_iface"] = pick_iface(
6881     cct,
6882     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6883
6884   // network numa
6885   {
6886     int node = -1;
6887     set<int> nodes;
6888     set<string> unknown;
6889     for (auto nm : { "front_iface", "back_iface" }) {
6890       if (!(*pm)[nm].size()) {
6891         unknown.insert(nm);
6892         continue;
6893       }
6894       int n = -1;
6895       int r = get_iface_numa_node((*pm)[nm], &n);
6896       if (r < 0) {
6897         unknown.insert((*pm)[nm]);
6898         continue;
6899       }
6900       nodes.insert(n);
6901       if (node < 0) {
6902         node = n;
6903       }
6904     }
6905     if (unknown.size()) {
6906       (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6907     }
6908     if (!nodes.empty()) {
6909       (*pm)["network_numa_nodes"] = stringify(nodes);
6910     }
6911     if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6912       (*pm)["network_numa_node"] = stringify(node);
6913     }
6914   }
6915
6916   if (numa_node >= 0) {
6917     (*pm)["numa_node"] = stringify(numa_node);
6918     (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6919                                                   &numa_cpu_set);
6920   }
6921
6922   set<string> devnames;
6923   store->get_devices(&devnames);
6924   map<string,string> errs;
6925   get_device_metadata(devnames, pm, &errs);
6926   for (auto& i : errs) {
6927     dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6928   }
6929   dout(10) << __func__ << " " << *pm << dendl;
6930 }
6931
6932 void OSD::queue_want_up_thru(epoch_t want)
6933 {
6934   std::shared_lock map_locker{map_lock};
6935   epoch_t cur = get_osdmap()->get_up_thru(whoami);
6936   std::lock_guard report_locker(mon_report_lock);
6937   if (want > up_thru_wanted) {
6938     dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6939              << ", currently " << cur
6940              << dendl;
6941     up_thru_wanted = want;
6942     send_alive();
6943   } else {
6944     dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6945              << ", currently " << cur
6946              << dendl;
6947   }
6948 }
6949
6950 void OSD::send_alive()
6951 {
6952   ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6953   const auto osdmap = get_osdmap();
6954   if (!osdmap->exists(whoami))
6955     return;
6956   epoch_t up_thru = osdmap->get_up_thru(whoami);
6957   dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6958   if (up_thru_wanted > up_thru) {
6959     dout(10) << "send_alive want " << up_thru_wanted << dendl;
6960     monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6961   }
6962 }
6963
6964 void OSD::request_full_map(epoch_t first, epoch_t last)
6965 {
6966   dout(10) << __func__ << " " << first << ".." << last
6967            << ", previously requested "
6968            << requested_full_first << ".." << requested_full_last << dendl;
6969   ceph_assert(ceph_mutex_is_locked(osd_lock));
6970   ceph_assert(first > 0 && last > 0);
6971   ceph_assert(first <= last);
6972   ceph_assert(first >= requested_full_first);  // we shouldn't ever ask for older maps
6973   if (requested_full_first == 0) {
6974     // first request
6975     requested_full_first = first;
6976     requested_full_last = last;
6977   } else if (last <= requested_full_last) {
6978     // dup
6979     return;
6980   } else {
6981     // additional request
6982     first = requested_full_last + 1;
6983     requested_full_last = last;
6984   }
6985   MMonGetOSDMap *req = new MMonGetOSDMap;
6986   req->request_full(first, last);
6987   monc->send_mon_message(req);
6988 }
6989
6990 void OSD::got_full_map(epoch_t e)
6991 {
6992   ceph_assert(requested_full_first <= requested_full_last);
6993   ceph_assert(ceph_mutex_is_locked(osd_lock));
6994   if (requested_full_first == 0) {
6995     dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6996     return;
6997   }
6998   if (e < requested_full_first) {
6999     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
7000              << ".." << requested_full_last
7001              << ", ignoring" << dendl;
7002     return;
7003   }
7004   if (e >= requested_full_last) {
7005     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
7006              << ".." << requested_full_last << ", resetting" << dendl;
7007     requested_full_first = requested_full_last = 0;
7008     return;
7009   }
7010
7011   requested_full_first = e + 1;
7012
7013   dout(10) << __func__ << " " << e << ", requested " << requested_full_first
7014            << ".." << requested_full_last
7015            << ", still need more" << dendl;
7016 }
7017
7018 void OSD::requeue_failures()
7019 {
7020   std::lock_guard l(heartbeat_lock);
7021   unsigned old_queue = failure_queue.size();
7022   unsigned old_pending = failure_pending.size();
7023   for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7024     failure_queue[p->first] = p->second.first;
7025     failure_pending.erase(p++);
7026   }
7027   dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
7028            << failure_queue.size() << dendl;
7029 }
7030
7031 void OSD::send_failures()
7032 {
7033   ceph_assert(ceph_mutex_is_locked(map_lock));
7034   ceph_assert(ceph_mutex_is_locked(mon_report_lock));
7035   std::lock_guard l(heartbeat_lock);
7036   utime_t now = ceph_clock_now();
7037   const auto osdmap = get_osdmap();
7038   while (!failure_queue.empty()) {
7039     int osd = failure_queue.begin()->first;
7040     if (!failure_pending.count(osd)) {
7041       int failed_for = (int)(double)(now - failure_queue.begin()->second);
7042       monc->send_mon_message(
7043         new MOSDFailure(
7044           monc->get_fsid(),
7045           osd,
7046           osdmap->get_addrs(osd),
7047           failed_for,
7048           osdmap->get_epoch()));
7049       failure_pending[osd] = make_pair(failure_queue.begin()->second,
7050                                        osdmap->get_addrs(osd));
7051     }
7052     failure_queue.erase(osd);
7053   }
7054 }
7055
7056 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7057 {
7058   MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
7059                                    MOSDFailure::FLAG_ALIVE);
7060   monc->send_mon_message(m);
7061 }
7062
7063 void OSD::cancel_pending_failures()
7064 {
7065   std::lock_guard l(heartbeat_lock);
7066   auto it = failure_pending.begin();
7067   while (it != failure_pending.end()) {
7068     dout(10) << __func__ << " canceling in-flight failure report for osd."
7069              << it->first << dendl;
7070     send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
7071     failure_pending.erase(it++);
7072   }
7073 }
7074
7075 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
7076 {
7077   const auto& monmap = monc->monmap;
7078   // send beacon to mon even if we are just connected, and the monmap is not
7079   // initialized yet by then.
7080   if (monmap.epoch > 0 &&
7081       monmap.get_required_features().contains_all(
7082         ceph::features::mon::FEATURE_LUMINOUS)) {
7083     dout(20) << __func__ << " sending" << dendl;
7084     MOSDBeacon* beacon = nullptr;
7085     {
7086       std::lock_guard l{min_last_epoch_clean_lock};
7087       beacon = new MOSDBeacon(get_osdmap_epoch(),
7088                               min_last_epoch_clean,
7089                               superblock.last_purged_snaps_scrub,
7090                               cct->_conf->osd_beacon_report_interval);
7091       beacon->pgs = min_last_epoch_clean_pgs;
7092       last_sent_beacon = now;
7093     }
7094     monc->send_mon_message(beacon);
7095   } else {
7096     dout(20) << __func__ << " not sending" << dendl;
7097   }
7098 }
7099
7100 void OSD::handle_command(MCommand *m)
7101 {
7102   ConnectionRef con = m->get_connection();
7103   auto session = ceph::ref_cast<Session>(con->get_priv());
7104   if (!session) {
7105     con->send_message(new MCommandReply(m, -EACCES));
7106     m->put();
7107     return;
7108   }
7109   if (!session->caps.allow_all()) {
7110     con->send_message(new MCommandReply(m, -EACCES));
7111     m->put();
7112     return;
7113   }
7114   cct->get_admin_socket()->queue_tell_command(m);
7115   m->put();
7116 }
7117
7118 namespace {
7119   class unlock_guard {
7120     ceph::mutex& m;
7121   public:
7122     explicit unlock_guard(ceph::mutex& mutex)
7123       : m(mutex)
7124     {
7125       m.unlock();
7126     }
7127     unlock_guard(unlock_guard&) = delete;
7128     ~unlock_guard() {
7129       m.lock();
7130     }
7131   };
7132 }
7133
7134 void OSD::scrub_purged_snaps()
7135 {
7136   dout(10) << __func__ << dendl;
7137   ceph_assert(ceph_mutex_is_locked(osd_lock));
7138   SnapMapper::Scrubber s(cct, store.get(), service.meta_ch,
7139                          make_snapmapper_oid(),
7140                          make_purged_snaps_oid());
7141   clog->debug() << "purged_snaps scrub starts";
7142   osd_lock.unlock();
7143   s.run();
7144   if (s.stray.size()) {
7145     clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
7146   } else {
7147     clog->debug() << "purged_snaps scrub ok";
7148   }
7149   set<pair<spg_t,snapid_t>> queued;
7150   for (auto& [pool, snap, hash, shard] : s.stray) {
7151     const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
7152     if (!pi) {
7153       dout(20) << __func__ << " pool " << pool << " dne" << dendl;
7154       continue;
7155     }
7156     pg_t pgid(pi->raw_hash_to_pg(hash), pool);
7157     spg_t spgid(pgid, shard);
7158     pair<spg_t,snapid_t> p(spgid, snap);
7159     if (queued.count(p)) {
7160       dout(20) << __func__ << " pg " << spgid << " snap " << snap
7161                << " already queued" << dendl;
7162       continue;
7163     }
7164     PGRef pg = lookup_lock_pg(spgid);
7165     if (!pg) {
7166       dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
7167       continue;
7168     }
7169     queued.insert(p);
7170     dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
7171              << snap << dendl;
7172     pg->queue_snap_retrim(snap);
7173     pg->unlock();
7174   }
7175   osd_lock.lock();
7176   if (is_stopping()) {
7177     return;
7178   }
7179   dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
7180   ObjectStore::Transaction t;
7181   superblock.last_purged_snaps_scrub = ceph_clock_now();
7182   write_superblock(t);
7183   int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7184   ceph_assert(tr == 0);
7185   if (is_active()) {
7186     send_beacon(ceph::coarse_mono_clock::now());
7187   }
7188   dout(10) << __func__ << " done" << dendl;
7189 }
7190
7191 void OSD::probe_smart(const string& only_devid, ostream& ss)
7192 {
7193   set<string> devnames;
7194   store->get_devices(&devnames);
7195   uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7196     "osd_smart_report_timeout");
7197
7198   // == typedef std::map<std::string, mValue> mObject;
7199   json_spirit::mObject json_map;
7200
7201   for (auto dev : devnames) {
7202     // smartctl works only on physical devices; filter out any logical device
7203     if (dev.find("dm-") == 0) {
7204       continue;
7205     }
7206
7207     string err;
7208     string devid = get_device_id(dev, &err);
7209     if (devid.size() == 0) {
7210       dout(10) << __func__ << " no unique id for dev " << dev << " ("
7211                << err << "), skipping" << dendl;
7212       continue;
7213     }
7214     if (only_devid.size() && devid != only_devid) {
7215       continue;
7216     }
7217
7218     json_spirit::mValue smart_json;
7219     if (block_device_get_metrics(dev, smart_timeout,
7220                                  &smart_json)) {
7221       dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7222       continue;
7223     }
7224     json_map[devid] = smart_json;
7225   }
7226   json_spirit::write(json_map, ss, json_spirit::pretty_print);
7227 }
7228
7229 bool OSD::heartbeat_dispatch(Message *m)
7230 {
7231   dout(30) << "heartbeat_dispatch " << m << dendl;
7232   switch (m->get_type()) {
7233
7234   case CEPH_MSG_PING:
7235     dout(10) << "ping from " << m->get_source_inst() << dendl;
7236     m->put();
7237     break;
7238
7239   case MSG_OSD_PING:
7240     handle_osd_ping(static_cast<MOSDPing*>(m));
7241     break;
7242
7243   default:
7244     dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7245     m->put();
7246   }
7247
7248   return true;
7249 }
7250
7251 bool OSD::ms_dispatch(Message *m)
7252 {
7253   dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7254   if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7255     service.got_stop_ack();
7256     m->put();
7257     return true;
7258   }
7259
7260   // lock!
7261
7262   osd_lock.lock();
7263   if (is_stopping()) {
7264     osd_lock.unlock();
7265     m->put();
7266     return true;
7267   }
7268
7269   _dispatch(m);
7270
7271   osd_lock.unlock();
7272
7273   return true;
7274 }
7275
7276 void OSDService::maybe_share_map(
7277   Connection *con,
7278   const OSDMapRef& osdmap,
7279   epoch_t peer_epoch_lb)
7280 {
7281   // NOTE: we assume caller hold something that keeps the Connection itself
7282   // pinned (e.g., an OpRequest's MessageRef).
7283   auto session = ceph::ref_cast<Session>(con->get_priv());
7284   if (!session) {
7285     return;
7286   }
7287
7288   // assume the peer has the newer of the op's sent_epoch and what
7289   // we think we sent them.
7290   session->sent_epoch_lock.lock();
7291   if (peer_epoch_lb > session->last_sent_epoch) {
7292     dout(10) << __func__ << " con " << con
7293              << " " << con->get_peer_addr()
7294              << " map epoch " << session->last_sent_epoch
7295              << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7296     session->last_sent_epoch = peer_epoch_lb;
7297   }
7298   epoch_t last_sent_epoch = session->last_sent_epoch;
7299   session->sent_epoch_lock.unlock();
7300
7301   if (osdmap->get_epoch() <= last_sent_epoch) {
7302     return;
7303   }
7304
7305   send_incremental_map(last_sent_epoch, con, osdmap);
7306   last_sent_epoch = osdmap->get_epoch();
7307
7308   session->sent_epoch_lock.lock();
7309   if (session->last_sent_epoch < last_sent_epoch) {
7310     dout(10) << __func__ << " con " << con
7311              << " " << con->get_peer_addr()
7312              << " map epoch " << session->last_sent_epoch
7313              << " -> " << last_sent_epoch << " (shared)" << dendl;
7314     session->last_sent_epoch = last_sent_epoch;
7315   }
7316   session->sent_epoch_lock.unlock();
7317 }
7318
7319 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7320 {
7321   ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7322
7323   auto i = session->waiting_on_map.begin();
7324   while (i != session->waiting_on_map.end()) {
7325     OpRequestRef op = &(*i);
7326     ceph_assert(ms_can_fast_dispatch(op->get_req()));
7327     auto m = op->get_req<MOSDFastDispatchOp>();
7328     if (m->get_min_epoch() > osdmap->get_epoch()) {
7329       break;
7330     }
7331     session->waiting_on_map.erase(i++);
7332     op->put();
7333
7334     spg_t pgid;
7335     if (m->get_type() == CEPH_MSG_OSD_OP) {
7336       pg_t actual_pgid = osdmap->raw_pg_to_pg(
7337         static_cast<const MOSDOp*>(m)->get_pg());
7338       if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7339         continue;
7340       }
7341     } else {
7342       pgid = m->get_spg();
7343     }
7344     enqueue_op(pgid, std::move(op), m->get_map_epoch());
7345   }
7346
7347   if (session->waiting_on_map.empty()) {
7348     clear_session_waiting_on_map(session);
7349   } else {
7350     register_session_waiting_on_map(session);
7351   }
7352 }
7353
7354 void OSD::ms_fast_dispatch(Message *m)
7355 {
7356   FUNCTRACE(cct);
7357   if (service.is_stopping()) {
7358     m->put();
7359     return;
7360   }
7361   // peering event?
7362   switch (m->get_type()) {
7363   case CEPH_MSG_PING:
7364     dout(10) << "ping from " << m->get_source() << dendl;
7365     m->put();
7366     return;
7367   case MSG_OSD_FORCE_RECOVERY:
7368     handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7369     return;
7370   case MSG_OSD_SCRUB2:
7371     handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7372     return;
7373   case MSG_OSD_PG_CREATE2:
7374     return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7375   case MSG_OSD_PG_NOTIFY:
7376     return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7377   case MSG_OSD_PG_INFO:
7378     return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7379   case MSG_OSD_PG_REMOVE:
7380     return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7381     // these are single-pg messages that handle themselves
7382   case MSG_OSD_PG_LOG:
7383   case MSG_OSD_PG_TRIM:
7384   case MSG_OSD_PG_NOTIFY2:
7385   case MSG_OSD_PG_QUERY2:
7386   case MSG_OSD_PG_INFO2:
7387   case MSG_OSD_BACKFILL_RESERVE:
7388   case MSG_OSD_RECOVERY_RESERVE:
7389   case MSG_OSD_PG_LEASE:
7390   case MSG_OSD_PG_LEASE_ACK:
7391     {
7392       MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7393       if (require_osd_peer(pm)) {
7394         enqueue_peering_evt(
7395           pm->get_spg(),
7396           PGPeeringEventRef(pm->get_event()));
7397       }
7398       pm->put();
7399       return;
7400     }
7401   }
7402
7403   OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7404   {
7405 #ifdef WITH_LTTNG
7406     osd_reqid_t reqid = op->get_reqid();
7407 #endif
7408     tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7409         reqid.name._num, reqid.tid, reqid.inc);
7410   }
7411   op->osd_parent_span = tracing::osd::tracer.start_trace("op-request-created");
7412
7413   if (m->trace)
7414     op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7415
7416   // note sender epoch, min req's epoch
7417   op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7418   op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7419   ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7420
7421   service.maybe_inject_dispatch_delay();
7422
7423   if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7424       m->get_type() != CEPH_MSG_OSD_OP) {
7425     // queue it directly
7426     enqueue_op(
7427       static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7428       std::move(op),
7429       static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7430   } else {
7431     // legacy client, and this is an MOSDOp (the *only* fast dispatch
7432     // message that didn't have an explicit spg_t); we need to map
7433     // them to an spg_t while preserving delivery order.
7434     auto priv = m->get_connection()->get_priv();
7435     if (auto session = static_cast<Session*>(priv.get()); session) {
7436       std::lock_guard l{session->session_dispatch_lock};
7437       op->get();
7438       session->waiting_on_map.push_back(*op);
7439       OSDMapRef nextmap = service.get_nextmap_reserved();
7440       dispatch_session_waiting(session, nextmap);
7441       service.release_map(nextmap);
7442     }
7443   }
7444   OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7445 }
7446
7447 int OSD::ms_handle_authentication(Connection *con)
7448 {
7449   int ret = 0;
7450   auto s = ceph::ref_cast<Session>(con->get_priv());
7451   if (!s) {
7452     s = ceph::make_ref<Session>(cct, con);
7453     con->set_priv(s);
7454     s->entity_name = con->get_peer_entity_name();
7455     dout(10) << __func__ << " new session " << s << " con " << s->con
7456              << " entity " << s->entity_name
7457              << " addr " << con->get_peer_addrs() << dendl;
7458   } else {
7459     dout(10) << __func__ << " existing session " << s << " con " << s->con
7460              << " entity " << s->entity_name
7461              << " addr " << con->get_peer_addrs() << dendl;
7462   }
7463
7464   AuthCapsInfo &caps_info = con->get_peer_caps_info();
7465   if (caps_info.allow_all) {
7466     s->caps.set_allow_all();
7467   } else if (caps_info.caps.length() > 0) {
7468     bufferlist::const_iterator p = caps_info.caps.cbegin();
7469     string str;
7470     try {
7471       decode(str, p);
7472     }
7473     catch (ceph::buffer::error& e) {
7474       dout(10) << __func__ << " session " << s << " " << s->entity_name
7475                << " failed to decode caps string" << dendl;
7476       ret = -EACCES;
7477     }
7478     if (!ret) {
7479       bool success = s->caps.parse(str);
7480       if (success) {
7481         dout(10) << __func__ << " session " << s
7482                  << " " << s->entity_name
7483                  << " has caps " << s->caps << " '" << str << "'" << dendl;
7484         ret = 1;
7485       } else {
7486         dout(10) << __func__ << " session " << s << " " << s->entity_name
7487                  << " failed to parse caps '" << str << "'" << dendl;
7488         ret = -EACCES;
7489       }
7490     }
7491   }
7492   return ret;
7493 }
7494
7495 void OSD::_dispatch(Message *m)
7496 {
7497   ceph_assert(ceph_mutex_is_locked(osd_lock));
7498   dout(20) << "_dispatch " << m << " " << *m << dendl;
7499
7500   switch (m->get_type()) {
7501     // -- don't need OSDMap --
7502
7503     // map and replication
7504   case CEPH_MSG_OSD_MAP:
7505     handle_osd_map(static_cast<MOSDMap*>(m));
7506     break;
7507   case MSG_MON_GET_PURGED_SNAPS_REPLY:
7508     handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7509     break;
7510
7511     // osd
7512   case MSG_COMMAND:
7513     handle_command(static_cast<MCommand*>(m));
7514     return;
7515   }
7516 }
7517
7518 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7519 {
7520   dout(10) << __func__ <<  " " << *m << dendl;
7521   if (!require_mon_or_mgr_peer(m)) {
7522     m->put();
7523     return;
7524   }
7525   if (m->fsid != monc->get_fsid()) {
7526     dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7527             << dendl;
7528     m->put();
7529     return;
7530   }
7531   for (auto pgid : m->scrub_pgs) {
7532     enqueue_peering_evt(
7533       pgid,
7534       PGPeeringEventRef(
7535         std::make_shared<PGPeeringEvent>(
7536           m->epoch,
7537           m->epoch,
7538           PeeringState::RequestScrub(m->deep, m->repair))));
7539   }
7540   m->put();
7541 }
7542
7543 bool OSD::scrub_random_backoff()
7544 {
7545   bool coin_flip = (rand() / (double)RAND_MAX >=
7546                     cct->_conf->osd_scrub_backoff_ratio);
7547   if (!coin_flip) {
7548     dout(20) << "scrub_random_backoff lost coin flip, randomly backing off (ratio: "
7549              << cct->_conf->osd_scrub_backoff_ratio << ")" << dendl;
7550     return true;
7551   }
7552   return false;
7553 }
7554
7555
7556 void OSD::sched_scrub()
7557 {
7558   auto& scrub_scheduler = service.get_scrub_services();
7559
7560   if (auto blocked_pgs = scrub_scheduler.get_blocked_pgs_count();
7561       blocked_pgs > 0) {
7562     // some PGs managed by this OSD were blocked by a locked object during
7563     // scrub. This means we might not have the resources needed to scrub now.
7564     dout(10)
7565       << fmt::format(
7566            "{}: PGs are blocked while scrubbing due to locked objects ({} PGs)",
7567            __func__,
7568            blocked_pgs)
7569       << dendl;
7570   }
7571
7572   // fail fast if no resources are available
7573   if (!scrub_scheduler.can_inc_scrubs()) {
7574     dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7575     return;
7576   }
7577
7578   // if there is a PG that is just now trying to reserve scrub replica resources -
7579   // we should wait and not initiate a new scrub
7580   if (scrub_scheduler.is_reserving_now()) {
7581     dout(20) << __func__ << ": scrub resources reservation in progress" << dendl;
7582     return;
7583   }
7584
7585   Scrub::ScrubPreconds env_conditions;
7586
7587   if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7588     if (!cct->_conf->osd_repair_during_recovery) {
7589       dout(15) << __func__ << ": not scheduling scrubs due to active recovery"
7590                << dendl;
7591       return;
7592     }
7593     dout(10) << __func__
7594       << " will only schedule explicitly requested repair due to active recovery"
7595       << dendl;
7596     env_conditions.allow_requested_repair_only = true;
7597   }
7598
7599   if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
7600     dout(20) << __func__ << " sched_scrub starts" << dendl;
7601     auto all_jobs = scrub_scheduler.list_registered_jobs();
7602     for (const auto& sj : all_jobs) {
7603       dout(20) << "sched_scrub scrub-queue jobs: " << *sj << dendl;
7604     }
7605   }
7606
7607   auto was_started = scrub_scheduler.select_pg_and_scrub(env_conditions);
7608   dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started)
7609            << ")" << dendl;
7610 }
7611
7612 Scrub::schedule_result_t OSDService::initiate_a_scrub(spg_t pgid,
7613                                                       bool allow_requested_repair_only)
7614 {
7615   dout(20) << __func__ << " trying " << pgid << dendl;
7616
7617   // we have a candidate to scrub. We need some PG information to know if scrubbing is
7618   // allowed
7619
7620   PGRef pg = osd->lookup_lock_pg(pgid);
7621   if (!pg) {
7622     // the PG was dequeued in the short timespan between creating the candidates list
7623     // (collect_ripe_jobs()) and here
7624     dout(5) << __func__ << " pg  " << pgid << " not found" << dendl;
7625     return Scrub::schedule_result_t::no_such_pg;
7626   }
7627
7628   // This has already started, so go on to the next scrub job
7629   if (pg->is_scrub_queued_or_active()) {
7630     pg->unlock();
7631     dout(20) << __func__ << ": already in progress pgid " << pgid << dendl;
7632     return Scrub::schedule_result_t::already_started;
7633   }
7634   // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7635   if (allow_requested_repair_only && !pg->get_planned_scrub().must_repair) {
7636     pg->unlock();
7637     dout(10) << __func__ << " skip " << pgid
7638              << " because repairing is not explicitly requested on it" << dendl;
7639     return Scrub::schedule_result_t::preconditions;
7640   }
7641
7642   auto scrub_attempt = pg->sched_scrub();
7643   pg->unlock();
7644   return scrub_attempt;
7645 }
7646
7647 void OSD::resched_all_scrubs()
7648 {
7649   dout(10) << __func__ << ": start" << dendl;
7650   auto all_jobs = service.get_scrub_services().list_registered_jobs();
7651   for (auto& e : all_jobs) {
7652
7653     auto& job = *e;
7654     dout(20) << __func__ << ": examine " << job.pgid << dendl;
7655
7656     PGRef pg = _lookup_lock_pg(job.pgid);
7657     if (!pg)
7658       continue;
7659
7660     if (!pg->get_planned_scrub().must_scrub && !pg->get_planned_scrub().need_auto) {
7661       dout(15) << __func__ << ": reschedule " << job.pgid << dendl;
7662       pg->reschedule_scrub();
7663     }
7664     pg->unlock();
7665   }
7666   dout(10) << __func__ << ": done" << dendl;
7667 }
7668
7669 MPGStats* OSD::collect_pg_stats()
7670 {
7671   dout(15) << __func__ << dendl;
7672   // This implementation unconditionally sends every is_primary PG's
7673   // stats every time we're called.  This has equivalent cost to the
7674   // previous implementation's worst case where all PGs are busy and
7675   // their stats are always enqueued for sending.
7676   std::shared_lock l{map_lock};
7677
7678   osd_stat_t cur_stat = service.get_osd_stat();
7679   cur_stat.os_perf_stat = store->get_cur_stats();
7680
7681   auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7682   m->osd_stat = cur_stat;
7683
7684   std::lock_guard lec{min_last_epoch_clean_lock};
7685   min_last_epoch_clean = get_osdmap_epoch();
7686   min_last_epoch_clean_pgs.clear();
7687
7688   auto now_is = ceph::coarse_real_clock::now();
7689
7690   std::set<int64_t> pool_set;
7691   vector<PGRef> pgs;
7692   _get_pgs(&pgs);
7693   for (auto& pg : pgs) {
7694     auto pool = pg->pg_id.pgid.pool();
7695     pool_set.emplace((int64_t)pool);
7696     if (!pg->is_primary()) {
7697       continue;
7698     }
7699     pg->with_pg_stats(now_is, [&](const pg_stat_t& s, epoch_t lec) {
7700         m->pg_stat[pg->pg_id.pgid] = s;
7701         min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
7702         min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7703       });
7704   }
7705   store_statfs_t st;
7706   bool per_pool_stats = true;
7707   bool per_pool_omap_stats = false;
7708   for (auto p : pool_set) {
7709     int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7710     if (r == -ENOTSUP) {
7711       per_pool_stats = false;
7712       break;
7713     } else {
7714       assert(r >= 0);
7715       m->pool_stat[p] = st;
7716     }
7717   }
7718
7719   // indicate whether we are reporting per-pool stats
7720   m->osd_stat.num_osds = 1;
7721   m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7722   m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7723
7724   return m;
7725 }
7726
7727 vector<DaemonHealthMetric> OSD::get_health_metrics()
7728 {
7729   vector<DaemonHealthMetric> metrics;
7730   {
7731     utime_t oldest_secs;
7732     const utime_t now = ceph_clock_now();
7733     auto too_old = now;
7734     too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7735     int slow = 0;
7736     TrackedOpRef oldest_op;
7737     OSDMapRef osdmap = get_osdmap();
7738     // map of slow op counts by slow op event type for an aggregated logging to
7739     // the cluster log.
7740     map<uint8_t, int> slow_op_types;
7741     // map of slow op counts by pool for reporting a pool name with highest
7742     // slow ops.
7743     map<uint64_t, int> slow_op_pools;
7744     bool log_aggregated_slow_op =
7745             cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
7746     auto count_slow_ops = [&](TrackedOp& op) {
7747       if (op.get_initiated() < too_old) {
7748         stringstream ss;
7749         ss << "slow request " << op.get_desc()
7750            << " initiated "
7751            << op.get_initiated()
7752            << " currently "
7753            << op.state_string();
7754         lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7755         if (log_aggregated_slow_op) {
7756           if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
7757             uint8_t op_type = req->state_flag();
7758             auto m = req->get_req<MOSDFastDispatchOp>();
7759             uint64_t poolid = m->get_spg().pgid.m_pool;
7760             slow_op_types[op_type]++;
7761             if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
7762               slow_op_pools[poolid]++;
7763             }
7764           }
7765         } else {
7766           clog->warn() << ss.str();
7767         }
7768         slow++;
7769         if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7770           oldest_op = &op;
7771         }
7772         return true;
7773       } else {
7774         return false;
7775       }
7776     };
7777     if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7778       if (slow) {
7779         derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7780              << oldest_op->get_desc() << dendl;
7781         if (log_aggregated_slow_op &&
7782              slow_op_types.size() > 0) {
7783           stringstream ss;
7784           ss << slow << " slow requests (by type [ ";
7785           for (const auto& [op_type, count] : slow_op_types) {
7786             ss << "'" << OpRequest::get_state_string(op_type)
7787                << "' : " << count
7788                << " ";
7789           }
7790           auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
7791                                  [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
7792                                    return p1.second < p2.second;
7793                                  });
7794           if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
7795             string pool_name = osdmap->get_pool_name(slow_pool_it->first);
7796             ss << "] most affected pool [ '"
7797                << pool_name
7798                << "' : "
7799                << slow_pool_it->second
7800                << " ])";
7801           } else {
7802             ss << "])";
7803           }
7804           lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7805           clog->warn() << ss.str();
7806         }
7807       }
7808       metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7809     } else {
7810       // no news is not good news.
7811       metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7812     }
7813   }
7814   {
7815     std::lock_guard l(pending_creates_lock);
7816     auto n_primaries = pending_creates_from_mon;
7817     for (const auto& create : pending_creates_from_osd) {
7818       if (create.second) {
7819         n_primaries++;
7820       }
7821     }
7822     metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7823   }
7824   return metrics;
7825 }
7826
7827 // =====================================================
7828 // MAP
7829 /** update_map
7830  * assimilate new OSDMap(s).  scan pgs, etc.
7831  */
7832
7833 void OSD::note_down_osd(int peer)
7834 {
7835   ceph_assert(ceph_mutex_is_locked(osd_lock));
7836   cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7837
7838   std::lock_guard l{heartbeat_lock};
7839   failure_queue.erase(peer);
7840   failure_pending.erase(peer);
7841   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7842   if (p != heartbeat_peers.end()) {
7843     p->second.clear_mark_down();
7844     heartbeat_peers.erase(p);
7845   }
7846 }
7847
7848 void OSD::note_up_osd(int peer)
7849 {
7850   heartbeat_set_peers_need_update();
7851 }
7852
7853 struct C_OnMapCommit : public Context {
7854   OSD *osd;
7855   epoch_t first, last;
7856   MOSDMap *msg;
7857   C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7858     : osd(o), first(f), last(l), msg(m) {}
7859   void finish(int r) override {
7860     osd->_committed_osd_maps(first, last, msg);
7861     msg->put();
7862   }
7863 };
7864
7865 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7866 {
7867   std::lock_guard l(osdmap_subscribe_lock);
7868   if (latest_subscribed_epoch >= epoch && !force_request)
7869     return;
7870
7871   latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7872
7873   if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7874       force_request) {
7875     monc->renew_subs();
7876   }
7877 }
7878
7879 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7880 {
7881   epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7882   if (min <= superblock.oldest_map)
7883     return;
7884
7885   int num = 0;
7886   ObjectStore::Transaction t;
7887   for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7888     dout(20) << " removing old osdmap epoch " << e << dendl;
7889     t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7890     t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7891     superblock.oldest_map = e + 1;
7892     num++;
7893     if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7894       service.publish_superblock(superblock);
7895       write_superblock(t);
7896       int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7897       ceph_assert(tr == 0);
7898       num = 0;
7899       if (!skip_maps) {
7900         // skip_maps leaves us with a range of old maps if we fail to remove all
7901         // of them before moving superblock.oldest_map forward to the first map
7902         // in the incoming MOSDMap msg. so we should continue removing them in
7903         // this case, even we could do huge series of delete transactions all at
7904         // once.
7905         break;
7906       }
7907     }
7908   }
7909   if (num > 0) {
7910     service.publish_superblock(superblock);
7911     write_superblock(t);
7912     int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7913     ceph_assert(tr == 0);
7914   }
7915   // we should not remove the cached maps
7916   ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7917 }
7918
7919 void OSD::handle_osd_map(MOSDMap *m)
7920 {
7921   // wait for pgs to catch up
7922   {
7923     // we extend the map cache pins to accomodate pgs slow to consume maps
7924     // for some period, until we hit the max_lag_factor bound, at which point
7925     // we block here to stop injesting more maps than they are able to keep
7926     // up with.
7927     epoch_t max_lag = cct->_conf->osd_map_cache_size *
7928       m_osd_pg_epoch_max_lag_factor;
7929     ceph_assert(max_lag > 0);
7930     epoch_t osd_min = 0;
7931     for (auto shard : shards) {
7932       epoch_t min = shard->get_min_pg_epoch();
7933       if (osd_min == 0 || min < osd_min) {
7934         osd_min = min;
7935       }
7936     }
7937     epoch_t osdmap_epoch = get_osdmap_epoch();
7938     if (osd_min > 0 &&
7939         osdmap_epoch > max_lag &&
7940         osdmap_epoch - max_lag > osd_min) {
7941       epoch_t need = osdmap_epoch - max_lag;
7942       dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7943                << " max_lag " << max_lag << ")" << dendl;
7944       for (auto shard : shards) {
7945         epoch_t min = shard->get_min_pg_epoch();
7946         if (need > min) {
7947           dout(10) << __func__ << " waiting for pgs to consume " << need
7948                    << " (shard " << shard->shard_id << " min " << min
7949                    << ", map cache is " << cct->_conf->osd_map_cache_size
7950                    << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7951                    << ")" << dendl;
7952           unlock_guard unlock{osd_lock};
7953           shard->wait_min_pg_epoch(need);
7954         }
7955       }
7956     }
7957   }
7958
7959   ceph_assert(ceph_mutex_is_locked(osd_lock));
7960   map<epoch_t,OSDMapRef> added_maps;
7961   map<epoch_t,bufferlist> added_maps_bl;
7962   if (m->fsid != monc->get_fsid()) {
7963     dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7964             << monc->get_fsid() << dendl;
7965     m->put();
7966     return;
7967   }
7968   if (is_initializing()) {
7969     dout(0) << "ignoring osdmap until we have initialized" << dendl;
7970     m->put();
7971     return;
7972   }
7973
7974   auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7975   if (session && !(session->entity_name.is_mon() ||
7976                    session->entity_name.is_osd())) {
7977     //not enough perms!
7978     dout(10) << "got osd map from Session " << session
7979              << " which we can't take maps from (not a mon or osd)" << dendl;
7980     m->put();
7981     return;
7982   }
7983
7984   // share with the objecter
7985   if (!is_preboot())
7986     service.objecter->handle_osd_map(m);
7987
7988   epoch_t first = m->get_first();
7989   epoch_t last = m->get_last();
7990   dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7991           << superblock.newest_map
7992           << ", src has [" << m->cluster_osdmap_trim_lower_bound
7993           << "," << m->newest_map << "]"
7994           << dendl;
7995
7996   logger->inc(l_osd_map);
7997   logger->inc(l_osd_mape, last - first + 1);
7998   if (first <= superblock.newest_map)
7999     logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8000
8001   if (superblock.cluster_osdmap_trim_lower_bound <
8002       m->cluster_osdmap_trim_lower_bound) {
8003     superblock.cluster_osdmap_trim_lower_bound =
8004       m->cluster_osdmap_trim_lower_bound;
8005     dout(10) << " superblock cluster_osdmap_trim_lower_bound new epoch is: "
8006              << superblock.cluster_osdmap_trim_lower_bound << dendl;
8007     ceph_assert(
8008       superblock.cluster_osdmap_trim_lower_bound >= superblock.oldest_map);
8009   }
8010
8011   // make sure there is something new, here, before we bother flushing
8012   // the queues and such
8013   if (last <= superblock.newest_map) {
8014     dout(10) << " no new maps here, dropping" << dendl;
8015     m->put();
8016     return;
8017   }
8018
8019   // missing some?
8020   bool skip_maps = false;
8021   if (first > superblock.newest_map + 1) {
8022     dout(10) << "handle_osd_map message skips epochs "
8023              << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8024     if (m->cluster_osdmap_trim_lower_bound <= superblock.newest_map + 1) {
8025       osdmap_subscribe(superblock.newest_map + 1, false);
8026       m->put();
8027       return;
8028     }
8029     // always try to get the full range of maps--as many as we can.  this
8030     //  1- is good to have
8031     //  2- is at present the only way to ensure that we get a *full* map as
8032     //     the first map!
8033     if (m->cluster_osdmap_trim_lower_bound < first) {
8034       osdmap_subscribe(m->cluster_osdmap_trim_lower_bound - 1, true);
8035       m->put();
8036       return;
8037     }
8038     skip_maps = true;
8039   }
8040
8041   ObjectStore::Transaction t;
8042   uint64_t txn_size = 0;
8043
8044   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8045
8046   // store new maps: queue for disk and put in the osdmap cache
8047   epoch_t start = std::max(superblock.newest_map + 1, first);
8048   for (epoch_t e = start; e <= last; e++) {
8049     if (txn_size >= t.get_num_bytes()) {
8050       derr << __func__ << " transaction size overflowed" << dendl;
8051       ceph_assert(txn_size < t.get_num_bytes());
8052     }
8053     txn_size = t.get_num_bytes();
8054     map<epoch_t,bufferlist>::iterator p;
8055     p = m->maps.find(e);
8056     if (p != m->maps.end()) {
8057       dout(10) << "handle_osd_map  got full map for epoch " << e << dendl;
8058       OSDMap *o = new OSDMap;
8059       bufferlist& bl = p->second;
8060
8061       o->decode(bl);
8062
8063       purged_snaps[e] = o->get_new_purged_snaps();
8064
8065       ghobject_t fulloid = get_osdmap_pobject_name(e);
8066       t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8067       added_maps[e] = add_map(o);
8068       added_maps_bl[e] = bl;
8069       got_full_map(e);
8070       continue;
8071     }
8072
8073     p = m->incremental_maps.find(e);
8074     if (p != m->incremental_maps.end()) {
8075       dout(10) << "handle_osd_map  got inc map for epoch " << e << dendl;
8076       bufferlist& bl = p->second;
8077       ghobject_t oid = get_inc_osdmap_pobject_name(e);
8078       t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8079
8080       OSDMap *o = new OSDMap;
8081       if (e > 1) {
8082         bufferlist obl;
8083         bool got = get_map_bl(e - 1, obl);
8084         if (!got) {
8085           auto p = added_maps_bl.find(e - 1);
8086           ceph_assert(p != added_maps_bl.end());
8087           obl = p->second;
8088         }
8089         o->decode(obl);
8090       }
8091
8092       OSDMap::Incremental inc;
8093       auto p = bl.cbegin();
8094       inc.decode(p);
8095
8096       if (o->apply_incremental(inc) < 0) {
8097         derr << "ERROR: bad fsid?  i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
8098         ceph_abort_msg("bad fsid");
8099       }
8100
8101       bufferlist fbl;
8102       o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8103
8104       bool injected_failure = false;
8105       if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8106           (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8107         derr << __func__ << " injecting map crc failure" << dendl;
8108         injected_failure = true;
8109       }
8110
8111       if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8112         dout(2) << "got incremental " << e
8113                 << " but failed to encode full with correct crc; requesting"
8114                 << dendl;
8115         clog->warn() << "failed to encode map e" << e << " with expected crc";
8116         dout(20) << "my encoded map was:\n";
8117         fbl.hexdump(*_dout);
8118         *_dout << dendl;
8119         delete o;
8120         request_full_map(e, last);
8121         last = e - 1;
8122
8123         // don't continue committing if we failed to enc the first inc map
8124         if (last < start) {
8125           dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8126           m->put();
8127           return;
8128         }
8129         break;
8130       }
8131       got_full_map(e);
8132       purged_snaps[e] = o->get_new_purged_snaps();
8133
8134       ghobject_t fulloid = get_osdmap_pobject_name(e);
8135       t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8136       added_maps[e] = add_map(o);
8137       added_maps_bl[e] = fbl;
8138       continue;
8139     }
8140
8141     ceph_abort_msg("MOSDMap lied about what maps it had?");
8142   }
8143
8144   // even if this map isn't from a mon, we may have satisfied our subscription
8145   monc->sub_got("osdmap", last);
8146
8147   if (!m->maps.empty() && requested_full_first) {
8148     dout(10) << __func__ << " still missing full maps " << requested_full_first
8149              << ".." << requested_full_last << dendl;
8150     rerequest_full_maps();
8151   }
8152
8153   if (superblock.oldest_map) {
8154     // make sure we at least keep pace with incoming maps
8155     trim_maps(m->cluster_osdmap_trim_lower_bound,
8156               last - first + 1, skip_maps);
8157     pg_num_history.prune(superblock.oldest_map);
8158   }
8159
8160   if (!superblock.oldest_map || skip_maps)
8161     superblock.oldest_map = first;
8162   superblock.newest_map = last;
8163   superblock.current_epoch = last;
8164
8165   // note in the superblock that we were clean thru the prior epoch
8166   epoch_t boot_epoch = service.get_boot_epoch();
8167   if (boot_epoch && boot_epoch >= superblock.mounted) {
8168     superblock.mounted = boot_epoch;
8169     superblock.clean_thru = last;
8170   }
8171
8172   // check for pg_num changes and deleted pools
8173   OSDMapRef lastmap;
8174   for (auto& i : added_maps) {
8175     if (!lastmap) {
8176       if (!(lastmap = service.try_get_map(i.first - 1))) {
8177         dout(10) << __func__ << " can't get previous map " << i.first - 1
8178                  << " probably first start of this osd" << dendl;
8179         continue;
8180       }
8181     }
8182     ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8183     for (auto& j : lastmap->get_pools()) {
8184       if (!i.second->have_pg_pool(j.first)) {
8185         pg_num_history.log_pool_delete(i.first, j.first);
8186         dout(10) << __func__ << " recording final pg_pool_t for pool "
8187                  << j.first << dendl;
8188         // this information is needed by _make_pg() if have to restart before
8189         // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8190         ghobject_t obj = make_final_pool_info_oid(j.first);
8191         bufferlist bl;
8192         encode(j.second, bl, CEPH_FEATURES_ALL);
8193         string name = lastmap->get_pool_name(j.first);
8194         encode(name, bl);
8195         map<string,string> profile;
8196         if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8197           profile = lastmap->get_erasure_code_profile(
8198             lastmap->get_pg_pool(j.first)->erasure_code_profile);
8199         }
8200         encode(profile, bl);
8201         t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8202       } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8203                  new_pg_num != j.second.get_pg_num()) {
8204         dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8205                  << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8206         pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8207       }
8208     }
8209     for (auto& j : i.second->get_pools()) {
8210       if (!lastmap->have_pg_pool(j.first)) {
8211         dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8212                  << j.second.get_pg_num() << dendl;
8213         pg_num_history.log_pg_num_change(i.first, j.first,
8214                                          j.second.get_pg_num());
8215       }
8216     }
8217     lastmap = i.second;
8218   }
8219   pg_num_history.epoch = last;
8220   {
8221     bufferlist bl;
8222     ::encode(pg_num_history, bl);
8223     t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8224     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8225   }
8226
8227   // record new purged_snaps
8228   if (superblock.purged_snaps_last == start - 1) {
8229     OSDriver osdriver{store.get(), service.meta_ch, make_purged_snaps_oid()};
8230     SnapMapper::record_purged_snaps(
8231       cct,
8232       osdriver,
8233       osdriver.get_transaction(&t),
8234       purged_snaps);
8235     superblock.purged_snaps_last = last;
8236   } else {
8237     dout(10) << __func__ << " superblock purged_snaps_last is "
8238              << superblock.purged_snaps_last
8239              << ", not recording new purged_snaps" << dendl;
8240   }
8241
8242   // superblock and commit
8243   write_superblock(t);
8244   t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8245   store->queue_transaction(
8246     service.meta_ch,
8247     std::move(t));
8248   service.publish_superblock(superblock);
8249 }
8250
8251 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8252 {
8253   dout(10) << __func__ << " " << first << ".." << last << dendl;
8254   if (is_stopping()) {
8255     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8256     return;
8257   }
8258   std::lock_guard l(osd_lock);
8259   if (is_stopping()) {
8260     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8261     return;
8262   }
8263   map_lock.lock();
8264
8265   ceph_assert(first <= last);
8266
8267   bool do_shutdown = false;
8268   bool do_restart = false;
8269   bool network_error = false;
8270   OSDMapRef osdmap = get_osdmap();
8271
8272   // advance through the new maps
8273   for (epoch_t cur = first; cur <= last; cur++) {
8274     dout(10) << " advance to epoch " << cur
8275              << " (<= last " << last
8276              << " <= newest_map " << superblock.newest_map
8277              << ")" << dendl;
8278
8279     OSDMapRef newmap = get_map(cur);
8280     ceph_assert(newmap);  // we just cached it above!
8281
8282     // start blocklisting messages sent to peers that go down.
8283     service.pre_publish_map(newmap);
8284
8285     // kill connections to newly down osds
8286     bool waited_for_reservations = false;
8287     set<int> old;
8288     osdmap = get_osdmap();
8289     osdmap->get_all_osds(old);
8290     for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8291       if (*p != whoami &&
8292           osdmap->is_up(*p) && // in old map
8293           newmap->is_down(*p)) {    // but not the new one
8294         if (!waited_for_reservations) {
8295           service.await_reserved_maps();
8296           waited_for_reservations = true;
8297         }
8298         note_down_osd(*p);
8299       } else if (*p != whoami &&
8300                 osdmap->is_down(*p) &&
8301                 newmap->is_up(*p)) {
8302         note_up_osd(*p);
8303       }
8304     }
8305
8306     if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8307       dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8308                << dendl;
8309       if (is_booting()) {
8310         // this captures the case where we sent the boot message while
8311         // NOUP was being set on the mon and our boot request was
8312         // dropped, and then later it is cleared.  it imperfectly
8313         // handles the case where our original boot message was not
8314         // dropped and we restart even though we might have booted, but
8315         // that is harmless (boot will just take slightly longer).
8316         do_restart = true;
8317       }
8318     }
8319
8320     osdmap = std::move(newmap);
8321     set_osdmap(osdmap);
8322     epoch_t up_epoch;
8323     epoch_t boot_epoch;
8324     service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8325     if (!up_epoch &&
8326         osdmap->is_up(whoami) &&
8327         osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8328       up_epoch = osdmap->get_epoch();
8329       dout(10) << "up_epoch is " << up_epoch << dendl;
8330       if (!boot_epoch) {
8331         boot_epoch = osdmap->get_epoch();
8332         dout(10) << "boot_epoch is " << boot_epoch << dendl;
8333       }
8334       service.set_epochs(&boot_epoch, &up_epoch, NULL);
8335     }
8336   }
8337
8338   epoch_t _bind_epoch = service.get_bind_epoch();
8339   if (osdmap->is_up(whoami) &&
8340       osdmap->get_addrs(whoami).legacy_equals(
8341         client_messenger->get_myaddrs()) &&
8342       _bind_epoch < osdmap->get_up_from(whoami)) {
8343
8344     if (is_booting()) {
8345       dout(1) << "state: booting -> active" << dendl;
8346       set_state(STATE_ACTIVE);
8347       do_restart = false;
8348
8349       // set incarnation so that osd_reqid_t's we generate for our
8350       // objecter requests are unique across restarts.
8351       service.objecter->set_client_incarnation(osdmap->get_epoch());
8352       cancel_pending_failures();
8353     }
8354   }
8355
8356   if (osdmap->get_epoch() > 0 &&
8357       is_active()) {
8358     if (!osdmap->exists(whoami)) {
8359       derr << "map says i do not exist.  shutting down." << dendl;
8360       do_shutdown = true;   // don't call shutdown() while we have
8361                             // everything paused
8362     } else if (osdmap->is_stop(whoami)) {
8363       derr << "map says i am stopped by admin. shutting down." << dendl;
8364       do_shutdown = true;
8365     } else if (!osdmap->is_up(whoami) ||
8366                !osdmap->get_addrs(whoami).legacy_equals(
8367                  client_messenger->get_myaddrs()) ||
8368                !osdmap->get_cluster_addrs(whoami).legacy_equals(
8369                  cluster_messenger->get_myaddrs()) ||
8370                !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8371                  hb_back_server_messenger->get_myaddrs()) ||
8372                !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8373                  hb_front_server_messenger->get_myaddrs())) {
8374       if (!osdmap->is_up(whoami)) {
8375         if (service.is_preparing_to_stop() || service.is_stopping()) {
8376           service.got_stop_ack();
8377         } else {
8378           clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8379                           "but it is still running";
8380           clog->debug() << "map e" << osdmap->get_epoch()
8381                         << " wrongly marked me down at e"
8382                         << osdmap->get_down_at(whoami);
8383         }
8384         if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8385           // note that this is best-effort...
8386           monc->send_mon_message(
8387             new MOSDMarkMeDead(
8388               monc->get_fsid(),
8389               whoami,
8390               osdmap->get_epoch()));
8391         }
8392       } else if (!osdmap->get_addrs(whoami).legacy_equals(
8393                    client_messenger->get_myaddrs())) {
8394         clog->error() << "map e" << osdmap->get_epoch()
8395                       << " had wrong client addr (" << osdmap->get_addrs(whoami)
8396                       << " != my " << client_messenger->get_myaddrs() << ")";
8397       } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8398                    cluster_messenger->get_myaddrs())) {
8399         clog->error() << "map e" << osdmap->get_epoch()
8400                       << " had wrong cluster addr ("
8401                       << osdmap->get_cluster_addrs(whoami)
8402                       << " != my " << cluster_messenger->get_myaddrs() << ")";
8403       } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8404                    hb_back_server_messenger->get_myaddrs())) {
8405         clog->error() << "map e" << osdmap->get_epoch()
8406                       << " had wrong heartbeat back addr ("
8407                       << osdmap->get_hb_back_addrs(whoami)
8408                       << " != my " << hb_back_server_messenger->get_myaddrs()
8409                       << ")";
8410       } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8411                    hb_front_server_messenger->get_myaddrs())) {
8412         clog->error() << "map e" << osdmap->get_epoch()
8413                       << " had wrong heartbeat front addr ("
8414                       << osdmap->get_hb_front_addrs(whoami)
8415                       << " != my " << hb_front_server_messenger->get_myaddrs()
8416                       << ")";
8417       }
8418
8419       if (!service.is_stopping()) {
8420         epoch_t up_epoch = 0;
8421         epoch_t bind_epoch = osdmap->get_epoch();
8422         service.set_epochs(NULL,&up_epoch, &bind_epoch);
8423         do_restart = true;
8424
8425         //add markdown log
8426         utime_t now = ceph_clock_now();
8427         utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8428         osd_markdown_log.push_back(now);
8429         if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8430           derr << __func__ << " marked down "
8431                << osd_markdown_log.size()
8432                << " > osd_max_markdown_count "
8433                << cct->_conf->osd_max_markdown_count
8434                << " in last " << grace << " seconds, shutting down"
8435                << dendl;
8436           do_restart = false;
8437           do_shutdown = true;
8438         }
8439
8440         start_waiting_for_healthy();
8441
8442         set<int> avoid_ports;
8443 #if defined(__FreeBSD__)
8444         // prevent FreeBSD from grabbing the client_messenger port during
8445         // rebinding. In which case a cluster_meesneger will connect also
8446         // to the same port
8447         client_messenger->get_myaddrs().get_ports(&avoid_ports);
8448 #endif
8449         cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8450
8451         int r = cluster_messenger->rebind(avoid_ports);
8452         if (r != 0) {
8453           do_shutdown = true;  // FIXME: do_restart?
8454           network_error = true;
8455           derr << __func__ << " marked down:"
8456                << " rebind cluster_messenger failed" << dendl;
8457         }
8458
8459         hb_back_server_messenger->mark_down_all();
8460         hb_front_server_messenger->mark_down_all();
8461         hb_front_client_messenger->mark_down_all();
8462         hb_back_client_messenger->mark_down_all();
8463
8464         reset_heartbeat_peers(true);
8465       }
8466     }
8467   } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
8468     derr << "map says i am stopped by admin. shutting down." << dendl;
8469     do_shutdown = true;
8470   }
8471
8472   map_lock.unlock();
8473
8474   check_osdmap_features();
8475
8476   // yay!
8477   consume_map();
8478
8479   if (is_active() || is_waiting_for_healthy())
8480     maybe_update_heartbeat_peers();
8481
8482   if (is_active()) {
8483     activate_map();
8484   }
8485
8486   if (do_shutdown) {
8487     if (network_error) {
8488       cancel_pending_failures();
8489     }
8490     // trigger shutdown in a different thread
8491     dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8492     queue_async_signal(SIGINT);
8493   }
8494   else if (m->newest_map && m->newest_map > last) {
8495     dout(10) << " msg say newest map is " << m->newest_map
8496              << ", requesting more" << dendl;
8497     osdmap_subscribe(osdmap->get_epoch()+1, false);
8498   }
8499   else if (is_preboot()) {
8500     if (m->get_source().is_mon())
8501       _preboot(m->cluster_osdmap_trim_lower_bound, m->newest_map);
8502     else
8503       start_boot();
8504   }
8505   else if (do_restart)
8506     start_boot();
8507
8508 }
8509
8510 void OSD::check_osdmap_features()
8511 {
8512   // adjust required feature bits?
8513
8514   // we have to be a bit careful here, because we are accessing the
8515   // Policy structures without taking any lock.  in particular, only
8516   // modify integer values that can safely be read by a racing CPU.
8517   // since we are only accessing existing Policy structures a their
8518   // current memory location, and setting or clearing bits in integer
8519   // fields, and we are the only writer, this is not a problem.
8520
8521   const auto osdmap = get_osdmap();
8522   {
8523     Messenger::Policy p = client_messenger->get_default_policy();
8524     uint64_t mask;
8525     uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8526     if ((p.features_required & mask) != features) {
8527       dout(0) << "crush map has features " << features
8528               << ", adjusting msgr requires for clients" << dendl;
8529       p.features_required = (p.features_required & ~mask) | features;
8530       client_messenger->set_default_policy(p);
8531     }
8532   }
8533   {
8534     Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8535     uint64_t mask;
8536     uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8537     if ((p.features_required & mask) != features) {
8538       dout(0) << "crush map has features " << features
8539               << " was " << p.features_required
8540               << ", adjusting msgr requires for mons" << dendl;
8541       p.features_required = (p.features_required & ~mask) | features;
8542       client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8543     }
8544   }
8545   {
8546     Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8547     uint64_t mask;
8548     uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8549
8550     if ((p.features_required & mask) != features) {
8551       dout(0) << "crush map has features " << features
8552               << ", adjusting msgr requires for osds" << dendl;
8553       p.features_required = (p.features_required & ~mask) | features;
8554       cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8555     }
8556
8557     if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8558       dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8559       superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8560       ObjectStore::Transaction t;
8561       write_superblock(t);
8562       int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8563       ceph_assert(err == 0);
8564     }
8565   }
8566
8567   if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8568     hb_front_server_messenger->set_require_authorizer(false);
8569     hb_back_server_messenger->set_require_authorizer(false);
8570   } else {
8571     hb_front_server_messenger->set_require_authorizer(true);
8572     hb_back_server_messenger->set_require_authorizer(true);
8573   }
8574
8575   if (osdmap->require_osd_release != last_require_osd_release) {
8576     dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8577             << " -> " << to_string(osdmap->require_osd_release) << dendl;
8578     store->write_meta("require_osd_release",
8579                       stringify((int)osdmap->require_osd_release));
8580     last_require_osd_release = osdmap->require_osd_release;
8581   }
8582 }
8583
8584 struct C_FinishSplits : public Context {
8585   OSD *osd;
8586   set<PGRef> pgs;
8587   C_FinishSplits(OSD *osd, const set<PGRef> &in)
8588     : osd(osd), pgs(in) {}
8589   void finish(int r) override {
8590     osd->_finish_splits(pgs);
8591   }
8592 };
8593
8594 void OSD::_finish_splits(set<PGRef>& pgs)
8595 {
8596   dout(10) << __func__ << " " << pgs << dendl;
8597   if (is_stopping())
8598     return;
8599   for (set<PGRef>::iterator i = pgs.begin();
8600        i != pgs.end();
8601        ++i) {
8602     PG *pg = i->get();
8603
8604     PeeringCtx rctx;
8605     pg->lock();
8606     dout(10) << __func__ << " " << *pg << dendl;
8607     epoch_t e = pg->get_osdmap_epoch();
8608     pg->handle_initialize(rctx);
8609     pg->queue_null(e, e);
8610     dispatch_context(rctx, pg, service.get_osdmap());
8611     pg->unlock();
8612
8613     unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8614     shards[shard_index]->register_and_wake_split_child(pg);
8615   }
8616 };
8617
8618 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8619                            unsigned need)
8620 {
8621   std::lock_guard l(merge_lock);
8622   auto& p = merge_waiters[nextmap->get_epoch()][target];
8623   p[src->pg_id] = src;
8624   dout(10) << __func__ << " added merge_waiter " << src->pg_id
8625            << " for " << target  << ", have " << p.size() << "/" << need
8626            << dendl;
8627   return p.size() == need;
8628 }
8629
8630 bool OSD::advance_pg(
8631   epoch_t osd_epoch,
8632   PG *pg,
8633   ThreadPool::TPHandle &handle,
8634   PeeringCtx &rctx)
8635 {
8636   if (osd_epoch <= pg->get_osdmap_epoch()) {
8637     return true;
8638   }
8639   ceph_assert(pg->is_locked());
8640   OSDMapRef lastmap = pg->get_osdmap();
8641   set<PGRef> new_pgs;  // any split children
8642   bool ret = true;
8643
8644   unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8645     lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8646   for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8647        next_epoch <= osd_epoch;
8648        ++next_epoch) {
8649     OSDMapRef nextmap = service.try_get_map(next_epoch);
8650     if (!nextmap) {
8651       dout(20) << __func__ << " missing map " << next_epoch << dendl;
8652       continue;
8653     }
8654
8655     unsigned new_pg_num =
8656       (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8657       nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8658     if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8659       // check for merge
8660       if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8661         spg_t parent;
8662         if (pg->pg_id.is_merge_source(
8663               old_pg_num,
8664               new_pg_num,
8665               &parent)) {
8666           // we are merge source
8667           PGRef spg = pg; // carry a ref
8668           dout(1) << __func__ << " " << pg->pg_id
8669                   << " is merge source, target is " << parent
8670                    << dendl;
8671           pg->write_if_dirty(rctx);
8672           if (!new_pgs.empty()) {
8673             rctx.transaction.register_on_applied(new C_FinishSplits(this,
8674                                                                     new_pgs));
8675             new_pgs.clear();
8676           }
8677           dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8678           pg->ch->flush();
8679           // release backoffs explicitly, since the on_shutdown path
8680           // aggressively tears down backoff state.
8681           if (pg->is_primary()) {
8682             pg->release_pg_backoffs();
8683           }
8684           pg->on_shutdown();
8685           OSDShard *sdata = pg->osd_shard;
8686           {
8687             std::lock_guard l(sdata->shard_lock);
8688             if (pg->pg_slot) {
8689               sdata->_detach_pg(pg->pg_slot);
8690               // update pg count now since we might not get an osdmap
8691               // any time soon.
8692               if (pg->is_primary())
8693                 logger->dec(l_osd_pg_primary);
8694               else if (pg->is_nonprimary())
8695                 logger->dec(l_osd_pg_replica); // misnomer
8696               else
8697                 logger->dec(l_osd_pg_stray);
8698             }
8699           }
8700           pg->unlock();
8701
8702           set<spg_t> children;
8703           parent.is_split(new_pg_num, old_pg_num, &children);
8704           if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8705             enqueue_peering_evt(
8706               parent,
8707               PGPeeringEventRef(
8708                 std::make_shared<PGPeeringEvent>(
8709                   nextmap->get_epoch(),
8710                   nextmap->get_epoch(),
8711                   NullEvt())));
8712           }
8713           ret = false;
8714           goto out;
8715         } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8716           // we are merge target
8717           set<spg_t> children;
8718           pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8719           dout(20) << __func__ << " " << pg->pg_id
8720                    << " is merge target, sources are " << children
8721                    << dendl;
8722           map<spg_t,PGRef> sources;
8723           {
8724             std::lock_guard l(merge_lock);
8725             auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8726             unsigned need = children.size();
8727             dout(20) << __func__ << " have " << s.size() << "/"
8728                      << need << dendl;
8729             if (s.size() == need) {
8730               sources.swap(s);
8731               merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8732               if (merge_waiters[nextmap->get_epoch()].empty()) {
8733                 merge_waiters.erase(nextmap->get_epoch());
8734               }
8735             }
8736           }
8737           if (!sources.empty()) {
8738             unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8739             unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8740             dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8741             pg->merge_from(
8742               sources, rctx, split_bits,
8743               nextmap->get_pg_pool(
8744                 pg->pg_id.pool())->last_pg_merge_meta);
8745             pg->pg_slot->waiting_for_merge_epoch = 0;
8746           } else {
8747             dout(20) << __func__ << " not ready to merge yet" << dendl;
8748             pg->write_if_dirty(rctx);
8749             if (!new_pgs.empty()) {
8750               rctx.transaction.register_on_applied(new C_FinishSplits(this,
8751                                                                       new_pgs));
8752               new_pgs.clear();
8753             }
8754             dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8755             pg->unlock();
8756             // kick source(s) to get them ready
8757             for (auto& i : children) {
8758               dout(20) << __func__ << " kicking source " << i << dendl;
8759               enqueue_peering_evt(
8760                 i,
8761                 PGPeeringEventRef(
8762                   std::make_shared<PGPeeringEvent>(
8763                     nextmap->get_epoch(),
8764                     nextmap->get_epoch(),
8765                     NullEvt())));
8766             }
8767             ret = false;
8768             goto out;
8769           }
8770         }
8771       }
8772     }
8773
8774     vector<int> newup, newacting;
8775     int up_primary, acting_primary;
8776     nextmap->pg_to_up_acting_osds(
8777       pg->pg_id.pgid,
8778       &newup, &up_primary,
8779       &newacting, &acting_primary);
8780     pg->handle_advance_map(
8781       nextmap, lastmap, newup, up_primary,
8782       newacting, acting_primary, rctx);
8783
8784     auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8785     auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8786     if (oldpool != lastmap->get_pools().end()
8787         && newpool != nextmap->get_pools().end()) {
8788       dout(20) << __func__
8789                << " new pool opts " << newpool->second.opts
8790                << " old pool opts " << oldpool->second.opts
8791                << dendl;
8792
8793       double old_min_interval = 0, new_min_interval = 0;
8794       oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8795       newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8796
8797       double old_max_interval = 0, new_max_interval = 0;
8798       oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8799       newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8800
8801       // Assume if an interval is change from set to unset or vice versa the actual config
8802       // is different.  Keep it simple even if it is possible to call resched_all_scrub()
8803       // unnecessarily.
8804       if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8805         pg->on_info_history_change();
8806       }
8807     }
8808
8809     if (new_pg_num && old_pg_num != new_pg_num) {
8810       // check for split
8811       set<spg_t> children;
8812       if (pg->pg_id.is_split(
8813             old_pg_num,
8814             new_pg_num,
8815             &children)) {
8816         split_pgs(
8817           pg, children, &new_pgs, lastmap, nextmap,
8818           rctx);
8819       }
8820     }
8821
8822     lastmap = nextmap;
8823     old_pg_num = new_pg_num;
8824     handle.reset_tp_timeout();
8825   }
8826   pg->handle_activate_map(rctx);
8827
8828   ret = true;
8829  out:
8830   if (!new_pgs.empty()) {
8831     rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8832   }
8833   return ret;
8834 }
8835
8836 void OSD::consume_map()
8837 {
8838   ceph_assert(ceph_mutex_is_locked(osd_lock));
8839   auto osdmap = get_osdmap();
8840   dout(20) << __func__ << " version " << osdmap->get_epoch() << dendl;
8841
8842   /** make sure the cluster is speaking in SORTBITWISE, because we don't
8843    *  speak the older sorting version any more. Be careful not to force
8844    *  a shutdown if we are merely processing old maps, though.
8845    */
8846   if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8847     derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8848     ceph_abort();
8849   }
8850   service.pre_publish_map(osdmap);
8851   service.await_reserved_maps();
8852   service.publish_map(osdmap);
8853   dout(20) << "consume_map " << osdmap->get_epoch() << " -- publish done" << dendl;
8854   // prime splits and merges
8855   set<pair<spg_t,epoch_t>> newly_split;  // splits, and when
8856   set<pair<spg_t,epoch_t>> merge_pgs;    // merge participants, and when
8857   for (auto& shard : shards) {
8858     shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8859   }
8860   if (!newly_split.empty()) {
8861     for (auto& shard : shards) {
8862       shard->prime_splits(osdmap, &newly_split);
8863     }
8864     ceph_assert(newly_split.empty());
8865   }
8866
8867   // prune sent_ready_to_merge
8868   service.prune_sent_ready_to_merge(osdmap);
8869
8870   // FIXME, maybe: We could race against an incoming peering message
8871   // that instantiates a merge PG after identify_merges() below and
8872   // never set up its peer to complete the merge.  An OSD restart
8873   // would clear it up.  This is a hard race to resolve,
8874   // extraordinarily rare (we only merge PGs that are stable and
8875   // clean, so it'd have to be an imported PG to an OSD with a
8876   // slightly stale OSDMap...), so I'm ignoring it for now.  We plan to
8877   // replace all of this with a seastar-based code soon anyway.
8878   if (!merge_pgs.empty()) {
8879     // mark the pgs we already have, or create new and empty merge
8880     // participants for those we are missing.  do this all under the
8881     // shard lock so we don't have to worry about racing pg creates
8882     // via _process.
8883     for (auto& shard : shards) {
8884       shard->prime_merges(osdmap, &merge_pgs);
8885     }
8886     ceph_assert(merge_pgs.empty());
8887   }
8888
8889   service.prune_pg_created();
8890
8891   unsigned pushes_to_free = 0;
8892   for (auto& shard : shards) {
8893     shard->consume_map(osdmap, &pushes_to_free);
8894   }
8895
8896   vector<spg_t> pgids;
8897   _get_pgids(&pgids);
8898
8899   // count (FIXME, probably during seastar rewrite)
8900   int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8901   vector<PGRef> pgs;
8902   _get_pgs(&pgs);
8903   for (auto& pg : pgs) {
8904     // FIXME (probably during seastar rewrite): this is lockless and
8905     // racy, but we don't want to take pg lock here.
8906     if (pg->is_primary())
8907       num_pg_primary++;
8908     else if (pg->is_nonprimary())
8909       num_pg_replica++;  // misnomer
8910     else
8911       num_pg_stray++;
8912   }
8913
8914   {
8915     // FIXME (as part of seastar rewrite): move to OSDShard
8916     std::lock_guard l(pending_creates_lock);
8917     for (auto pg = pending_creates_from_osd.begin();
8918          pg != pending_creates_from_osd.end();) {
8919       if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8920         dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8921                  << "discarding pending_create_from_osd" << dendl;
8922         pg = pending_creates_from_osd.erase(pg);
8923       } else {
8924         ++pg;
8925       }
8926     }
8927   }
8928
8929   service.maybe_inject_dispatch_delay();
8930
8931   dispatch_sessions_waiting_on_map();
8932
8933   service.maybe_inject_dispatch_delay();
8934
8935   service.release_reserved_pushes(pushes_to_free);
8936
8937   // queue null events to push maps down to individual PGs
8938   for (auto pgid : pgids) {
8939     enqueue_peering_evt(
8940       pgid,
8941       PGPeeringEventRef(
8942         std::make_shared<PGPeeringEvent>(
8943           osdmap->get_epoch(),
8944           osdmap->get_epoch(),
8945           NullEvt())));
8946   }
8947   logger->set(l_osd_pg, pgids.size());
8948   logger->set(l_osd_pg_primary, num_pg_primary);
8949   logger->set(l_osd_pg_replica, num_pg_replica);
8950   logger->set(l_osd_pg_stray, num_pg_stray);
8951 }
8952
8953 void OSD::activate_map()
8954 {
8955   ceph_assert(ceph_mutex_is_locked(osd_lock));
8956   auto osdmap = get_osdmap();
8957
8958   dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8959
8960   // norecover?
8961   if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8962     if (!service.recovery_is_paused()) {
8963       dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8964       service.pause_recovery();
8965     }
8966   } else {
8967     if (service.recovery_is_paused()) {
8968       dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8969       service.unpause_recovery();
8970     }
8971   }
8972
8973   service.activate_map();
8974 }
8975
8976 bool OSD::require_mon_peer(const Message *m)
8977 {
8978   if (!m->get_connection()->peer_is_mon()) {
8979     dout(0) << "require_mon_peer received from non-mon "
8980             << m->get_connection()->get_peer_addr()
8981             << " " << *m << dendl;
8982     return false;
8983   }
8984   return true;
8985 }
8986
8987 bool OSD::require_mon_or_mgr_peer(const Message *m)
8988 {
8989   if (!m->get_connection()->peer_is_mon() &&
8990       !m->get_connection()->peer_is_mgr()) {
8991     dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8992             << m->get_connection()->get_peer_addr()
8993             << " " << *m << dendl;
8994     return false;
8995   }
8996   return true;
8997 }
8998
8999 bool OSD::require_osd_peer(const Message *m)
9000 {
9001   if (!m->get_connection()->peer_is_osd()) {
9002     dout(0) << "require_osd_peer received from non-osd "
9003             << m->get_connection()->get_peer_addr()
9004             << " " << *m << dendl;
9005     return false;
9006   }
9007   return true;
9008 }
9009
9010 // ----------------------------------------
9011 // pg creation
9012
9013 void OSD::split_pgs(
9014   PG *parent,
9015   const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9016   OSDMapRef curmap,
9017   OSDMapRef nextmap,
9018   PeeringCtx &rctx)
9019 {
9020   unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9021   parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9022
9023   vector<object_stat_sum_t> updated_stats;
9024   parent->start_split_stats(childpgids, &updated_stats);
9025
9026   vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9027   for (set<spg_t>::const_iterator i = childpgids.begin();
9028        i != childpgids.end();
9029        ++i, ++stat_iter) {
9030     ceph_assert(stat_iter != updated_stats.end());
9031     dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9032     PG* child = _make_pg(nextmap, *i);
9033     child->lock(true);
9034     out_pgs->insert(child);
9035     child->ch = store->create_new_collection(child->coll);
9036
9037     {
9038       uint32_t shard_index = i->hash_to_shard(shards.size());
9039       assert(NULL != shards[shard_index]);
9040       store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9041     }
9042
9043     unsigned split_bits = i->get_split_bits(pg_num);
9044     dout(10) << " pg_num is " << pg_num
9045              << ", m_seed " << i->ps()
9046              << ", split_bits is " << split_bits << dendl;
9047     parent->split_colls(
9048       *i,
9049       split_bits,
9050       i->ps(),
9051       &child->get_pgpool().info,
9052       rctx.transaction);
9053     parent->split_into(
9054       i->pgid,
9055       child,
9056       split_bits);
9057
9058     child->init_collection_pool_opts();
9059
9060     child->finish_split_stats(*stat_iter, rctx.transaction);
9061     child->unlock();
9062   }
9063   ceph_assert(stat_iter != updated_stats.end());
9064   parent->finish_split_stats(*stat_iter, rctx.transaction);
9065 }
9066
9067 // ----------------------------------------
9068 // peering and recovery
9069
9070 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9071                            ThreadPool::TPHandle *handle)
9072 {
9073   if (!service.get_osdmap()->is_up(whoami)) {
9074     dout(20) << __func__ << " not up in osdmap" << dendl;
9075   } else if (!is_active()) {
9076     dout(20) << __func__ << " not active" << dendl;
9077   } else {
9078     for (auto& [osd, ls] : ctx.message_map) {
9079       if (!curmap->is_up(osd)) {
9080         dout(20) << __func__ << " skipping down osd." << osd << dendl;
9081         continue;
9082       }
9083       ConnectionRef con = service.get_con_osd_cluster(
9084         osd, curmap->get_epoch());
9085       if (!con) {
9086         dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9087                  << dendl;
9088         continue;
9089       }
9090       service.maybe_share_map(con.get(), curmap);
9091       for (auto m : ls) {
9092         con->send_message2(m);
9093       }
9094       ls.clear();
9095     }
9096   }
9097   if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9098     int tr = store->queue_transaction(
9099       pg->ch,
9100       std::move(ctx.transaction), TrackedOpRef(),
9101       handle);
9102     ceph_assert(tr == 0);
9103   }
9104 }
9105
9106 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9107 {
9108   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9109   if (!require_mon_peer(m)) {
9110     m->put();
9111     return;
9112   }
9113   for (auto& p : m->pgs) {
9114     spg_t pgid = p.first;
9115     epoch_t created = p.second.first;
9116     utime_t created_stamp = p.second.second;
9117     auto q = m->pg_extra.find(pgid);
9118     if (q == m->pg_extra.end()) {
9119       clog->error() << __func__ << " " << pgid << " e" << created
9120                     << "@" << created_stamp << " with no history or past_intervals"
9121                     << ", this should be impossible after octopus.  Ignoring.";
9122     } else {
9123       dout(20) << __func__ << " " << pgid << " e" << created
9124                << "@" << created_stamp
9125                << " history " << q->second.first
9126                << " pi " << q->second.second << dendl;
9127       if (!q->second.second.empty() &&
9128           m->epoch < q->second.second.get_bounds().second) {
9129         clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9130                       << " and unmatched past_intervals " << q->second.second
9131                       << " (history " << q->second.first << ")";
9132       } else {
9133         enqueue_peering_evt(
9134           pgid,
9135           PGPeeringEventRef(
9136             std::make_shared<PGPeeringEvent>(
9137               m->epoch,
9138               m->epoch,
9139               NullEvt(),
9140               true,
9141               new PGCreateInfo(
9142                 pgid,
9143                 m->epoch,
9144                 q->second.first,
9145                 q->second.second,
9146                 true)
9147               )));
9148       }
9149     }
9150   }
9151
9152   {
9153     std::lock_guard l(pending_creates_lock);
9154     if (pending_creates_from_mon == 0) {
9155       last_pg_create_epoch = m->epoch;
9156     }
9157   }
9158
9159   m->put();
9160 }
9161
9162 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9163 {
9164   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9165   if (!require_osd_peer(m)) {
9166     m->put();
9167     return;
9168   }
9169   int from = m->get_source().num();
9170   for (auto& p : m->get_pg_list()) {
9171     spg_t pgid(p.info.pgid.pgid, p.to);
9172     enqueue_peering_evt(
9173       pgid,
9174       PGPeeringEventRef(
9175         std::make_shared<PGPeeringEvent>(
9176           p.epoch_sent,
9177           p.query_epoch,
9178           MNotifyRec(
9179             pgid, pg_shard_t(from, p.from),
9180             p,
9181             m->get_connection()->get_features()),
9182           true,
9183           new PGCreateInfo(
9184             pgid,
9185             p.query_epoch,
9186             p.info.history,
9187             p.past_intervals,
9188             false)
9189           )));
9190   }
9191   m->put();
9192 }
9193
9194 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9195 {
9196   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9197   if (!require_osd_peer(m)) {
9198     m->put();
9199     return;
9200   }
9201   int from = m->get_source().num();
9202   for (auto& p : m->pg_list) {
9203     enqueue_peering_evt(
9204       spg_t(p.info.pgid.pgid, p.to),
9205       PGPeeringEventRef(
9206        std::make_shared<PGPeeringEvent>(
9207          p.epoch_sent, p.query_epoch,
9208          MInfoRec(
9209            pg_shard_t(from, p.from),
9210            p.info,
9211            p.epoch_sent)))
9212       );
9213   }
9214   m->put();
9215 }
9216
9217 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9218 {
9219   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9220   if (!require_osd_peer(m)) {
9221     m->put();
9222     return;
9223   }
9224   for (auto& pgid : m->pg_list) {
9225     enqueue_peering_evt(
9226       pgid,
9227       PGPeeringEventRef(
9228         std::make_shared<PGPeeringEvent>(
9229           m->get_epoch(), m->get_epoch(),
9230           PeeringState::DeleteStart())));
9231   }
9232   m->put();
9233 }
9234
9235 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9236 {
9237   dout(10) << __func__ << " " << *m << dendl;
9238   if (!require_mon_or_mgr_peer(m)) {
9239     m->put();
9240     return;
9241   }
9242   epoch_t epoch = get_osdmap_epoch();
9243   for (auto pgid : m->forced_pgs) {
9244     if (m->options & OFR_BACKFILL) {
9245       if (m->options & OFR_CANCEL) {
9246         enqueue_peering_evt(
9247           pgid,
9248           PGPeeringEventRef(
9249             std::make_shared<PGPeeringEvent>(
9250               epoch, epoch,
9251               PeeringState::UnsetForceBackfill())));
9252       } else {
9253         enqueue_peering_evt(
9254           pgid,
9255           PGPeeringEventRef(
9256             std::make_shared<PGPeeringEvent>(
9257               epoch, epoch,
9258               PeeringState::SetForceBackfill())));
9259       }
9260     } else if (m->options & OFR_RECOVERY) {
9261       if (m->options & OFR_CANCEL) {
9262         enqueue_peering_evt(
9263           pgid,
9264           PGPeeringEventRef(
9265             std::make_shared<PGPeeringEvent>(
9266               epoch, epoch,
9267               PeeringState::UnsetForceRecovery())));
9268       } else {
9269         enqueue_peering_evt(
9270           pgid,
9271           PGPeeringEventRef(
9272             std::make_shared<PGPeeringEvent>(
9273               epoch, epoch,
9274               PeeringState::SetForceRecovery())));
9275       }
9276     }
9277   }
9278   m->put();
9279 }
9280
9281 void OSD::handle_pg_query_nopg(const MQuery& q)
9282 {
9283   spg_t pgid = q.pgid;
9284   dout(10) << __func__ << " " << pgid << dendl;
9285
9286   OSDMapRef osdmap = get_osdmap();
9287   if (!osdmap->have_pg_pool(pgid.pool()))
9288     return;
9289
9290   dout(10) << " pg " << pgid << " dne" << dendl;
9291   pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9292   ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9293   if (con) {
9294     Message *m;
9295     if (q.query.type == pg_query_t::LOG ||
9296         q.query.type == pg_query_t::FULLLOG) {
9297       m = new MOSDPGLog(
9298         q.query.from, q.query.to,
9299         osdmap->get_epoch(), empty,
9300         q.query.epoch_sent);
9301     } else {
9302       pg_notify_t notify{q.query.from, q.query.to,
9303                          q.query.epoch_sent,
9304                          osdmap->get_epoch(),
9305                          empty,
9306                          PastIntervals()};
9307       m = new MOSDPGNotify2(spg_t{pgid.pgid, q.query.from},
9308                             std::move(notify));
9309     }
9310     service.maybe_share_map(con.get(), osdmap);
9311     con->send_message(m);
9312   }
9313 }
9314
9315 void OSDService::queue_check_readable(spg_t spgid,
9316                                       epoch_t lpr,
9317                                       ceph::signedspan delay)
9318 {
9319   if (delay == ceph::signedspan::zero()) {
9320     osd->enqueue_peering_evt(
9321       spgid,
9322       PGPeeringEventRef(
9323         std::make_shared<PGPeeringEvent>(
9324           lpr, lpr,
9325           PeeringState::CheckReadable())));
9326   } else {
9327     mono_timer.add_event(
9328       delay,
9329       [this, spgid, lpr]() {
9330         queue_check_readable(spgid, lpr);
9331       });
9332   }
9333 }
9334
9335
9336 // =========================================================
9337 // RECOVERY
9338
9339 void OSDService::_maybe_queue_recovery() {
9340   ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9341   uint64_t available_pushes;
9342   while (!awaiting_throttle.empty() &&
9343          _recover_now(&available_pushes)) {
9344     uint64_t to_start = std::min(
9345       available_pushes,
9346       cct->_conf->osd_recovery_max_single_start);
9347     _queue_for_recovery(awaiting_throttle.front(), to_start);
9348     awaiting_throttle.pop_front();
9349     dout(10) << __func__ << " starting " << to_start
9350              << ", recovery_ops_reserved " << recovery_ops_reserved
9351              << " -> " << (recovery_ops_reserved + to_start) << dendl;
9352     recovery_ops_reserved += to_start;
9353   }
9354 }
9355
9356 bool OSDService::_recover_now(uint64_t *available_pushes)
9357 {
9358   if (available_pushes)
9359       *available_pushes = 0;
9360
9361   if (ceph_clock_now() < defer_recovery_until) {
9362     dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9363     return false;
9364   }
9365
9366   if (recovery_paused) {
9367     dout(15) << __func__ << " paused" << dendl;
9368     return false;
9369   }
9370
9371   uint64_t max = osd->get_recovery_max_active();
9372   if (max <= recovery_ops_active + recovery_ops_reserved) {
9373     dout(15) << __func__ << " active " << recovery_ops_active
9374              << " + reserved " << recovery_ops_reserved
9375              << " >= max " << max << dendl;
9376     return false;
9377   }
9378
9379   if (available_pushes)
9380     *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9381
9382   return true;
9383 }
9384
9385 unsigned OSDService::get_target_pg_log_entries() const
9386 {
9387   auto num_pgs = osd->get_num_pgs();
9388   auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9389   if (num_pgs > 0 && target > 0) {
9390     // target an even spread of our budgeted log entries across all
9391     // PGs.  note that while we only get to control the entry count
9392     // for primary PGs, we'll normally be responsible for a mix of
9393     // primary and replica PGs (for the same pool(s) even), so this
9394     // will work out.
9395     return std::max<unsigned>(
9396       std::min<unsigned>(target / num_pgs,
9397                          cct->_conf->osd_max_pg_log_entries),
9398       cct->_conf->osd_min_pg_log_entries);
9399   } else {
9400     // fall back to a per-pg value.
9401     return cct->_conf->osd_min_pg_log_entries;
9402   }
9403 }
9404
9405 void OSD::do_recovery(
9406   PG *pg, epoch_t queued, uint64_t reserved_pushes, int priority,
9407   ThreadPool::TPHandle &handle)
9408 {
9409   uint64_t started = 0;
9410
9411   /*
9412    * When the value of osd_recovery_sleep is set greater than zero, recovery
9413    * ops are scheduled after osd_recovery_sleep amount of time from the previous
9414    * recovery event's schedule time. This is done by adding a
9415    * recovery_requeue_callback event, which re-queues the recovery op using
9416    * queue_recovery_after_sleep.
9417    */
9418   float recovery_sleep = get_osd_recovery_sleep();
9419   {
9420     std::lock_guard l(service.sleep_lock);
9421     if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9422       PGRef pgref(pg);
9423       auto recovery_requeue_callback = new LambdaContext(
9424         [this, pgref, queued, reserved_pushes, priority](int r) {
9425         dout(20) << "do_recovery wake up at "
9426                  << ceph_clock_now()
9427                  << ", re-queuing recovery" << dendl;
9428         std::lock_guard l(service.sleep_lock);
9429         service.recovery_needs_sleep = false;
9430         service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes, priority);
9431       });
9432
9433       // This is true for the first recovery op and when the previous recovery op
9434       // has been scheduled in the past. The next recovery op is scheduled after
9435       // completing the sleep from now.
9436
9437       if (auto now = ceph::real_clock::now();
9438           service.recovery_schedule_time < now) {
9439         service.recovery_schedule_time = now;
9440       }
9441       service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9442       service.sleep_timer.add_event_at(service.recovery_schedule_time,
9443                                        recovery_requeue_callback);
9444       dout(20) << "Recovery event scheduled at "
9445                << service.recovery_schedule_time << dendl;
9446       return;
9447     }
9448   }
9449
9450   {
9451     {
9452       std::lock_guard l(service.sleep_lock);
9453       service.recovery_needs_sleep = true;
9454     }
9455
9456     if (pg->pg_has_reset_since(queued)) {
9457       goto out;
9458     }
9459
9460     dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9461 #ifdef DEBUG_RECOVERY_OIDS
9462     dout(20) << "  active was " << service.recovery_oids[pg->pg_id] << dendl;
9463 #endif
9464
9465     bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9466     dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9467              << " on " << *pg << dendl;
9468
9469     if (do_unfound) {
9470       PeeringCtx rctx;
9471       rctx.handle = &handle;
9472       pg->find_unfound(queued, rctx);
9473       dispatch_context(rctx, pg, pg->get_osdmap());
9474     }
9475   }
9476
9477  out:
9478   ceph_assert(started <= reserved_pushes);
9479   service.release_reserved_pushes(reserved_pushes);
9480 }
9481
9482 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9483 {
9484   std::lock_guard l(recovery_lock);
9485   dout(10) << "start_recovery_op " << *pg << " " << soid
9486            << " (" << recovery_ops_active << "/"
9487            << osd->get_recovery_max_active() << " rops)"
9488            << dendl;
9489   recovery_ops_active++;
9490
9491 #ifdef DEBUG_RECOVERY_OIDS
9492   dout(20) << "  active was " << recovery_oids[pg->pg_id] << dendl;
9493   ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9494   recovery_oids[pg->pg_id].insert(soid);
9495 #endif
9496 }
9497
9498 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9499 {
9500   std::lock_guard l(recovery_lock);
9501   dout(10) << "finish_recovery_op " << *pg << " " << soid
9502            << " dequeue=" << dequeue
9503            << " (" << recovery_ops_active << "/"
9504            << osd->get_recovery_max_active() << " rops)"
9505            << dendl;
9506
9507   // adjust count
9508   ceph_assert(recovery_ops_active > 0);
9509   recovery_ops_active--;
9510
9511 #ifdef DEBUG_RECOVERY_OIDS
9512   dout(20) << "  active oids was " << recovery_oids[pg->pg_id] << dendl;
9513   ceph_assert(recovery_oids[pg->pg_id].count(soid));
9514   recovery_oids[pg->pg_id].erase(soid);
9515 #endif
9516
9517   _maybe_queue_recovery();
9518 }
9519
9520 bool OSDService::is_recovery_active()
9521 {
9522   if (cct->_conf->osd_debug_pretend_recovery_active) {
9523     return true;
9524   }
9525   return local_reserver.has_reservation() || remote_reserver.has_reservation();
9526 }
9527
9528 void OSDService::release_reserved_pushes(uint64_t pushes)
9529 {
9530   std::lock_guard l(recovery_lock);
9531   dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9532            << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9533            << dendl;
9534   ceph_assert(recovery_ops_reserved >= pushes);
9535   recovery_ops_reserved -= pushes;
9536   _maybe_queue_recovery();
9537 }
9538
9539 // =========================================================
9540 // OPS
9541
9542 bool OSD::op_is_discardable(const MOSDOp *op)
9543 {
9544   // drop client request if they are not connected and can't get the
9545   // reply anyway.
9546   if (!op->get_connection()->is_connected()) {
9547     return true;
9548   }
9549   return false;
9550 }
9551
9552 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9553 {
9554   const utime_t stamp = op->get_req()->get_recv_stamp();
9555   const utime_t latency = ceph_clock_now() - stamp;
9556   const unsigned priority = op->get_req()->get_priority();
9557   const int cost = op->get_req()->get_cost();
9558   const uint64_t owner = op->get_req()->get_source().num();
9559   const int type = op->get_req()->get_type();
9560
9561   dout(15) << "enqueue_op " << *op->get_req() << " prio " << priority
9562            << " type " << type
9563            << " cost " << cost
9564            << " latency " << latency
9565            << " epoch " << epoch
9566            << " " << *(op->get_req()) << dendl;
9567   op->osd_trace.event("enqueue op");
9568   op->osd_trace.keyval("priority", priority);
9569   op->osd_trace.keyval("cost", cost);
9570
9571   auto enqueue_span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
9572   enqueue_span->AddEvent(__func__, {
9573     {"priority", priority},
9574     {"cost", cost},
9575     {"epoch", epoch},
9576     {"owner", owner},
9577     {"type", type}
9578     });
9579
9580   op->mark_queued_for_pg();
9581   logger->tinc(l_osd_op_before_queue_op_lat, latency);
9582   if (PGRecoveryMsg::is_recovery_msg(op)) {
9583     op_shardedwq.queue(
9584       OpSchedulerItem(
9585         unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9586         cost, priority, stamp, owner, epoch));
9587   } else {
9588     op_shardedwq.queue(
9589       OpSchedulerItem(
9590         unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9591         cost, priority, stamp, owner, epoch));
9592   }
9593 }
9594
9595 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9596 {
9597   dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9598   op_shardedwq.queue(
9599     OpSchedulerItem(
9600       unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9601       10,
9602       cct->_conf->osd_peering_op_priority,
9603       utime_t(),
9604       0,
9605       evt->get_epoch_sent()));
9606 }
9607
9608 /*
9609  * NOTE: dequeue called in worker thread, with pg lock
9610  */
9611 void OSD::dequeue_op(
9612   PGRef pg, OpRequestRef op,
9613   ThreadPool::TPHandle &handle)
9614 {
9615   const Message *m = op->get_req();
9616
9617   FUNCTRACE(cct);
9618   OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9619
9620   utime_t now = ceph_clock_now();
9621   op->set_dequeued_time(now);
9622
9623   utime_t latency = now - m->get_recv_stamp();
9624   dout(10) << "dequeue_op " << *op->get_req()
9625            << " prio " << m->get_priority()
9626            << " cost " << m->get_cost()
9627            << " latency " << latency
9628            << " " << *m
9629            << " pg " << *pg << dendl;
9630
9631   logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9632
9633   service.maybe_share_map(m->get_connection().get(),
9634                           pg->get_osdmap(),
9635                           op->sent_epoch);
9636
9637   if (pg->is_deleting())
9638     return;
9639
9640   op->mark_reached_pg();
9641   op->osd_trace.event("dequeue_op");
9642
9643   pg->do_request(op, handle);
9644
9645   // finish
9646   dout(10) << "dequeue_op " << *op->get_req() << " finish" << dendl;
9647   OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9648 }
9649
9650
9651 void OSD::dequeue_peering_evt(
9652   OSDShard *sdata,
9653   PG *pg,
9654   PGPeeringEventRef evt,
9655   ThreadPool::TPHandle& handle)
9656 {
9657   auto curmap = sdata->get_osdmap();
9658   bool need_up_thru = false;
9659   epoch_t same_interval_since = 0;
9660   if (!pg) {
9661     if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9662       handle_pg_query_nopg(*q);
9663     } else {
9664       derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9665       ceph_abort();
9666     }
9667   } else if (PeeringCtx rctx;
9668              advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9669     pg->do_peering_event(evt, rctx);
9670     if (pg->is_deleted()) {
9671       pg->unlock();
9672       return;
9673     }
9674     dispatch_context(rctx, pg, curmap, &handle);
9675     need_up_thru = pg->get_need_up_thru();
9676     same_interval_since = pg->get_same_interval_since();
9677     pg->unlock();
9678   }
9679
9680   if (need_up_thru) {
9681     queue_want_up_thru(same_interval_since);
9682   }
9683
9684   service.send_pg_temp();
9685 }
9686
9687 void OSD::dequeue_delete(
9688   OSDShard *sdata,
9689   PG *pg,
9690   epoch_t e,
9691   ThreadPool::TPHandle& handle)
9692 {
9693   dequeue_peering_evt(
9694     sdata,
9695     pg,
9696     PGPeeringEventRef(
9697       std::make_shared<PGPeeringEvent>(
9698         e, e,
9699         PeeringState::DeleteSome())),
9700     handle);
9701 }
9702
9703
9704
9705 // --------------------------------
9706
9707 const char** OSD::get_tracked_conf_keys() const
9708 {
9709   static const char* KEYS[] = {
9710     "osd_max_backfills",
9711     "osd_min_recovery_priority",
9712     "osd_max_trimming_pgs",
9713     "osd_op_complaint_time",
9714     "osd_op_log_threshold",
9715     "osd_op_history_size",
9716     "osd_op_history_duration",
9717     "osd_op_history_slow_op_size",
9718     "osd_op_history_slow_op_threshold",
9719     "osd_enable_op_tracker",
9720     "osd_map_cache_size",
9721     "osd_pg_epoch_max_lag_factor",
9722     "osd_pg_epoch_persisted_max_stale",
9723     "osd_recovery_sleep",
9724     "osd_recovery_sleep_hdd",
9725     "osd_recovery_sleep_ssd",
9726     "osd_recovery_sleep_hybrid",
9727     "osd_delete_sleep",
9728     "osd_delete_sleep_hdd",
9729     "osd_delete_sleep_ssd",
9730     "osd_delete_sleep_hybrid",
9731     "osd_snap_trim_sleep",
9732     "osd_snap_trim_sleep_hdd",
9733     "osd_snap_trim_sleep_ssd",
9734     "osd_snap_trim_sleep_hybrid",
9735     "osd_scrub_sleep",
9736     "osd_recovery_max_active",
9737     "osd_recovery_max_active_hdd",
9738     "osd_recovery_max_active_ssd",
9739     // clog & admin clog
9740     "clog_to_monitors",
9741     "clog_to_syslog",
9742     "clog_to_syslog_facility",
9743     "clog_to_syslog_level",
9744     "osd_objectstore_fuse",
9745     "clog_to_graylog",
9746     "clog_to_graylog_host",
9747     "clog_to_graylog_port",
9748     "host",
9749     "fsid",
9750     "osd_recovery_delay_start",
9751     "osd_client_message_size_cap",
9752     "osd_client_message_cap",
9753     "osd_heartbeat_min_size",
9754     "osd_heartbeat_interval",
9755     "osd_object_clean_region_max_num_intervals",
9756     "osd_scrub_min_interval",
9757     "osd_scrub_max_interval",
9758     NULL
9759   };
9760   return KEYS;
9761 }
9762
9763 void OSD::handle_conf_change(const ConfigProxy& conf,
9764                              const std::set <std::string> &changed)
9765 {
9766   std::lock_guard l{osd_lock};
9767
9768   if (changed.count("osd_max_backfills") ||
9769       changed.count("osd_recovery_max_active") ||
9770       changed.count("osd_recovery_max_active_hdd") ||
9771       changed.count("osd_recovery_max_active_ssd")) {
9772     if (!maybe_override_options_for_qos(&changed) &&
9773         changed.count("osd_max_backfills")) {
9774       // Scheduler is not "mclock". Fallback to earlier behavior
9775       service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9776       service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9777     }
9778   }
9779   if (changed.count("osd_delete_sleep") ||
9780       changed.count("osd_delete_sleep_hdd") ||
9781       changed.count("osd_delete_sleep_ssd") ||
9782       changed.count("osd_delete_sleep_hybrid") ||
9783       changed.count("osd_snap_trim_sleep") ||
9784       changed.count("osd_snap_trim_sleep_hdd") ||
9785       changed.count("osd_snap_trim_sleep_ssd") ||
9786       changed.count("osd_snap_trim_sleep_hybrid") ||
9787       changed.count("osd_scrub_sleep") ||
9788       changed.count("osd_recovery_sleep") ||
9789       changed.count("osd_recovery_sleep_hdd") ||
9790       changed.count("osd_recovery_sleep_ssd") ||
9791       changed.count("osd_recovery_sleep_hybrid")) {
9792     maybe_override_sleep_options_for_qos();
9793   }
9794   if (changed.count("osd_pg_delete_cost")) {
9795     maybe_override_cost_for_qos();
9796   }
9797   if (changed.count("osd_min_recovery_priority")) {
9798     service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9799     service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9800   }
9801   if (changed.count("osd_max_trimming_pgs")) {
9802     service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9803   }
9804   if (changed.count("osd_op_complaint_time") ||
9805       changed.count("osd_op_log_threshold")) {
9806     op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9807                                            cct->_conf->osd_op_log_threshold);
9808   }
9809   if (changed.count("osd_op_history_size") ||
9810       changed.count("osd_op_history_duration")) {
9811     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9812                                              cct->_conf->osd_op_history_duration);
9813   }
9814   if (changed.count("osd_op_history_slow_op_size") ||
9815       changed.count("osd_op_history_slow_op_threshold")) {
9816     op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9817                                                       cct->_conf->osd_op_history_slow_op_threshold);
9818   }
9819   if (changed.count("osd_enable_op_tracker")) {
9820       op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9821   }
9822   if (changed.count("osd_map_cache_size")) {
9823     service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9824     service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9825     service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9826   }
9827   if (changed.count("clog_to_monitors") ||
9828       changed.count("clog_to_syslog") ||
9829       changed.count("clog_to_syslog_level") ||
9830       changed.count("clog_to_syslog_facility") ||
9831       changed.count("clog_to_graylog") ||
9832       changed.count("clog_to_graylog_host") ||
9833       changed.count("clog_to_graylog_port") ||
9834       changed.count("host") ||
9835       changed.count("fsid")) {
9836     update_log_config();
9837   }
9838   if (changed.count("osd_pg_epoch_max_lag_factor")) {
9839     m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9840       "osd_pg_epoch_max_lag_factor");
9841   }
9842
9843 #ifdef HAVE_LIBFUSE
9844   if (changed.count("osd_objectstore_fuse")) {
9845     if (store) {
9846       enable_disable_fuse(false);
9847     }
9848   }
9849 #endif
9850
9851   if (changed.count("osd_recovery_delay_start")) {
9852     service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9853     service.kick_recovery_queue();
9854   }
9855
9856   if (changed.count("osd_client_message_cap")) {
9857     uint64_t newval = cct->_conf->osd_client_message_cap;
9858     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9859     if (pol.throttler_messages) {
9860       pol.throttler_messages->reset_max(newval);
9861     }
9862   }
9863   if (changed.count("osd_client_message_size_cap")) {
9864     uint64_t newval = cct->_conf->osd_client_message_size_cap;
9865     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9866     if (pol.throttler_bytes) {
9867       pol.throttler_bytes->reset_max(newval);
9868     }
9869   }
9870   if (changed.count("osd_object_clean_region_max_num_intervals")) {
9871     ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9872   }
9873
9874   if (changed.count("osd_scrub_min_interval") ||
9875       changed.count("osd_scrub_max_interval")) {
9876     resched_all_scrubs();
9877     dout(0) << __func__ << ": scrub interval change" << dendl;
9878   }
9879   check_config();
9880   if (changed.count("osd_asio_thread_count")) {
9881     service.poolctx.stop();
9882     service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
9883   }
9884 }
9885
9886 void OSD::maybe_override_max_osd_capacity_for_qos()
9887 {
9888   // If the scheduler enabled is mclock, override the default
9889   // osd capacity with the value obtained from running the
9890   // osd bench test. This is later used to setup mclock.
9891   if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
9892       (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false) &&
9893       (!unsupported_objstore_for_qos())) {
9894     std::string max_capacity_iops_config;
9895     bool force_run_benchmark =
9896       cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
9897
9898     if (store_is_rotational) {
9899       max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
9900     } else {
9901       max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
9902     }
9903
9904     double default_iops = 0.0;
9905     double cur_iops = 0.0;
9906     if (!force_run_benchmark) {
9907       // Get the current osd iops capacity
9908       cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
9909
9910       // Get the default max iops capacity
9911       auto val = cct->_conf.get_val_default(max_capacity_iops_config);
9912       if (!val.has_value()) {
9913         derr << __func__ << " Unable to determine default value of "
9914             << max_capacity_iops_config << dendl;
9915         // Cannot determine default iops. Force a run of the OSD benchmark.
9916         force_run_benchmark = true;
9917       } else {
9918         // Default iops
9919         default_iops = std::stod(val.value());
9920       }
9921
9922       // Determine if we really need to run the osd benchmark
9923       if (!force_run_benchmark && (default_iops != cur_iops)) {
9924         dout(1) << __func__ << std::fixed << std::setprecision(2)
9925                 << " default_iops: " << default_iops
9926                 << " cur_iops: " << cur_iops
9927                 << ". Skip OSD benchmark test." << dendl;
9928         return;
9929       }
9930     }
9931
9932     // Run osd bench: write 100 4MiB objects with blocksize 4KiB
9933     int64_t count = 12288000; // Count of bytes to write
9934     int64_t bsize = 4096;     // Block size
9935     int64_t osize = 4194304;  // Object size
9936     int64_t onum = 100;       // Count of objects to write
9937     double elapsed = 0.0;     // Time taken to complete the test
9938     double iops = 0.0;
9939     stringstream ss;
9940     int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
9941     if (ret != 0) {
9942       derr << __func__
9943            << " osd bench err: " << ret
9944            << " osd bench errstr: " << ss.str()
9945            << dendl;
9946       return;
9947     }
9948
9949     double rate = count / elapsed;
9950     iops = rate / bsize;
9951     dout(1) << __func__
9952             << " osd bench result -"
9953             << std::fixed << std::setprecision(3)
9954             << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
9955             << " iops: " << iops
9956             << " elapsed_sec: " << elapsed
9957             << dendl;
9958
9959     // Get the threshold IOPS set for the underlying hdd/ssd.
9960     double threshold_iops = 0.0;
9961     if (store_is_rotational) {
9962       threshold_iops = cct->_conf.get_val<double>(
9963         "osd_mclock_iops_capacity_threshold_hdd");
9964     } else {
9965       threshold_iops = cct->_conf.get_val<double>(
9966         "osd_mclock_iops_capacity_threshold_ssd");
9967     }
9968
9969     // Persist the iops value to the MON store or throw cluster warning
9970     // if the measured iops exceeds the set threshold. If the iops exceed
9971     // the threshold, the default value is used.
9972     if (iops > threshold_iops) {
9973       clog->warn() << "OSD bench result of " << std::to_string(iops)
9974                    << " IOPS exceeded the threshold limit of "
9975                    << std::to_string(threshold_iops) << " IOPS for osd."
9976                    << std::to_string(whoami) << ". IOPS capacity is unchanged"
9977                    << " at " << std::to_string(cur_iops) << " IOPS. The"
9978                    << " recommendation is to establish the osd's IOPS capacity"
9979                    << " using other benchmark tools (e.g. Fio) and then"
9980                    << " override osd_mclock_max_capacity_iops_[hdd|ssd].";
9981     } else {
9982       mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
9983     }
9984   }
9985 }
9986
9987 bool OSD::maybe_override_options_for_qos(const std::set<std::string> *changed)
9988 {
9989   // Override options only if the scheduler enabled is mclock and the
9990   // underlying objectstore is supported by mclock
9991   if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
9992       !unsupported_objstore_for_qos()) {
9993     static const std::map<std::string, uint64_t> recovery_qos_defaults {
9994       {"osd_recovery_max_active", 0},
9995       {"osd_recovery_max_active_hdd", 3},
9996       {"osd_recovery_max_active_ssd", 10},
9997       {"osd_max_backfills", 1},
9998     };
9999
10000     // Check if we were called because of a configuration change
10001     if (changed != nullptr) {
10002       if (cct->_conf.get_val<bool>("osd_mclock_override_recovery_settings")) {
10003         if (changed->count("osd_max_backfills")) {
10004           dout(1) << __func__ << " Set local and remote max backfills to "
10005                    << cct->_conf->osd_max_backfills << dendl;
10006           service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10007           service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10008         }
10009       } else {
10010         // Recovery options change was attempted without setting
10011         // the 'osd_mclock_override_recovery_settings' option.
10012         // Find the key to remove from the configuration db.
10013         std::string key;
10014         if (changed->count("osd_max_backfills")) {
10015           key = "osd_max_backfills";
10016         } else if (changed->count("osd_recovery_max_active")) {
10017           key = "osd_recovery_max_active";
10018         } else if (changed->count("osd_recovery_max_active_hdd")) {
10019           key = "osd_recovery_max_active_hdd";
10020         } else if (changed->count("osd_recovery_max_active_ssd")) {
10021           key = "osd_recovery_max_active_ssd";
10022         } else {
10023           // No key that we are interested in. Return.
10024           return true;
10025         }
10026
10027         // Remove the current entry from the configuration if
10028         // different from its default value.
10029         auto val = recovery_qos_defaults.find(key);
10030         if (val != recovery_qos_defaults.end() &&
10031             cct->_conf.get_val<uint64_t>(key) != val->second) {
10032           static const std::vector<std::string> osds = {
10033             "osd",
10034             "osd." + std::to_string(whoami)
10035           };
10036
10037           for (auto osd : osds) {
10038             std::string cmd =
10039               "{"
10040                 "\"prefix\": \"config rm\", "
10041                 "\"who\": \"" + osd + "\", "
10042                 "\"name\": \"" + key + "\""
10043               "}";
10044             vector<std::string> vcmd{cmd};
10045
10046             dout(1) << __func__ << " Removing Key: " << key
10047                     << " for " << osd << " from Mon db" << dendl;
10048             monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
10049           }
10050
10051           // Raise a cluster warning indicating that the changes did not
10052           // take effect and indicate the reason why.
10053           clog->warn() << "Change to " << key << " on osd."
10054                        << std::to_string(whoami) << " did not take effect."
10055                        << " Enable osd_mclock_override_recovery_settings before"
10056                        << " setting this option.";
10057         }
10058       }
10059     } else { // if (changed != nullptr) (osd boot-up)
10060       /**
10061        * This section is executed only during osd boot-up.
10062        * Override the default recovery max active (hdd & ssd) and max backfills
10063        * config options to either the mClock defaults or retain their respective
10064        * overridden values before the osd was restarted.
10065        */
10066       for (auto opt : recovery_qos_defaults) {
10067         /**
10068          * Note: set_val_default doesn't overwrite an option if it was earlier
10069          * set at a config level greater than CONF_DEFAULT. It doesn't return
10070          * a status. With get_val(), the config subsystem is guaranteed to
10071          * either return the overridden value (if any) or the default value.
10072          */
10073         cct->_conf.set_val_default(opt.first, std::to_string(opt.second));
10074         auto opt_val = cct->_conf.get_val<uint64_t>(opt.first);
10075         dout(1) << __func__ << " "
10076                 << opt.first << " set to " << opt_val
10077                 << dendl;
10078         if (opt.first == "osd_max_backfills") {
10079           service.local_reserver.set_max(opt_val);
10080           service.remote_reserver.set_max(opt_val);
10081         }
10082       }
10083     }
10084     return true;
10085   }
10086   return false;
10087 }
10088
10089 void OSD::maybe_override_sleep_options_for_qos()
10090 {
10091   // Override options only if the scheduler enabled is mclock and the
10092   // underlying objectstore is supported by mclock
10093   if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10094       !unsupported_objstore_for_qos()) {
10095
10096     // Override the various sleep settings
10097     // Disable recovery sleep
10098     cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10099     cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10100     cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10101     cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10102
10103     // Disable delete sleep
10104     cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10105     cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10106     cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10107     cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10108
10109     // Disable snap trim sleep
10110     cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10111     cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10112     cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10113     cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10114
10115     // Disable scrub sleep
10116     cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10117   }
10118 }
10119
10120 void OSD::maybe_override_cost_for_qos()
10121 {
10122   // If the scheduler enabled is mclock, override the default PG deletion cost
10123   // so that mclock can meet the QoS goals.
10124   if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10125       !unsupported_objstore_for_qos()) {
10126     uint64_t pg_delete_cost = 15728640;
10127     cct->_conf.set_val("osd_pg_delete_cost", std::to_string(pg_delete_cost));
10128   }
10129 }
10130
10131 /**
10132  * A context for receiving status from a background mon command to set
10133  * a config option and optionally apply the changes on each op shard.
10134  */
10135 class MonCmdSetConfigOnFinish : public Context {
10136   OSD *osd;
10137   CephContext *cct;
10138   std::string key;
10139   std::string val;
10140   bool update_shard;
10141 public:
10142   explicit MonCmdSetConfigOnFinish(
10143     OSD *o,
10144     CephContext *cct,
10145     const std::string &k,
10146     const std::string &v,
10147     const bool s)
10148       : osd(o), cct(cct), key(k), val(v), update_shard(s) {}
10149   void finish(int r) override {
10150     if (r != 0) {
10151       // Fallback to setting the config within the in-memory "values" map.
10152       cct->_conf.set_val_default(key, val);
10153     }
10154
10155     // If requested, apply this option on the
10156     // active scheduler of each op shard.
10157     if (update_shard) {
10158       for (auto& shard : osd->shards) {
10159         shard->update_scheduler_config();
10160       }
10161     }
10162   }
10163 };
10164
10165 void OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
10166 {
10167   std::string cmd =
10168     "{"
10169       "\"prefix\": \"config set\", "
10170       "\"who\": \"osd." + std::to_string(whoami) + "\", "
10171       "\"name\": \"" + key + "\", "
10172       "\"value\": \"" + val + "\""
10173     "}";
10174   vector<std::string> vcmd{cmd};
10175
10176   // List of config options to be distributed across each op shard.
10177   // Currently limited to a couple of mClock options.
10178   static const std::vector<std::string> shard_option =
10179     { "osd_mclock_max_capacity_iops_hdd", "osd_mclock_max_capacity_iops_ssd" };
10180   const bool update_shard = std::find(shard_option.begin(),
10181                                       shard_option.end(),
10182                                       key) != shard_option.end();
10183
10184   auto on_finish = new MonCmdSetConfigOnFinish(this, cct, key,
10185                                                val, update_shard);
10186   dout(10) << __func__ << " Set " << key << " = " << val << dendl;
10187   monc->start_mon_command(vcmd, {}, nullptr, nullptr, on_finish);
10188 }
10189
10190 bool OSD::unsupported_objstore_for_qos()
10191 {
10192   static const std::vector<std::string> unsupported_objstores = { "filestore" };
10193   return std::find(unsupported_objstores.begin(),
10194                    unsupported_objstores.end(),
10195                    store->get_type()) != unsupported_objstores.end();
10196 }
10197
10198 void OSD::update_log_config()
10199 {
10200   auto parsed_options = clog->parse_client_options(cct);
10201   derr << "log_to_monitors " << parsed_options.log_to_monitors << dendl;
10202 }
10203
10204 void OSD::check_config()
10205 {
10206   // some sanity checks
10207   if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10208     clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10209                  << " is not > osd_pg_epoch_persisted_max_stale ("
10210                  << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10211   }
10212   if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
10213     clog->warn() << "osd_object_clean_region_max_num_intervals ("
10214                  << cct->_conf->osd_object_clean_region_max_num_intervals
10215                 << ") is < 0";
10216   }
10217 }
10218
10219 // --------------------------------
10220
10221 void OSD::get_latest_osdmap()
10222 {
10223   dout(10) << __func__ << " -- start" << dendl;
10224
10225   boost::system::error_code ec;
10226   service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
10227
10228   dout(10) << __func__ << " -- finish" << dendl;
10229 }
10230
10231 // --------------------------------
10232
10233 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10234   const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10235   const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
10236   dout(10) << "setting " << queries.size() << " queries" << dendl;
10237
10238   std::list<OSDPerfMetricQuery> supported_queries;
10239   for (auto &it : queries) {
10240     auto &query = it.first;
10241     if (!query.key_descriptor.empty()) {
10242       supported_queries.push_back(query);
10243     }
10244   }
10245   if (supported_queries.size() < queries.size()) {
10246     dout(1) << queries.size() - supported_queries.size()
10247             << " unsupported queries" << dendl;
10248   }
10249   {
10250     std::lock_guard locker{m_perf_queries_lock};
10251     m_perf_queries = supported_queries;
10252     m_perf_limits = queries;
10253   }
10254   std::vector<PGRef> pgs;
10255   _get_pgs(&pgs);
10256   for (auto& pg : pgs) {
10257     std::scoped_lock l{*pg};
10258     pg->set_dynamic_perf_stats_queries(supported_queries);
10259   }
10260 }
10261
10262 MetricPayload OSD::get_perf_reports() {
10263   OSDMetricPayload payload;
10264   std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10265
10266   std::vector<PGRef> pgs;
10267   _get_pgs(&pgs);
10268   DynamicPerfStats dps;
10269   for (auto& pg : pgs) {
10270     // m_perf_queries can be modified only in set_perf_queries by mgr client
10271     // request, and it is protected by by mgr client's lock, which is held
10272     // when set_perf_queries/get_perf_reports are called, so we may not hold
10273     // m_perf_queries_lock here.
10274     DynamicPerfStats pg_dps(m_perf_queries);
10275     pg->lock();
10276     pg->get_dynamic_perf_stats(&pg_dps);
10277     pg->unlock();
10278     dps.merge(pg_dps);
10279   }
10280   dps.add_to_reports(m_perf_limits, &reports);
10281   dout(20) << "reports for " << reports.size() << " queries" << dendl;
10282
10283   return payload;
10284 }
10285
10286 // =============================================================
10287
10288 #undef dout_context
10289 #define dout_context cct
10290 #undef dout_prefix
10291 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10292
10293 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10294 {
10295   dout(10) << pg->pg_id << " " << pg << dendl;
10296   slot->pg = pg;
10297   pg->osd_shard = this;
10298   pg->pg_slot = slot;
10299   osd->inc_num_pgs();
10300
10301   slot->epoch = pg->get_osdmap_epoch();
10302   pg_slots_by_epoch.insert(*slot);
10303 }
10304
10305 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10306 {
10307   dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10308   slot->pg->osd_shard = nullptr;
10309   slot->pg->pg_slot = nullptr;
10310   slot->pg = nullptr;
10311   osd->dec_num_pgs();
10312
10313   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10314   slot->epoch = 0;
10315   if (waiting_for_min_pg_epoch) {
10316     min_pg_epoch_cond.notify_all();
10317   }
10318 }
10319
10320 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10321 {
10322   std::lock_guard l(shard_lock);
10323   dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10324            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10325   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10326   dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10327   slot->epoch = e;
10328   pg_slots_by_epoch.insert(*slot);
10329   dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10330            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10331   if (waiting_for_min_pg_epoch) {
10332     min_pg_epoch_cond.notify_all();
10333   }
10334 }
10335
10336 epoch_t OSDShard::get_min_pg_epoch()
10337 {
10338   std::lock_guard l(shard_lock);
10339   auto p = pg_slots_by_epoch.begin();
10340   if (p == pg_slots_by_epoch.end()) {
10341     return 0;
10342   }
10343   return p->epoch;
10344 }
10345
10346 void OSDShard::wait_min_pg_epoch(epoch_t need)
10347 {
10348   std::unique_lock l{shard_lock};
10349   ++waiting_for_min_pg_epoch;
10350   min_pg_epoch_cond.wait(l, [need, this] {
10351     if (pg_slots_by_epoch.empty()) {
10352       return true;
10353     } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10354       return true;
10355     } else {
10356       dout(10) << need << " waiting on "
10357                << pg_slots_by_epoch.begin()->epoch << dendl;
10358       return false;
10359     }
10360   });
10361   --waiting_for_min_pg_epoch;
10362 }
10363
10364 epoch_t OSDShard::get_max_waiting_epoch()
10365 {
10366   std::lock_guard l(shard_lock);
10367   epoch_t r = 0;
10368   for (auto& i : pg_slots) {
10369     if (!i.second->waiting_peering.empty()) {
10370       r = std::max(r, i.second->waiting_peering.rbegin()->first);
10371     }
10372   }
10373   return r;
10374 }
10375
10376 void OSDShard::consume_map(
10377   const OSDMapRef& new_osdmap,
10378   unsigned *pushes_to_free)
10379 {
10380   std::lock_guard l(shard_lock);
10381   OSDMapRef old_osdmap;
10382   {
10383     std::lock_guard l(osdmap_lock);
10384     old_osdmap = std::move(shard_osdmap);
10385     shard_osdmap = new_osdmap;
10386   }
10387   dout(10) << new_osdmap->get_epoch()
10388            << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10389            << dendl;
10390   int queued = 0;
10391
10392   // check slots
10393   auto p = pg_slots.begin();
10394   while (p != pg_slots.end()) {
10395     OSDShardPGSlot *slot = p->second.get();
10396     const spg_t& pgid = p->first;
10397     dout(20) << __func__ << " " << pgid << dendl;
10398     if (!slot->waiting_for_split.empty()) {
10399       dout(20) << __func__ << "  " << pgid
10400                << " waiting for split " << slot->waiting_for_split << dendl;
10401       ++p;
10402       continue;
10403     }
10404     if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10405       dout(20) << __func__ << "  " << pgid
10406                << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10407                << dendl;
10408       ++p;
10409       continue;
10410     }
10411     if (!slot->waiting_peering.empty()) {
10412       epoch_t first = slot->waiting_peering.begin()->first;
10413       if (first <= new_osdmap->get_epoch()) {
10414         dout(20) << __func__ << "  " << pgid
10415                  << " pending_peering first epoch " << first
10416                  << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10417         queued += _wake_pg_slot(pgid, slot);
10418       }
10419       ++p;
10420       continue;
10421     }
10422     if (!slot->waiting.empty()) {
10423       if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10424         dout(20) << __func__ << "  " << pgid << " maps to us, keeping"
10425                  << dendl;
10426         ++p;
10427         continue;
10428       }
10429       while (!slot->waiting.empty() &&
10430              slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10431         auto& qi = slot->waiting.front();
10432         dout(20) << __func__ << "  " << pgid
10433                  << " waiting item " << qi
10434                  << " epoch " << qi.get_map_epoch()
10435                  << " <= " << new_osdmap->get_epoch()
10436                  << ", "
10437                  << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10438                      "misdirected")
10439                  << ", dropping" << dendl;
10440         *pushes_to_free += qi.get_reserved_pushes();
10441         slot->waiting.pop_front();
10442       }
10443     }
10444     if (slot->waiting.empty() &&
10445         slot->num_running == 0 &&
10446         slot->waiting_for_split.empty() &&
10447         !slot->pg) {
10448       dout(20) << __func__ << "  " << pgid << " empty, pruning" << dendl;
10449       p = pg_slots.erase(p);
10450       continue;
10451     }
10452
10453     ++p;
10454   }
10455   if (queued) {
10456     std::lock_guard l{sdata_wait_lock};
10457     if (queued == 1)
10458       sdata_cond.notify_one();
10459     else
10460       sdata_cond.notify_all();
10461   }
10462 }
10463
10464 int OSDShard::_wake_pg_slot(
10465   spg_t pgid,
10466   OSDShardPGSlot *slot)
10467 {
10468   int count = 0;
10469   dout(20) << __func__ << " " << pgid
10470            << " to_process " << slot->to_process
10471            << " waiting " << slot->waiting
10472            << " waiting_peering " << slot->waiting_peering << dendl;
10473   for (auto i = slot->to_process.rbegin();
10474        i != slot->to_process.rend();
10475        ++i) {
10476     scheduler->enqueue_front(std::move(*i));
10477     count++;
10478   }
10479   slot->to_process.clear();
10480   for (auto i = slot->waiting.rbegin();
10481        i != slot->waiting.rend();
10482        ++i) {
10483     scheduler->enqueue_front(std::move(*i));
10484     count++;
10485   }
10486   slot->waiting.clear();
10487   for (auto i = slot->waiting_peering.rbegin();
10488        i != slot->waiting_peering.rend();
10489        ++i) {
10490     // this is overkill; we requeue everything, even if some of these
10491     // items are waiting for maps we don't have yet.  FIXME, maybe,
10492     // someday, if we decide this inefficiency matters
10493     for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10494       scheduler->enqueue_front(std::move(*j));
10495       count++;
10496     }
10497   }
10498   slot->waiting_peering.clear();
10499   ++slot->requeue_seq;
10500   return count;
10501 }
10502
10503 void OSDShard::identify_splits_and_merges(
10504   const OSDMapRef& as_of_osdmap,
10505   set<pair<spg_t,epoch_t>> *split_pgs,
10506   set<pair<spg_t,epoch_t>> *merge_pgs)
10507 {
10508   std::lock_guard l(shard_lock);
10509   dout(20) << __func__ << " " << pg_slots.size() << " slots" << dendl;
10510   if (shard_osdmap) {
10511     for (auto& i : pg_slots) {
10512       dout(20) << __func__ << " slot pgid:" << i.first << "slot:" << i.second.get() << dendl;
10513       const spg_t& pgid = i.first;
10514       auto *slot = i.second.get();
10515       if (slot->pg) {
10516         osd->service.identify_splits_and_merges(
10517           shard_osdmap, as_of_osdmap, pgid,
10518           split_pgs, merge_pgs);
10519       } else if (!slot->waiting_for_split.empty()) {
10520         osd->service.identify_splits_and_merges(
10521           shard_osdmap, as_of_osdmap, pgid,
10522           split_pgs, nullptr);
10523       } else {
10524         dout(20) << __func__ << " slot " << pgid
10525                  << " has no pg and waiting_for_split " << dendl;
10526       }
10527     }
10528   }
10529   dout(20) << __func__ << " " << split_pgs->size() << " splits, "
10530            << merge_pgs->size() << " merges" << dendl;
10531 }
10532
10533 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10534                             set<pair<spg_t,epoch_t>> *pgids)
10535 {
10536   std::lock_guard l(shard_lock);
10537   _prime_splits(pgids);
10538   if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10539     set<pair<spg_t,epoch_t>> newer_children;
10540     for (auto i : *pgids) {
10541       osd->service.identify_splits_and_merges(
10542         as_of_osdmap, shard_osdmap, i.first,
10543         &newer_children, nullptr);
10544     }
10545     newer_children.insert(pgids->begin(), pgids->end());
10546     dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10547              << shard_osdmap->get_epoch() << ", new children " << newer_children
10548              << dendl;
10549     _prime_splits(&newer_children);
10550     // note: we don't care what is left over here for other shards.
10551     // if this shard is ahead of us and one isn't, e.g., one thread is
10552     // calling into prime_splits via _process (due to a newly created
10553     // pg) and this shard has a newer map due to a racing consume_map,
10554     // then any grandchildren left here will be identified (or were
10555     // identified) when the slower shard's osdmap is advanced.
10556     // _prime_splits() will tolerate the case where the pgid is
10557     // already primed.
10558   }
10559 }
10560
10561 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10562 {
10563   dout(10) << *pgids << dendl;
10564   auto p = pgids->begin();
10565   while (p != pgids->end()) {
10566     unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10567     if (shard_index == shard_id) {
10568       auto r = pg_slots.emplace(p->first, nullptr);
10569       if (r.second) {
10570         dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10571         r.first->second = make_unique<OSDShardPGSlot>();
10572         r.first->second->waiting_for_split.insert(p->second);
10573       } else {
10574         auto q = r.first;
10575         ceph_assert(q != pg_slots.end());
10576         dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10577                  << dendl;
10578         q->second->waiting_for_split.insert(p->second);
10579       }
10580       p = pgids->erase(p);
10581     } else {
10582       ++p;
10583     }
10584   }
10585 }
10586
10587 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10588                             set<pair<spg_t,epoch_t>> *merge_pgs)
10589 {
10590   std::lock_guard l(shard_lock);
10591   dout(20) << __func__ << " checking shard " << shard_id
10592            << " for remaining merge pgs " << merge_pgs << dendl;
10593   auto p = merge_pgs->begin();
10594   while (p != merge_pgs->end()) {
10595     spg_t pgid = p->first;
10596     epoch_t epoch = p->second;
10597     unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10598     if (shard_index != shard_id) {
10599       ++p;
10600       continue;
10601     }
10602     OSDShardPGSlot *slot;
10603     auto r = pg_slots.emplace(pgid, nullptr);
10604     if (r.second) {
10605       r.first->second = make_unique<OSDShardPGSlot>();
10606     }
10607     slot = r.first->second.get();
10608     if (slot->pg) {
10609       // already have pg
10610       dout(20) << __func__ << "  have merge participant pg " << pgid
10611                << " " << slot->pg << dendl;
10612     } else if (!slot->waiting_for_split.empty() &&
10613                *slot->waiting_for_split.begin() < epoch) {
10614       dout(20) << __func__ << "  pending split on merge participant pg " << pgid
10615                << " " << slot->waiting_for_split << dendl;
10616     } else {
10617       dout(20) << __func__ << "  creating empty merge participant " << pgid
10618                << " for merge in " << epoch << dendl;
10619       // leave history zeroed; PG::merge_from() will fill it in.
10620       pg_history_t history;
10621       PGCreateInfo cinfo(pgid, epoch - 1,
10622                          history, PastIntervals(), false);
10623       PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10624       _attach_pg(r.first->second.get(), pg.get());
10625       _wake_pg_slot(pgid, slot);
10626       pg->unlock();
10627     }
10628     // mark slot for merge
10629     dout(20) << __func__ << "  marking merge participant " << pgid << dendl;
10630     slot->waiting_for_merge_epoch = epoch;
10631     p = merge_pgs->erase(p);
10632   }
10633 }
10634
10635 void OSDShard::register_and_wake_split_child(PG *pg)
10636 {
10637   dout(15) <<  __func__ << ": " << pg << " #:" << pg_slots.size() << dendl;
10638   epoch_t epoch;
10639   {
10640     std::lock_guard l(shard_lock);
10641     dout(10) << __func__ << ": " << pg->pg_id << " " << pg << dendl;
10642     auto p = pg_slots.find(pg->pg_id);
10643     ceph_assert(p != pg_slots.end());
10644     auto *slot = p->second.get();
10645     dout(20) << __func__ << ": " << pg->pg_id << " waiting_for_split "
10646              << slot->waiting_for_split << dendl;
10647     ceph_assert(!slot->pg);
10648     ceph_assert(!slot->waiting_for_split.empty());
10649     _attach_pg(slot, pg);
10650
10651     epoch = pg->get_osdmap_epoch();
10652     ceph_assert(slot->waiting_for_split.count(epoch));
10653     slot->waiting_for_split.erase(epoch);
10654     if (slot->waiting_for_split.empty()) {
10655       _wake_pg_slot(pg->pg_id, slot);
10656     } else {
10657       dout(10) << __func__ << " still waiting for split on "
10658                << slot->waiting_for_split << dendl;
10659     }
10660   }
10661
10662   // kick child to ensure it pulls up to the latest osdmap
10663   osd->enqueue_peering_evt(
10664     pg->pg_id,
10665     PGPeeringEventRef(
10666       std::make_shared<PGPeeringEvent>(
10667         epoch,
10668         epoch,
10669         NullEvt())));
10670
10671   std::lock_guard l{sdata_wait_lock};
10672   sdata_cond.notify_one();
10673 }
10674
10675 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10676 {
10677   std::lock_guard l(shard_lock);
10678   vector<spg_t> to_delete;
10679   for (auto& i : pg_slots) {
10680     if (i.first != parent &&
10681         i.first.get_ancestor(old_pg_num) == parent) {
10682       dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10683                << dendl;
10684       _wake_pg_slot(i.first, i.second.get());
10685       to_delete.push_back(i.first);
10686     }
10687   }
10688   for (auto pgid : to_delete) {
10689     pg_slots.erase(pgid);
10690   }
10691 }
10692
10693 void OSDShard::update_scheduler_config()
10694 {
10695   scheduler->update_configuration();
10696 }
10697
10698 std::string OSDShard::get_scheduler_type()
10699 {
10700   std::ostringstream scheduler_type;
10701   scheduler_type << *scheduler;
10702   return scheduler_type.str();
10703 }
10704
10705 OSDShard::OSDShard(
10706   int id,
10707   CephContext *cct,
10708   OSD *osd)
10709   : shard_id(id),
10710     cct(cct),
10711     osd(osd),
10712     shard_name(string("OSDShard.") + stringify(id)),
10713     sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10714     sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10715     osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10716     shard_lock_name(shard_name + "::shard_lock"),
10717     shard_lock{make_mutex(shard_lock_name)},
10718     scheduler(ceph::osd::scheduler::make_scheduler(
10719       cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(),
10720       osd->store->get_type(), osd->monc)),
10721     context_queue(sdata_wait_lock, sdata_cond)
10722 {
10723   dout(0) << "using op scheduler " << *scheduler << dendl;
10724 }
10725
10726
10727 // =============================================================
10728
10729 #undef dout_context
10730 #define dout_context osd->cct
10731 #undef dout_prefix
10732 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10733
10734 void OSD::ShardedOpWQ::_add_slot_waiter(
10735   spg_t pgid,
10736   OSDShardPGSlot *slot,
10737   OpSchedulerItem&& qi)
10738 {
10739   if (qi.is_peering()) {
10740     dout(20) << __func__ << " " << pgid
10741              << " peering, item epoch is "
10742              << qi.get_map_epoch()
10743              << ", will wait on " << qi << dendl;
10744     slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10745   } else {
10746     dout(20) << __func__ << " " << pgid
10747              << " item epoch is "
10748              << qi.get_map_epoch()
10749              << ", will wait on " << qi << dendl;
10750     slot->waiting.push_back(std::move(qi));
10751   }
10752 }
10753
10754 #undef dout_prefix
10755 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10756
10757 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10758 {
10759   uint32_t shard_index = thread_index % osd->num_shards;
10760   auto& sdata = osd->shards[shard_index];
10761   ceph_assert(sdata);
10762
10763   // If all threads of shards do oncommits, there is a out-of-order
10764   // problem.  So we choose the thread which has the smallest
10765   // thread_index(thread_index < num_shards) of shard to do oncommit
10766   // callback.
10767   bool is_smallest_thread_index = thread_index < osd->num_shards;
10768
10769   // peek at spg_t
10770   sdata->shard_lock.lock();
10771   if (sdata->scheduler->empty() &&
10772       (!is_smallest_thread_index || sdata->context_queue.empty())) {
10773     std::unique_lock wait_lock{sdata->sdata_wait_lock};
10774     if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10775       // we raced with a context_queue addition, don't wait
10776       wait_lock.unlock();
10777     } else if (!sdata->stop_waiting) {
10778       dout(20) << __func__ << " empty q, waiting" << dendl;
10779       osd->cct->get_heartbeat_map()->clear_timeout(hb);
10780       sdata->shard_lock.unlock();
10781       sdata->sdata_cond.wait(wait_lock);
10782       wait_lock.unlock();
10783       sdata->shard_lock.lock();
10784       if (sdata->scheduler->empty() &&
10785          !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10786         sdata->shard_lock.unlock();
10787         return;
10788       }
10789       // found a work item; reapply default wq timeouts
10790       osd->cct->get_heartbeat_map()->reset_timeout(hb,
10791         timeout_interval, suicide_interval);
10792     } else {
10793       dout(20) << __func__ << " need return immediately" << dendl;
10794       wait_lock.unlock();
10795       sdata->shard_lock.unlock();
10796       return;
10797     }
10798   }
10799
10800   list<Context *> oncommits;
10801   if (is_smallest_thread_index) {
10802     sdata->context_queue.move_to(oncommits);
10803   }
10804
10805   WorkItem work_item;
10806   while (!std::get_if<OpSchedulerItem>(&work_item)) {
10807     if (sdata->scheduler->empty()) {
10808       if (osd->is_stopping()) {
10809         sdata->shard_lock.unlock();
10810         for (auto c : oncommits) {
10811           dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10812           delete c;
10813         }
10814         return;    // OSD shutdown, discard.
10815       }
10816       sdata->shard_lock.unlock();
10817       handle_oncommits(oncommits);
10818       return;
10819     }
10820
10821     work_item = sdata->scheduler->dequeue();
10822     if (osd->is_stopping()) {
10823       sdata->shard_lock.unlock();
10824       for (auto c : oncommits) {
10825         dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10826         delete c;
10827       }
10828       return;    // OSD shutdown, discard.
10829     }
10830
10831     // If the work item is scheduled in the future, wait until
10832     // the time returned in the dequeue response before retrying.
10833     if (auto when_ready = std::get_if<double>(&work_item)) {
10834       if (is_smallest_thread_index) {
10835         sdata->shard_lock.unlock();
10836         handle_oncommits(oncommits);
10837         sdata->shard_lock.lock();
10838       }
10839       std::unique_lock wait_lock{sdata->sdata_wait_lock};
10840       auto future_time = ceph::real_clock::from_double(*when_ready);
10841       dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10842       // Disable heartbeat timeout until we find a non-future work item to process.
10843       osd->cct->get_heartbeat_map()->clear_timeout(hb);
10844       sdata->shard_lock.unlock();
10845       ++sdata->waiting_threads;
10846       sdata->sdata_cond.wait_until(wait_lock, future_time);
10847       --sdata->waiting_threads;
10848       wait_lock.unlock();
10849       sdata->shard_lock.lock();
10850       // Reapply default wq timeouts
10851       osd->cct->get_heartbeat_map()->reset_timeout(hb,
10852         timeout_interval, suicide_interval);
10853       // Populate the oncommits list if there were any additions
10854       // to the context_queue while we were waiting
10855       if (is_smallest_thread_index) {
10856         sdata->context_queue.move_to(oncommits);
10857       }
10858     }
10859   } // while
10860
10861   // Access the stored item
10862   auto item = std::move(std::get<OpSchedulerItem>(work_item));
10863   if (osd->is_stopping()) {
10864     sdata->shard_lock.unlock();
10865     for (auto c : oncommits) {
10866       dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10867       delete c;
10868     }
10869     return;    // OSD shutdown, discard.
10870   }
10871
10872   const auto token = item.get_ordering_token();
10873   auto r = sdata->pg_slots.emplace(token, nullptr);
10874   if (r.second) {
10875     r.first->second = make_unique<OSDShardPGSlot>();
10876   }
10877   OSDShardPGSlot *slot = r.first->second.get();
10878   dout(20) << __func__ << " " << token
10879            << (r.second ? " (new)" : "")
10880            << " to_process " << slot->to_process
10881            << " waiting " << slot->waiting
10882            << " waiting_peering " << slot->waiting_peering
10883            << dendl;
10884   slot->to_process.push_back(std::move(item));
10885   dout(20) << __func__ << " " << slot->to_process.back()
10886            << " queued" << dendl;
10887
10888  retry_pg:
10889   PGRef pg = slot->pg;
10890
10891   // lock pg (if we have it)
10892   if (pg) {
10893     // note the requeue seq now...
10894     uint64_t requeue_seq = slot->requeue_seq;
10895     ++slot->num_running;
10896
10897     sdata->shard_lock.unlock();
10898     osd->service.maybe_inject_dispatch_delay();
10899     pg->lock();
10900     osd->service.maybe_inject_dispatch_delay();
10901     sdata->shard_lock.lock();
10902
10903     auto q = sdata->pg_slots.find(token);
10904     if (q == sdata->pg_slots.end()) {
10905       // this can happen if we race with pg removal.
10906       dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10907       pg->unlock();
10908       sdata->shard_lock.unlock();
10909       handle_oncommits(oncommits);
10910       return;
10911     }
10912     slot = q->second.get();
10913     --slot->num_running;
10914
10915     if (slot->to_process.empty()) {
10916       // raced with _wake_pg_slot or consume_map
10917       dout(20) << __func__ << " " << token
10918                << " nothing queued" << dendl;
10919       pg->unlock();
10920       sdata->shard_lock.unlock();
10921       handle_oncommits(oncommits);
10922       return;
10923     }
10924     if (requeue_seq != slot->requeue_seq) {
10925       dout(20) << __func__ << " " << token
10926                << " requeue_seq " << slot->requeue_seq << " > our "
10927                << requeue_seq << ", we raced with _wake_pg_slot"
10928                << dendl;
10929       pg->unlock();
10930       sdata->shard_lock.unlock();
10931       handle_oncommits(oncommits);
10932       return;
10933     }
10934     if (slot->pg != pg) {
10935       // this can happen if we race with pg removal.
10936       dout(20) << __func__ << " slot " << token << " no longer attached to "
10937                << pg << dendl;
10938       pg->unlock();
10939       goto retry_pg;
10940     }
10941   }
10942
10943   dout(20) << __func__ << " " << token
10944            << " to_process " << slot->to_process
10945            << " waiting " << slot->waiting
10946            << " waiting_peering " << slot->waiting_peering << dendl;
10947
10948   ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10949                                  suicide_interval);
10950
10951   // take next item
10952   auto qi = std::move(slot->to_process.front());
10953   slot->to_process.pop_front();
10954   dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10955   set<pair<spg_t,epoch_t>> new_children;
10956   OSDMapRef osdmap;
10957
10958   while (!pg) {
10959     // should this pg shard exist on this osd in this (or a later) epoch?
10960     osdmap = sdata->shard_osdmap;
10961     const PGCreateInfo *create_info = qi.creates_pg();
10962     if (!slot->waiting_for_split.empty()) {
10963       dout(20) << __func__ << " " << token
10964                << " splitting " << slot->waiting_for_split << dendl;
10965       _add_slot_waiter(token, slot, std::move(qi));
10966     } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10967       dout(20) << __func__ << " " << token
10968                << " map " << qi.get_map_epoch() << " > "
10969                << osdmap->get_epoch() << dendl;
10970       _add_slot_waiter(token, slot, std::move(qi));
10971     } else if (qi.is_peering()) {
10972       if (!qi.peering_requires_pg()) {
10973         // for pg-less events, we run them under the ordering lock, since
10974         // we don't have the pg lock to keep them ordered.
10975         qi.run(osd, sdata, pg, tp_handle);
10976       } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10977         if (create_info) {
10978           if (create_info->by_mon &&
10979               osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10980             dout(20) << __func__ << " " << token
10981                      << " no pg, no longer primary, ignoring mon create on "
10982                      << qi << dendl;
10983           } else {
10984             dout(20) << __func__ << " " << token
10985                      << " no pg, should create on " << qi << dendl;
10986             pg = osd->handle_pg_create_info(osdmap, create_info);
10987             if (pg) {
10988               // we created the pg! drop out and continue "normally"!
10989               sdata->_attach_pg(slot, pg.get());
10990               sdata->_wake_pg_slot(token, slot);
10991
10992               // identify split children between create epoch and shard epoch.
10993               osd->service.identify_splits_and_merges(
10994                 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10995               sdata->_prime_splits(&new_children);
10996               // distribute remaining split children to other shards below!
10997               break;
10998             }
10999             dout(20) << __func__ << " ignored create on " << qi << dendl;
11000           }
11001         } else {
11002           dout(20) << __func__ << " " << token
11003                    << " no pg, peering, !create, discarding " << qi << dendl;
11004         }
11005       } else {
11006         dout(20) << __func__ << " " << token
11007                  << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11008                  << ", discarding " << qi
11009                  << dendl;
11010       }
11011     } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11012       dout(20) << __func__ << " " << token
11013                << " no pg, should exist e" << osdmap->get_epoch()
11014                << ", will wait on " << qi << dendl;
11015       _add_slot_waiter(token, slot, std::move(qi));
11016     } else {
11017       dout(20) << __func__ << " " << token
11018                << " no pg, shouldn't exist e" << osdmap->get_epoch()
11019                << ", dropping " << qi << dendl;
11020       // share map with client?
11021       if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11022         osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11023                                      sdata->shard_osdmap,
11024                                      (*_op)->sent_epoch);
11025       }
11026       unsigned pushes_to_free = qi.get_reserved_pushes();
11027       if (pushes_to_free > 0) {
11028         sdata->shard_lock.unlock();
11029         osd->service.release_reserved_pushes(pushes_to_free);
11030         handle_oncommits(oncommits);
11031         return;
11032       }
11033     }
11034     sdata->shard_lock.unlock();
11035     handle_oncommits(oncommits);
11036     return;
11037   }
11038   if (qi.is_peering()) {
11039     OSDMapRef osdmap = sdata->shard_osdmap;
11040     if (qi.get_map_epoch() > osdmap->get_epoch()) {
11041       _add_slot_waiter(token, slot, std::move(qi));
11042       sdata->shard_lock.unlock();
11043       pg->unlock();
11044       handle_oncommits(oncommits);
11045       return;
11046     }
11047   }
11048   sdata->shard_lock.unlock();
11049
11050   if (!new_children.empty()) {
11051     for (auto shard : osd->shards) {
11052       shard->prime_splits(osdmap, &new_children);
11053     }
11054     ceph_assert(new_children.empty());
11055   }
11056
11057   // osd_opwq_process marks the point at which an operation has been dequeued
11058   // and will begin to be handled by a worker thread.
11059   {
11060 #ifdef WITH_LTTNG
11061     osd_reqid_t reqid;
11062     if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11063       reqid = (*_op)->get_reqid();
11064     }
11065 #endif
11066     tracepoint(osd, opwq_process_start, reqid.name._type,
11067         reqid.name._num, reqid.tid, reqid.inc);
11068   }
11069
11070   lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11071   Formatter *f = Formatter::create("json");
11072   f->open_object_section("q");
11073   dump(f);
11074   f->close_section();
11075   f->flush(*_dout);
11076   delete f;
11077   *_dout << dendl;
11078
11079   qi.run(osd, sdata, pg, tp_handle);
11080
11081   {
11082 #ifdef WITH_LTTNG
11083     osd_reqid_t reqid;
11084     if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11085       reqid = (*_op)->get_reqid();
11086     }
11087 #endif
11088     tracepoint(osd, opwq_process_finish, reqid.name._type,
11089         reqid.name._num, reqid.tid, reqid.inc);
11090   }
11091
11092   handle_oncommits(oncommits);
11093 }
11094
11095 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11096   if (unlikely(m_fast_shutdown) ) {
11097     // stop enqueing when we are in the middle of a fast shutdown
11098     return;
11099   }
11100
11101   uint32_t shard_index =
11102     item.get_ordering_token().hash_to_shard(osd->shards.size());
11103
11104   OSDShard* sdata = osd->shards[shard_index];
11105   assert (NULL != sdata);
11106
11107   dout(20) << __func__ << " " << item << dendl;
11108
11109   bool empty = true;
11110   {
11111     std::lock_guard l{sdata->shard_lock};
11112     empty = sdata->scheduler->empty();
11113     sdata->scheduler->enqueue(std::move(item));
11114   }
11115
11116   {
11117     std::lock_guard l{sdata->sdata_wait_lock};
11118     if (empty) {
11119       sdata->sdata_cond.notify_all();
11120     } else if (sdata->waiting_threads) {
11121       sdata->sdata_cond.notify_one();
11122     }
11123   }
11124 }
11125
11126 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
11127 {
11128   if (unlikely(m_fast_shutdown) ) {
11129     // stop enqueing when we are in the middle of a fast shutdown
11130     return;
11131   }
11132
11133   auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11134   auto& sdata = osd->shards[shard_index];
11135   ceph_assert(sdata);
11136   sdata->shard_lock.lock();
11137   auto p = sdata->pg_slots.find(item.get_ordering_token());
11138   if (p != sdata->pg_slots.end() &&
11139       !p->second->to_process.empty()) {
11140     // we may be racing with _process, which has dequeued a new item
11141     // from scheduler, put it on to_process, and is now busy taking the
11142     // pg lock.  ensure this old requeued item is ordered before any
11143     // such newer item in to_process.
11144     p->second->to_process.push_front(std::move(item));
11145     item = std::move(p->second->to_process.back());
11146     p->second->to_process.pop_back();
11147     dout(20) << __func__
11148              << " " << p->second->to_process.front()
11149              << " shuffled w/ " << item << dendl;
11150   } else {
11151     dout(20) << __func__ << " " << item << dendl;
11152   }
11153   sdata->scheduler->enqueue_front(std::move(item));
11154   sdata->shard_lock.unlock();
11155   std::lock_guard l{sdata->sdata_wait_lock};
11156   sdata->sdata_cond.notify_one();
11157 }
11158
11159 void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11160 {
11161   uint32_t shard_index = 0;
11162   m_fast_shutdown = true;
11163
11164   for (; shard_index < osd->num_shards; shard_index++) {
11165     auto& sdata = osd->shards[shard_index];
11166     ceph_assert(sdata);
11167     sdata->shard_lock.lock();
11168     int work_count = 0;
11169     while(! sdata->scheduler->empty() ) {
11170       auto work_item = sdata->scheduler->dequeue();
11171       work_count++;
11172     }
11173     sdata->shard_lock.unlock();
11174   }
11175 }
11176
11177 namespace ceph::osd_cmds {
11178
11179 int heap(CephContext& cct,
11180          const cmdmap_t& cmdmap,
11181          std::ostream& outos,
11182          std::ostream& erros)
11183 {
11184   if (!ceph_using_tcmalloc()) {
11185         erros << "could not issue heap profiler command -- not using tcmalloc!";
11186         return -EOPNOTSUPP;
11187   }
11188
11189   string cmd;
11190   if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
11191         erros << "unable to get value for command \"" << cmd << "\"";
11192        return -EINVAL;
11193   }
11194
11195   std::vector<std::string> cmd_vec;
11196   get_str_vec(cmd, cmd_vec);
11197
11198   string val;
11199   if (cmd_getval(cmdmap, "value", val)) {
11200     cmd_vec.push_back(val);
11201   }
11202
11203   ceph_heap_profiler_handle_command(cmd_vec, outos);
11204
11205   return 0;
11206 }
11207
11208 } // namespace ceph::osd_cmds