ceph/src/osd/OSD.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2017 OVH
   8  *
   9  * This is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License version 2.1, as published by the Free Software
  12  * Foundation.  See file COPYING.
  13  *
  14  */
  15
  16 #include "acconfig.h"
  17
  18 #include <cctype>
  19 #include <fstream>
  20 #include <iostream>
  21 #include <iterator>
  22
  23 #include <unistd.h>
  24 #include <sys/stat.h>
  25 #include <signal.h>
  26 #include <time.h>
  27 #include <boost/scoped_ptr.hpp>
  28 #include <boost/range/adaptor/reversed.hpp>
  29
  30 #ifdef HAVE_SYS_PARAM_H
  31 #include <sys/param.h>
  32 #endif
  33
  34 #ifdef HAVE_SYS_MOUNT_H
  35 #include <sys/mount.h>
  36 #endif
  37
  38 #include "osd/PG.h"
  39 #include "osd/scrub_machine.h"
  40 #include "osd/pg_scrubber.h"
  41
  42 #include "include/types.h"
  43 #include "include/compat.h"
  44 #include "include/random.h"
  45
  46 #include "OSD.h"
  47 #include "OSDMap.h"
  48 #include "Watch.h"
  49 #include "osdc/Objecter.h"
  50
  51 #include "common/errno.h"
  52 #include "common/ceph_argparse.h"
  53 #include "common/ceph_releases.h"
  54 #include "common/ceph_time.h"
  55 #include "common/version.h"
  56 #include "common/async/blocked_completion.h"
  57 #include "common/pick_address.h"
  58 #include "common/blkdev.h"
  59 #include "common/numa.h"
  60
  61 #include "os/ObjectStore.h"
  62 #ifdef HAVE_LIBFUSE
  63 #include "os/FuseStore.h"
  64 #endif
  65
  66 #include "PrimaryLogPG.h"
  67
  68 #include "msg/Messenger.h"
  69 #include "msg/Message.h"
  70
  71 #include "mon/MonClient.h"
  72
  73 #include "messages/MLog.h"
  74
  75 #include "messages/MGenericMessage.h"
  76 #include "messages/MOSDPing.h"
  77 #include "messages/MOSDFailure.h"
  78 #include "messages/MOSDMarkMeDown.h"
  79 #include "messages/MOSDMarkMeDead.h"
  80 #include "messages/MOSDFull.h"
  81 #include "messages/MOSDOp.h"
  82 #include "messages/MOSDOpReply.h"
  83 #include "messages/MOSDBackoff.h"
  84 #include "messages/MOSDBeacon.h"
  85 #include "messages/MOSDRepOp.h"
  86 #include "messages/MOSDRepOpReply.h"
  87 #include "messages/MOSDBoot.h"
  88 #include "messages/MOSDPGTemp.h"
  89 #include "messages/MOSDPGReadyToMerge.h"
  90
  91 #include "messages/MOSDMap.h"
  92 #include "messages/MMonGetOSDMap.h"
  93 #include "messages/MOSDPGNotify.h"
  94 #include "messages/MOSDPGNotify2.h"
  95 #include "messages/MOSDPGQuery.h"
  96 #include "messages/MOSDPGQuery2.h"
  97 #include "messages/MOSDPGLog.h"
  98 #include "messages/MOSDPGRemove.h"
  99 #include "messages/MOSDPGInfo.h"
 100 #include "messages/MOSDPGInfo2.h"
 101 #include "messages/MOSDPGCreate.h"
 102 #include "messages/MOSDPGCreate2.h"
 103 #include "messages/MBackfillReserve.h"
 104 #include "messages/MRecoveryReserve.h"
 105 #include "messages/MOSDForceRecovery.h"
 106 #include "messages/MOSDECSubOpWrite.h"
 107 #include "messages/MOSDECSubOpWriteReply.h"
 108 #include "messages/MOSDECSubOpRead.h"
 109 #include "messages/MOSDECSubOpReadReply.h"
 110 #include "messages/MOSDPGCreated.h"
 111 #include "messages/MOSDPGUpdateLogMissing.h"
 112 #include "messages/MOSDPGUpdateLogMissingReply.h"
 113
 114 #include "messages/MOSDPeeringOp.h"
 115
 116 #include "messages/MOSDAlive.h"
 117
 118 #include "messages/MOSDScrub.h"
 119 #include "messages/MOSDScrub2.h"
 120 #include "messages/MOSDRepScrub.h"
 121
 122 #include "messages/MCommand.h"
 123 #include "messages/MCommandReply.h"
 124
 125 #include "messages/MPGStats.h"
 126
 127 #include "messages/MWatchNotify.h"
 128 #include "messages/MOSDPGPush.h"
 129 #include "messages/MOSDPGPushReply.h"
 130 #include "messages/MOSDPGPull.h"
 131
 132 #include "messages/MMonGetPurgedSnaps.h"
 133 #include "messages/MMonGetPurgedSnapsReply.h"
 134
 135 #include "common/perf_counters.h"
 136 #include "common/Timer.h"
 137 #include "common/LogClient.h"
 138 #include "common/AsyncReserver.h"
 139 #include "common/HeartbeatMap.h"
 140 #include "common/admin_socket.h"
 141 #include "common/ceph_context.h"
 142
 143 #include "global/signal_handler.h"
 144 #include "global/pidfile.h"
 145
 146 #include "include/color.h"
 147 #include "perfglue/cpu_profiler.h"
 148 #include "perfglue/heap_profiler.h"
 149
 150 #include "osd/ClassHandler.h"
 151 #include "osd/OpRequest.h"
 152
 153 #include "auth/AuthAuthorizeHandler.h"
 154 #include "auth/RotatingKeyRing.h"
 155
 156 #include "objclass/objclass.h"
 157
 158 #include "common/cmdparse.h"
 159 #include "include/str_list.h"
 160 #include "include/util.h"
 161
 162 #include "include/ceph_assert.h"
 163 #include "common/config.h"
 164 #include "common/EventTrace.h"
 165
 166 #include "json_spirit/json_spirit_reader.h"
 167 #include "json_spirit/json_spirit_writer.h"
 168
 169 #ifdef WITH_LTTNG
 170 #define TRACEPOINT_DEFINE
 171 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 172 #include "tracing/osd.h"
 173 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 174 #undef TRACEPOINT_DEFINE
 175 #else
 176 #define tracepoint(...)
 177 #endif
 178 #ifdef HAVE_JAEGER
 179 #include "common/tracer.h"
 180 #endif
 181
 182 #define dout_context cct
 183 #define dout_subsys ceph_subsys_osd
 184 #undef dout_prefix
 185 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
 186
 187 using std::deque;
 188 using std::list;
 189 using std::lock_guard;
 190 using std::make_pair;
 191 using std::make_tuple;
 192 using std::make_unique;
 193 using std::map;
 194 using std::ostream;
 195 using std::ostringstream;
 196 using std::pair;
 197 using std::set;
 198 using std::string;
 199 using std::stringstream;
 200 using std::to_string;
 201 using std::unique_ptr;
 202 using std::vector;
 203
 204 using ceph::bufferlist;
 205 using ceph::bufferptr;
 206 using ceph::decode;
 207 using ceph::encode;
 208 using ceph::fixed_u_to_string;
 209 using ceph::Formatter;
 210 using ceph::heartbeat_handle_d;
 211 using ceph::make_mutex;
 212
 213 using namespace ceph::osd::scheduler;
 214 using TOPNSPC::common::cmd_getval;
 215
 216 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
 217   return *_dout << "osd." << whoami << " " << epoch << " ";
 218 }
 219
 220 //Initial features in new superblock.
 221 //Features here are also automatically upgraded
 222 CompatSet OSD::get_osd_initial_compat_set() {
 223   CompatSet::FeatureSet ceph_osd_feature_compat;
 224   CompatSet::FeatureSet ceph_osd_feature_ro_compat;
 225   CompatSet::FeatureSet ceph_osd_feature_incompat;
 226   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
 227   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
 228   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
 229   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
 230   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
 231   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
 232   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
 233   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
 234   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
 235   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
 236   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
 237   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
 238   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
 239   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
 240   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
 241   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
 242   return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
 243                    ceph_osd_feature_incompat);
 244 }
 245
 246 //Features are added here that this OSD supports.
 247 CompatSet OSD::get_osd_compat_set() {
 248   CompatSet compat =  get_osd_initial_compat_set();
 249   //Any features here can be set in code, but not in initial superblock
 250   compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
 251   return compat;
 252 }
 253
 254 OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
 255   osd(osd),
 256   cct(osd->cct),
 257   whoami(osd->whoami), store(osd->store),
 258   log_client(osd->log_client), clog(osd->clog),
 259   pg_recovery_stats(osd->pg_recovery_stats),
 260   cluster_messenger(osd->cluster_messenger),
 261   client_messenger(osd->client_messenger),
 262   logger(osd->logger),
 263   recoverystate_perf(osd->recoverystate_perf),
 264   monc(osd->monc),
 265   osd_max_object_size(cct->_conf, "osd_max_object_size"),
 266   osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
 267   publish_lock{ceph::make_mutex("OSDService::publish_lock")},
 268   pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
 269   max_oldest_map(0),
 270   scrubs_local(0),
 271   scrubs_remote(0),
 272   agent_valid_iterator(false),
 273   agent_ops(0),
 274   flush_mode_high_count(0),
 275   agent_active(true),
 276   agent_thread(this),
 277   agent_stop_flag(false),
 278   agent_timer(osd->client_messenger->cct, agent_timer_lock),
 279   last_recalibrate(ceph_clock_now()),
 280   promote_max_objects(0),
 281   promote_max_bytes(0),
 282   poolctx(poolctx),
 283   objecter(make_unique<Objecter>(osd->client_messenger->cct,
 284                                  osd->objecter_messenger,
 285                                  osd->monc, poolctx)),
 286   m_objecter_finishers(cct->_conf->osd_objecter_finishers),
 287   watch_timer(osd->client_messenger->cct, watch_lock),
 288   next_notif_id(0),
 289   recovery_request_timer(cct, recovery_request_lock, false),
 290   sleep_timer(cct, sleep_lock, false),
 291   reserver_finisher(cct),
 292   local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 293                  cct->_conf->osd_min_recovery_priority),
 294   remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 295                   cct->_conf->osd_min_recovery_priority),
 296   snap_reserver(cct, &reserver_finisher,
 297                 cct->_conf->osd_max_trimming_pgs),
 298   recovery_ops_active(0),
 299   recovery_ops_reserved(0),
 300   recovery_paused(false),
 301   map_cache(cct, cct->_conf->osd_map_cache_size),
 302   map_bl_cache(cct->_conf->osd_map_cache_size),
 303   map_bl_inc_cache(cct->_conf->osd_map_cache_size),
 304   cur_state(NONE),
 305   cur_ratio(0), physical_ratio(0),
 306   boot_epoch(0), up_epoch(0), bind_epoch(0)
 307 {
 308   objecter->init();
 309
 310   for (int i = 0; i < m_objecter_finishers; i++) {
 311     ostringstream str;
 312     str << "objecter-finisher-" << i;
 313     auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
 314     objecter_finishers.push_back(std::move(fin));
 315   }
 316 }
 317
 318 #ifdef PG_DEBUG_REFS
 319 void OSDService::add_pgid(spg_t pgid, PG *pg) {
 320   std::lock_guard l(pgid_lock);
 321   if (!pgid_tracker.count(pgid)) {
 322     live_pgs[pgid] = pg;
 323   }
 324   pgid_tracker[pgid]++;
 325 }
 326 void OSDService::remove_pgid(spg_t pgid, PG *pg)
 327 {
 328   std::lock_guard l(pgid_lock);
 329   ceph_assert(pgid_tracker.count(pgid));
 330   ceph_assert(pgid_tracker[pgid] > 0);
 331   pgid_tracker[pgid]--;
 332   if (pgid_tracker[pgid] == 0) {
 333     pgid_tracker.erase(pgid);
 334     live_pgs.erase(pgid);
 335   }
 336 }
 337 void OSDService::dump_live_pgids()
 338 {
 339   std::lock_guard l(pgid_lock);
 340   derr << "live pgids:" << dendl;
 341   for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
 342        i != pgid_tracker.cend();
 343        ++i) {
 344     derr << "\t" << *i << dendl;
 345     live_pgs[i->first]->dump_live_ids();
 346   }
 347 }
 348 #endif
 349
 350
 351 ceph::signedspan OSDService::get_mnow()
 352 {
 353   return ceph::mono_clock::now() - osd->startup_time;
 354 }
 355
 356 void OSDService::identify_splits_and_merges(
 357   OSDMapRef old_map,
 358   OSDMapRef new_map,
 359   spg_t pgid,
 360   set<pair<spg_t,epoch_t>> *split_children,
 361   set<pair<spg_t,epoch_t>> *merge_pgs)
 362 {
 363   if (!old_map->have_pg_pool(pgid.pool())) {
 364     return;
 365   }
 366   int old_pgnum = old_map->get_pg_num(pgid.pool());
 367   auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
 368   if (p == osd->pg_num_history.pg_nums.end()) {
 369     return;
 370   }
 371   dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
 372            << " to e" << new_map->get_epoch()
 373            << " pg_nums " << p->second << dendl;
 374   deque<spg_t> queue;
 375   queue.push_back(pgid);
 376   set<spg_t> did;
 377   while (!queue.empty()) {
 378     auto cur = queue.front();
 379     queue.pop_front();
 380     did.insert(cur);
 381     unsigned pgnum = old_pgnum;
 382     for (auto q = p->second.lower_bound(old_map->get_epoch());
 383          q != p->second.end() &&
 384            q->first <= new_map->get_epoch();
 385          ++q) {
 386       if (pgnum < q->second) {
 387         // split?
 388         if (cur.ps() < pgnum) {
 389           set<spg_t> children;
 390           if (cur.is_split(pgnum, q->second, &children)) {
 391             dout(20) << __func__ << " " << cur << " e" << q->first
 392                      << " pg_num " << pgnum << " -> " << q->second
 393                      << " children " << children << dendl;
 394             for (auto i : children) {
 395               split_children->insert(make_pair(i, q->first));
 396               if (!did.count(i))
 397                 queue.push_back(i);
 398             }
 399           }
 400         } else if (cur.ps() < q->second) {
 401           dout(20) << __func__ << " " << cur << " e" << q->first
 402                    << " pg_num " << pgnum << " -> " << q->second
 403                    << " is a child" << dendl;
 404           // normally we'd capture this from the parent, but it's
 405           // possible the parent doesn't exist yet (it will be
 406           // fabricated to allow an intervening merge).  note this PG
 407           // as a split child here to be sure we catch it.
 408           split_children->insert(make_pair(cur, q->first));
 409         } else {
 410           dout(20) << __func__ << " " << cur << " e" << q->first
 411                    << " pg_num " << pgnum << " -> " << q->second
 412                    << " is post-split, skipping" << dendl;
 413         }
 414       } else if (merge_pgs) {
 415         // merge?
 416         if (cur.ps() >= q->second) {
 417           if (cur.ps() < pgnum) {
 418             spg_t parent;
 419             if (cur.is_merge_source(pgnum, q->second, &parent)) {
 420               set<spg_t> children;
 421               parent.is_split(q->second, pgnum, &children);
 422               dout(20) << __func__ << " " << cur << " e" << q->first
 423                        << " pg_num " << pgnum << " -> " << q->second
 424                        << " is merge source, target " << parent
 425                        << ", source(s) " << children << dendl;
 426               merge_pgs->insert(make_pair(parent, q->first));
 427               if (!did.count(parent)) {
 428                 // queue (and re-scan) parent in case it might not exist yet
 429                 // and there are some future splits pending on it
 430                 queue.push_back(parent);
 431               }
 432               for (auto c : children) {
 433                 merge_pgs->insert(make_pair(c, q->first));
 434                 if (!did.count(c))
 435                   queue.push_back(c);
 436               }
 437             }
 438           } else {
 439             dout(20) << __func__ << " " << cur << " e" << q->first
 440                      << " pg_num " << pgnum << " -> " << q->second
 441                      << " is beyond old pgnum, skipping" << dendl;
 442           }
 443         } else {
 444           set<spg_t> children;
 445           if (cur.is_split(q->second, pgnum, &children)) {
 446             dout(20) << __func__ << " " << cur << " e" << q->first
 447                      << " pg_num " << pgnum << " -> " << q->second
 448                      << " is merge target, source " << children << dendl;
 449             for (auto c : children) {
 450               merge_pgs->insert(make_pair(c, q->first));
 451               if (!did.count(c))
 452                 queue.push_back(c);
 453             }
 454             merge_pgs->insert(make_pair(cur, q->first));
 455           }
 456         }
 457       }
 458       pgnum = q->second;
 459     }
 460   }
 461 }
 462
 463 void OSDService::need_heartbeat_peer_update()
 464 {
 465   osd->need_heartbeat_peer_update();
 466 }
 467
 468 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
 469 {
 470   std::lock_guard l(hb_stamp_lock);
 471   if (peer >= hb_stamps.size()) {
 472     hb_stamps.resize(peer + 1);
 473   }
 474   if (!hb_stamps[peer]) {
 475     hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
 476   }
 477   return hb_stamps[peer];
 478 }
 479
 480 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
 481 {
 482   osd->enqueue_peering_evt(
 483     spgid,
 484     PGPeeringEventRef(
 485       std::make_shared<PGPeeringEvent>(
 486         epoch, epoch,
 487         RenewLease())));
 488 }
 489
 490 void OSDService::start_shutdown()
 491 {
 492   {
 493     std::lock_guard l(agent_timer_lock);
 494     agent_timer.shutdown();
 495   }
 496
 497   {
 498     std::lock_guard l(sleep_lock);
 499     sleep_timer.shutdown();
 500   }
 501
 502   {
 503     std::lock_guard l(recovery_request_lock);
 504     recovery_request_timer.shutdown();
 505   }
 506 }
 507
 508 void OSDService::shutdown_reserver()
 509 {
 510   reserver_finisher.wait_for_empty();
 511   reserver_finisher.stop();
 512 }
 513
 514 void OSDService::shutdown()
 515 {
 516   mono_timer.suspend();
 517
 518   {
 519     std::lock_guard l(watch_lock);
 520     watch_timer.shutdown();
 521   }
 522
 523   objecter->shutdown();
 524   for (auto& f : objecter_finishers) {
 525     f->wait_for_empty();
 526     f->stop();
 527   }
 528
 529   publish_map(OSDMapRef());
 530   next_osdmap = OSDMapRef();
 531 }
 532
 533 void OSDService::init()
 534 {
 535   reserver_finisher.start();
 536   for (auto& f : objecter_finishers) {
 537     f->start();
 538   }
 539   objecter->set_client_incarnation(0);
 540
 541   // deprioritize objecter in daemonperf output
 542   objecter->get_logger()->set_prio_adjust(-3);
 543
 544   watch_timer.init();
 545   agent_timer.init();
 546   mono_timer.resume();
 547
 548   agent_thread.create("osd_srv_agent");
 549
 550   if (cct->_conf->osd_recovery_delay_start)
 551     defer_recovery(cct->_conf->osd_recovery_delay_start);
 552 }
 553
 554 void OSDService::final_init()
 555 {
 556   objecter->start(osdmap.get());
 557 }
 558
 559 void OSDService::activate_map()
 560 {
 561   // wake/unwake the tiering agent
 562   std::lock_guard l{agent_lock};
 563   agent_active =
 564     !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
 565     osd->is_active();
 566   agent_cond.notify_all();
 567 }
 568
 569 void OSDService::request_osdmap_update(epoch_t e)
 570 {
 571   osd->osdmap_subscribe(e, false);
 572 }
 573
 574
 575 class AgentTimeoutCB : public Context {
 576   PGRef pg;
 577 public:
 578   explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
 579   void finish(int) override {
 580     pg->agent_choose_mode_restart();
 581   }
 582 };
 583
 584 void OSDService::agent_entry()
 585 {
 586   dout(10) << __func__ << " start" << dendl;
 587   std::unique_lock agent_locker{agent_lock};
 588
 589   while (!agent_stop_flag) {
 590     if (agent_queue.empty()) {
 591       dout(20) << __func__ << " empty queue" << dendl;
 592       agent_cond.wait(agent_locker);
 593       continue;
 594     }
 595     uint64_t level = agent_queue.rbegin()->first;
 596     set<PGRef>& top = agent_queue.rbegin()->second;
 597     dout(10) << __func__
 598              << " tiers " << agent_queue.size()
 599              << ", top is " << level
 600              << " with pgs " << top.size()
 601              << ", ops " << agent_ops << "/"
 602              << cct->_conf->osd_agent_max_ops
 603              << (agent_active ? " active" : " NOT ACTIVE")
 604              << dendl;
 605     dout(20) << __func__ << " oids " << agent_oids << dendl;
 606     int max = cct->_conf->osd_agent_max_ops - agent_ops;
 607     int agent_flush_quota = max;
 608     if (!flush_mode_high_count)
 609       agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
 610     if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
 611       agent_cond.wait(agent_locker);
 612       continue;
 613     }
 614
 615     if (!agent_valid_iterator || agent_queue_pos == top.end()) {
 616       agent_queue_pos = top.begin();
 617       agent_valid_iterator = true;
 618     }
 619     PGRef pg = *agent_queue_pos;
 620     dout(10) << "high_count " << flush_mode_high_count
 621              << " agent_ops " << agent_ops
 622              << " flush_quota " << agent_flush_quota << dendl;
 623     agent_locker.unlock();
 624     if (!pg->agent_work(max, agent_flush_quota)) {
 625       dout(10) << __func__ << " " << pg->pg_id
 626         << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
 627         << " seconds" << dendl;
 628
 629       logger->inc(l_osd_tier_delay);
 630       // Queue a timer to call agent_choose_mode for this pg in 5 seconds
 631       std::lock_guard timer_locker{agent_timer_lock};
 632       Context *cb = new AgentTimeoutCB(pg);
 633       agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
 634     }
 635     agent_locker.lock();
 636   }
 637   dout(10) << __func__ << " finish" << dendl;
 638 }
 639
 640 void OSDService::agent_stop()
 641 {
 642   {
 643     std::lock_guard l(agent_lock);
 644
 645     // By this time all ops should be cancelled
 646     ceph_assert(agent_ops == 0);
 647     // By this time all PGs are shutdown and dequeued
 648     if (!agent_queue.empty()) {
 649       set<PGRef>& top = agent_queue.rbegin()->second;
 650       derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
 651       ceph_abort_msg("agent queue not empty");
 652     }
 653
 654     agent_stop_flag = true;
 655     agent_cond.notify_all();
 656   }
 657   agent_thread.join();
 658 }
 659
 660 // -------------------------------------
 661
 662 void OSDService::promote_throttle_recalibrate()
 663 {
 664   utime_t now = ceph_clock_now();
 665   double dur = now - last_recalibrate;
 666   last_recalibrate = now;
 667   unsigned prob = promote_probability_millis;
 668
 669   uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
 670   uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
 671
 672   unsigned min_prob = 1;
 673
 674   uint64_t attempts, obj, bytes;
 675   promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
 676   dout(10) << __func__ << " " << attempts << " attempts, promoted "
 677            << obj << " objects and " << byte_u_t(bytes) << "; target "
 678            << target_obj_sec << " obj/sec or "
 679            << byte_u_t(target_bytes_sec) << "/sec"
 680            << dendl;
 681
 682   // calculate what the probability *should* be, given the targets
 683   unsigned new_prob;
 684   if (attempts && dur > 0) {
 685     uint64_t avg_size = 1;
 686     if (obj)
 687       avg_size = std::max<uint64_t>(bytes / obj, 1);
 688     unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
 689     unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
 690       / (double)attempts;
 691     dout(20) << __func__ << "  po " << po << " pb " << pb << " avg_size "
 692              << avg_size << dendl;
 693     if (target_obj_sec && target_bytes_sec)
 694       new_prob = std::min(po, pb);
 695     else if (target_obj_sec)
 696       new_prob = po;
 697     else if (target_bytes_sec)
 698       new_prob = pb;
 699     else
 700       new_prob = 1000;
 701   } else {
 702     new_prob = 1000;
 703   }
 704   dout(20) << __func__ << "  new_prob " << new_prob << dendl;
 705
 706   // correct for persistent skew between target rate and actual rate, adjust
 707   double ratio = 1.0;
 708   unsigned actual = 0;
 709   if (attempts && obj) {
 710     actual = obj * 1000 / attempts;
 711     ratio = (double)actual / (double)prob;
 712     new_prob = (double)new_prob / ratio;
 713   }
 714   new_prob = std::max(new_prob, min_prob);
 715   new_prob = std::min(new_prob, 1000u);
 716
 717   // adjust
 718   prob = (prob + new_prob) / 2;
 719   prob = std::max(prob, min_prob);
 720   prob = std::min(prob, 1000u);
 721   dout(10) << __func__ << "  actual " << actual
 722            << ", actual/prob ratio " << ratio
 723            << ", adjusted new_prob " << new_prob
 724            << ", prob " << promote_probability_millis << " -> " << prob
 725            << dendl;
 726   promote_probability_millis = prob;
 727
 728   // set hard limits for this interval to mitigate stampedes
 729   promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
 730   promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
 731 }
 732
 733 // -------------------------------------
 734
 735 float OSDService::get_failsafe_full_ratio()
 736 {
 737   float full_ratio = cct->_conf->osd_failsafe_full_ratio;
 738   if (full_ratio > 1.0) full_ratio /= 100.0;
 739   return full_ratio;
 740 }
 741
 742 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
 743 {
 744   // The OSDMap ratios take precendence.  So if the failsafe is .95 and
 745   // the admin sets the cluster full to .96, the failsafe moves up to .96
 746   // too.  (Not that having failsafe == full is ideal, but it's better than
 747   // dropping writes before the clusters appears full.)
 748   OSDMapRef osdmap = get_osdmap();
 749   if (!osdmap || osdmap->get_epoch() == 0) {
 750     return NONE;
 751   }
 752   float nearfull_ratio = osdmap->get_nearfull_ratio();
 753   float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
 754   float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
 755   float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
 756
 757   if (osdmap->require_osd_release < ceph_release_t::luminous) {
 758     // use the failsafe for nearfull and full; the mon isn't using the
 759     // flags anyway because we're mid-upgrade.
 760     full_ratio = failsafe_ratio;
 761     backfillfull_ratio = failsafe_ratio;
 762     nearfull_ratio = failsafe_ratio;
 763   } else if (full_ratio <= 0 ||
 764              backfillfull_ratio <= 0 ||
 765              nearfull_ratio <= 0) {
 766     derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
 767     // use failsafe flag.  ick.  the monitor did something wrong or the user
 768     // did something stupid.
 769     full_ratio = failsafe_ratio;
 770     backfillfull_ratio = failsafe_ratio;
 771     nearfull_ratio = failsafe_ratio;
 772   }
 773
 774   if (injectfull_state > NONE && injectfull) {
 775     inject = "(Injected)";
 776     return injectfull_state;
 777   } else if (pratio > failsafe_ratio) {
 778     return FAILSAFE;
 779   } else if (ratio > full_ratio) {
 780     return FULL;
 781   } else if (ratio > backfillfull_ratio) {
 782     return BACKFILLFULL;
 783   } else if (pratio > nearfull_ratio) {
 784     return NEARFULL;
 785   }
 786    return NONE;
 787 }
 788
 789 void OSDService::check_full_status(float ratio, float pratio)
 790 {
 791   std::lock_guard l(full_status_lock);
 792
 793   cur_ratio = ratio;
 794   physical_ratio = pratio;
 795
 796   string inject;
 797   s_names new_state;
 798   new_state = recalc_full_state(ratio, pratio, inject);
 799
 800   dout(20) << __func__ << " cur ratio " << ratio
 801            << ", physical ratio " << pratio
 802            << ", new state " << get_full_state_name(new_state)
 803            << " " << inject
 804            << dendl;
 805
 806   // warn
 807   if (cur_state != new_state) {
 808     dout(10) << __func__ << " " << get_full_state_name(cur_state)
 809              << " -> " << get_full_state_name(new_state) << dendl;
 810     if (new_state == FAILSAFE) {
 811       clog->error() << "full status failsafe engaged, dropping updates, now "
 812                     << (int)roundf(ratio * 100) << "% full";
 813     } else if (cur_state == FAILSAFE) {
 814       clog->error() << "full status failsafe disengaged, no longer dropping "
 815                      << "updates, now " << (int)roundf(ratio * 100) << "% full";
 816     }
 817     cur_state = new_state;
 818   }
 819 }
 820
 821 bool OSDService::need_fullness_update()
 822 {
 823   OSDMapRef osdmap = get_osdmap();
 824   s_names cur = NONE;
 825   if (osdmap->exists(whoami)) {
 826     if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
 827       cur = FULL;
 828     } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
 829       cur = BACKFILLFULL;
 830     } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
 831       cur = NEARFULL;
 832     }
 833   }
 834   s_names want = NONE;
 835   if (is_full())
 836     want = FULL;
 837   else if (is_backfillfull())
 838     want = BACKFILLFULL;
 839   else if (is_nearfull())
 840     want = NEARFULL;
 841   return want != cur;
 842 }
 843
 844 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
 845 {
 846   if (injectfull && injectfull_state >= type) {
 847     // injectfull is either a count of the number of times to return failsafe full
 848     // or if -1 then always return full
 849     if (injectfull > 0)
 850       --injectfull;
 851     ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
 852              << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
 853              << dendl;
 854     return true;
 855   }
 856   return false;
 857 }
 858
 859 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
 860 {
 861   std::lock_guard l(full_status_lock);
 862
 863   if (_check_inject_full(dpp, type))
 864     return true;
 865
 866   if (cur_state >= type)
 867     ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
 868                        << " physical " << physical_ratio << dendl;
 869
 870   return cur_state >= type;
 871 }
 872
 873 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
 874 {
 875   ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
 876   {
 877     std::lock_guard l(full_status_lock);
 878     if (_check_inject_full(dpp, type)) {
 879       return true;
 880     }
 881   }
 882
 883   float pratio;
 884   float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
 885
 886   string notused;
 887   s_names tentative_state = recalc_full_state(ratio, pratio, notused);
 888
 889   if (tentative_state >= type)
 890     ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
 891
 892   return tentative_state >= type;
 893 }
 894
 895 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
 896 {
 897   return _check_full(dpp, FAILSAFE);
 898 }
 899
 900 bool OSDService::check_full(DoutPrefixProvider *dpp) const
 901 {
 902   return _check_full(dpp, FULL);
 903 }
 904
 905 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
 906 {
 907   return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
 908 }
 909
 910 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
 911 {
 912   return _check_full(dpp, BACKFILLFULL);
 913 }
 914
 915 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
 916 {
 917   return _check_full(dpp, NEARFULL);
 918 }
 919
 920 bool OSDService::is_failsafe_full() const
 921 {
 922   std::lock_guard l(full_status_lock);
 923   return cur_state == FAILSAFE;
 924 }
 925
 926 bool OSDService::is_full() const
 927 {
 928   std::lock_guard l(full_status_lock);
 929   return cur_state >= FULL;
 930 }
 931
 932 bool OSDService::is_backfillfull() const
 933 {
 934   std::lock_guard l(full_status_lock);
 935   return cur_state >= BACKFILLFULL;
 936 }
 937
 938 bool OSDService::is_nearfull() const
 939 {
 940   std::lock_guard l(full_status_lock);
 941   return cur_state >= NEARFULL;
 942 }
 943
 944 void OSDService::set_injectfull(s_names type, int64_t count)
 945 {
 946   std::lock_guard l(full_status_lock);
 947   injectfull_state = type;
 948   injectfull = count;
 949 }
 950
 951 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
 952                             osd_alert_list_t& alerts)
 953 {
 954   uint64_t bytes = stbuf.total;
 955   uint64_t avail = stbuf.available;
 956   uint64_t used = stbuf.get_used_raw();
 957
 958   // For testing fake statfs values so it doesn't matter if all
 959   // OSDs are using the same partition.
 960   if (cct->_conf->fake_statfs_for_testing) {
 961     uint64_t total_num_bytes = 0;
 962     vector<PGRef> pgs;
 963     osd->_get_pgs(&pgs);
 964     for (auto p : pgs) {
 965       total_num_bytes += p->get_stats_num_bytes();
 966     }
 967     bytes = cct->_conf->fake_statfs_for_testing;
 968     if (total_num_bytes < bytes)
 969       avail = bytes - total_num_bytes;
 970     else
 971       avail = 0;
 972     dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
 973             << " adjust available " << avail
 974             << dendl;
 975     used = bytes - avail;
 976   }
 977
 978   logger->set(l_osd_stat_bytes, bytes);
 979   logger->set(l_osd_stat_bytes_used, used);
 980   logger->set(l_osd_stat_bytes_avail, avail);
 981
 982   std::lock_guard l(stat_lock);
 983   osd_stat.statfs = stbuf;
 984   osd_stat.os_alerts.clear();
 985   osd_stat.os_alerts[whoami].swap(alerts);
 986   if (cct->_conf->fake_statfs_for_testing) {
 987     osd_stat.statfs.total = bytes;
 988     osd_stat.statfs.available = avail;
 989     // For testing don't want used to go negative, so clear reserved
 990     osd_stat.statfs.internally_reserved = 0;
 991   }
 992 }
 993
 994 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
 995                                     int num_pgs)
 996 {
 997   utime_t now = ceph_clock_now();
 998   auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
 999   std::lock_guard l(stat_lock);
1000   osd_stat.hb_peers.swap(hb_peers);
1001   osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
1002   osd_stat.num_pgs = num_pgs;
1003   // Clean entries that aren't updated
1004   // This is called often enough that we can just remove 1 at a time
1005   for (auto i: osd_stat.hb_pingtime) {
1006     if (i.second.last_update == 0)
1007       continue;
1008     if (stale_time && now.sec() - i.second.last_update > stale_time) {
1009       dout(20) << __func__ << " time out heartbeat for osd " << i.first
1010                << " last_update " << i.second.last_update << dendl;
1011       osd_stat.hb_pingtime.erase(i.first);
1012       break;
1013     }
1014   }
1015   return osd_stat;
1016 }
1017
1018 void OSDService::inc_osd_stat_repaired()
1019 {
1020   std::lock_guard l(stat_lock);
1021   osd_stat.num_shards_repaired++;
1022   return;
1023 }
1024
1025 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1026                                          uint64_t adjust_used)
1027 {
1028   *pratio =
1029    ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1030
1031   if (adjust_used) {
1032     dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used()  << dendl;
1033     if (new_stat.statfs.available > adjust_used)
1034       new_stat.statfs.available -= adjust_used;
1035     else
1036       new_stat.statfs.available = 0;
1037     dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1038   }
1039
1040   // Check all pgs and adjust kb_used to include all pending backfill data
1041   int backfill_adjusted = 0;
1042   vector<PGRef> pgs;
1043   osd->_get_pgs(&pgs);
1044   for (auto p : pgs) {
1045     backfill_adjusted += p->pg_stat_adjust(&new_stat);
1046   }
1047   if (backfill_adjusted) {
1048     dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1049   }
1050   return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1051 }
1052
1053 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1054 {
1055   OSDMapRef next_map = get_nextmap_reserved();
1056   // service map is always newer/newest
1057   ceph_assert(from_epoch <= next_map->get_epoch());
1058
1059   if (next_map->is_down(peer) ||
1060       next_map->get_info(peer).up_from > from_epoch) {
1061     m->put();
1062     release_map(next_map);
1063     return;
1064   }
1065   ConnectionRef peer_con;
1066   if (peer == whoami) {
1067     peer_con = osd->cluster_messenger->get_loopback_connection();
1068   } else {
1069     peer_con = osd->cluster_messenger->connect_to_osd(
1070         next_map->get_cluster_addrs(peer), false, true);
1071   }
1072   maybe_share_map(peer_con.get(), next_map);
1073   peer_con->send_message(m);
1074   release_map(next_map);
1075 }
1076
1077 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1078 {
1079   OSDMapRef next_map = get_nextmap_reserved();
1080   // service map is always newer/newest
1081   ceph_assert(from_epoch <= next_map->get_epoch());
1082
1083   for (auto& iter : messages) {
1084     if (next_map->is_down(iter.first) ||
1085         next_map->get_info(iter.first).up_from > from_epoch) {
1086       iter.second->put();
1087       continue;
1088     }
1089     ConnectionRef peer_con;
1090     if (iter.first == whoami) {
1091       peer_con = osd->cluster_messenger->get_loopback_connection();
1092     } else {
1093       peer_con = osd->cluster_messenger->connect_to_osd(
1094           next_map->get_cluster_addrs(iter.first), false, true);
1095     }
1096     maybe_share_map(peer_con.get(), next_map);
1097     peer_con->send_message(iter.second);
1098   }
1099   release_map(next_map);
1100 }
1101 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1102 {
1103   OSDMapRef next_map = get_nextmap_reserved();
1104   // service map is always newer/newest
1105   ceph_assert(from_epoch <= next_map->get_epoch());
1106
1107   if (next_map->is_down(peer) ||
1108       next_map->get_info(peer).up_from > from_epoch) {
1109     release_map(next_map);
1110     return NULL;
1111   }
1112   ConnectionRef con;
1113   if (peer == whoami) {
1114     con = osd->cluster_messenger->get_loopback_connection();
1115   } else {
1116     con = osd->cluster_messenger->connect_to_osd(
1117         next_map->get_cluster_addrs(peer), false, true);
1118   }
1119   release_map(next_map);
1120   return con;
1121 }
1122
1123 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1124 {
1125   OSDMapRef next_map = get_nextmap_reserved();
1126   // service map is always newer/newest
1127   ceph_assert(from_epoch <= next_map->get_epoch());
1128
1129   pair<ConnectionRef,ConnectionRef> ret;
1130   if (next_map->is_down(peer) ||
1131       next_map->get_info(peer).up_from > from_epoch) {
1132     release_map(next_map);
1133     return ret;
1134   }
1135   ret.first = osd->hb_back_client_messenger->connect_to_osd(
1136     next_map->get_hb_back_addrs(peer));
1137   ret.second = osd->hb_front_client_messenger->connect_to_osd(
1138     next_map->get_hb_front_addrs(peer));
1139   release_map(next_map);
1140   return ret;
1141 }
1142
1143 entity_name_t OSDService::get_cluster_msgr_name() const
1144 {
1145   return cluster_messenger->get_myname();
1146 }
1147
1148 void OSDService::queue_want_pg_temp(pg_t pgid,
1149                                     const vector<int>& want,
1150                                     bool forced)
1151 {
1152   std::lock_guard l(pg_temp_lock);
1153   auto p = pg_temp_pending.find(pgid);
1154   if (p == pg_temp_pending.end() ||
1155       p->second.acting != want ||
1156       forced) {
1157     pg_temp_wanted[pgid] = {want, forced};
1158   }
1159 }
1160
1161 void OSDService::remove_want_pg_temp(pg_t pgid)
1162 {
1163   std::lock_guard l(pg_temp_lock);
1164   pg_temp_wanted.erase(pgid);
1165   pg_temp_pending.erase(pgid);
1166 }
1167
1168 void OSDService::_sent_pg_temp()
1169 {
1170 #ifdef HAVE_STDLIB_MAP_SPLICING
1171   pg_temp_pending.merge(pg_temp_wanted);
1172 #else
1173   pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1174                          make_move_iterator(end(pg_temp_wanted)));
1175 #endif
1176   pg_temp_wanted.clear();
1177 }
1178
1179 void OSDService::requeue_pg_temp()
1180 {
1181   std::lock_guard l(pg_temp_lock);
1182   // wanted overrides pending.  note that remove_want_pg_temp
1183   // clears the item out of both.
1184   unsigned old_wanted = pg_temp_wanted.size();
1185   unsigned old_pending = pg_temp_pending.size();
1186   _sent_pg_temp();
1187   pg_temp_wanted.swap(pg_temp_pending);
1188   dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1189            << pg_temp_wanted.size() << dendl;
1190 }
1191
1192 std::ostream& operator<<(std::ostream& out,
1193                          const OSDService::pg_temp_t& pg_temp)
1194 {
1195   out << pg_temp.acting;
1196   if (pg_temp.forced) {
1197     out << " (forced)";
1198   }
1199   return out;
1200 }
1201
1202 void OSDService::send_pg_temp()
1203 {
1204   std::lock_guard l(pg_temp_lock);
1205   if (pg_temp_wanted.empty())
1206     return;
1207   dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1208   MOSDPGTemp *ms[2] = {nullptr, nullptr};
1209   for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1210     auto& m = ms[pg_temp.forced];
1211     if (!m) {
1212       m = new MOSDPGTemp(osdmap->get_epoch());
1213       m->forced = pg_temp.forced;
1214     }
1215     m->pg_temp.emplace(pgid, pg_temp.acting);
1216   }
1217   for (auto m : ms) {
1218     if (m) {
1219       monc->send_mon_message(m);
1220     }
1221   }
1222   _sent_pg_temp();
1223 }
1224
1225 void OSDService::send_pg_created(pg_t pgid)
1226 {
1227   std::lock_guard l(pg_created_lock);
1228   dout(20) << __func__ << dendl;
1229   auto o = get_osdmap();
1230   if (o->require_osd_release >= ceph_release_t::luminous) {
1231     pg_created.insert(pgid);
1232     monc->send_mon_message(new MOSDPGCreated(pgid));
1233   }
1234 }
1235
1236 void OSDService::send_pg_created()
1237 {
1238   std::lock_guard l(pg_created_lock);
1239   dout(20) << __func__ << dendl;
1240   auto o = get_osdmap();
1241   if (o->require_osd_release >= ceph_release_t::luminous) {
1242     for (auto pgid : pg_created) {
1243       monc->send_mon_message(new MOSDPGCreated(pgid));
1244     }
1245   }
1246 }
1247
1248 void OSDService::prune_pg_created()
1249 {
1250   std::lock_guard l(pg_created_lock);
1251   dout(20) << __func__ << dendl;
1252   auto o = get_osdmap();
1253   auto i = pg_created.begin();
1254   while (i != pg_created.end()) {
1255     auto p = o->get_pg_pool(i->pool());
1256     if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1257       dout(20) << __func__ << " pruning " << *i << dendl;
1258       i = pg_created.erase(i);
1259     } else {
1260       dout(20) << __func__ << " keeping " << *i << dendl;
1261       ++i;
1262     }
1263   }
1264 }
1265
1266
1267 // --------------------------------------
1268 // dispatch
1269
1270 bool OSDService::can_inc_scrubs()
1271 {
1272   bool can_inc = false;
1273   std::lock_guard l(sched_scrub_lock);
1274
1275   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1276     dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1277              << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1278     can_inc = true;
1279   } else {
1280     dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1281              << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1282   }
1283
1284   return can_inc;
1285 }
1286
1287 bool OSDService::inc_scrubs_local()
1288 {
1289   bool result = false;
1290   std::lock_guard l{sched_scrub_lock};
1291   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1292     dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1293              << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1294     result = true;
1295     ++scrubs_local;
1296   } else {
1297     dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1298   }
1299   return result;
1300 }
1301
1302 void OSDService::dec_scrubs_local()
1303 {
1304   std::lock_guard l{sched_scrub_lock};
1305   dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1306            << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1307   --scrubs_local;
1308   ceph_assert(scrubs_local >= 0);
1309 }
1310
1311 bool OSDService::inc_scrubs_remote()
1312 {
1313   bool result = false;
1314   std::lock_guard l{sched_scrub_lock};
1315   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1316     dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1317              << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1318     result = true;
1319     ++scrubs_remote;
1320   } else {
1321     dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1322   }
1323   return result;
1324 }
1325
1326 void OSDService::dec_scrubs_remote()
1327 {
1328   std::lock_guard l{sched_scrub_lock};
1329   dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1330            << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1331   --scrubs_remote;
1332   ceph_assert(scrubs_remote >= 0);
1333 }
1334
1335 void OSDService::dump_scrub_reservations(Formatter *f)
1336 {
1337   std::lock_guard l{sched_scrub_lock};
1338   f->dump_int("scrubs_local", scrubs_local);
1339   f->dump_int("scrubs_remote", scrubs_remote);
1340   f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1341 }
1342
1343 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1344                                  epoch_t *_bind_epoch) const
1345 {
1346   std::lock_guard l(epoch_lock);
1347   if (_boot_epoch)
1348     *_boot_epoch = boot_epoch;
1349   if (_up_epoch)
1350     *_up_epoch = up_epoch;
1351   if (_bind_epoch)
1352     *_bind_epoch = bind_epoch;
1353 }
1354
1355 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1356                             const epoch_t *_bind_epoch)
1357 {
1358   std::lock_guard l(epoch_lock);
1359   if (_boot_epoch) {
1360     ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1361     boot_epoch = *_boot_epoch;
1362   }
1363   if (_up_epoch) {
1364     ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1365     up_epoch = *_up_epoch;
1366   }
1367   if (_bind_epoch) {
1368     ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1369     bind_epoch = *_bind_epoch;
1370   }
1371 }
1372
1373 bool OSDService::prepare_to_stop()
1374 {
1375   std::unique_lock l(is_stopping_lock);
1376   if (get_state() != NOT_STOPPING)
1377     return false;
1378
1379   OSDMapRef osdmap = get_osdmap();
1380   if (osdmap && osdmap->is_up(whoami)) {
1381     dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1382     set_state(PREPARING_TO_STOP);
1383     monc->send_mon_message(
1384       new MOSDMarkMeDown(
1385         monc->get_fsid(),
1386         whoami,
1387         osdmap->get_addrs(whoami),
1388         osdmap->get_epoch(),
1389         true  // request ack
1390         ));
1391     const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1392     is_stopping_cond.wait_for(l, timeout,
1393       [this] { return get_state() == STOPPING; });
1394   }
1395   dout(0) << __func__ << " starting shutdown" << dendl;
1396   set_state(STOPPING);
1397   return true;
1398 }
1399
1400 void OSDService::got_stop_ack()
1401 {
1402   std::scoped_lock l(is_stopping_lock);
1403   if (get_state() == PREPARING_TO_STOP) {
1404     dout(0) << __func__ << " starting shutdown" << dendl;
1405     set_state(STOPPING);
1406     is_stopping_cond.notify_all();
1407   } else {
1408     dout(10) << __func__ << " ignoring msg" << dendl;
1409   }
1410 }
1411
1412 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1413                                                OSDSuperblock& sblock)
1414 {
1415   MOSDMap *m = new MOSDMap(monc->get_fsid(),
1416                            osdmap->get_encoding_features());
1417   m->oldest_map = max_oldest_map;
1418   m->newest_map = sblock.newest_map;
1419
1420   int max = cct->_conf->osd_map_message_max;
1421   ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1422
1423   if (since < m->oldest_map) {
1424     // we don't have the next map the target wants, so start with a
1425     // full map.
1426     bufferlist bl;
1427     dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1428              << since << ", starting with full map" << dendl;
1429     since = m->oldest_map;
1430     if (!get_map_bl(since, bl)) {
1431       derr << __func__ << " missing full map " << since << dendl;
1432       goto panic;
1433     }
1434     max--;
1435     max_bytes -= bl.length();
1436     m->maps[since] = std::move(bl);
1437   }
1438   for (epoch_t e = since + 1; e <= to; ++e) {
1439     bufferlist bl;
1440     if (get_inc_map_bl(e, bl)) {
1441       m->incremental_maps[e] = std::move(bl);
1442     } else {
1443       dout(10) << __func__ << " missing incremental map " << e << dendl;
1444       if (!get_map_bl(e, bl)) {
1445         derr << __func__ << " also missing full map " << e << dendl;
1446         goto panic;
1447       }
1448       m->maps[e] = std::move(bl);
1449     }
1450     max--;
1451     max_bytes -= bl.length();
1452     if (max <= 0 || max_bytes <= 0) {
1453       break;
1454     }
1455   }
1456   return m;
1457
1458  panic:
1459   if (!m->maps.empty() ||
1460       !m->incremental_maps.empty()) {
1461     // send what we have so far
1462     return m;
1463   }
1464   // send something
1465   bufferlist bl;
1466   if (get_inc_map_bl(m->newest_map, bl)) {
1467     m->incremental_maps[m->newest_map] = std::move(bl);
1468   } else {
1469     derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1470     if (!get_map_bl(m->newest_map, bl)) {
1471       derr << __func__ << " unable to load latest full map " << m->newest_map
1472            << dendl;
1473       ceph_abort();
1474     }
1475     m->maps[m->newest_map] = std::move(bl);
1476   }
1477   return m;
1478 }
1479
1480 void OSDService::send_map(MOSDMap *m, Connection *con)
1481 {
1482   con->send_message(m);
1483 }
1484
1485 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1486                                       const OSDMapRef& osdmap)
1487 {
1488   epoch_t to = osdmap->get_epoch();
1489   dout(10) << "send_incremental_map " << since << " -> " << to
1490            << " to " << con << " " << con->get_peer_addr() << dendl;
1491
1492   MOSDMap *m = NULL;
1493   while (!m) {
1494     OSDSuperblock sblock(get_superblock());
1495     if (since < sblock.oldest_map) {
1496       // just send latest full map
1497       MOSDMap *m = new MOSDMap(monc->get_fsid(),
1498                                osdmap->get_encoding_features());
1499       m->oldest_map = max_oldest_map;
1500       m->newest_map = sblock.newest_map;
1501       get_map_bl(to, m->maps[to]);
1502       send_map(m, con);
1503       return;
1504     }
1505
1506     if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1507       dout(10) << "  " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1508                << ", only sending most recent" << dendl;
1509       since = to - cct->_conf->osd_map_share_max_epochs;
1510     }
1511
1512     m = build_incremental_map_msg(since, to, sblock);
1513   }
1514   send_map(m, con);
1515 }
1516
1517 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1518 {
1519   bool found = map_bl_cache.lookup(e, &bl);
1520   if (found) {
1521     logger->inc(l_osd_map_bl_cache_hit);
1522     return true;
1523   }
1524   logger->inc(l_osd_map_bl_cache_miss);
1525   found = store->read(meta_ch,
1526                       OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1527                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1528   if (found) {
1529     _add_map_bl(e, bl);
1530   }
1531   return found;
1532 }
1533
1534 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1535 {
1536   std::lock_guard l(map_cache_lock);
1537   bool found = map_bl_inc_cache.lookup(e, &bl);
1538   if (found) {
1539     logger->inc(l_osd_map_bl_cache_hit);
1540     return true;
1541   }
1542   logger->inc(l_osd_map_bl_cache_miss);
1543   found = store->read(meta_ch,
1544                       OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1545                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1546   if (found) {
1547     _add_map_inc_bl(e, bl);
1548   }
1549   return found;
1550 }
1551
1552 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1553 {
1554   dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1555   // cache a contiguous buffer
1556   if (bl.get_num_buffers() > 1) {
1557     bl.rebuild();
1558   }
1559   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1560   map_bl_cache.add(e, bl);
1561 }
1562
1563 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1564 {
1565   dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1566   // cache a contiguous buffer
1567   if (bl.get_num_buffers() > 1) {
1568     bl.rebuild();
1569   }
1570   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1571   map_bl_inc_cache.add(e, bl);
1572 }
1573
1574 OSDMapRef OSDService::_add_map(OSDMap *o)
1575 {
1576   epoch_t e = o->get_epoch();
1577
1578   if (cct->_conf->osd_map_dedup) {
1579     // Dedup against an existing map at a nearby epoch
1580     OSDMapRef for_dedup = map_cache.lower_bound(e);
1581     if (for_dedup) {
1582       OSDMap::dedup(for_dedup.get(), o);
1583     }
1584   }
1585   bool existed;
1586   OSDMapRef l = map_cache.add(e, o, &existed);
1587   if (existed) {
1588     delete o;
1589   }
1590   return l;
1591 }
1592
1593 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1594 {
1595   std::lock_guard l(map_cache_lock);
1596   OSDMapRef retval = map_cache.lookup(epoch);
1597   if (retval) {
1598     dout(30) << "get_map " << epoch << " -cached" << dendl;
1599     logger->inc(l_osd_map_cache_hit);
1600     return retval;
1601   }
1602   {
1603     logger->inc(l_osd_map_cache_miss);
1604     epoch_t lb = map_cache.cached_key_lower_bound();
1605     if (epoch < lb) {
1606       dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1607       logger->inc(l_osd_map_cache_miss_low);
1608       logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1609     }
1610   }
1611
1612   OSDMap *map = new OSDMap;
1613   if (epoch > 0) {
1614     dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1615     bufferlist bl;
1616     if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1617       derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1618       delete map;
1619       return OSDMapRef();
1620     }
1621     map->decode(bl);
1622   } else {
1623     dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1624   }
1625   return _add_map(map);
1626 }
1627
1628 // ops
1629
1630
1631 void OSDService::reply_op_error(OpRequestRef op, int err)
1632 {
1633   reply_op_error(op, err, eversion_t(), 0, {});
1634 }
1635
1636 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1637                                 version_t uv,
1638                                 vector<pg_log_op_return_item_t> op_returns)
1639 {
1640   auto m = op->get_req<MOSDOp>();
1641   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1642   int flags;
1643   flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1644
1645   MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1646                                        !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1647   reply->set_reply_versions(v, uv);
1648   reply->set_op_returns(op_returns);
1649   m->get_connection()->send_message(reply);
1650 }
1651
1652 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1653 {
1654   if (!cct->_conf->osd_debug_misdirected_ops) {
1655     return;
1656   }
1657
1658   auto m = op->get_req<MOSDOp>();
1659   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1660
1661   ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1662
1663   if (pg->is_ec_pg()) {
1664     /**
1665        * OSD recomputes op target based on current OSDMap. With an EC pg, we
1666        * can get this result:
1667        * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1668        *    [CRUSH_ITEM_NONE, 2, 3]/3
1669        * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1670        *    [3, 2, 3]/3
1671        * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1672        *    -- misdirected op
1673        * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1674        *    it and fulfils it
1675        *
1676        * We can't compute the op target based on the sending map epoch due to
1677        * splitting.  The simplest thing is to detect such cases here and drop
1678        * them without an error (the client will resend anyway).
1679        */
1680     ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1681     OSDMapRef opmap = try_get_map(m->get_map_epoch());
1682     if (!opmap) {
1683       dout(7) << __func__ << ": " << *pg << " no longer have map for "
1684               << m->get_map_epoch() << ", dropping" << dendl;
1685       return;
1686     }
1687     pg_t _pgid = m->get_raw_pg();
1688     spg_t pgid;
1689     if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1690       _pgid = opmap->raw_pg_to_pg(_pgid);
1691     if (opmap->get_primary_shard(_pgid, &pgid) &&
1692         pgid.shard != pg->pg_id.shard) {
1693       dout(7) << __func__ << ": " << *pg << " primary changed since "
1694               << m->get_map_epoch() << ", dropping" << dendl;
1695       return;
1696     }
1697   }
1698
1699   dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1700   clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1701                << " pg " << m->get_raw_pg()
1702                << " to osd." << whoami
1703                << " not " << pg->get_acting()
1704                << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1705 }
1706
1707 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1708 {
1709   osd->op_shardedwq.queue(std::move(qi));
1710 }
1711
1712 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1713 {
1714   osd->op_shardedwq.queue_front(std::move(qi));
1715 }
1716
1717 void OSDService::queue_recovery_context(
1718   PG *pg,
1719   GenContext<ThreadPool::TPHandle&> *c)
1720 {
1721   epoch_t e = get_osdmap_epoch();
1722   enqueue_back(
1723     OpSchedulerItem(
1724       unique_ptr<OpSchedulerItem::OpQueueable>(
1725         new PGRecoveryContext(pg->get_pgid(), c, e)),
1726       cct->_conf->osd_recovery_cost,
1727       cct->_conf->osd_recovery_priority,
1728       ceph_clock_now(),
1729       0,
1730       e));
1731 }
1732
1733 void OSDService::queue_for_snap_trim(PG *pg)
1734 {
1735   dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1736   enqueue_back(
1737     OpSchedulerItem(
1738       unique_ptr<OpSchedulerItem::OpQueueable>(
1739         new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1740       cct->_conf->osd_snap_trim_cost,
1741       cct->_conf->osd_snap_trim_priority,
1742       ceph_clock_now(),
1743       0,
1744       pg->get_osdmap_epoch()));
1745 }
1746
1747 template <class MSG_TYPE>
1748 void OSDService::queue_scrub_event_msg(PG* pg,
1749                                        Scrub::scrub_prio_t with_priority,
1750                                        unsigned int qu_priority)
1751 {
1752   const auto epoch = pg->get_osdmap_epoch();
1753   auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1754   dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1755
1756   enqueue_back(OpSchedulerItem(
1757     unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1758     pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1759 }
1760
1761 template <class MSG_TYPE>
1762 void OSDService::queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority)
1763 {
1764   const auto epoch = pg->get_osdmap_epoch();
1765   auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1766   dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1767
1768   enqueue_back(OpSchedulerItem(
1769     unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1770     pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1771 }
1772
1773 void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1774 {
1775   queue_scrub_event_msg<PGScrub>(pg, with_priority);
1776 }
1777
1778 void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1779 {
1780   queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1781 }
1782
1783 void OSDService::queue_for_rep_scrub(PG* pg,
1784                                      Scrub::scrub_prio_t with_priority,
1785                                      unsigned int qu_priority)
1786 {
1787   queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority);
1788 }
1789
1790 void OSDService::queue_for_rep_scrub_resched(PG* pg,
1791                                              Scrub::scrub_prio_t with_priority,
1792                                              unsigned int qu_priority)
1793 {
1794   // Resulting scrub event: 'SchedReplica'
1795   queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority);
1796 }
1797
1798 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1799 {
1800   // Resulting scrub event: 'RemotesReserved'
1801   queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1802 }
1803
1804 void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1805 {
1806   // Resulting scrub event: 'ReservationFailure'
1807   queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1808 }
1809
1810 void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1811 {
1812   // Resulting scrub event: 'InternalSchedScrub'
1813   queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1814 }
1815
1816 void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1817 {
1818   // Resulting scrub event: 'ActivePushesUpd'
1819   queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1820 }
1821
1822 void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1823 {
1824   queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1825 }
1826
1827 void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1828 {
1829   // Resulting scrub event: 'Unblocked'
1830   queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1831 }
1832
1833 void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1834 {
1835   // Resulting scrub event: 'DigestUpdate'
1836   queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1837 }
1838
1839 void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1840 {
1841   // Resulting scrub event: 'GotReplicas'
1842   queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1843 }
1844
1845 void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1846 {
1847   // Resulting scrub event: 'ReplicaPushesUpd'
1848   queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1849 }
1850
1851 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1852 {
1853   dout(10) << __func__ << " on " << pgid << " e " << e  << dendl;
1854   enqueue_back(
1855     OpSchedulerItem(
1856       unique_ptr<OpSchedulerItem::OpQueueable>(
1857         new PGDelete(pgid, e)),
1858       cct->_conf->osd_pg_delete_cost,
1859       cct->_conf->osd_pg_delete_priority,
1860       ceph_clock_now(),
1861       0,
1862       e));
1863 }
1864
1865 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1866 {
1867   return osd->try_finish_pg_delete(pg, old_pg_num);
1868 }
1869
1870 // ---
1871
1872 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1873 {
1874   std::lock_guard l(merge_lock);
1875   dout(10) << __func__ << " " << pg->pg_id << dendl;
1876   ready_to_merge_source[pg->pg_id.pgid] = version;
1877   assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1878   _send_ready_to_merge();
1879 }
1880
1881 void OSDService::set_ready_to_merge_target(PG *pg,
1882                                            eversion_t version,
1883                                            epoch_t last_epoch_started,
1884                                            epoch_t last_epoch_clean)
1885 {
1886   std::lock_guard l(merge_lock);
1887   dout(10) << __func__ << " " << pg->pg_id << dendl;
1888   ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1889                                          make_tuple(version,
1890                                                     last_epoch_started,
1891                                                     last_epoch_clean)));
1892   assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1893   _send_ready_to_merge();
1894 }
1895
1896 void OSDService::set_not_ready_to_merge_source(pg_t source)
1897 {
1898   std::lock_guard l(merge_lock);
1899   dout(10) << __func__ << " " << source << dendl;
1900   not_ready_to_merge_source.insert(source);
1901   assert(ready_to_merge_source.count(source) == 0);
1902   _send_ready_to_merge();
1903 }
1904
1905 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1906 {
1907   std::lock_guard l(merge_lock);
1908   dout(10) << __func__ << " " << target << " source " << source << dendl;
1909   not_ready_to_merge_target[target] = source;
1910   assert(ready_to_merge_target.count(target) == 0);
1911   _send_ready_to_merge();
1912 }
1913
1914 void OSDService::send_ready_to_merge()
1915 {
1916   std::lock_guard l(merge_lock);
1917   _send_ready_to_merge();
1918 }
1919
1920 void OSDService::_send_ready_to_merge()
1921 {
1922   dout(20) << __func__
1923            << " ready_to_merge_source " << ready_to_merge_source
1924            << " not_ready_to_merge_source " << not_ready_to_merge_source
1925            << " ready_to_merge_target " << ready_to_merge_target
1926            << " not_ready_to_merge_target " << not_ready_to_merge_target
1927            << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1928            << dendl;
1929   for (auto src : not_ready_to_merge_source) {
1930     if (sent_ready_to_merge_source.count(src) == 0) {
1931       monc->send_mon_message(new MOSDPGReadyToMerge(
1932                                src,
1933                                {}, {}, 0, 0,
1934                                false,
1935                                osdmap->get_epoch()));
1936       sent_ready_to_merge_source.insert(src);
1937     }
1938   }
1939   for (auto p : not_ready_to_merge_target) {
1940     if (sent_ready_to_merge_source.count(p.second) == 0) {
1941       monc->send_mon_message(new MOSDPGReadyToMerge(
1942                                p.second,
1943                                {}, {}, 0, 0,
1944                                false,
1945                                osdmap->get_epoch()));
1946       sent_ready_to_merge_source.insert(p.second);
1947     }
1948   }
1949   for (auto src : ready_to_merge_source) {
1950     if (not_ready_to_merge_source.count(src.first) ||
1951         not_ready_to_merge_target.count(src.first.get_parent())) {
1952       continue;
1953     }
1954     auto p = ready_to_merge_target.find(src.first.get_parent());
1955     if (p != ready_to_merge_target.end() &&
1956         sent_ready_to_merge_source.count(src.first) == 0) {
1957       monc->send_mon_message(new MOSDPGReadyToMerge(
1958                                src.first,           // source pgid
1959                                src.second,          // src version
1960                                std::get<0>(p->second), // target version
1961                                std::get<1>(p->second), // PG's last_epoch_started
1962                                std::get<2>(p->second), // PG's last_epoch_clean
1963                                true,
1964                                osdmap->get_epoch()));
1965       sent_ready_to_merge_source.insert(src.first);
1966     }
1967   }
1968 }
1969
1970 void OSDService::clear_ready_to_merge(PG *pg)
1971 {
1972   std::lock_guard l(merge_lock);
1973   dout(10) << __func__ << " " << pg->pg_id << dendl;
1974   ready_to_merge_source.erase(pg->pg_id.pgid);
1975   ready_to_merge_target.erase(pg->pg_id.pgid);
1976   not_ready_to_merge_source.erase(pg->pg_id.pgid);
1977   not_ready_to_merge_target.erase(pg->pg_id.pgid);
1978   sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1979 }
1980
1981 void OSDService::clear_sent_ready_to_merge()
1982 {
1983   std::lock_guard l(merge_lock);
1984   sent_ready_to_merge_source.clear();
1985 }
1986
1987 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1988 {
1989   std::lock_guard l(merge_lock);
1990   auto i = sent_ready_to_merge_source.begin();
1991   while (i != sent_ready_to_merge_source.end()) {
1992     if (!osdmap->pg_exists(*i)) {
1993       dout(10) << __func__ << " " << *i << dendl;
1994       i = sent_ready_to_merge_source.erase(i);
1995     } else {
1996       ++i;
1997     }
1998   }
1999 }
2000
2001 // ---
2002
2003 void OSDService::_queue_for_recovery(
2004   std::pair<epoch_t, PGRef> p,
2005   uint64_t reserved_pushes)
2006 {
2007   ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
2008   enqueue_back(
2009     OpSchedulerItem(
2010       unique_ptr<OpSchedulerItem::OpQueueable>(
2011         new PGRecovery(
2012           p.second->get_pgid(), p.first, reserved_pushes)),
2013       cct->_conf->osd_recovery_cost,
2014       cct->_conf->osd_recovery_priority,
2015       ceph_clock_now(),
2016       0,
2017       p.first));
2018 }
2019
2020 // ====================================================================
2021 // OSD
2022
2023 #undef dout_prefix
2024 #define dout_prefix *_dout
2025
2026 // Commands shared between OSD's console and admin console:
2027 namespace ceph::osd_cmds {
2028
2029 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
2030
2031 } // namespace ceph::osd_cmds
2032
2033 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
2034 {
2035   int ret;
2036
2037   OSDSuperblock sb;
2038   bufferlist sbbl;
2039   ObjectStore::CollectionHandle ch;
2040
2041   // if we are fed a uuid for this osd, use it.
2042   store->set_fsid(cct->_conf->osd_uuid);
2043
2044   ret = store->mkfs();
2045   if (ret) {
2046     derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2047          << cpp_strerror(ret) << dendl;
2048     goto free_store;
2049   }
2050
2051   store->set_cache_shards(1);  // doesn't matter for mkfs!
2052
2053   ret = store->mount();
2054   if (ret) {
2055     derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2056          << cpp_strerror(ret) << dendl;
2057     goto free_store;
2058   }
2059
2060   ch = store->open_collection(coll_t::meta());
2061   if (ch) {
2062     ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2063     if (ret < 0) {
2064       derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2065       goto free_store;
2066     }
2067     /* if we already have superblock, check content of superblock */
2068     dout(0) << " have superblock" << dendl;
2069     auto p = sbbl.cbegin();
2070     decode(sb, p);
2071     if (whoami != sb.whoami) {
2072       derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2073            << dendl;
2074       ret = -EINVAL;
2075       goto umount_store;
2076     }
2077     if (fsid != sb.cluster_fsid) {
2078       derr << "provided cluster fsid " << fsid
2079            << " != superblock's " << sb.cluster_fsid << dendl;
2080       ret = -EINVAL;
2081       goto umount_store;
2082     }
2083   } else {
2084     // create superblock
2085     sb.cluster_fsid = fsid;
2086     sb.osd_fsid = store->get_fsid();
2087     sb.whoami = whoami;
2088     sb.compat_features = get_osd_initial_compat_set();
2089
2090     bufferlist bl;
2091     encode(sb, bl);
2092
2093     ObjectStore::CollectionHandle ch = store->create_new_collection(
2094       coll_t::meta());
2095     ObjectStore::Transaction t;
2096     t.create_collection(coll_t::meta(), 0);
2097     t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2098     ret = store->queue_transaction(ch, std::move(t));
2099     if (ret) {
2100       derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2101            << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2102       goto umount_store;
2103     }
2104     ch->flush();
2105   }
2106
2107   ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
2108   if (ret) {
2109     derr << "OSD::mkfs: failed to write fsid file: error "
2110          << cpp_strerror(ret) << dendl;
2111     goto umount_store;
2112   }
2113
2114 umount_store:
2115   if (ch) {
2116     ch.reset();
2117   }
2118   store->umount();
2119 free_store:
2120   delete store;
2121   return ret;
2122 }
2123
2124 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2125 {
2126   char val[80];
2127   int r;
2128
2129   snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2130   r = store->write_meta("magic", val);
2131   if (r < 0)
2132     return r;
2133
2134   snprintf(val, sizeof(val), "%d", whoami);
2135   r = store->write_meta("whoami", val);
2136   if (r < 0)
2137     return r;
2138
2139   cluster_fsid.print(val);
2140   r = store->write_meta("ceph_fsid", val);
2141   if (r < 0)
2142     return r;
2143
2144   string key = cct->_conf.get_val<string>("key");
2145   if (key.size()) {
2146     r = store->write_meta("osd_key", key);
2147     if (r < 0)
2148       return r;
2149   } else {
2150     string keyfile = cct->_conf.get_val<string>("keyfile");
2151     if (!keyfile.empty()) {
2152       bufferlist keybl;
2153       string err;
2154       r = keybl.read_file(keyfile.c_str(), &err);
2155       if (r < 0) {
2156         derr << __func__ << " failed to read keyfile " << keyfile << ": "
2157              << err << ": " << cpp_strerror(r) << dendl;
2158         return r;
2159       }
2160       r = store->write_meta("osd_key", keybl.to_str());
2161       if (r < 0)
2162         return r;
2163     }
2164   }
2165   if (!osdspec_affinity.empty()) {
2166     r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2167     if (r < 0)
2168       return r;
2169   }
2170
2171   r = store->write_meta("ready", "ready");
2172   if (r < 0)
2173     return r;
2174
2175   return 0;
2176 }
2177
2178 int OSD::peek_meta(ObjectStore *store,
2179                    std::string *magic,
2180                    uuid_d *cluster_fsid,
2181                    uuid_d *osd_fsid,
2182                    int *whoami,
2183                    ceph_release_t *require_osd_release)
2184 {
2185   string val;
2186
2187   int r = store->read_meta("magic", &val);
2188   if (r < 0)
2189     return r;
2190   *magic = val;
2191
2192   r = store->read_meta("whoami", &val);
2193   if (r < 0)
2194     return r;
2195   *whoami = atoi(val.c_str());
2196
2197   r = store->read_meta("ceph_fsid", &val);
2198   if (r < 0)
2199     return r;
2200   r = cluster_fsid->parse(val.c_str());
2201   if (!r)
2202     return -EINVAL;
2203
2204   r = store->read_meta("fsid", &val);
2205   if (r < 0) {
2206     *osd_fsid = uuid_d();
2207   } else {
2208     r = osd_fsid->parse(val.c_str());
2209     if (!r)
2210       return -EINVAL;
2211   }
2212
2213   r = store->read_meta("require_osd_release", &val);
2214   if (r >= 0) {
2215     *require_osd_release = ceph_release_from_name(val);
2216   }
2217
2218   return 0;
2219 }
2220
2221
2222 #undef dout_prefix
2223 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2224
2225 // cons/des
2226
2227 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2228          int id,
2229          Messenger *internal_messenger,
2230          Messenger *external_messenger,
2231          Messenger *hb_client_front,
2232          Messenger *hb_client_back,
2233          Messenger *hb_front_serverm,
2234          Messenger *hb_back_serverm,
2235          Messenger *osdc_messenger,
2236          MonClient *mc,
2237          const std::string &dev, const std::string &jdev,
2238          ceph::async::io_context_pool& poolctx) :
2239   Dispatcher(cct_),
2240   tick_timer(cct, osd_lock),
2241   tick_timer_without_osd_lock(cct, tick_timer_lock),
2242   gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2243   cluster_messenger(internal_messenger),
2244   client_messenger(external_messenger),
2245   objecter_messenger(osdc_messenger),
2246   monc(mc),
2247   mgrc(cct_, client_messenger, &mc->monmap),
2248   logger(create_logger()),
2249   recoverystate_perf(create_recoverystate_perf()),
2250   store(store_),
2251   log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2252   clog(log_client.create_channel()),
2253   whoami(id),
2254   dev_path(dev), journal_path(jdev),
2255   store_is_rotational(store->is_rotational()),
2256   trace_endpoint("0.0.0.0", 0, "osd"),
2257   asok_hook(NULL),
2258   m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2259                                   "osd_pg_epoch_max_lag_factor")),
2260   osd_compat(get_osd_compat_set()),
2261   osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2262             get_num_op_threads()),
2263   heartbeat_stop(false),
2264   heartbeat_need_update(true),
2265   hb_front_client_messenger(hb_client_front),
2266   hb_back_client_messenger(hb_client_back),
2267   hb_front_server_messenger(hb_front_serverm),
2268   hb_back_server_messenger(hb_back_serverm),
2269   daily_loadavg(0.0),
2270   heartbeat_thread(this),
2271   heartbeat_dispatcher(this),
2272   op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2273                   cct->_conf->osd_num_op_tracker_shard),
2274   test_ops_hook(NULL),
2275   op_shardedwq(
2276     this,
2277     ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2278     ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
2279     &osd_op_tp),
2280   last_pg_create_epoch(0),
2281   boot_finisher(cct),
2282   up_thru_wanted(0),
2283   requested_full_first(0),
2284   requested_full_last(0),
2285   service(this, poolctx)
2286 {
2287
2288   if (!gss_ktfile_client.empty()) {
2289     // Assert we can export environment variable
2290     /*
2291         The default client keytab is used, if it is present and readable,
2292         to automatically obtain initial credentials for GSSAPI client
2293         applications. The principal name of the first entry in the client
2294         keytab is used by default when obtaining initial credentials.
2295         1. The KRB5_CLIENT_KTNAME environment variable.
2296         2. The default_client_keytab_name profile variable in [libdefaults].
2297         3. The hardcoded default, DEFCKTNAME.
2298     */
2299     const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2300                                     gss_ktfile_client.c_str(), 1));
2301     ceph_assert(set_result == 0);
2302   }
2303
2304   monc->set_messenger(client_messenger);
2305   op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2306                                          cct->_conf->osd_op_log_threshold);
2307   op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2308                                            cct->_conf->osd_op_history_duration);
2309   op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2310                                                     cct->_conf->osd_op_history_slow_op_threshold);
2311   ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2312 #ifdef WITH_BLKIN
2313   std::stringstream ss;
2314   ss << "osd." << whoami;
2315   trace_endpoint.copy_name(ss.str());
2316 #endif
2317
2318   // initialize shards
2319   num_shards = get_num_op_shards();
2320   for (uint32_t i = 0; i < num_shards; i++) {
2321     OSDShard *one_shard = new OSDShard(
2322       i,
2323       cct,
2324       this);
2325     shards.push_back(one_shard);
2326   }
2327 }
2328
2329 OSD::~OSD()
2330 {
2331   while (!shards.empty()) {
2332     delete shards.back();
2333     shards.pop_back();
2334   }
2335   cct->get_perfcounters_collection()->remove(recoverystate_perf);
2336   cct->get_perfcounters_collection()->remove(logger);
2337   delete recoverystate_perf;
2338   delete logger;
2339   delete store;
2340 }
2341
2342 double OSD::get_tick_interval() const
2343 {
2344   // vary +/- 5% to avoid scrub scheduling livelocks
2345   constexpr auto delta = 0.05;
2346   return (OSD_TICK_INTERVAL *
2347           ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2348 }
2349
2350 void OSD::handle_signal(int signum)
2351 {
2352   ceph_assert(signum == SIGINT || signum == SIGTERM);
2353   derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2354   shutdown();
2355 }
2356
2357 int OSD::pre_init()
2358 {
2359   std::lock_guard lock(osd_lock);
2360   if (is_stopping())
2361     return 0;
2362
2363   if (store->test_mount_in_use()) {
2364     derr << "OSD::pre_init: object store '" << dev_path << "' is "
2365          << "currently in use. (Is ceph-osd already running?)" << dendl;
2366     return -EBUSY;
2367   }
2368
2369   cct->_conf.add_observer(this);
2370   return 0;
2371 }
2372
2373 int OSD::set_numa_affinity()
2374 {
2375   // storage numa node
2376   int store_node = -1;
2377   store->get_numa_node(&store_node, nullptr, nullptr);
2378   if (store_node >= 0) {
2379     dout(1) << __func__ << " storage numa node " << store_node << dendl;
2380   }
2381
2382   // check network numa node(s)
2383   int front_node = -1, back_node = -1;
2384   string front_iface = pick_iface(
2385     cct,
2386     client_messenger->get_myaddrs().front().get_sockaddr_storage());
2387   string back_iface = pick_iface(
2388     cct,
2389     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2390   int r = get_iface_numa_node(front_iface, &front_node);
2391   if (r >= 0 && front_node >= 0) {
2392     dout(1) << __func__ << " public network " << front_iface << " numa node "
2393             << front_node << dendl;
2394     r = get_iface_numa_node(back_iface, &back_node);
2395     if (r >= 0 && back_node >= 0) {
2396       dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2397               << back_node << dendl;
2398       if (front_node == back_node &&
2399           front_node == store_node) {
2400         dout(1) << " objectstore and network numa nodes all match" << dendl;
2401         if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2402           numa_node = front_node;
2403         }
2404       } else if (front_node != back_node) {
2405         dout(1) << __func__ << " public and cluster network numa nodes do not match"
2406                 << dendl;
2407       } else {
2408         dout(1) << __func__ << " objectstore and network numa nodes do not match"
2409                 << dendl;
2410       }
2411     } else if (back_node == -2) {
2412       dout(1) << __func__ << " cluster network " << back_iface
2413               << " ports numa nodes do not match" << dendl;
2414     } else {
2415       derr << __func__ << " unable to identify cluster interface '" << back_iface
2416            << "' numa node: " << cpp_strerror(r) << dendl;
2417     }
2418   } else if (front_node == -2) {
2419     dout(1) << __func__ << " public network " << front_iface
2420             << " ports numa nodes do not match" << dendl;
2421   } else {
2422     derr << __func__ << " unable to identify public interface '" << front_iface
2423          << "' numa node: " << cpp_strerror(r) << dendl;
2424   }
2425   if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2426     // this takes precedence over the automagic logic above
2427     numa_node = node;
2428   }
2429   if (numa_node >= 0) {
2430     int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2431     if (r < 0) {
2432       dout(1) << __func__ << " unable to determine numa node " << numa_node
2433               << " CPUs" << dendl;
2434       numa_node = -1;
2435     } else {
2436       dout(1) << __func__ << " setting numa affinity to node " << numa_node
2437               << " cpus "
2438               << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2439               << dendl;
2440       r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2441       if (r < 0) {
2442         r = -errno;
2443         derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2444              << dendl;
2445         numa_node = -1;
2446       }
2447     }
2448   } else {
2449     dout(1) << __func__ << " not setting numa affinity" << dendl;
2450   }
2451   return 0;
2452 }
2453
2454 // asok
2455
2456 class OSDSocketHook : public AdminSocketHook {
2457   OSD *osd;
2458 public:
2459   explicit OSDSocketHook(OSD *o) : osd(o) {}
2460   int call(std::string_view prefix, const cmdmap_t& cmdmap,
2461            Formatter *f,
2462            std::ostream& ss,
2463            bufferlist& out) override {
2464     ceph_abort("should use async hook");
2465   }
2466   void call_async(
2467     std::string_view prefix,
2468     const cmdmap_t& cmdmap,
2469     Formatter *f,
2470     const bufferlist& inbl,
2471     std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2472     try {
2473       osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2474     } catch (const TOPNSPC::common::bad_cmd_get& e) {
2475       bufferlist empty;
2476       on_finish(-EINVAL, e.what(), empty);
2477     }
2478   }
2479 };
2480
2481 std::set<int64_t> OSD::get_mapped_pools()
2482 {
2483   std::set<int64_t> pools;
2484   std::vector<spg_t> pgids;
2485   _get_pgids(&pgids);
2486   for (const auto &pgid : pgids) {
2487     pools.insert(pgid.pool());
2488   }
2489   return pools;
2490 }
2491
2492 void OSD::asok_command(
2493   std::string_view prefix, const cmdmap_t& cmdmap,
2494   Formatter *f,
2495   const bufferlist& inbl,
2496   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2497 {
2498   int ret = 0;
2499   stringstream ss;   // stderr error message stream
2500   bufferlist outbl;  // if empty at end, we'll dump formatter as output
2501
2502   // --- PG commands are routed here to PG::do_command ---
2503   if (prefix == "pg" ||
2504       prefix == "query" ||
2505       prefix == "mark_unfound_lost" ||
2506       prefix == "list_unfound" ||
2507       prefix == "scrub" ||
2508       prefix == "deep_scrub"
2509     ) {
2510     string pgidstr;
2511     pg_t pgid;
2512     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2513       ss << "no pgid specified";
2514       ret = -EINVAL;
2515       goto out;
2516     }
2517     if (!pgid.parse(pgidstr.c_str())) {
2518       ss << "couldn't parse pgid '" << pgidstr << "'";
2519       ret = -EINVAL;
2520       goto out;
2521     }
2522     spg_t pcand;
2523     PGRef pg;
2524     if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2525         (pg = _lookup_lock_pg(pcand))) {
2526       if (pg->is_primary()) {
2527         cmdmap_t new_cmdmap = cmdmap;
2528         try {
2529           pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2530           pg->unlock();
2531           return; // the pg handler calls on_finish directly
2532         } catch (const TOPNSPC::common::bad_cmd_get& e) {
2533           pg->unlock();
2534           ss << e.what();
2535           ret = -EINVAL;
2536           goto out;
2537         }
2538       } else {
2539         ss << "not primary for pgid " << pgid;
2540         // do not reply; they will get newer maps and realize they
2541         // need to resend.
2542         pg->unlock();
2543         ret = -EAGAIN;
2544         goto out;
2545       }
2546     } else {
2547       ss << "i don't have pgid " << pgid;
2548       ret = -ENOENT;
2549     }
2550   }
2551
2552   // --- OSD commands follow ---
2553
2554   else if (prefix == "status") {
2555     lock_guard l(osd_lock);
2556     f->open_object_section("status");
2557     f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2558     f->dump_stream("osd_fsid") << superblock.osd_fsid;
2559     f->dump_unsigned("whoami", superblock.whoami);
2560     f->dump_string("state", get_state_name(get_state()));
2561     f->dump_unsigned("oldest_map", superblock.oldest_map);
2562     f->dump_unsigned("newest_map", superblock.newest_map);
2563     f->dump_unsigned("num_pgs", num_pgs);
2564     f->close_section();
2565   } else if (prefix == "flush_journal") {
2566     store->flush_journal();
2567   } else if (prefix == "dump_ops_in_flight" ||
2568              prefix == "ops" ||
2569              prefix == "dump_blocked_ops" ||
2570              prefix == "dump_historic_ops" ||
2571              prefix == "dump_historic_ops_by_duration" ||
2572              prefix == "dump_historic_slow_ops") {
2573
2574     const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2575 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2576 will start to track new ops received afterwards.";
2577
2578     set<string> filters;
2579     vector<string> filter_str;
2580     if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2581         copy(filter_str.begin(), filter_str.end(),
2582            inserter(filters, filters.end()));
2583     }
2584
2585     if (prefix == "dump_ops_in_flight" ||
2586         prefix == "ops") {
2587       if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2588         ss << error_str;
2589         ret = -EINVAL;
2590         goto out;
2591       }
2592     }
2593     if (prefix == "dump_blocked_ops") {
2594       if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2595         ss << error_str;
2596         ret = -EINVAL;
2597         goto out;
2598       }
2599     }
2600     if (prefix == "dump_historic_ops") {
2601       if (!op_tracker.dump_historic_ops(f, false, filters)) {
2602         ss << error_str;
2603         ret = -EINVAL;
2604         goto out;
2605       }
2606     }
2607     if (prefix == "dump_historic_ops_by_duration") {
2608       if (!op_tracker.dump_historic_ops(f, true, filters)) {
2609         ss << error_str;
2610         ret = -EINVAL;
2611         goto out;
2612       }
2613     }
2614     if (prefix == "dump_historic_slow_ops") {
2615       if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2616         ss << error_str;
2617         ret = -EINVAL;
2618         goto out;
2619       }
2620     }
2621   } else if (prefix == "dump_op_pq_state") {
2622     f->open_object_section("pq");
2623     op_shardedwq.dump(f);
2624     f->close_section();
2625   } else if (prefix == "dump_blocklist") {
2626     list<pair<entity_addr_t,utime_t> > bl;
2627     OSDMapRef curmap = service.get_osdmap();
2628
2629     f->open_array_section("blocklist");
2630     curmap->get_blocklist(&bl);
2631     for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2632         it != bl.end(); ++it) {
2633       f->open_object_section("entry");
2634       f->open_object_section("entity_addr_t");
2635       it->first.dump(f);
2636       f->close_section(); //entity_addr_t
2637       it->second.localtime(f->dump_stream("expire_time"));
2638       f->close_section(); //entry
2639     }
2640     f->close_section(); //blocklist
2641   } else if (prefix == "dump_watchers") {
2642     list<obj_watch_item_t> watchers;
2643     // scan pg's
2644     vector<PGRef> pgs;
2645     _get_pgs(&pgs);
2646     for (auto& pg : pgs) {
2647       list<obj_watch_item_t> pg_watchers;
2648       pg->get_watchers(&pg_watchers);
2649       watchers.splice(watchers.end(), pg_watchers);
2650     }
2651
2652     f->open_array_section("watchers");
2653     for (list<obj_watch_item_t>::iterator it = watchers.begin();
2654         it != watchers.end(); ++it) {
2655
2656       f->open_object_section("watch");
2657
2658       f->dump_string("namespace", it->obj.nspace);
2659       f->dump_string("object", it->obj.oid.name);
2660
2661       f->open_object_section("entity_name");
2662       it->wi.name.dump(f);
2663       f->close_section(); //entity_name_t
2664
2665       f->dump_unsigned("cookie", it->wi.cookie);
2666       f->dump_unsigned("timeout", it->wi.timeout_seconds);
2667
2668       f->open_object_section("entity_addr_t");
2669       it->wi.addr.dump(f);
2670       f->close_section(); //entity_addr_t
2671
2672       f->close_section(); //watch
2673     }
2674
2675     f->close_section(); //watchers
2676   } else if (prefix == "dump_recovery_reservations") {
2677     f->open_object_section("reservations");
2678     f->open_object_section("local_reservations");
2679     service.local_reserver.dump(f);
2680     f->close_section();
2681     f->open_object_section("remote_reservations");
2682     service.remote_reserver.dump(f);
2683     f->close_section();
2684     f->close_section();
2685   } else if (prefix == "dump_scrub_reservations") {
2686     f->open_object_section("scrub_reservations");
2687     service.dump_scrub_reservations(f);
2688     f->close_section();
2689   } else if (prefix == "get_latest_osdmap") {
2690     get_latest_osdmap();
2691   } else if (prefix == "set_heap_property") {
2692     string property;
2693     int64_t value = 0;
2694     string error;
2695     bool success = false;
2696     if (!cmd_getval(cmdmap, "property", property)) {
2697       error = "unable to get property";
2698       success = false;
2699     } else if (!cmd_getval(cmdmap, "value", value)) {
2700       error = "unable to get value";
2701       success = false;
2702     } else if (value < 0) {
2703       error = "negative value not allowed";
2704       success = false;
2705     } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2706       error = "invalid property";
2707       success = false;
2708     } else {
2709       success = true;
2710     }
2711     f->open_object_section("result");
2712     f->dump_string("error", error);
2713     f->dump_bool("success", success);
2714     f->close_section();
2715   } else if (prefix == "get_heap_property") {
2716     string property;
2717     size_t value = 0;
2718     string error;
2719     bool success = false;
2720     if (!cmd_getval(cmdmap, "property", property)) {
2721       error = "unable to get property";
2722       success = false;
2723     } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2724       error = "invalid property";
2725       success = false;
2726     } else {
2727       success = true;
2728     }
2729     f->open_object_section("result");
2730     f->dump_string("error", error);
2731     f->dump_bool("success", success);
2732     f->dump_int("value", value);
2733     f->close_section();
2734   } else if (prefix == "dump_objectstore_kv_stats") {
2735     store->get_db_statistics(f);
2736   } else if (prefix == "dump_scrubs") {
2737     service.dumps_scrub(f);
2738   } else if (prefix == "calc_objectstore_db_histogram") {
2739     store->generate_db_histogram(f);
2740   } else if (prefix == "flush_store_cache") {
2741     store->flush_cache(&ss);
2742   } else if (prefix == "dump_pgstate_history") {
2743     f->open_object_section("pgstate_history");
2744     f->open_array_section("pgs");
2745     vector<PGRef> pgs;
2746     _get_pgs(&pgs);
2747     for (auto& pg : pgs) {
2748       f->open_object_section("pg");
2749       f->dump_stream("pg") << pg->pg_id;
2750       f->dump_string("currently", pg->get_current_state());
2751       pg->dump_pgstate_history(f);
2752       f->close_section();
2753     }
2754     f->close_section();
2755     f->close_section();
2756   } else if (prefix == "compact") {
2757     dout(1) << "triggering manual compaction" << dendl;
2758     auto start = ceph::coarse_mono_clock::now();
2759     store->compact();
2760     auto end = ceph::coarse_mono_clock::now();
2761     double duration = std::chrono::duration<double>(end-start).count();
2762     dout(1) << "finished manual compaction in "
2763             << duration
2764             << " seconds" << dendl;
2765     f->open_object_section("compact_result");
2766     f->dump_float("elapsed_time", duration);
2767     f->close_section();
2768   } else if (prefix == "get_mapped_pools") {
2769     f->open_array_section("mapped_pools");
2770     set<int64_t> poollist = get_mapped_pools();
2771     for (auto pool : poollist) {
2772       f->dump_int("pool_id", pool);
2773     }
2774     f->close_section();
2775   } else if (prefix == "smart") {
2776     string devid;
2777     cmd_getval(cmdmap, "devid", devid);
2778     ostringstream out;
2779     probe_smart(devid, out);
2780     outbl.append(out.str());
2781   } else if (prefix == "list_devices") {
2782     set<string> devnames;
2783     store->get_devices(&devnames);
2784     f->open_array_section("list_devices");
2785     for (auto dev : devnames) {
2786       if (dev.find("dm-") == 0) {
2787         continue;
2788       }
2789       string err;
2790       f->open_object_section("device");
2791       f->dump_string("device", "/dev/" + dev);
2792       f->dump_string("device_id", get_device_id(dev, &err));
2793       f->close_section();
2794     }
2795     f->close_section();
2796   } else if (prefix == "send_beacon") {
2797     lock_guard l(osd_lock);
2798     if (is_active()) {
2799       send_beacon(ceph::coarse_mono_clock::now());
2800     }
2801   }
2802
2803   else if (prefix == "cluster_log") {
2804     vector<string> msg;
2805     cmd_getval(cmdmap, "message", msg);
2806     if (msg.empty()) {
2807       ret = -EINVAL;
2808       ss << "ignoring empty log message";
2809       goto out;
2810     }
2811     string message = msg.front();
2812     for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2813       message += " " + *a;
2814     string lvl;
2815     cmd_getval(cmdmap, "level", lvl);
2816     clog_type level = string_to_clog_type(lvl);
2817     if (level < 0) {
2818       ret = -EINVAL;
2819       ss << "unknown level '" << lvl << "'";
2820       goto out;
2821     }
2822     clog->do_log(level, message);
2823   }
2824
2825   else if (prefix == "bench") {
2826     int64_t count;
2827     int64_t bsize;
2828     int64_t osize, onum;
2829     // default count 1G, size 4MB
2830     cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2831     cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2832     cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2833     cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2834     double elapsed = 0.0;
2835
2836     ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
2837     if (ret != 0) {
2838       goto out;
2839     }
2840
2841     double rate = count / elapsed;
2842     double iops = rate / bsize;
2843     f->open_object_section("osd_bench_results");
2844     f->dump_int("bytes_written", count);
2845     f->dump_int("blocksize", bsize);
2846     f->dump_float("elapsed_sec", elapsed);
2847     f->dump_float("bytes_per_sec", rate);
2848     f->dump_float("iops", iops);
2849     f->close_section();
2850   }
2851
2852   else if (prefix == "flush_pg_stats") {
2853     mgrc.send_pgstats();
2854     f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2855   }
2856
2857   else if (prefix == "heap") {
2858     ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2859   }
2860
2861   else if (prefix == "debug dump_missing") {
2862     f->open_array_section("pgs");
2863     vector<PGRef> pgs;
2864     _get_pgs(&pgs);
2865     for (auto& pg : pgs) {
2866       string s = stringify(pg->pg_id);
2867       f->open_array_section(s.c_str());
2868       pg->lock();
2869       pg->dump_missing(f);
2870       pg->unlock();
2871       f->close_section();
2872     }
2873     f->close_section();
2874   }
2875
2876   else if (prefix == "debug kick_recovery_wq") {
2877     int64_t delay;
2878     cmd_getval(cmdmap, "delay", delay);
2879     ostringstream oss;
2880     oss << delay;
2881     ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2882     if (ret != 0) {
2883       ss << "kick_recovery_wq: error setting "
2884          << "osd_recovery_delay_start to '" << delay << "': error "
2885          << ret;
2886       goto out;
2887     }
2888     cct->_conf.apply_changes(nullptr);
2889     ss << "kicking recovery queue. set osd_recovery_delay_start "
2890        << "to " << cct->_conf->osd_recovery_delay_start;
2891   }
2892
2893   else if (prefix == "cpu_profiler") {
2894     ostringstream ds;
2895     string arg;
2896     cmd_getval(cmdmap, "arg", arg);
2897     vector<string> argvec;
2898     get_str_vec(arg, argvec);
2899     cpu_profiler_handle_command(argvec, ds);
2900     outbl.append(ds.str());
2901   }
2902
2903   else if (prefix == "dump_pg_recovery_stats") {
2904     lock_guard l(osd_lock);
2905     pg_recovery_stats.dump_formatted(f);
2906   }
2907
2908   else if (prefix == "reset_pg_recovery_stats") {
2909     lock_guard l(osd_lock);
2910     pg_recovery_stats.reset();
2911   }
2912
2913   else if (prefix == "perf histogram dump") {
2914     std::string logger;
2915     std::string counter;
2916     cmd_getval(cmdmap, "logger", logger);
2917     cmd_getval(cmdmap, "counter", counter);
2918     cct->get_perfcounters_collection()->dump_formatted_histograms(
2919       f, false, logger, counter);
2920   }
2921
2922   else if (prefix == "cache drop") {
2923     lock_guard l(osd_lock);
2924     dout(20) << "clearing all caches" << dendl;
2925     // Clear the objectstore's cache - onode and buffer for Bluestore,
2926     // system's pagecache for Filestore
2927     ret = store->flush_cache(&ss);
2928     if (ret < 0) {
2929       ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2930       goto out;
2931     }
2932     // Clear the objectcontext cache (per PG)
2933     vector<PGRef> pgs;
2934     _get_pgs(&pgs);
2935     for (auto& pg: pgs) {
2936       pg->clear_cache();
2937     }
2938   }
2939
2940   else if (prefix == "cache status") {
2941     lock_guard l(osd_lock);
2942     int obj_ctx_count = 0;
2943     vector<PGRef> pgs;
2944     _get_pgs(&pgs);
2945     for (auto& pg: pgs) {
2946       obj_ctx_count += pg->get_cache_obj_count();
2947     }
2948     f->open_object_section("cache_status");
2949     f->dump_int("object_ctx", obj_ctx_count);
2950     store->dump_cache_stats(f);
2951     f->close_section();
2952   }
2953
2954   else if (prefix == "scrub_purged_snaps") {
2955     lock_guard l(osd_lock);
2956     scrub_purged_snaps();
2957   }
2958
2959   else if (prefix == "dump_osd_network") {
2960     lock_guard l(osd_lock);
2961     int64_t value = 0;
2962     if (!(cmd_getval(cmdmap, "value", value))) {
2963       // Convert milliseconds to microseconds
2964       value = static_cast<double>(g_conf().get_val<double>(
2965                                     "mon_warn_on_slow_ping_time")) * 1000;
2966       if (value == 0) {
2967         double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2968         value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2969         value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2970       }
2971     } else {
2972       // Convert user input to microseconds
2973       value *= 1000;
2974     }
2975     if (value < 0) value = 0;
2976
2977     struct osd_ping_time_t {
2978       uint32_t pingtime;
2979       int to;
2980       bool back;
2981       std::array<uint32_t,3> times;
2982       std::array<uint32_t,3> min;
2983       std::array<uint32_t,3> max;
2984       uint32_t last;
2985       uint32_t last_update;
2986
2987       bool operator<(const osd_ping_time_t& rhs) const {
2988         if (pingtime < rhs.pingtime)
2989           return true;
2990         if (pingtime > rhs.pingtime)
2991           return false;
2992         if (to < rhs.to)
2993           return true;
2994         if (to > rhs.to)
2995           return false;
2996         return back;
2997       }
2998     };
2999
3000     set<osd_ping_time_t> sorted;
3001     // Get pingtimes under lock and not on the stack
3002     map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3003     service.get_hb_pingtime(pingtimes);
3004     for (auto j : *pingtimes) {
3005       if (j.second.last_update == 0)
3006         continue;
3007       osd_ping_time_t item;
3008       item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3009       item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3010       if (item.pingtime >= value) {
3011         item.to = j.first;
3012         item.times[0] = j.second.back_pingtime[0];
3013         item.times[1] = j.second.back_pingtime[1];
3014         item.times[2] = j.second.back_pingtime[2];
3015         item.min[0] = j.second.back_min[0];
3016         item.min[1] = j.second.back_min[1];
3017         item.min[2] = j.second.back_min[2];
3018         item.max[0] = j.second.back_max[0];
3019         item.max[1] = j.second.back_max[1];
3020         item.max[2] = j.second.back_max[2];
3021         item.last = j.second.back_last;
3022         item.back = true;
3023         item.last_update = j.second.last_update;
3024         sorted.emplace(item);
3025       }
3026       if (j.second.front_last == 0)
3027         continue;
3028       item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3029       item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3030       if (item.pingtime >= value) {
3031         item.to = j.first;
3032         item.times[0] = j.second.front_pingtime[0];
3033         item.times[1] = j.second.front_pingtime[1];
3034         item.times[2] = j.second.front_pingtime[2];
3035         item.min[0] = j.second.front_min[0];
3036         item.min[1] = j.second.front_min[1];
3037         item.min[2] = j.second.front_min[2];
3038         item.max[0] = j.second.front_max[0];
3039         item.max[1] = j.second.front_max[1];
3040         item.max[2] = j.second.front_max[2];
3041         item.last = j.second.front_last;
3042         item.last_update = j.second.last_update;
3043         item.back = false;
3044         sorted.emplace(item);
3045       }
3046     }
3047     delete pingtimes;
3048     //
3049     // Network ping times (1min 5min 15min)
3050     f->open_object_section("network_ping_times");
3051     f->dump_int("threshold", value / 1000);
3052     f->open_array_section("entries");
3053     for (auto &sitem : boost::adaptors::reverse(sorted)) {
3054       ceph_assert(sitem.pingtime >= value);
3055       f->open_object_section("entry");
3056
3057       const time_t lu(sitem.last_update);
3058       char buffer[26];
3059       string lustr(ctime_r(&lu, buffer));
3060       lustr.pop_back();   // Remove trailing \n
3061       auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3062       f->dump_string("last update", lustr);
3063       f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3064       f->dump_int("from osd", whoami);
3065       f->dump_int("to osd", sitem.to);
3066       f->dump_string("interface", (sitem.back ? "back" : "front"));
3067       f->open_object_section("average");
3068       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3069       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3070       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3071       f->close_section();  // average
3072       f->open_object_section("min");
3073       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3074       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3075       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3076       f->close_section();  // min
3077       f->open_object_section("max");
3078       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3079       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3080       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3081       f->close_section();  // max
3082       f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3083       f->close_section();  // entry
3084     }
3085     f->close_section(); // entries
3086     f->close_section(); // network_ping_times
3087   } else {
3088     ceph_abort_msg("broken asok registration");
3089   }
3090
3091  out:
3092   on_finish(ret, ss.str(), outbl);
3093 }
3094
3095 int OSD::run_osd_bench_test(
3096   int64_t count,
3097   int64_t bsize,
3098   int64_t osize,
3099   int64_t onum,
3100   double *elapsed,
3101   ostream &ss)
3102 {
3103   int ret = 0;
3104   uint32_t duration = cct->_conf->osd_bench_duration;
3105
3106   if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3107     // let us limit the block size because the next checks rely on it
3108     // having a sane value.  If we allow any block size to be set things
3109     // can still go sideways.
3110     ss << "block 'size' values are capped at "
3111        << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3112        << " a higher value, please adjust 'osd_bench_max_block_size'";
3113     ret = -EINVAL;
3114     return ret;
3115   } else if (bsize < (int64_t) (1 << 20)) {
3116     // entering the realm of small block sizes.
3117     // limit the count to a sane value, assuming a configurable amount of
3118     // IOPS and duration, so that the OSD doesn't get hung up on this,
3119     // preventing timeouts from going off
3120     int64_t max_count =
3121       bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3122     if (count > max_count) {
3123       ss << "'count' values greater than " << max_count
3124          << " for a block size of " << byte_u_t(bsize) << ", assuming "
3125          << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3126          << " for " << duration << " seconds,"
3127          << " can cause ill effects on osd. "
3128          << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3129          << " value if you wish to use a higher 'count'.";
3130       ret = -EINVAL;
3131       return ret;
3132     }
3133   } else {
3134     // 1MB block sizes are big enough so that we get more stuff done.
3135     // However, to avoid the osd from getting hung on this and having
3136     // timers being triggered, we are going to limit the count assuming
3137     // a configurable throughput and duration.
3138     // NOTE: max_count is the total amount of bytes that we believe we
3139     //       will be able to write during 'duration' for the given
3140     //       throughput.  The block size hardly impacts this unless it's
3141     //       way too big.  Given we already check how big the block size
3142     //       is, it's safe to assume everything will check out.
3143     int64_t max_count =
3144       cct->_conf->osd_bench_large_size_max_throughput * duration;
3145     if (count > max_count) {
3146       ss << "'count' values greater than " << max_count
3147          << " for a block size of " << byte_u_t(bsize) << ", assuming "
3148          << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3149          << " for " << duration << " seconds,"
3150          << " can cause ill effects on osd. "
3151          << " Please adjust 'osd_bench_large_size_max_throughput'"
3152          << " with a higher value if you wish to use a higher 'count'.";
3153       ret = -EINVAL;
3154       return ret;
3155     }
3156   }
3157
3158   if (osize && bsize > osize) {
3159     bsize = osize;
3160   }
3161
3162   dout(1) << " bench count " << count
3163           << " bsize " << byte_u_t(bsize) << dendl;
3164
3165   ObjectStore::Transaction cleanupt;
3166
3167   if (osize && onum) {
3168     bufferlist bl;
3169     bufferptr bp(osize);
3170     bp.zero();
3171     bl.push_back(std::move(bp));
3172     bl.rebuild_page_aligned();
3173     for (int i=0; i<onum; ++i) {
3174       char nm[30];
3175       snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3176       object_t oid(nm);
3177       hobject_t soid(sobject_t(oid, 0));
3178       ObjectStore::Transaction t;
3179       t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3180       store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3181       cleanupt.remove(coll_t(), ghobject_t(soid));
3182     }
3183   }
3184
3185   bufferlist bl;
3186   bufferptr bp(bsize);
3187   bp.zero();
3188   bl.push_back(std::move(bp));
3189   bl.rebuild_page_aligned();
3190
3191   {
3192     C_SaferCond waiter;
3193     if (!service.meta_ch->flush_commit(&waiter)) {
3194       waiter.wait();
3195     }
3196   }
3197
3198   utime_t start = ceph_clock_now();
3199   for (int64_t pos = 0; pos < count; pos += bsize) {
3200     char nm[30];
3201     unsigned offset = 0;
3202     if (onum && osize) {
3203       snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3204       offset = rand() % (osize / bsize) * bsize;
3205     } else {
3206       snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3207     }
3208     object_t oid(nm);
3209     hobject_t soid(sobject_t(oid, 0));
3210     ObjectStore::Transaction t;
3211     t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3212     store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3213     if (!onum || !osize) {
3214       cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3215     }
3216   }
3217
3218   {
3219     C_SaferCond waiter;
3220     if (!service.meta_ch->flush_commit(&waiter)) {
3221       waiter.wait();
3222     }
3223   }
3224   utime_t end = ceph_clock_now();
3225   *elapsed = end - start;
3226
3227   // clean up
3228   store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3229   {
3230     C_SaferCond waiter;
3231     if (!service.meta_ch->flush_commit(&waiter)) {
3232       waiter.wait();
3233     }
3234   }
3235
3236  return ret;
3237 }
3238
3239 class TestOpsSocketHook : public AdminSocketHook {
3240   OSDService *service;
3241   ObjectStore *store;
3242 public:
3243   TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3244   int call(std::string_view command, const cmdmap_t& cmdmap,
3245            Formatter *f,
3246            std::ostream& errss,
3247            bufferlist& out) override {
3248     int r = 0;
3249     stringstream outss;
3250     try {
3251       test_ops(service, store, command, cmdmap, outss);
3252       out.append(outss);
3253     } catch (const TOPNSPC::common::bad_cmd_get& e) {
3254       errss << e.what();
3255       r = -EINVAL;
3256     }
3257     return r;
3258   }
3259   void test_ops(OSDService *service, ObjectStore *store,
3260                 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3261
3262 };
3263
3264 class OSD::C_Tick : public Context {
3265   OSD *osd;
3266   public:
3267   explicit C_Tick(OSD *o) : osd(o) {}
3268   void finish(int r) override {
3269     osd->tick();
3270   }
3271 };
3272
3273 class OSD::C_Tick_WithoutOSDLock : public Context {
3274   OSD *osd;
3275   public:
3276   explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3277   void finish(int r) override {
3278     osd->tick_without_osd_lock();
3279   }
3280 };
3281
3282 int OSD::enable_disable_fuse(bool stop)
3283 {
3284 #ifdef HAVE_LIBFUSE
3285   int r;
3286   string mntpath = cct->_conf->osd_data + "/fuse";
3287   if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3288     dout(1) << __func__ << " disabling" << dendl;
3289     fuse_store->stop();
3290     delete fuse_store;
3291     fuse_store = NULL;
3292     r = ::rmdir(mntpath.c_str());
3293     if (r < 0) {
3294       r = -errno;
3295       derr << __func__ << " failed to rmdir " << mntpath << ": "
3296            << cpp_strerror(r) << dendl;
3297       return r;
3298     }
3299     return 0;
3300   }
3301   if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3302     dout(1) << __func__ << " enabling" << dendl;
3303     r = ::mkdir(mntpath.c_str(), 0700);
3304     if (r < 0)
3305       r = -errno;
3306     if (r < 0 && r != -EEXIST) {
3307       derr << __func__ << " unable to create " << mntpath << ": "
3308            << cpp_strerror(r) << dendl;
3309       return r;
3310     }
3311     fuse_store = new FuseStore(store, mntpath);
3312     r = fuse_store->start();
3313     if (r < 0) {
3314       derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3315       delete fuse_store;
3316       fuse_store = NULL;
3317       return r;
3318     }
3319   }
3320 #endif  // HAVE_LIBFUSE
3321   return 0;
3322 }
3323
3324 size_t OSD::get_num_cache_shards()
3325 {
3326   return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3327 }
3328
3329 int OSD::get_num_op_shards()
3330 {
3331   if (cct->_conf->osd_op_num_shards)
3332     return cct->_conf->osd_op_num_shards;
3333   if (store_is_rotational)
3334     return cct->_conf->osd_op_num_shards_hdd;
3335   else
3336     return cct->_conf->osd_op_num_shards_ssd;
3337 }
3338
3339 int OSD::get_num_op_threads()
3340 {
3341   if (cct->_conf->osd_op_num_threads_per_shard)
3342     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3343   if (store_is_rotational)
3344     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3345   else
3346     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3347 }
3348
3349 float OSD::get_osd_recovery_sleep()
3350 {
3351   if (cct->_conf->osd_recovery_sleep)
3352     return cct->_conf->osd_recovery_sleep;
3353   if (!store_is_rotational && !journal_is_rotational)
3354     return cct->_conf->osd_recovery_sleep_ssd;
3355   else if (store_is_rotational && !journal_is_rotational)
3356     return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3357   else
3358     return cct->_conf->osd_recovery_sleep_hdd;
3359 }
3360
3361 float OSD::get_osd_delete_sleep()
3362 {
3363   float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3364   if (osd_delete_sleep > 0)
3365     return osd_delete_sleep;
3366   if (!store_is_rotational && !journal_is_rotational)
3367     return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3368   if (store_is_rotational && !journal_is_rotational)
3369     return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3370   return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3371 }
3372
3373 int OSD::get_recovery_max_active()
3374 {
3375   if (cct->_conf->osd_recovery_max_active)
3376     return cct->_conf->osd_recovery_max_active;
3377   if (store_is_rotational)
3378     return cct->_conf->osd_recovery_max_active_hdd;
3379   else
3380     return cct->_conf->osd_recovery_max_active_ssd;
3381 }
3382
3383 float OSD::get_osd_snap_trim_sleep()
3384 {
3385   float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3386   if (osd_snap_trim_sleep > 0)
3387     return osd_snap_trim_sleep;
3388   if (!store_is_rotational && !journal_is_rotational)
3389     return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3390   if (store_is_rotational && !journal_is_rotational)
3391     return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3392   return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3393 }
3394
3395 int OSD::init()
3396 {
3397   OSDMapRef osdmap;
3398   CompatSet initial, diff;
3399   std::lock_guard lock(osd_lock);
3400   if (is_stopping())
3401     return 0;
3402
3403   tick_timer.init();
3404   tick_timer_without_osd_lock.init();
3405   service.recovery_request_timer.init();
3406   service.sleep_timer.init();
3407
3408   boot_finisher.start();
3409
3410   {
3411     string val;
3412     store->read_meta("require_osd_release", &val);
3413     last_require_osd_release = ceph_release_from_name(val);
3414   }
3415
3416   // mount.
3417   dout(2) << "init " << dev_path
3418           << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3419           << dendl;
3420   dout(2) << "journal " << journal_path << dendl;
3421   ceph_assert(store);  // call pre_init() first!
3422
3423   store->set_cache_shards(get_num_cache_shards());
3424
3425   int r = store->mount();
3426   if (r < 0) {
3427     derr << "OSD:init: unable to mount object store" << dendl;
3428     return r;
3429   }
3430   journal_is_rotational = store->is_journal_rotational();
3431   dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3432           << dendl;
3433
3434   enable_disable_fuse(false);
3435
3436   dout(2) << "boot" << dendl;
3437
3438   service.meta_ch = store->open_collection(coll_t::meta());
3439
3440   // initialize the daily loadavg with current 15min loadavg
3441   double loadavgs[3];
3442   if (getloadavg(loadavgs, 3) == 3) {
3443     daily_loadavg = loadavgs[2];
3444   } else {
3445     derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3446     daily_loadavg = 1.0;
3447   }
3448
3449   int rotating_auth_attempts = 0;
3450   auto rotating_auth_timeout =
3451     g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3452
3453   // sanity check long object name handling
3454   {
3455     hobject_t l;
3456     l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3457     l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3458     l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3459     r = store->validate_hobject_key(l);
3460     if (r < 0) {
3461       derr << "backend (" << store->get_type() << ") is unable to support max "
3462            << "object name[space] len" << dendl;
3463       derr << "   osd max object name len = "
3464            << cct->_conf->osd_max_object_name_len << dendl;
3465       derr << "   osd max object namespace len = "
3466            << cct->_conf->osd_max_object_namespace_len << dendl;
3467       derr << cpp_strerror(r) << dendl;
3468       if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3469         goto out;
3470       }
3471       derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3472            << dendl;
3473     } else {
3474       dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3475     }
3476   }
3477
3478   // read superblock
3479   r = read_superblock();
3480   if (r < 0) {
3481     derr << "OSD::init() : unable to read osd superblock" << dendl;
3482     r = -EINVAL;
3483     goto out;
3484   }
3485
3486   if (osd_compat.compare(superblock.compat_features) < 0) {
3487     derr << "The disk uses features unsupported by the executable." << dendl;
3488     derr << " ondisk features " << superblock.compat_features << dendl;
3489     derr << " daemon features " << osd_compat << dendl;
3490
3491     if (osd_compat.writeable(superblock.compat_features)) {
3492       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3493       derr << "it is still writeable, though. Missing features: " << diff << dendl;
3494       r = -EOPNOTSUPP;
3495       goto out;
3496     }
3497     else {
3498       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3499       derr << "Cannot write to disk! Missing features: " << diff << dendl;
3500       r = -EOPNOTSUPP;
3501       goto out;
3502     }
3503   }
3504
3505   assert_warn(whoami == superblock.whoami);
3506   if (whoami != superblock.whoami) {
3507     derr << "OSD::init: superblock says osd"
3508          << superblock.whoami << " but I am osd." << whoami << dendl;
3509     r = -EINVAL;
3510     goto out;
3511   }
3512
3513   startup_time = ceph::mono_clock::now();
3514
3515   // load up "current" osdmap
3516   assert_warn(!get_osdmap());
3517   if (get_osdmap()) {
3518     derr << "OSD::init: unable to read current osdmap" << dendl;
3519     r = -EINVAL;
3520     goto out;
3521   }
3522   osdmap = get_map(superblock.current_epoch);
3523   set_osdmap(osdmap);
3524
3525   // make sure we don't have legacy pgs deleting
3526   {
3527     vector<coll_t> ls;
3528     int r = store->list_collections(ls);
3529     ceph_assert(r >= 0);
3530     for (auto c : ls) {
3531       spg_t pgid;
3532       if (c.is_pg(&pgid) &&
3533           !osdmap->have_pg_pool(pgid.pool())) {
3534         ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3535         if (!store->exists(service.meta_ch, oid)) {
3536           derr << __func__ << " missing pg_pool_t for deleted pool "
3537                << pgid.pool() << " for pg " << pgid
3538                << "; please downgrade to luminous and allow "
3539                << "pg deletion to complete before upgrading" << dendl;
3540           ceph_abort();
3541         }
3542       }
3543     }
3544   }
3545
3546   initial = get_osd_initial_compat_set();
3547   diff = superblock.compat_features.unsupported(initial);
3548   if (superblock.compat_features.merge(initial)) {
3549     // Are we adding SNAPMAPPER2?
3550     if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3551       dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3552               << dendl;
3553       auto ch = service.meta_ch;
3554       auto hoid = make_snapmapper_oid();
3555       unsigned max = cct->_conf->osd_target_transaction_size;
3556       r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3557       if (r < 0)
3558         goto out;
3559     }
3560     // We need to persist the new compat_set before we
3561     // do anything else
3562     dout(5) << "Upgrading superblock adding: " << diff << dendl;
3563     ObjectStore::Transaction t;
3564     write_superblock(t);
3565     r = store->queue_transaction(service.meta_ch, std::move(t));
3566     if (r < 0)
3567       goto out;
3568   }
3569
3570   // make sure snap mapper object exists
3571   if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3572     dout(10) << "init creating/touching snapmapper object" << dendl;
3573     ObjectStore::Transaction t;
3574     t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3575     r = store->queue_transaction(service.meta_ch, std::move(t));
3576     if (r < 0)
3577       goto out;
3578   }
3579   if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3580     dout(10) << "init creating/touching purged_snaps object" << dendl;
3581     ObjectStore::Transaction t;
3582     t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3583     r = store->queue_transaction(service.meta_ch, std::move(t));
3584     if (r < 0)
3585       goto out;
3586   }
3587
3588   if (cct->_conf->osd_open_classes_on_start) {
3589     int r = ClassHandler::get_instance().open_all_classes();
3590     if (r)
3591       dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3592   }
3593
3594   check_osdmap_features();
3595
3596   {
3597     epoch_t bind_epoch = osdmap->get_epoch();
3598     service.set_epochs(NULL, NULL, &bind_epoch);
3599   }
3600
3601   clear_temp_objects();
3602
3603   // initialize osdmap references in sharded wq
3604   for (auto& shard : shards) {
3605     std::lock_guard l(shard->osdmap_lock);
3606     shard->shard_osdmap = osdmap;
3607   }
3608
3609   // load up pgs (as they previously existed)
3610   load_pgs();
3611
3612   dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3613
3614   if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3615     dout(2) << "compacting object store's omap" << dendl;
3616     store->compact();
3617   }
3618
3619   // prime osd stats
3620   {
3621     struct store_statfs_t stbuf;
3622     osd_alert_list_t alerts;
3623     int r = store->statfs(&stbuf, &alerts);
3624     ceph_assert(r == 0);
3625     service.set_statfs(stbuf, alerts);
3626   }
3627
3628   // client_messenger's auth_client will be set up by monc->init() later.
3629   for (auto m : { cluster_messenger,
3630         objecter_messenger,
3631         hb_front_client_messenger,
3632         hb_back_client_messenger,
3633         hb_front_server_messenger,
3634         hb_back_server_messenger } ) {
3635     m->set_auth_client(monc);
3636   }
3637   for (auto m : { client_messenger,
3638         cluster_messenger,
3639         hb_front_server_messenger,
3640         hb_back_server_messenger }) {
3641     m->set_auth_server(monc);
3642   }
3643   monc->set_handle_authentication_dispatcher(this);
3644
3645   monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3646                       | CEPH_ENTITY_TYPE_MGR);
3647   r = monc->init();
3648   if (r < 0)
3649     goto out;
3650
3651   mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
3652   mgrc.set_perf_metric_query_cb(
3653     [this](const ConfigPayload &config_payload) {
3654         set_perf_queries(config_payload);
3655       },
3656       [this] {
3657         return get_perf_reports();
3658       });
3659   mgrc.init();
3660
3661   // tell monc about log_client so it will know about mon session resets
3662   monc->set_log_client(&log_client);
3663   update_log_config();
3664
3665   // i'm ready!
3666   client_messenger->add_dispatcher_tail(&mgrc);
3667   client_messenger->add_dispatcher_tail(this);
3668   cluster_messenger->add_dispatcher_head(this);
3669
3670   hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3671   hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3672   hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3673   hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3674
3675   objecter_messenger->add_dispatcher_head(service.objecter.get());
3676
3677   service.init();
3678   service.publish_map(osdmap);
3679   service.publish_superblock(superblock);
3680   service.max_oldest_map = superblock.oldest_map;
3681
3682   for (auto& shard : shards) {
3683     // put PGs in a temporary set because we may modify pg_slots
3684     // unordered_map below.
3685     set<PGRef> pgs;
3686     for (auto& i : shard->pg_slots) {
3687       PGRef pg = i.second->pg;
3688       if (!pg) {
3689         continue;
3690       }
3691       pgs.insert(pg);
3692     }
3693     for (auto pg : pgs) {
3694       std::scoped_lock l{*pg};
3695       set<pair<spg_t,epoch_t>> new_children;
3696       set<pair<spg_t,epoch_t>> merge_pgs;
3697       service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3698                                          &new_children, &merge_pgs);
3699       if (!new_children.empty()) {
3700         for (auto shard : shards) {
3701           shard->prime_splits(osdmap, &new_children);
3702         }
3703         assert(new_children.empty());
3704       }
3705       if (!merge_pgs.empty()) {
3706         for (auto shard : shards) {
3707           shard->prime_merges(osdmap, &merge_pgs);
3708         }
3709         assert(merge_pgs.empty());
3710       }
3711     }
3712   }
3713
3714   osd_op_tp.start();
3715
3716   // start the heartbeat
3717   heartbeat_thread.create("osd_srv_heartbt");
3718
3719   // tick
3720   tick_timer.add_event_after(get_tick_interval(),
3721                              new C_Tick(this));
3722   {
3723     std::lock_guard l(tick_timer_lock);
3724     tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3725                                                 new C_Tick_WithoutOSDLock(this));
3726   }
3727
3728   osd_lock.unlock();
3729
3730   r = monc->authenticate();
3731   if (r < 0) {
3732     derr << __func__ << " authentication failed: " << cpp_strerror(r)
3733          << dendl;
3734     exit(1);
3735   }
3736
3737   while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3738     derr << "unable to obtain rotating service keys; retrying" << dendl;
3739     ++rotating_auth_attempts;
3740     if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3741         derr << __func__ << " wait_auth_rotating timed out" << dendl;
3742         exit(1);
3743     }
3744   }
3745
3746   r = update_crush_device_class();
3747   if (r < 0) {
3748     derr << __func__ << " unable to update_crush_device_class: "
3749          << cpp_strerror(r) << dendl;
3750     exit(1);
3751   }
3752
3753   r = update_crush_location();
3754   if (r < 0) {
3755     derr << __func__ << " unable to update_crush_location: "
3756          << cpp_strerror(r) << dendl;
3757     exit(1);
3758   }
3759
3760   osd_lock.lock();
3761   if (is_stopping())
3762     return 0;
3763
3764   // start objecter *after* we have authenticated, so that we don't ignore
3765   // the OSDMaps it requests.
3766   service.final_init();
3767
3768   check_config();
3769
3770   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3771   consume_map();
3772
3773   dout(0) << "done with init, starting boot process" << dendl;
3774
3775   // subscribe to any pg creations
3776   monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3777
3778   // MgrClient needs this (it doesn't have MonClient reference itself)
3779   monc->sub_want("mgrmap", 0, 0);
3780
3781   // we don't need to ask for an osdmap here; objecter will
3782   //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3783
3784   monc->renew_subs();
3785
3786   start_boot();
3787
3788   // Override a few options if mclock scheduler is enabled.
3789   maybe_override_max_osd_capacity_for_qos();
3790   maybe_override_options_for_qos();
3791
3792   return 0;
3793
3794 out:
3795   enable_disable_fuse(true);
3796   store->umount();
3797   delete store;
3798   store = NULL;
3799   return r;
3800 }
3801
3802 void OSD::final_init()
3803 {
3804   AdminSocket *admin_socket = cct->get_admin_socket();
3805   asok_hook = new OSDSocketHook(this);
3806   int r = admin_socket->register_command("status", asok_hook,
3807                                          "high-level status of OSD");
3808   ceph_assert(r == 0);
3809   r = admin_socket->register_command("flush_journal",
3810                                      asok_hook,
3811                                      "flush the journal to permanent store");
3812   ceph_assert(r == 0);
3813   r = admin_socket->register_command("dump_ops_in_flight " \
3814                                      "name=filterstr,type=CephString,n=N,req=false",
3815                                      asok_hook,
3816                                      "show the ops currently in flight");
3817   ceph_assert(r == 0);
3818   r = admin_socket->register_command("ops " \
3819                                      "name=filterstr,type=CephString,n=N,req=false",
3820                                      asok_hook,
3821                                      "show the ops currently in flight");
3822   ceph_assert(r == 0);
3823   r = admin_socket->register_command("dump_blocked_ops " \
3824                                      "name=filterstr,type=CephString,n=N,req=false",
3825                                      asok_hook,
3826                                      "show the blocked ops currently in flight");
3827   ceph_assert(r == 0);
3828   r = admin_socket->register_command("dump_historic_ops " \
3829                                      "name=filterstr,type=CephString,n=N,req=false",
3830                                      asok_hook,
3831                                      "show recent ops");
3832   ceph_assert(r == 0);
3833   r = admin_socket->register_command("dump_historic_slow_ops " \
3834                                      "name=filterstr,type=CephString,n=N,req=false",
3835                                      asok_hook,
3836                                      "show slowest recent ops");
3837   ceph_assert(r == 0);
3838   r = admin_socket->register_command("dump_historic_ops_by_duration " \
3839                                      "name=filterstr,type=CephString,n=N,req=false",
3840                                      asok_hook,
3841                                      "show slowest recent ops, sorted by duration");
3842   ceph_assert(r == 0);
3843   r = admin_socket->register_command("dump_op_pq_state",
3844                                      asok_hook,
3845                                      "dump op priority queue state");
3846   ceph_assert(r == 0);
3847   r = admin_socket->register_command("dump_blocklist",
3848                                      asok_hook,
3849                                      "dump blocklisted clients and times");
3850   ceph_assert(r == 0);
3851   r = admin_socket->register_command("dump_watchers",
3852                                      asok_hook,
3853                                      "show clients which have active watches,"
3854                                      " and on which objects");
3855   ceph_assert(r == 0);
3856   r = admin_socket->register_command("dump_recovery_reservations",
3857                                      asok_hook,
3858                                      "show recovery reservations");
3859   ceph_assert(r == 0);
3860   r = admin_socket->register_command("dump_scrub_reservations",
3861                                      asok_hook,
3862                                      "show scrub reservations");
3863   ceph_assert(r == 0);
3864   r = admin_socket->register_command("get_latest_osdmap",
3865                                      asok_hook,
3866                                      "force osd to update the latest map from "
3867                                      "the mon");
3868   ceph_assert(r == 0);
3869
3870   r = admin_socket->register_command("set_heap_property " \
3871                                      "name=property,type=CephString " \
3872                                      "name=value,type=CephInt",
3873                                      asok_hook,
3874                                      "update malloc extension heap property");
3875   ceph_assert(r == 0);
3876
3877   r = admin_socket->register_command("get_heap_property " \
3878                                      "name=property,type=CephString",
3879                                      asok_hook,
3880                                      "get malloc extension heap property");
3881   ceph_assert(r == 0);
3882
3883   r = admin_socket->register_command("dump_objectstore_kv_stats",
3884                                      asok_hook,
3885                                      "print statistics of kvdb which used by bluestore");
3886   ceph_assert(r == 0);
3887
3888   r = admin_socket->register_command("dump_scrubs",
3889                                      asok_hook,
3890                                      "print scheduled scrubs");
3891   ceph_assert(r == 0);
3892
3893   r = admin_socket->register_command("calc_objectstore_db_histogram",
3894                                      asok_hook,
3895                                      "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3896   ceph_assert(r == 0);
3897
3898   r = admin_socket->register_command("flush_store_cache",
3899                                      asok_hook,
3900                                      "Flush bluestore internal cache");
3901   ceph_assert(r == 0);
3902   r = admin_socket->register_command("dump_pgstate_history",
3903                                      asok_hook,
3904                                      "show recent state history");
3905   ceph_assert(r == 0);
3906
3907   r = admin_socket->register_command("compact",
3908                                      asok_hook,
3909                                      "Commpact object store's omap."
3910                                      " WARNING: Compaction probably slows your requests");
3911   ceph_assert(r == 0);
3912
3913   r = admin_socket->register_command("get_mapped_pools",
3914                                      asok_hook,
3915                                      "dump pools whose PG(s) are mapped to this OSD.");
3916
3917   ceph_assert(r == 0);
3918
3919   r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3920                                      asok_hook,
3921                                      "probe OSD devices for SMART data.");
3922
3923   ceph_assert(r == 0);
3924
3925   r = admin_socket->register_command("list_devices",
3926                                      asok_hook,
3927                                      "list OSD devices.");
3928   r = admin_socket->register_command("send_beacon",
3929                                      asok_hook,
3930                                      "send OSD beacon to mon immediately");
3931
3932   r = admin_socket->register_command(
3933     "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3934     "Dump osd heartbeat network ping times");
3935   ceph_assert(r == 0);
3936
3937   test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3938   // Note: pools are CephString instead of CephPoolname because
3939   // these commands traditionally support both pool names and numbers
3940   r = admin_socket->register_command(
3941    "setomapval " \
3942    "name=pool,type=CephString " \
3943    "name=objname,type=CephObjectname " \
3944    "name=key,type=CephString "\
3945    "name=val,type=CephString",
3946    test_ops_hook,
3947    "set omap key");
3948   ceph_assert(r == 0);
3949   r = admin_socket->register_command(
3950     "rmomapkey " \
3951     "name=pool,type=CephString " \
3952     "name=objname,type=CephObjectname " \
3953     "name=key,type=CephString",
3954     test_ops_hook,
3955     "remove omap key");
3956   ceph_assert(r == 0);
3957   r = admin_socket->register_command(
3958     "setomapheader " \
3959     "name=pool,type=CephString " \
3960     "name=objname,type=CephObjectname " \
3961     "name=header,type=CephString",
3962     test_ops_hook,
3963     "set omap header");
3964   ceph_assert(r == 0);
3965
3966   r = admin_socket->register_command(
3967     "getomap " \
3968     "name=pool,type=CephString " \
3969     "name=objname,type=CephObjectname",
3970     test_ops_hook,
3971     "output entire object map");
3972   ceph_assert(r == 0);
3973
3974   r = admin_socket->register_command(
3975     "truncobj " \
3976     "name=pool,type=CephString " \
3977     "name=objname,type=CephObjectname " \
3978     "name=len,type=CephInt",
3979     test_ops_hook,
3980     "truncate object to length");
3981   ceph_assert(r == 0);
3982
3983   r = admin_socket->register_command(
3984     "injectdataerr " \
3985     "name=pool,type=CephString " \
3986     "name=objname,type=CephObjectname " \
3987     "name=shardid,type=CephInt,req=false,range=0|255",
3988     test_ops_hook,
3989     "inject data error to an object");
3990   ceph_assert(r == 0);
3991
3992   r = admin_socket->register_command(
3993     "injectmdataerr " \
3994     "name=pool,type=CephString " \
3995     "name=objname,type=CephObjectname " \
3996     "name=shardid,type=CephInt,req=false,range=0|255",
3997     test_ops_hook,
3998     "inject metadata error to an object");
3999   ceph_assert(r == 0);
4000   r = admin_socket->register_command(
4001     "set_recovery_delay " \
4002     "name=utime,type=CephInt,req=false",
4003     test_ops_hook,
4004      "Delay osd recovery by specified seconds");
4005   ceph_assert(r == 0);
4006   r = admin_socket->register_command(
4007    "injectfull " \
4008    "name=type,type=CephString,req=false " \
4009    "name=count,type=CephInt,req=false ",
4010    test_ops_hook,
4011    "Inject a full disk (optional count times)");
4012   ceph_assert(r == 0);
4013   r = admin_socket->register_command(
4014     "bench " \
4015     "name=count,type=CephInt,req=false "    \
4016     "name=size,type=CephInt,req=false "            \
4017     "name=object_size,type=CephInt,req=false "     \
4018     "name=object_num,type=CephInt,req=false ",
4019     asok_hook,
4020     "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4021     "(default count=1G default size=4MB). Results in log.");
4022   ceph_assert(r == 0);
4023   r = admin_socket->register_command(
4024     "cluster_log " \
4025     "name=level,type=CephChoices,strings=error,warning,info,debug "     \
4026     "name=message,type=CephString,n=N",
4027     asok_hook,
4028     "log a message to the cluster log");
4029   ceph_assert(r == 0);
4030   r = admin_socket->register_command(
4031     "flush_pg_stats",
4032     asok_hook,
4033     "flush pg stats");
4034   ceph_assert(r == 0);
4035   r = admin_socket->register_command(
4036     "heap " \
4037     "name=heapcmd,type=CephChoices,strings="                            \
4038     "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4039     "name=value,type=CephString,req=false",
4040     asok_hook,
4041     "show heap usage info (available only if compiled with tcmalloc)");
4042   ceph_assert(r == 0);
4043   r = admin_socket->register_command(
4044     "debug dump_missing "                       \
4045     "name=filename,type=CephFilepath",
4046     asok_hook,
4047     "dump missing objects to a named file");
4048   ceph_assert(r == 0);
4049   r = admin_socket->register_command(
4050     "debug kick_recovery_wq "                                           \
4051     "name=delay,type=CephInt,range=0",
4052     asok_hook,
4053     "set osd_recovery_delay_start to <val>");
4054   ceph_assert(r == 0);
4055   r = admin_socket->register_command(
4056     "cpu_profiler "                                             \
4057     "name=arg,type=CephChoices,strings=status|flush",
4058     asok_hook,
4059     "run cpu profiling on daemon");
4060   ceph_assert(r == 0);
4061   r = admin_socket->register_command(
4062     "dump_pg_recovery_stats",
4063     asok_hook,
4064     "dump pg recovery statistics");
4065   ceph_assert(r == 0);
4066   r = admin_socket->register_command(
4067     "reset_pg_recovery_stats",
4068     asok_hook,
4069     "reset pg recovery statistics");
4070   ceph_assert(r == 0);
4071   r = admin_socket->register_command(
4072     "cache drop",
4073     asok_hook,
4074     "Drop all OSD caches");
4075   ceph_assert(r == 0);
4076   r = admin_socket->register_command(
4077     "cache status",
4078     asok_hook,
4079     "Get OSD caches statistics");
4080   ceph_assert(r == 0);
4081   r = admin_socket->register_command(
4082     "scrub_purged_snaps",
4083     asok_hook,
4084     "Scrub purged_snaps vs snapmapper index");
4085   ceph_assert(r == 0);
4086
4087   // -- pg commands --
4088   // old form: ceph pg <pgid> command ...
4089   r = admin_socket->register_command(
4090     "pg "                          \
4091     "name=pgid,type=CephPgid "     \
4092     "name=cmd,type=CephChoices,strings=query",
4093     asok_hook,
4094     "");
4095   ceph_assert(r == 0);
4096   r = admin_socket->register_command(
4097     "pg "                          \
4098     "name=pgid,type=CephPgid "     \
4099     "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4100     "name=mulcmd,type=CephChoices,strings=revert|delete",
4101     asok_hook,
4102     "");
4103   ceph_assert(r == 0);
4104   r = admin_socket->register_command(
4105     "pg "                          \
4106     "name=pgid,type=CephPgid "     \
4107     "name=cmd,type=CephChoices,strings=list_unfound " \
4108     "name=offset,type=CephString,req=false",
4109     asok_hook,
4110     "");
4111   ceph_assert(r == 0);
4112   r = admin_socket->register_command(
4113     "pg "                          \
4114     "name=pgid,type=CephPgid "     \
4115     "name=cmd,type=CephChoices,strings=scrub " \
4116     "name=time,type=CephInt,req=false",
4117     asok_hook,
4118     "");
4119   ceph_assert(r == 0);
4120   r = admin_socket->register_command(
4121     "pg "                          \
4122     "name=pgid,type=CephPgid "     \
4123     "name=cmd,type=CephChoices,strings=deep_scrub " \
4124     "name=time,type=CephInt,req=false",
4125     asok_hook,
4126     "");
4127   ceph_assert(r == 0);
4128   // new form: tell <pgid> <cmd> for both cli and rest
4129   r = admin_socket->register_command(
4130     "query",
4131     asok_hook,
4132     "show details of a specific pg");
4133   ceph_assert(r == 0);
4134   r = admin_socket->register_command(
4135     "mark_unfound_lost "                                        \
4136     "name=pgid,type=CephPgid,req=false "                        \
4137     "name=mulcmd,type=CephChoices,strings=revert|delete",
4138     asok_hook,
4139     "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4140   ceph_assert(r == 0);
4141   r = admin_socket->register_command(
4142     "list_unfound "                                     \
4143     "name=pgid,type=CephPgid,req=false "                \
4144     "name=offset,type=CephString,req=false",
4145     asok_hook,
4146     "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4147   ceph_assert(r == 0);
4148   r = admin_socket->register_command(
4149     "scrub "                            \
4150     "name=pgid,type=CephPgid,req=false "        \
4151     "name=time,type=CephInt,req=false",
4152     asok_hook,
4153     "Trigger a scheduled scrub ");
4154   ceph_assert(r == 0);
4155   r = admin_socket->register_command(
4156     "deep_scrub "                       \
4157     "name=pgid,type=CephPgid,req=false "        \
4158     "name=time,type=CephInt,req=false",
4159     asok_hook,
4160     "Trigger a scheduled deep scrub ");
4161   ceph_assert(r == 0);
4162 }
4163
4164 PerfCounters* OSD::create_logger()
4165 {
4166   PerfCounters* logger = build_osd_logger(cct);
4167   cct->get_perfcounters_collection()->add(logger);
4168   return logger;
4169 }
4170
4171 PerfCounters* OSD::create_recoverystate_perf()
4172 {
4173   PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
4174   cct->get_perfcounters_collection()->add(recoverystate_perf);
4175   return recoverystate_perf;
4176 }
4177
4178 int OSD::shutdown()
4179 {
4180   if (cct->_conf->osd_fast_shutdown) {
4181     derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4182     if (cct->_conf->osd_fast_shutdown_notify_mon)
4183       service.prepare_to_stop();
4184     cct->_log->flush();
4185     _exit(0);
4186   }
4187
4188   if (!service.prepare_to_stop())
4189     return 0; // already shutting down
4190   osd_lock.lock();
4191   if (is_stopping()) {
4192     osd_lock.unlock();
4193     return 0;
4194   }
4195   dout(0) << "shutdown" << dendl;
4196
4197   set_state(STATE_STOPPING);
4198
4199   // Debugging
4200   if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4201     cct->_conf.set_val("debug_osd", "100");
4202     cct->_conf.set_val("debug_journal", "100");
4203     cct->_conf.set_val("debug_filestore", "100");
4204     cct->_conf.set_val("debug_bluestore", "100");
4205     cct->_conf.set_val("debug_ms", "100");
4206     cct->_conf.apply_changes(nullptr);
4207   }
4208
4209   // stop MgrClient earlier as it's more like an internal consumer of OSD
4210   mgrc.shutdown();
4211
4212   service.start_shutdown();
4213
4214   // stop sending work to pgs.  this just prevents any new work in _process
4215   // from racing with on_shutdown and potentially entering the pg after.
4216   op_shardedwq.drain();
4217
4218   // Shutdown PGs
4219   {
4220     vector<PGRef> pgs;
4221     _get_pgs(&pgs);
4222     for (auto pg : pgs) {
4223       pg->shutdown();
4224     }
4225   }
4226
4227   // drain op queue again (in case PGs requeued something)
4228   op_shardedwq.drain();
4229   {
4230     finished.clear(); // zap waiters (bleh, this is messy)
4231     waiting_for_osdmap.clear();
4232   }
4233
4234   // unregister commands
4235   cct->get_admin_socket()->unregister_commands(asok_hook);
4236   delete asok_hook;
4237   asok_hook = NULL;
4238
4239   cct->get_admin_socket()->unregister_commands(test_ops_hook);
4240   delete test_ops_hook;
4241   test_ops_hook = NULL;
4242
4243   osd_lock.unlock();
4244
4245   {
4246     std::lock_guard l{heartbeat_lock};
4247     heartbeat_stop = true;
4248     heartbeat_cond.notify_all();
4249     heartbeat_peers.clear();
4250   }
4251   heartbeat_thread.join();
4252
4253   hb_back_server_messenger->mark_down_all();
4254   hb_front_server_messenger->mark_down_all();
4255   hb_front_client_messenger->mark_down_all();
4256   hb_back_client_messenger->mark_down_all();
4257
4258   osd_op_tp.drain();
4259   osd_op_tp.stop();
4260   dout(10) << "op sharded tp stopped" << dendl;
4261
4262   dout(10) << "stopping agent" << dendl;
4263   service.agent_stop();
4264
4265   boot_finisher.wait_for_empty();
4266
4267   osd_lock.lock();
4268
4269   boot_finisher.stop();
4270   reset_heartbeat_peers(true);
4271
4272   tick_timer.shutdown();
4273
4274   {
4275     std::lock_guard l(tick_timer_lock);
4276     tick_timer_without_osd_lock.shutdown();
4277   }
4278
4279   // note unmount epoch
4280   dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4281   superblock.mounted = service.get_boot_epoch();
4282   superblock.clean_thru = get_osdmap_epoch();
4283   ObjectStore::Transaction t;
4284   write_superblock(t);
4285   int r = store->queue_transaction(service.meta_ch, std::move(t));
4286   if (r) {
4287     derr << "OSD::shutdown: error writing superblock: "
4288          << cpp_strerror(r) << dendl;
4289   }
4290
4291
4292   service.shutdown_reserver();
4293
4294   // Remove PGs
4295 #ifdef PG_DEBUG_REFS
4296   service.dump_live_pgids();
4297 #endif
4298   while (true) {
4299     vector<PGRef> pgs;
4300     _get_pgs(&pgs, true);
4301     if (pgs.empty()) {
4302       break;
4303     }
4304     for (auto& pg : pgs) {
4305       if (pg->is_deleted()) {
4306         continue;
4307       }
4308       dout(20) << " kicking pg " << pg << dendl;
4309       pg->lock();
4310       if (pg->get_num_ref() != 1) {
4311         derr << "pgid " << pg->get_pgid() << " has ref count of "
4312              << pg->get_num_ref() << dendl;
4313 #ifdef PG_DEBUG_REFS
4314         pg->dump_live_ids();
4315 #endif
4316         if (cct->_conf->osd_shutdown_pgref_assert) {
4317           ceph_abort();
4318         }
4319       }
4320       pg->ch.reset();
4321       pg->unlock();
4322     }
4323   }
4324 #ifdef PG_DEBUG_REFS
4325   service.dump_live_pgids();
4326 #endif
4327
4328   osd_lock.unlock();
4329   cct->_conf.remove_observer(this);
4330   osd_lock.lock();
4331
4332   service.meta_ch.reset();
4333
4334   dout(10) << "syncing store" << dendl;
4335   enable_disable_fuse(true);
4336
4337   if (cct->_conf->osd_journal_flush_on_shutdown) {
4338     dout(10) << "flushing journal" << dendl;
4339     store->flush_journal();
4340   }
4341
4342   monc->shutdown();
4343   osd_lock.unlock();
4344   {
4345     std::unique_lock l{map_lock};
4346     set_osdmap(OSDMapRef());
4347   }
4348   for (auto s : shards) {
4349     std::lock_guard l(s->osdmap_lock);
4350     s->shard_osdmap = OSDMapRef();
4351   }
4352   service.shutdown();
4353
4354   std::lock_guard lock(osd_lock);
4355   store->umount();
4356   delete store;
4357   store = nullptr;
4358   dout(10) << "Store synced" << dendl;
4359
4360   op_tracker.on_shutdown();
4361
4362   ClassHandler::get_instance().shutdown();
4363   client_messenger->shutdown();
4364   cluster_messenger->shutdown();
4365   hb_front_client_messenger->shutdown();
4366   hb_back_client_messenger->shutdown();
4367   objecter_messenger->shutdown();
4368   hb_front_server_messenger->shutdown();
4369   hb_back_server_messenger->shutdown();
4370
4371   return r;
4372 }
4373
4374 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4375 {
4376   bool created = false;
4377   while (true) {
4378     dout(10) << __func__ << " cmd: " << cmd << dendl;
4379     vector<string> vcmd{cmd};
4380     bufferlist inbl;
4381     C_SaferCond w;
4382     string outs;
4383     monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4384     int r = w.wait();
4385     if (r < 0) {
4386       if (r == -ENOENT && !created) {
4387         string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4388           + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4389         vector<string> vnewcmd{newcmd};
4390         bufferlist inbl;
4391         C_SaferCond w;
4392         string outs;
4393         monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4394         int r = w.wait();
4395         if (r < 0) {
4396           derr << __func__ << " fail: osd does not exist and created failed: "
4397                << cpp_strerror(r) << dendl;
4398           return r;
4399         }
4400         created = true;
4401         continue;
4402       }
4403       derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4404       return r;
4405     }
4406     break;
4407   }
4408
4409   return 0;
4410 }
4411
4412 int OSD::update_crush_location()
4413 {
4414   if (!cct->_conf->osd_crush_update_on_start) {
4415     dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4416     return 0;
4417   }
4418
4419   char weight[32];
4420   if (cct->_conf->osd_crush_initial_weight >= 0) {
4421     snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4422   } else {
4423     struct store_statfs_t st;
4424     osd_alert_list_t alerts;
4425     int r = store->statfs(&st, &alerts);
4426     if (r < 0) {
4427       derr << "statfs: " << cpp_strerror(r) << dendl;
4428       return r;
4429     }
4430     snprintf(weight, sizeof(weight), "%.4lf",
4431              std::max(.00001,
4432                       double(st.total) /
4433                       double(1ull << 40 /* TB */)));
4434   }
4435
4436   dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4437
4438   string cmd =
4439     string("{\"prefix\": \"osd crush create-or-move\", ") +
4440     string("\"id\": ") + stringify(whoami) + ", " +
4441     string("\"weight\":") + weight + ", " +
4442     string("\"args\": [") + stringify(cct->crush_location) + "]}";
4443   return mon_cmd_maybe_osd_create(cmd);
4444 }
4445
4446 int OSD::update_crush_device_class()
4447 {
4448   if (!cct->_conf->osd_class_update_on_start) {
4449     dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4450     return 0;
4451   }
4452
4453   string device_class;
4454   int r = store->read_meta("crush_device_class", &device_class);
4455   if (r < 0 || device_class.empty()) {
4456     device_class = store->get_default_device_class();
4457   }
4458
4459   if (device_class.empty()) {
4460     dout(20) << __func__ << " no device class stored locally" << dendl;
4461     return 0;
4462   }
4463
4464   string cmd =
4465     string("{\"prefix\": \"osd crush set-device-class\", ") +
4466     string("\"class\": \"") + device_class + string("\", ") +
4467     string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4468
4469   r = mon_cmd_maybe_osd_create(cmd);
4470   if (r == -EBUSY) {
4471     // good, already bound to a device-class
4472     return 0;
4473   } else {
4474     return r;
4475   }
4476 }
4477
4478 void OSD::write_superblock(ObjectStore::Transaction& t)
4479 {
4480   dout(10) << "write_superblock " << superblock << dendl;
4481
4482   //hack: at minimum it's using the baseline feature set
4483   if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4484     superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4485
4486   bufferlist bl;
4487   encode(superblock, bl);
4488   t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4489 }
4490
4491 int OSD::read_superblock()
4492 {
4493   bufferlist bl;
4494   int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4495   if (r < 0)
4496     return r;
4497
4498   auto p = bl.cbegin();
4499   decode(superblock, p);
4500
4501   dout(10) << "read_superblock " << superblock << dendl;
4502
4503   return 0;
4504 }
4505
4506 void OSD::clear_temp_objects()
4507 {
4508   dout(10) << __func__ << dendl;
4509   vector<coll_t> ls;
4510   store->list_collections(ls);
4511   for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4512     spg_t pgid;
4513     if (!p->is_pg(&pgid))
4514       continue;
4515
4516     // list temp objects
4517     dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4518
4519     vector<ghobject_t> temps;
4520     ghobject_t next;
4521     while (1) {
4522       vector<ghobject_t> objects;
4523       auto ch = store->open_collection(*p);
4524       ceph_assert(ch);
4525       store->collection_list(ch, next, ghobject_t::get_max(),
4526                              store->get_ideal_list_max(),
4527                              &objects, &next);
4528       if (objects.empty())
4529         break;
4530       vector<ghobject_t>::iterator q;
4531       for (q = objects.begin(); q != objects.end(); ++q) {
4532         // Hammer set pool for temps to -1, so check for clean-up
4533         if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4534           temps.push_back(*q);
4535         } else {
4536           break;
4537         }
4538       }
4539       // If we saw a non-temp object and hit the break above we can
4540       // break out of the while loop too.
4541       if (q != objects.end())
4542         break;
4543     }
4544     if (!temps.empty()) {
4545       ObjectStore::Transaction t;
4546       int removed = 0;
4547       for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4548         dout(20) << "  removing " << *p << " object " << *q << dendl;
4549         t.remove(*p, *q);
4550         if (++removed > cct->_conf->osd_target_transaction_size) {
4551           store->queue_transaction(service.meta_ch, std::move(t));
4552           t = ObjectStore::Transaction();
4553           removed = 0;
4554         }
4555       }
4556       if (removed) {
4557         store->queue_transaction(service.meta_ch, std::move(t));
4558       }
4559     }
4560   }
4561 }
4562
4563 void OSD::recursive_remove_collection(CephContext* cct,
4564                                       ObjectStore *store, spg_t pgid,
4565                                       coll_t tmp)
4566 {
4567   OSDriver driver(
4568     store,
4569     coll_t(),
4570     make_snapmapper_oid());
4571
4572   ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4573   ObjectStore::Transaction t;
4574   SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4575
4576   ghobject_t next;
4577   int max = cct->_conf->osd_target_transaction_size;
4578   vector<ghobject_t> objects;
4579   objects.reserve(max);
4580   while (true) {
4581     objects.clear();
4582     store->collection_list(ch, next, ghobject_t::get_max(),
4583       max, &objects, &next);
4584     generic_dout(10) << __func__ << " " << objects << dendl;
4585     if (objects.empty())
4586       break;
4587     for (auto& p: objects) {
4588       OSDriver::OSTransaction _t(driver.get_transaction(&t));
4589       int r = mapper.remove_oid(p.hobj, &_t);
4590       if (r != 0 && r != -ENOENT)
4591         ceph_abort();
4592       t.remove(tmp, p);
4593     }
4594     int r = store->queue_transaction(ch, std::move(t));
4595     ceph_assert(r == 0);
4596     t = ObjectStore::Transaction();
4597   }
4598   t.remove_collection(tmp);
4599   int r = store->queue_transaction(ch, std::move(t));
4600   ceph_assert(r == 0);
4601
4602   C_SaferCond waiter;
4603   if (!ch->flush_commit(&waiter)) {
4604     waiter.wait();
4605   }
4606 }
4607
4608
4609 // ======================================================
4610 // PG's
4611
4612 PG* OSD::_make_pg(
4613   OSDMapRef createmap,
4614   spg_t pgid)
4615 {
4616   dout(10) << __func__ << " " << pgid << dendl;
4617   pg_pool_t pi;
4618   map<string,string> ec_profile;
4619   string name;
4620   if (createmap->have_pg_pool(pgid.pool())) {
4621     pi = *createmap->get_pg_pool(pgid.pool());
4622     name = createmap->get_pool_name(pgid.pool());
4623     if (pi.is_erasure()) {
4624       ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4625     }
4626   } else {
4627     // pool was deleted; grab final pg_pool_t off disk.
4628     ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4629     bufferlist bl;
4630     int r = store->read(service.meta_ch, oid, 0, 0, bl);
4631     if (r < 0) {
4632       derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4633            << dendl;
4634       return nullptr;
4635     }
4636     ceph_assert(r >= 0);
4637     auto p = bl.cbegin();
4638     decode(pi, p);
4639     decode(name, p);
4640     if (p.end()) { // dev release v13.0.2 did not include ec_profile
4641       derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4642            << " tombstone" << dendl;
4643       return nullptr;
4644     }
4645     decode(ec_profile, p);
4646   }
4647   PGPool pool(createmap, pgid.pool(), pi, name);
4648   PG *pg;
4649   if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4650       pi.type == pg_pool_t::TYPE_ERASURE)
4651     pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4652   else
4653     ceph_abort();
4654   return pg;
4655 }
4656
4657 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4658 {
4659   v->clear();
4660   v->reserve(get_num_pgs());
4661   for (auto& s : shards) {
4662     std::lock_guard l(s->shard_lock);
4663     for (auto& j : s->pg_slots) {
4664       if (j.second->pg &&
4665           !j.second->pg->is_deleted()) {
4666         v->push_back(j.second->pg);
4667         if (clear_too) {
4668           s->_detach_pg(j.second.get());
4669         }
4670       }
4671     }
4672   }
4673 }
4674
4675 void OSD::_get_pgids(vector<spg_t> *v)
4676 {
4677   v->clear();
4678   v->reserve(get_num_pgs());
4679   for (auto& s : shards) {
4680     std::lock_guard l(s->shard_lock);
4681     for (auto& j : s->pg_slots) {
4682       if (j.second->pg &&
4683           !j.second->pg->is_deleted()) {
4684         v->push_back(j.first);
4685       }
4686     }
4687   }
4688 }
4689
4690 void OSD::register_pg(PGRef pg)
4691 {
4692   spg_t pgid = pg->get_pgid();
4693   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4694   auto sdata = shards[shard_index];
4695   std::lock_guard l(sdata->shard_lock);
4696   auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4697   ceph_assert(r.second);
4698   auto *slot = r.first->second.get();
4699   dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4700   sdata->_attach_pg(slot, pg.get());
4701 }
4702
4703 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4704 {
4705   auto sdata = pg->osd_shard;
4706   ceph_assert(sdata);
4707   {
4708     std::lock_guard l(sdata->shard_lock);
4709     auto p = sdata->pg_slots.find(pg->pg_id);
4710     if (p == sdata->pg_slots.end() ||
4711         !p->second->pg) {
4712       dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4713       return false;
4714     }
4715     if (p->second->waiting_for_merge_epoch) {
4716       dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4717       return false;
4718     }
4719     dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4720     sdata->_detach_pg(p->second.get());
4721   }
4722
4723   for (auto shard : shards) {
4724     shard->unprime_split_children(pg->pg_id, old_pg_num);
4725   }
4726
4727   // update pg count now since we might not get an osdmap any time soon.
4728   if (pg->is_primary())
4729     service.logger->dec(l_osd_pg_primary);
4730   else if (pg->is_nonprimary())
4731     service.logger->dec(l_osd_pg_replica); // misnomver
4732   else
4733     service.logger->dec(l_osd_pg_stray);
4734
4735   return true;
4736 }
4737
4738 PGRef OSD::_lookup_pg(spg_t pgid)
4739 {
4740   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4741   auto sdata = shards[shard_index];
4742   std::lock_guard l(sdata->shard_lock);
4743   auto p = sdata->pg_slots.find(pgid);
4744   if (p == sdata->pg_slots.end()) {
4745     return nullptr;
4746   }
4747   return p->second->pg;
4748 }
4749
4750 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4751 {
4752   PGRef pg = _lookup_pg(pgid);
4753   if (!pg) {
4754     return nullptr;
4755   }
4756   pg->lock();
4757   if (!pg->is_deleted()) {
4758     return pg;
4759   }
4760   pg->unlock();
4761   return nullptr;
4762 }
4763
4764 PGRef OSD::lookup_lock_pg(spg_t pgid)
4765 {
4766   return _lookup_lock_pg(pgid);
4767 }
4768
4769 void OSD::load_pgs()
4770 {
4771   ceph_assert(ceph_mutex_is_locked(osd_lock));
4772   dout(0) << "load_pgs" << dendl;
4773
4774   {
4775     auto pghist = make_pg_num_history_oid();
4776     bufferlist bl;
4777     int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4778     if (r >= 0 && bl.length() > 0) {
4779       auto p = bl.cbegin();
4780       decode(pg_num_history, p);
4781     }
4782     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4783   }
4784
4785   vector<coll_t> ls;
4786   int r = store->list_collections(ls);
4787   if (r < 0) {
4788     derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4789   }
4790
4791   int num = 0;
4792   for (vector<coll_t>::iterator it = ls.begin();
4793        it != ls.end();
4794        ++it) {
4795     spg_t pgid;
4796     if (it->is_temp(&pgid) ||
4797        (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4798       dout(10) << "load_pgs " << *it
4799                << " removing, legacy or flagged for removal pg" << dendl;
4800       recursive_remove_collection(cct, store, pgid, *it);
4801       continue;
4802     }
4803
4804     if (!it->is_pg(&pgid)) {
4805       dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4806       continue;
4807     }
4808
4809     dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4810     epoch_t map_epoch = 0;
4811     int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4812     if (r < 0) {
4813       derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4814            << dendl;
4815       continue;
4816     }
4817
4818     PGRef pg;
4819     if (map_epoch > 0) {
4820       OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4821       if (!pgosdmap) {
4822         if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4823           derr << __func__ << ": could not find map for epoch " << map_epoch
4824                << " on pg " << pgid << ", but the pool is not present in the "
4825                << "current map, so this is probably a result of bug 10617.  "
4826                << "Skipping the pg for now, you can use ceph-objectstore-tool "
4827                << "to clean it up later." << dendl;
4828           continue;
4829         } else {
4830           derr << __func__ << ": have pgid " << pgid << " at epoch "
4831                << map_epoch << ", but missing map.  Crashing."
4832                << dendl;
4833           ceph_abort_msg("Missing map in load_pgs");
4834         }
4835       }
4836       pg = _make_pg(pgosdmap, pgid);
4837     } else {
4838       pg = _make_pg(get_osdmap(), pgid);
4839     }
4840     if (!pg) {
4841       recursive_remove_collection(cct, store, pgid, *it);
4842       continue;
4843     }
4844
4845     // there can be no waiters here, so we don't call _wake_pg_slot
4846
4847     pg->lock();
4848     pg->ch = store->open_collection(pg->coll);
4849
4850     // read pg state, log
4851     pg->read_state(store);
4852
4853     if (pg->dne())  {
4854       dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4855       pg->ch = nullptr;
4856       pg->unlock();
4857       recursive_remove_collection(cct, store, pgid, *it);
4858       continue;
4859     }
4860     {
4861       uint32_t shard_index = pgid.hash_to_shard(shards.size());
4862       assert(NULL != shards[shard_index]);
4863       store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4864     }
4865
4866     pg->reg_next_scrub();
4867
4868     dout(10) << __func__ << " loaded " << *pg << dendl;
4869     pg->unlock();
4870
4871     register_pg(pg);
4872     ++num;
4873   }
4874   dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4875 }
4876
4877
4878 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4879                                  const PGCreateInfo *info)
4880 {
4881   spg_t pgid = info->pgid;
4882
4883   if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4884     dout(10) << __func__ << " hit max pg, dropping" << dendl;
4885     return nullptr;
4886   }
4887
4888   PeeringCtx rctx = create_context();
4889
4890   OSDMapRef startmap = get_map(info->epoch);
4891
4892   if (info->by_mon) {
4893     int64_t pool_id = pgid.pgid.pool();
4894     const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4895     if (!pool) {
4896       dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4897       return nullptr;
4898     }
4899     if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4900         !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4901       // this ensures we do not process old creating messages after the
4902       // pool's initial pgs have been created (and pg are subsequently
4903       // allowed to split or merge).
4904       dout(20) << __func__ << "  dropping " << pgid
4905                << "create, pool does not have CREATING flag set" << dendl;
4906       return nullptr;
4907     }
4908   }
4909
4910   int up_primary, acting_primary;
4911   vector<int> up, acting;
4912   startmap->pg_to_up_acting_osds(
4913     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4914
4915   const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4916   if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4917       store->get_type() != "bluestore") {
4918     clog->warn() << "pg " << pgid
4919                  << " is at risk of silent data corruption: "
4920                  << "the pool allows ec overwrites but is not stored in "
4921                  << "bluestore, so deep scrubbing will not detect bitrot";
4922   }
4923   create_pg_collection(
4924     rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4925   init_pg_ondisk(rctx.transaction, pgid, pp);
4926
4927   int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4928
4929   PGRef pg = _make_pg(startmap, pgid);
4930   pg->ch = store->create_new_collection(pg->coll);
4931
4932   {
4933     uint32_t shard_index = pgid.hash_to_shard(shards.size());
4934     assert(NULL != shards[shard_index]);
4935     store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4936   }
4937
4938   pg->lock(true);
4939
4940   // we are holding the shard lock
4941   ceph_assert(!pg->is_deleted());
4942
4943   pg->init(
4944     role,
4945     up,
4946     up_primary,
4947     acting,
4948     acting_primary,
4949     info->history,
4950     info->past_intervals,
4951     false,
4952     rctx.transaction);
4953
4954   pg->init_collection_pool_opts();
4955
4956   if (pg->is_primary()) {
4957     std::lock_guard locker{m_perf_queries_lock};
4958     pg->set_dynamic_perf_stats_queries(m_perf_queries);
4959   }
4960
4961   pg->handle_initialize(rctx);
4962   pg->handle_activate_map(rctx);
4963
4964   dispatch_context(rctx, pg.get(), osdmap, nullptr);
4965
4966   dout(10) << __func__ << " new pg " << *pg << dendl;
4967   return pg;
4968 }
4969
4970 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4971                                 spg_t pgid,
4972                                 bool is_mon_create)
4973 {
4974   const auto max_pgs_per_osd =
4975     (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4976      cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4977
4978   if (num_pgs < max_pgs_per_osd) {
4979     return false;
4980   }
4981
4982   std::lock_guard l(pending_creates_lock);
4983   if (is_mon_create) {
4984     pending_creates_from_mon++;
4985   } else {
4986     bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4987     pending_creates_from_osd.emplace(pgid, is_primary);
4988   }
4989   dout(1) << __func__ << " withhold creation of pg " << pgid
4990           << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4991   return true;
4992 }
4993
4994 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4995 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4996 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4997 static vector<int32_t> twiddle(const vector<int>& acting) {
4998   if (acting.size() > 1) {
4999     return {acting[0]};
5000   } else {
5001     vector<int32_t> twiddled(acting.begin(), acting.end());
5002     twiddled.push_back(-1);
5003     return twiddled;
5004   }
5005 }
5006
5007 void OSD::resume_creating_pg()
5008 {
5009   bool do_sub_pg_creates = false;
5010   bool have_pending_creates = false;
5011   {
5012     const auto max_pgs_per_osd =
5013       (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5014        cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5015     if (max_pgs_per_osd <= num_pgs) {
5016       // this could happen if admin decreases this setting before a PG is removed
5017       return;
5018     }
5019     unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5020     std::lock_guard l(pending_creates_lock);
5021     if (pending_creates_from_mon > 0) {
5022       dout(20) << __func__ << " pending_creates_from_mon "
5023                << pending_creates_from_mon << dendl;
5024       do_sub_pg_creates = true;
5025       if (pending_creates_from_mon >= spare_pgs) {
5026         spare_pgs = pending_creates_from_mon = 0;
5027       } else {
5028         spare_pgs -= pending_creates_from_mon;
5029         pending_creates_from_mon = 0;
5030       }
5031     }
5032     auto pg = pending_creates_from_osd.cbegin();
5033     while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
5034       dout(20) << __func__ << " pg " << pg->first << dendl;
5035       vector<int> acting;
5036       get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5037       service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
5038       pg = pending_creates_from_osd.erase(pg);
5039       do_sub_pg_creates = true;
5040       spare_pgs--;
5041     }
5042     have_pending_creates = (pending_creates_from_mon > 0 ||
5043                             !pending_creates_from_osd.empty());
5044   }
5045
5046   bool do_renew_subs = false;
5047   if (do_sub_pg_creates) {
5048     if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5049       dout(4) << __func__ << ": resolicit pg creates from mon since "
5050               << last_pg_create_epoch << dendl;
5051       do_renew_subs = true;
5052     }
5053   }
5054   version_t start = get_osdmap_epoch() + 1;
5055   if (have_pending_creates) {
5056     // don't miss any new osdmap deleting PGs
5057     if (monc->sub_want("osdmap", start, 0)) {
5058       dout(4) << __func__ << ": resolicit osdmap from mon since "
5059               << start << dendl;
5060       do_renew_subs = true;
5061     }
5062   } else if (do_sub_pg_creates) {
5063     // no need to subscribe the osdmap continuously anymore
5064     // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5065     if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
5066       dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
5067               << start << dendl;
5068       do_renew_subs = true;
5069     }
5070   }
5071
5072   if (do_renew_subs) {
5073     monc->renew_subs();
5074   }
5075
5076   service.send_pg_temp();
5077 }
5078
5079 void OSD::build_initial_pg_history(
5080   spg_t pgid,
5081   epoch_t created,
5082   utime_t created_stamp,
5083   pg_history_t *h,
5084   PastIntervals *pi)
5085 {
5086   dout(10) << __func__ << " " << pgid << " created " << created << dendl;
5087   *h = pg_history_t(created, created_stamp);
5088
5089   OSDMapRef lastmap = service.get_map(created);
5090   int up_primary, acting_primary;
5091   vector<int> up, acting;
5092   lastmap->pg_to_up_acting_osds(
5093     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5094
5095   ostringstream debug;
5096   for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
5097     OSDMapRef osdmap = service.get_map(e);
5098     int new_up_primary, new_acting_primary;
5099     vector<int> new_up, new_acting;
5100     osdmap->pg_to_up_acting_osds(
5101       pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5102
5103     // this is a bit imprecise, but sufficient?
5104     struct min_size_predicate_t : public IsPGRecoverablePredicate {
5105       const pg_pool_t *pi;
5106       bool operator()(const set<pg_shard_t> &have) const {
5107         return have.size() >= pi->min_size;
5108       }
5109       explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
5110     } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5111
5112     bool new_interval = PastIntervals::check_new_interval(
5113       acting_primary,
5114       new_acting_primary,
5115       acting, new_acting,
5116       up_primary,
5117       new_up_primary,
5118       up, new_up,
5119       h->same_interval_since,
5120       h->last_epoch_clean,
5121       osdmap.get(),
5122       lastmap.get(),
5123       pgid.pgid,
5124       min_size_predicate,
5125       pi,
5126       &debug);
5127     if (new_interval) {
5128       h->same_interval_since = e;
5129       if (up != new_up) {
5130         h->same_up_since = e;
5131       }
5132       if (acting_primary != new_acting_primary) {
5133         h->same_primary_since = e;
5134       }
5135       if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5136                              osdmap->get_pg_num(pgid.pgid.pool()),
5137                              nullptr)) {
5138         h->last_epoch_split = e;
5139       }
5140       up = new_up;
5141       acting = new_acting;
5142       up_primary = new_up_primary;
5143       acting_primary = new_acting_primary;
5144     }
5145     lastmap = osdmap;
5146   }
5147   dout(20) << __func__ << " " << debug.str() << dendl;
5148   dout(10) << __func__ << " " << *h << " " << *pi
5149            << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5150                        pi->get_bounds()) << ")"
5151            << dendl;
5152 }
5153
5154 void OSD::_add_heartbeat_peer(int p)
5155 {
5156   if (p == whoami)
5157     return;
5158   HeartbeatInfo *hi;
5159
5160   map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5161   if (i == heartbeat_peers.end()) {
5162     pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5163     if (!cons.first)
5164       return;
5165     assert(cons.second);
5166
5167     hi = &heartbeat_peers[p];
5168     hi->peer = p;
5169
5170     auto stamps = service.get_hb_stamps(p);
5171
5172     auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5173     sb->peer = p;
5174     sb->stamps = stamps;
5175     hi->hb_interval_start = ceph_clock_now();
5176     hi->con_back = cons.first.get();
5177     hi->con_back->set_priv(sb);
5178
5179     auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5180     sf->peer = p;
5181     sf->stamps = stamps;
5182     hi->con_front = cons.second.get();
5183     hi->con_front->set_priv(sf);
5184
5185     dout(10) << "_add_heartbeat_peer: new peer osd." << p
5186              << " " << hi->con_back->get_peer_addr()
5187              << " " << hi->con_front->get_peer_addr()
5188              << dendl;
5189   } else {
5190     hi = &i->second;
5191   }
5192   hi->epoch = get_osdmap_epoch();
5193 }
5194
5195 void OSD::_remove_heartbeat_peer(int n)
5196 {
5197   map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5198   ceph_assert(q != heartbeat_peers.end());
5199   dout(20) << " removing heartbeat peer osd." << n
5200            << " " << q->second.con_back->get_peer_addr()
5201            << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5202            << dendl;
5203   q->second.clear_mark_down();
5204   heartbeat_peers.erase(q);
5205 }
5206
5207 void OSD::need_heartbeat_peer_update()
5208 {
5209   if (is_stopping())
5210     return;
5211   dout(20) << "need_heartbeat_peer_update" << dendl;
5212   heartbeat_set_peers_need_update();
5213 }
5214
5215 void OSD::maybe_update_heartbeat_peers()
5216 {
5217   ceph_assert(ceph_mutex_is_locked(osd_lock));
5218
5219   if (is_waiting_for_healthy() || is_active()) {
5220     utime_t now = ceph_clock_now();
5221     if (last_heartbeat_resample == utime_t()) {
5222       last_heartbeat_resample = now;
5223       heartbeat_set_peers_need_update();
5224     } else if (!heartbeat_peers_need_update()) {
5225       utime_t dur = now - last_heartbeat_resample;
5226       if (dur > cct->_conf->osd_heartbeat_grace) {
5227         dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5228         heartbeat_set_peers_need_update();
5229         last_heartbeat_resample = now;
5230         // automatically clean up any stale heartbeat peers
5231         // if we are unhealthy, then clean all
5232         reset_heartbeat_peers(is_waiting_for_healthy());
5233       }
5234     }
5235   }
5236
5237   if (!heartbeat_peers_need_update())
5238     return;
5239   heartbeat_clear_peers_need_update();
5240
5241   std::lock_guard l(heartbeat_lock);
5242
5243   dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5244
5245
5246   // build heartbeat from set
5247   if (is_active()) {
5248     vector<PGRef> pgs;
5249     _get_pgs(&pgs);
5250     for (auto& pg : pgs) {
5251       pg->with_heartbeat_peers([&](int peer) {
5252           if (get_osdmap()->is_up(peer)) {
5253             _add_heartbeat_peer(peer);
5254           }
5255         });
5256     }
5257   }
5258
5259   // include next and previous up osds to ensure we have a fully-connected set
5260   set<int> want, extras;
5261   const int next = get_osdmap()->get_next_up_osd_after(whoami);
5262   if (next >= 0)
5263     want.insert(next);
5264   int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5265   if (prev >= 0 && prev != next)
5266     want.insert(prev);
5267
5268   // make sure we have at least **min_down** osds coming from different
5269   // subtree level (e.g., hosts) for fast failure detection.
5270   auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5271   auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5272   auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5273   get_osdmap()->get_random_up_osds_by_subtree(
5274     whoami, subtree, limit, want, &want);
5275
5276   for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5277     dout(10) << " adding neighbor peer osd." << *p << dendl;
5278     extras.insert(*p);
5279     _add_heartbeat_peer(*p);
5280   }
5281
5282   // remove down peers; enumerate extras
5283   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5284   while (p != heartbeat_peers.end()) {
5285     if (!get_osdmap()->is_up(p->first)) {
5286       int o = p->first;
5287       ++p;
5288       _remove_heartbeat_peer(o);
5289       continue;
5290     }
5291     if (p->second.epoch < get_osdmap_epoch()) {
5292       extras.insert(p->first);
5293     }
5294     ++p;
5295   }
5296
5297   // too few?
5298   for (int n = next; n >= 0; ) {
5299     if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5300       break;
5301     if (!extras.count(n) && !want.count(n) && n != whoami) {
5302       dout(10) << " adding random peer osd." << n << dendl;
5303       extras.insert(n);
5304       _add_heartbeat_peer(n);
5305     }
5306     n = get_osdmap()->get_next_up_osd_after(n);
5307     if (n == next)
5308       break;  // came full circle; stop
5309   }
5310
5311   // too many?
5312   for (set<int>::iterator p = extras.begin();
5313        (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5314        ++p) {
5315     if (want.count(*p))
5316       continue;
5317     _remove_heartbeat_peer(*p);
5318   }
5319
5320   dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5321
5322   // clean up stale failure pending
5323   for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5324     if (heartbeat_peers.count(it->first) == 0) {
5325       send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5326       failure_pending.erase(it++);
5327     } else {
5328       it++;
5329     }
5330   }
5331 }
5332
5333 void OSD::reset_heartbeat_peers(bool all)
5334 {
5335   ceph_assert(ceph_mutex_is_locked(osd_lock));
5336   dout(10) << "reset_heartbeat_peers" << dendl;
5337   utime_t stale = ceph_clock_now();
5338   stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5339   std::lock_guard l(heartbeat_lock);
5340   for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5341     auto& [peer, hi] = *it;
5342     if (all || hi.is_stale(stale)) {
5343       hi.clear_mark_down();
5344       // stop sending failure_report to mon too
5345       failure_queue.erase(peer);
5346       failure_pending.erase(peer);
5347       it = heartbeat_peers.erase(it);
5348     } else {
5349       ++it;
5350     }
5351   }
5352 }
5353
5354 void OSD::handle_osd_ping(MOSDPing *m)
5355 {
5356   if (superblock.cluster_fsid != m->fsid) {
5357     dout(20) << "handle_osd_ping from " << m->get_source_inst()
5358              << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5359              << dendl;
5360     m->put();
5361     return;
5362   }
5363
5364   int from = m->get_source().num();
5365
5366   heartbeat_lock.lock();
5367   if (is_stopping()) {
5368     heartbeat_lock.unlock();
5369     m->put();
5370     return;
5371   }
5372
5373   utime_t now = ceph_clock_now();
5374   auto mnow = service.get_mnow();
5375   ConnectionRef con(m->get_connection());
5376   OSDMapRef curmap = service.get_osdmap();
5377   if (!curmap) {
5378     heartbeat_lock.unlock();
5379     m->put();
5380     return;
5381   }
5382
5383   auto sref = con->get_priv();
5384   Session *s = static_cast<Session*>(sref.get());
5385   if (!s) {
5386     heartbeat_lock.unlock();
5387     m->put();
5388     return;
5389   }
5390   if (!s->stamps) {
5391     s->peer = from;
5392     s->stamps = service.get_hb_stamps(from);
5393   }
5394
5395   switch (m->op) {
5396
5397   case MOSDPing::PING:
5398     {
5399       if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5400         auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5401         if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5402           if (heartbeat_drop->second == 0) {
5403             debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5404           } else {
5405             --heartbeat_drop->second;
5406             dout(5) << "Dropping heartbeat from " << from
5407                     << ", " << heartbeat_drop->second
5408                     << " remaining to drop" << dendl;
5409             break;
5410           }
5411         } else if (cct->_conf->osd_debug_drop_ping_probability >
5412                    ((((double)(rand()%100))/100.0))) {
5413           heartbeat_drop =
5414             debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5415                              cct->_conf->osd_debug_drop_ping_duration)).first;
5416           dout(5) << "Dropping heartbeat from " << from
5417                   << ", " << heartbeat_drop->second
5418                   << " remaining to drop" << dendl;
5419           break;
5420         }
5421       }
5422
5423       ceph::signedspan sender_delta_ub{};
5424       s->stamps->got_ping(
5425         m->up_from,
5426         mnow,
5427         m->mono_send_stamp,
5428         m->delta_ub,
5429         &sender_delta_ub);
5430       dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5431
5432       if (!cct->get_heartbeat_map()->is_healthy()) {
5433         dout(10) << "internal heartbeat not healthy, dropping ping request"
5434                  << dendl;
5435         break;
5436       }
5437
5438       Message *r = new MOSDPing(monc->get_fsid(),
5439                                 curmap->get_epoch(),
5440                                 MOSDPing::PING_REPLY,
5441                                 m->ping_stamp,
5442                                 m->mono_ping_stamp,
5443                                 mnow,
5444                                 service.get_up_epoch(),
5445                                 cct->_conf->osd_heartbeat_min_size,
5446                                 sender_delta_ub);
5447       con->send_message(r);
5448
5449       if (curmap->is_up(from)) {
5450         if (is_active()) {
5451           ConnectionRef cluster_con = service.get_con_osd_cluster(
5452             from, curmap->get_epoch());
5453           if (cluster_con) {
5454             service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5455           }
5456         }
5457       } else if (!curmap->exists(from) ||
5458                  curmap->get_down_at(from) > m->map_epoch) {
5459         // tell them they have died
5460         Message *r = new MOSDPing(monc->get_fsid(),
5461                                   curmap->get_epoch(),
5462                                   MOSDPing::YOU_DIED,
5463                                   m->ping_stamp,
5464                                   m->mono_ping_stamp,
5465                                   mnow,
5466                                   service.get_up_epoch(),
5467                                   cct->_conf->osd_heartbeat_min_size);
5468         con->send_message(r);
5469       }
5470     }
5471     break;
5472
5473   case MOSDPing::PING_REPLY:
5474     {
5475       map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5476       if (i != heartbeat_peers.end()) {
5477         auto acked = i->second.ping_history.find(m->ping_stamp);
5478         if (acked != i->second.ping_history.end()) {
5479           int &unacknowledged = acked->second.second;
5480           if (con == i->second.con_back) {
5481             dout(25) << "handle_osd_ping got reply from osd." << from
5482                      << " first_tx " << i->second.first_tx
5483                      << " last_tx " << i->second.last_tx
5484                      << " last_rx_back " << i->second.last_rx_back
5485                      << " -> " << now
5486                      << " last_rx_front " << i->second.last_rx_front
5487                      << dendl;
5488             i->second.last_rx_back = now;
5489             ceph_assert(unacknowledged > 0);
5490             --unacknowledged;
5491             // if there is no front con, set both stamps.
5492             if (i->second.con_front == NULL) {
5493               i->second.last_rx_front = now;
5494               ceph_assert(unacknowledged > 0);
5495               --unacknowledged;
5496             }
5497           } else if (con == i->second.con_front) {
5498             dout(25) << "handle_osd_ping got reply from osd." << from
5499                      << " first_tx " << i->second.first_tx
5500                      << " last_tx " << i->second.last_tx
5501                      << " last_rx_back " << i->second.last_rx_back
5502                      << " last_rx_front " << i->second.last_rx_front
5503                      << " -> " << now
5504                      << dendl;
5505             i->second.last_rx_front = now;
5506             ceph_assert(unacknowledged > 0);
5507             --unacknowledged;
5508           }
5509
5510           if (unacknowledged == 0) {
5511             // succeeded in getting all replies
5512             dout(25) << "handle_osd_ping got all replies from osd." << from
5513                      << " , erase pending ping(sent at " << m->ping_stamp << ")"
5514                      << " and older pending ping(s)"
5515                      << dendl;
5516
5517 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5518             ++i->second.hb_average_count;
5519             uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5520             i->second.hb_total_back += back_pingtime;
5521             if (back_pingtime < i->second.hb_min_back)
5522               i->second.hb_min_back = back_pingtime;
5523             if (back_pingtime > i->second.hb_max_back)
5524               i->second.hb_max_back = back_pingtime;
5525             uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5526             i->second.hb_total_front += front_pingtime;
5527             if (front_pingtime < i->second.hb_min_front)
5528               i->second.hb_min_front = front_pingtime;
5529             if (front_pingtime > i->second.hb_max_front)
5530               i->second.hb_max_front = front_pingtime;
5531
5532             ceph_assert(i->second.hb_interval_start != utime_t());
5533             if (i->second.hb_interval_start == utime_t())
5534               i->second.hb_interval_start = now;
5535             int64_t hb_avg_time_period = 60;
5536             if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5537               hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5538             }
5539             if (now - i->second.hb_interval_start >=  utime_t(hb_avg_time_period, 0)) {
5540               uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5541               uint32_t back_min = i->second.hb_min_back;
5542               uint32_t back_max = i->second.hb_max_back;
5543               uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5544               uint32_t front_min = i->second.hb_min_front;
5545               uint32_t front_max = i->second.hb_max_front;
5546
5547               // Reset for new interval
5548               i->second.hb_average_count = 0;
5549               i->second.hb_interval_start = now;
5550               i->second.hb_total_back = i->second.hb_max_back = 0;
5551               i->second.hb_min_back =  UINT_MAX;
5552               i->second.hb_total_front = i->second.hb_max_front = 0;
5553               i->second.hb_min_front = UINT_MAX;
5554
5555               // Record per osd interace ping times
5556               // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5557               if (i->second.hb_back_pingtime.size() == 0) {
5558                 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5559                 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5560                   i->second.hb_back_pingtime.push_back(back_avg);
5561                   i->second.hb_back_min.push_back(back_min);
5562                   i->second.hb_back_max.push_back(back_max);
5563                   i->second.hb_front_pingtime.push_back(front_avg);
5564                   i->second.hb_front_min.push_back(front_min);
5565                   i->second.hb_front_max.push_back(front_max);
5566                   ++i->second.hb_index;
5567                 }
5568               } else {
5569                 int index = i->second.hb_index & (hb_vector_size - 1);
5570                 i->second.hb_back_pingtime[index] = back_avg;
5571                 i->second.hb_back_min[index] = back_min;
5572                 i->second.hb_back_max[index] = back_max;
5573                 i->second.hb_front_pingtime[index] = front_avg;
5574                 i->second.hb_front_min[index] = front_min;
5575                 i->second.hb_front_max[index] = front_max;
5576                 ++i->second.hb_index;
5577               }
5578
5579               {
5580                 std::lock_guard l(service.stat_lock);
5581                 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5582                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5583
5584                 uint32_t total = 0;
5585                 uint32_t min = UINT_MAX;
5586                 uint32_t max = 0;
5587                 uint32_t count = 0;
5588                 uint32_t which = 0;
5589                 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5590                 for (int32_t k = size - 1 ; k >= 0; --k) {
5591                   ++count;
5592                   int index = (i->second.hb_index + k) % size;
5593                   total += i->second.hb_back_pingtime[index];
5594                   if (i->second.hb_back_min[index] < min)
5595                     min = i->second.hb_back_min[index];
5596                   if (i->second.hb_back_max[index] > max)
5597                     max = i->second.hb_back_max[index];
5598                   if (count == 1 || count == 5 || count == 15) {
5599                     service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5600                     service.osd_stat.hb_pingtime[from].back_min[which] = min;
5601                     service.osd_stat.hb_pingtime[from].back_max[which] = max;
5602                     which++;
5603                     if (count == 15)
5604                       break;
5605                   }
5606                 }
5607
5608                 if (i->second.con_front != NULL) {
5609                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5610
5611                   total = 0;
5612                   min = UINT_MAX;
5613                   max = 0;
5614                   count = 0;
5615                   which = 0;
5616                   for (int32_t k = size - 1 ; k >= 0; --k) {
5617                     ++count;
5618                     int index = (i->second.hb_index + k) % size;
5619                     total += i->second.hb_front_pingtime[index];
5620                     if (i->second.hb_front_min[index] < min)
5621                       min = i->second.hb_front_min[index];
5622                     if (i->second.hb_front_max[index] > max)
5623                       max = i->second.hb_front_max[index];
5624                     if (count == 1 || count == 5 || count == 15) {
5625                       service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5626                       service.osd_stat.hb_pingtime[from].front_min[which] = min;
5627                       service.osd_stat.hb_pingtime[from].front_max[which] = max;
5628                       which++;
5629                       if (count == 15)
5630                         break;
5631                     }
5632                   }
5633                 }
5634               }
5635             } else {
5636                 std::lock_guard l(service.stat_lock);
5637                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5638                 if (i->second.con_front != NULL)
5639                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5640             }
5641             i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5642           }
5643
5644           if (i->second.is_healthy(now)) {
5645             // Cancel false reports
5646             auto failure_queue_entry = failure_queue.find(from);
5647             if (failure_queue_entry != failure_queue.end()) {
5648               dout(10) << "handle_osd_ping canceling queued "
5649                        << "failure report for osd." << from << dendl;
5650               failure_queue.erase(failure_queue_entry);
5651             }
5652
5653             auto failure_pending_entry = failure_pending.find(from);
5654             if (failure_pending_entry != failure_pending.end()) {
5655               dout(10) << "handle_osd_ping canceling in-flight "
5656                        << "failure report for osd." << from << dendl;
5657               send_still_alive(curmap->get_epoch(),
5658                                from,
5659                                failure_pending_entry->second.second);
5660               failure_pending.erase(failure_pending_entry);
5661             }
5662           }
5663         } else {
5664           // old replies, deprecated by newly sent pings.
5665           dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5666                    << ") is found, treat as covered by newly sent pings "
5667                    << "and ignore"
5668                    << dendl;
5669         }
5670       }
5671
5672       if (m->map_epoch &&
5673           curmap->is_up(from)) {
5674         if (is_active()) {
5675           ConnectionRef cluster_con = service.get_con_osd_cluster(
5676             from, curmap->get_epoch());
5677           if (cluster_con) {
5678             service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5679           }
5680         }
5681       }
5682
5683       s->stamps->got_ping_reply(
5684         mnow,
5685         m->mono_send_stamp,
5686         m->delta_ub);
5687       dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5688     }
5689     break;
5690
5691   case MOSDPing::YOU_DIED:
5692     dout(10) << "handle_osd_ping " << m->get_source_inst()
5693              << " says i am down in " << m->map_epoch << dendl;
5694     osdmap_subscribe(curmap->get_epoch()+1, false);
5695     break;
5696   }
5697
5698   heartbeat_lock.unlock();
5699   m->put();
5700 }
5701
5702 void OSD::heartbeat_entry()
5703 {
5704   std::unique_lock l(heartbeat_lock);
5705   if (is_stopping())
5706     return;
5707   while (!heartbeat_stop) {
5708     heartbeat();
5709
5710     double wait;
5711     if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5712       wait = (float)cct->_conf->osd_heartbeat_interval;
5713     } else {
5714       wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5715     }
5716     auto w = ceph::make_timespan(wait);
5717     dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5718     heartbeat_cond.wait_for(l, w);
5719     if (is_stopping())
5720       return;
5721     dout(30) << "heartbeat_entry woke up" << dendl;
5722   }
5723 }
5724
5725 void OSD::heartbeat_check()
5726 {
5727   ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5728   utime_t now = ceph_clock_now();
5729
5730   // check for incoming heartbeats (move me elsewhere?)
5731   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5732        p != heartbeat_peers.end();
5733        ++p) {
5734
5735     if (p->second.first_tx == utime_t()) {
5736       dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5737                << " yet, skipping" << dendl;
5738       continue;
5739     }
5740
5741     dout(25) << "heartbeat_check osd." << p->first
5742              << " first_tx " << p->second.first_tx
5743              << " last_tx " << p->second.last_tx
5744              << " last_rx_back " << p->second.last_rx_back
5745              << " last_rx_front " << p->second.last_rx_front
5746              << dendl;
5747     if (p->second.is_unhealthy(now)) {
5748       utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5749       if (p->second.last_rx_back == utime_t() ||
5750           p->second.last_rx_front == utime_t()) {
5751         derr << "heartbeat_check: no reply from "
5752              << p->second.con_front->get_peer_addr().get_sockaddr()
5753              << " osd." << p->first
5754              << " ever on either front or back, first ping sent "
5755              << p->second.first_tx
5756              << " (oldest deadline " << oldest_deadline << ")"
5757              << dendl;
5758         // fail
5759         failure_queue[p->first] = p->second.first_tx;
5760       } else {
5761         derr << "heartbeat_check: no reply from "
5762              << p->second.con_front->get_peer_addr().get_sockaddr()
5763              << " osd." << p->first << " since back " << p->second.last_rx_back
5764              << " front " << p->second.last_rx_front
5765              << " (oldest deadline " << oldest_deadline << ")"
5766              << dendl;
5767         // fail
5768         failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5769       }
5770     }
5771   }
5772 }
5773
5774 void OSD::heartbeat()
5775 {
5776   ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5777   dout(30) << "heartbeat" << dendl;
5778
5779   // get CPU load avg
5780   double loadavgs[1];
5781   int hb_interval = cct->_conf->osd_heartbeat_interval;
5782   int n_samples = 86400;
5783   if (hb_interval > 1) {
5784     n_samples /= hb_interval;
5785     if (n_samples < 1)
5786       n_samples = 1;
5787   }
5788
5789   if (getloadavg(loadavgs, 1) == 1) {
5790     logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5791     daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5792     dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5793   }
5794
5795   dout(30) << "heartbeat checking stats" << dendl;
5796
5797   // refresh peer list and osd stats
5798   vector<int> hb_peers;
5799   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5800        p != heartbeat_peers.end();
5801        ++p)
5802     hb_peers.push_back(p->first);
5803
5804   auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5805   dout(5) << __func__ << " " << new_stat << dendl;
5806   ceph_assert(new_stat.statfs.total);
5807
5808   float pratio;
5809   float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5810
5811   service.check_full_status(ratio, pratio);
5812
5813   utime_t now = ceph_clock_now();
5814   auto mnow = service.get_mnow();
5815   utime_t deadline = now;
5816   deadline += cct->_conf->osd_heartbeat_grace;
5817
5818   // send heartbeats
5819   for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5820        i != heartbeat_peers.end();
5821        ++i) {
5822     int peer = i->first;
5823     Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5824     if (!s) {
5825       dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5826       continue;
5827     }
5828     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5829
5830     i->second.last_tx = now;
5831     if (i->second.first_tx == utime_t())
5832       i->second.first_tx = now;
5833     i->second.ping_history[now] = make_pair(deadline,
5834       HeartbeatInfo::HEARTBEAT_MAX_CONN);
5835     if (i->second.hb_interval_start == utime_t())
5836       i->second.hb_interval_start = now;
5837
5838     std::optional<ceph::signedspan> delta_ub;
5839     s->stamps->sent_ping(&delta_ub);
5840
5841     i->second.con_back->send_message(
5842       new MOSDPing(monc->get_fsid(),
5843                    service.get_osdmap_epoch(),
5844                    MOSDPing::PING,
5845                    now,
5846                    mnow,
5847                    mnow,
5848                    service.get_up_epoch(),
5849                    cct->_conf->osd_heartbeat_min_size,
5850                    delta_ub));
5851
5852     if (i->second.con_front)
5853       i->second.con_front->send_message(
5854         new MOSDPing(monc->get_fsid(),
5855                      service.get_osdmap_epoch(),
5856                      MOSDPing::PING,
5857                      now,
5858                      mnow,
5859                      mnow,
5860                      service.get_up_epoch(),
5861                      cct->_conf->osd_heartbeat_min_size,
5862                      delta_ub));
5863   }
5864
5865   logger->set(l_osd_hb_to, heartbeat_peers.size());
5866
5867   // hmm.. am i all alone?
5868   dout(30) << "heartbeat lonely?" << dendl;
5869   if (heartbeat_peers.empty()) {
5870     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5871       last_mon_heartbeat = now;
5872       dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5873       osdmap_subscribe(get_osdmap_epoch() + 1, false);
5874     }
5875   }
5876
5877   dout(30) << "heartbeat done" << dendl;
5878 }
5879
5880 bool OSD::heartbeat_reset(Connection *con)
5881 {
5882   std::lock_guard l(heartbeat_lock);
5883   auto s = con->get_priv();
5884   dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5885   con->set_priv(nullptr);
5886   if (s) {
5887     if (is_stopping()) {
5888       return true;
5889     }
5890     auto session = static_cast<Session*>(s.get());
5891     auto p = heartbeat_peers.find(session->peer);
5892     if (p != heartbeat_peers.end() &&
5893         (p->second.con_back == con ||
5894          p->second.con_front == con)) {
5895       dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5896                << ", reopening" << dendl;
5897       p->second.clear_mark_down(con);
5898       pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5899       if (newcon.first) {
5900         p->second.con_back = newcon.first.get();
5901         p->second.con_back->set_priv(s);
5902         if (newcon.second) {
5903           p->second.con_front = newcon.second.get();
5904           p->second.con_front->set_priv(s);
5905         }
5906         p->second.ping_history.clear();
5907       } else {
5908         dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5909                  << ", raced with osdmap update, closing out peer" << dendl;
5910         heartbeat_peers.erase(p);
5911       }
5912     } else {
5913       dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5914     }
5915   }
5916   return true;
5917 }
5918
5919
5920
5921 // =========================================
5922
5923 void OSD::tick()
5924 {
5925   ceph_assert(ceph_mutex_is_locked(osd_lock));
5926   dout(10) << "tick" << dendl;
5927
5928   utime_t now = ceph_clock_now();
5929   // throw out any obsolete markdown log
5930   utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5931   while (!osd_markdown_log.empty() &&
5932           osd_markdown_log.front() + grace < now)
5933     osd_markdown_log.pop_front();
5934
5935   if (is_active() || is_waiting_for_healthy()) {
5936     maybe_update_heartbeat_peers();
5937   }
5938
5939   if (is_waiting_for_healthy()) {
5940     start_boot();
5941   }
5942
5943   if (is_waiting_for_healthy() || is_booting()) {
5944     std::lock_guard l(heartbeat_lock);
5945     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5946       last_mon_heartbeat = now;
5947       dout(1) << __func__ << " checking mon for new map" << dendl;
5948       osdmap_subscribe(get_osdmap_epoch() + 1, false);
5949     }
5950   }
5951
5952   do_waiters();
5953
5954   // scrub purged_snaps every deep scrub interval
5955   {
5956     const utime_t last = superblock.last_purged_snaps_scrub;
5957     utime_t next = last;
5958     next += cct->_conf->osd_scrub_min_interval;
5959     std::mt19937 rng;
5960     // use a seed that is stable for each scrub interval, but varies
5961     // by OSD to avoid any herds.
5962     rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5963     double r = (rng() % 1024) / 1024;
5964     next +=
5965       cct->_conf->osd_scrub_min_interval *
5966       cct->_conf->osd_scrub_interval_randomize_ratio * r;
5967     if (next < ceph_clock_now()) {
5968       dout(20) << __func__ << " last_purged_snaps_scrub " << last
5969                << " next " << next << " ... now" << dendl;
5970       scrub_purged_snaps();
5971     } else {
5972       dout(20) << __func__ << " last_purged_snaps_scrub " << last
5973                << " next " << next << dendl;
5974     }
5975   }
5976
5977   tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5978 }
5979
5980 void OSD::tick_without_osd_lock()
5981 {
5982   ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5983   dout(10) << "tick_without_osd_lock" << dendl;
5984
5985   logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
5986   logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
5987   logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
5988
5989   // refresh osd stats
5990   struct store_statfs_t stbuf;
5991   osd_alert_list_t alerts;
5992   int r = store->statfs(&stbuf, &alerts);
5993   ceph_assert(r == 0);
5994   service.set_statfs(stbuf, alerts);
5995
5996   // osd_lock is not being held, which means the OSD state
5997   // might change when doing the monitor report
5998   if (is_active() || is_waiting_for_healthy()) {
5999     {
6000       std::lock_guard l{heartbeat_lock};
6001       heartbeat_check();
6002     }
6003     map_lock.lock_shared();
6004     std::lock_guard l(mon_report_lock);
6005
6006     // mon report?
6007     utime_t now = ceph_clock_now();
6008     if (service.need_fullness_update() ||
6009         now - last_mon_report > cct->_conf->osd_mon_report_interval) {
6010       last_mon_report = now;
6011       send_full_update();
6012       send_failures();
6013     }
6014     map_lock.unlock_shared();
6015
6016     epoch_t max_waiting_epoch = 0;
6017     for (auto s : shards) {
6018       max_waiting_epoch = std::max(max_waiting_epoch,
6019                                    s->get_max_waiting_epoch());
6020     }
6021     if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6022       dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6023                << ", requesting new map" << dendl;
6024       osdmap_subscribe(superblock.newest_map + 1, false);
6025     }
6026   }
6027
6028   if (is_active()) {
6029     if (!scrub_random_backoff()) {
6030       sched_scrub();
6031     }
6032     service.promote_throttle_recalibrate();
6033     resume_creating_pg();
6034     bool need_send_beacon = false;
6035     const auto now = ceph::coarse_mono_clock::now();
6036     {
6037       // borrow lec lock to pretect last_sent_beacon from changing
6038       std::lock_guard l{min_last_epoch_clean_lock};
6039       const auto elapsed = now - last_sent_beacon;
6040       if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
6041         cct->_conf->osd_beacon_report_interval) {
6042         need_send_beacon = true;
6043       }
6044     }
6045     if (need_send_beacon) {
6046       send_beacon(now);
6047     }
6048   }
6049
6050   mgrc.update_daemon_health(get_health_metrics());
6051   service.kick_recovery_queue();
6052   tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6053                                               new C_Tick_WithoutOSDLock(this));
6054 }
6055
6056 // Usage:
6057 //   setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6058 //   rmomapkey <pool-id> [namespace/]<obj-name> <key>
6059 //   setomapheader <pool-id> [namespace/]<obj-name> <header>
6060 //   getomap <pool> [namespace/]<obj-name>
6061 //   truncobj <pool-id> [namespace/]<obj-name> <newlen>
6062 //   injectmdataerr [namespace/]<obj-name> [shardid]
6063 //   injectdataerr [namespace/]<obj-name> [shardid]
6064 //
6065 //   set_recovery_delay [utime]
6066 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
6067                                  std::string_view command,
6068                                  const cmdmap_t& cmdmap, ostream &ss)
6069 {
6070   //Test support
6071   //Support changing the omap on a single osd by using the Admin Socket to
6072   //directly request the osd make a change.
6073   if (command == "setomapval" || command == "rmomapkey" ||
6074       command == "setomapheader" || command == "getomap" ||
6075       command == "truncobj" || command == "injectmdataerr" ||
6076       command == "injectdataerr"
6077     ) {
6078     pg_t rawpg;
6079     int64_t pool;
6080     OSDMapRef curmap = service->get_osdmap();
6081     int r = -1;
6082
6083     string poolstr;
6084
6085     cmd_getval(cmdmap, "pool", poolstr);
6086     pool = curmap->lookup_pg_pool_name(poolstr);
6087     //If we can't find it by name then maybe id specified
6088     if (pool < 0 && isdigit(poolstr[0]))
6089       pool = atoll(poolstr.c_str());
6090     if (pool < 0) {
6091       ss << "Invalid pool '" << poolstr << "''";
6092       return;
6093     }
6094
6095     string objname, nspace;
6096     cmd_getval(cmdmap, "objname", objname);
6097     std::size_t found = objname.find_first_of('/');
6098     if (found != string::npos) {
6099       nspace = objname.substr(0, found);
6100       objname = objname.substr(found+1);
6101     }
6102     object_locator_t oloc(pool, nspace);
6103     r = curmap->object_locator_to_pg(object_t(objname), oloc,  rawpg);
6104
6105     if (r < 0) {
6106       ss << "Invalid namespace/objname";
6107       return;
6108     }
6109
6110     int64_t shardid;
6111     cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
6112     hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6113     ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6114     spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6115     if (curmap->pg_is_ec(rawpg)) {
6116         if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6117             ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6118             return;
6119         }
6120     }
6121
6122     ObjectStore::Transaction t;
6123
6124     if (command == "setomapval") {
6125       map<string, bufferlist> newattrs;
6126       bufferlist val;
6127       string key, valstr;
6128       cmd_getval(cmdmap, "key", key);
6129       cmd_getval(cmdmap, "val", valstr);
6130
6131       val.append(valstr);
6132       newattrs[key] = val;
6133       t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
6134       r = store->queue_transaction(service->meta_ch, std::move(t));
6135       if (r < 0)
6136         ss << "error=" << r;
6137       else
6138         ss << "ok";
6139     } else if (command == "rmomapkey") {
6140       string key;
6141       cmd_getval(cmdmap, "key", key);
6142
6143       t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6144       r = store->queue_transaction(service->meta_ch, std::move(t));
6145       if (r < 0)
6146         ss << "error=" << r;
6147       else
6148         ss << "ok";
6149     } else if (command == "setomapheader") {
6150       bufferlist newheader;
6151       string headerstr;
6152
6153       cmd_getval(cmdmap, "header", headerstr);
6154       newheader.append(headerstr);
6155       t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6156       r = store->queue_transaction(service->meta_ch, std::move(t));
6157       if (r < 0)
6158         ss << "error=" << r;
6159       else
6160         ss << "ok";
6161     } else if (command == "getomap") {
6162       //Debug: Output entire omap
6163       bufferlist hdrbl;
6164       map<string, bufferlist> keyvals;
6165       auto ch = store->open_collection(coll_t(pgid));
6166       if (!ch) {
6167         ss << "unable to open collection for " << pgid;
6168         r = -ENOENT;
6169       } else {
6170         r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6171         if (r >= 0) {
6172           ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6173           for (map<string, bufferlist>::iterator it = keyvals.begin();
6174                it != keyvals.end(); ++it)
6175             ss << " key=" << (*it).first << " val="
6176                << string((*it).second.c_str(), (*it).second.length());
6177         } else {
6178           ss << "error=" << r;
6179         }
6180       }
6181     } else if (command == "truncobj") {
6182       int64_t trunclen;
6183       cmd_getval(cmdmap, "len", trunclen);
6184       t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6185       r = store->queue_transaction(service->meta_ch, std::move(t));
6186       if (r < 0)
6187         ss << "error=" << r;
6188       else
6189         ss << "ok";
6190     } else if (command == "injectdataerr") {
6191       store->inject_data_error(gobj);
6192       ss << "ok";
6193     } else if (command == "injectmdataerr") {
6194       store->inject_mdata_error(gobj);
6195       ss << "ok";
6196     }
6197     return;
6198   }
6199   if (command == "set_recovery_delay") {
6200     int64_t delay;
6201     cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6202     ostringstream oss;
6203     oss << delay;
6204     int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6205                                          oss.str().c_str());
6206     if (r != 0) {
6207       ss << "set_recovery_delay: error setting "
6208          << "osd_recovery_delay_start to '" << delay << "': error "
6209          << r;
6210       return;
6211     }
6212     service->cct->_conf.apply_changes(nullptr);
6213     ss << "set_recovery_delay: set osd_recovery_delay_start "
6214        << "to " << service->cct->_conf->osd_recovery_delay_start;
6215     return;
6216   }
6217   if (command == "injectfull") {
6218     int64_t count;
6219     string type;
6220     OSDService::s_names state;
6221     cmd_getval(cmdmap, "type", type, string("full"));
6222     cmd_getval(cmdmap, "count", count, (int64_t)-1);
6223     if (type == "none" || count == 0) {
6224       type = "none";
6225       count = 0;
6226     }
6227     state = service->get_full_state(type);
6228     if (state == OSDService::s_names::INVALID) {
6229       ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6230       return;
6231     }
6232     service->set_injectfull(state, count);
6233     return;
6234   }
6235   ss << "Internal error - command=" << command;
6236 }
6237
6238 // =========================================
6239
6240 void OSD::ms_handle_connect(Connection *con)
6241 {
6242   dout(10) << __func__ << " con " << con << dendl;
6243   if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6244     std::lock_guard l(osd_lock);
6245     if (is_stopping())
6246       return;
6247     dout(10) << __func__ << " on mon" << dendl;
6248
6249     if (is_preboot()) {
6250       start_boot();
6251     } else if (is_booting()) {
6252       _send_boot();       // resend boot message
6253     } else {
6254       map_lock.lock_shared();
6255       std::lock_guard l2(mon_report_lock);
6256
6257       utime_t now = ceph_clock_now();
6258       last_mon_report = now;
6259
6260       // resend everything, it's a new session
6261       send_full_update();
6262       send_alive();
6263       service.requeue_pg_temp();
6264       service.clear_sent_ready_to_merge();
6265       service.send_pg_temp();
6266       service.send_ready_to_merge();
6267       service.send_pg_created();
6268       requeue_failures();
6269       send_failures();
6270
6271       map_lock.unlock_shared();
6272       if (is_active()) {
6273         send_beacon(ceph::coarse_mono_clock::now());
6274       }
6275     }
6276
6277     // full map requests may happen while active or pre-boot
6278     if (requested_full_first) {
6279       rerequest_full_maps();
6280     }
6281   }
6282 }
6283
6284 void OSD::ms_handle_fast_connect(Connection *con)
6285 {
6286   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6287       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6288     if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6289       s = ceph::make_ref<Session>(cct, con);
6290       con->set_priv(s);
6291       dout(10) << " new session (outgoing) " << s << " con=" << s->con
6292           << " addr=" << s->con->get_peer_addr() << dendl;
6293       // we don't connect to clients
6294       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6295       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6296     }
6297   }
6298 }
6299
6300 void OSD::ms_handle_fast_accept(Connection *con)
6301 {
6302   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6303       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6304     if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6305       s = ceph::make_ref<Session>(cct, con);
6306       con->set_priv(s);
6307       dout(10) << "new session (incoming)" << s << " con=" << con
6308           << " addr=" << con->get_peer_addr()
6309           << " must have raced with connect" << dendl;
6310       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6311       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6312     }
6313   }
6314 }
6315
6316 bool OSD::ms_handle_reset(Connection *con)
6317 {
6318   auto session = ceph::ref_cast<Session>(con->get_priv());
6319   dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6320   if (!session)
6321     return false;
6322   session->wstate.reset(con);
6323   session->con->set_priv(nullptr);
6324   session->con.reset();  // break con <-> session ref cycle
6325   // note that we break session->con *before* the session_handle_reset
6326   // cleanup below.  this avoids a race between us and
6327   // PG::add_backoff, Session::check_backoff, etc.
6328   session_handle_reset(session);
6329   return true;
6330 }
6331
6332 bool OSD::ms_handle_refused(Connection *con)
6333 {
6334   if (!cct->_conf->osd_fast_fail_on_connection_refused)
6335     return false;
6336
6337   auto session = ceph::ref_cast<Session>(con->get_priv());
6338   dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6339   if (!session)
6340     return false;
6341   int type = con->get_peer_type();
6342   // handle only OSD failures here
6343   if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6344     OSDMapRef osdmap = get_osdmap();
6345     if (osdmap) {
6346       int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6347       if (id >= 0 && osdmap->is_up(id)) {
6348         // I'm cheating mon heartbeat grace logic, because we know it's not going
6349         // to respawn alone. +1 so we won't hit any boundary case.
6350         monc->send_mon_message(
6351           new MOSDFailure(
6352             monc->get_fsid(),
6353             id,
6354             osdmap->get_addrs(id),
6355             cct->_conf->osd_heartbeat_grace + 1,
6356             osdmap->get_epoch(),
6357             MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6358             ));
6359       }
6360     }
6361   }
6362   return true;
6363 }
6364
6365 struct CB_OSD_GetVersion {
6366   OSD *osd;
6367   explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6368   void operator ()(boost::system::error_code ec, version_t newest,
6369                    version_t oldest) {
6370     if (!ec)
6371       osd->_got_mon_epochs(oldest, newest);
6372   }
6373 };
6374
6375 void OSD::start_boot()
6376 {
6377   if (!_is_healthy()) {
6378     // if we are not healthy, do not mark ourselves up (yet)
6379     dout(1) << "not healthy; waiting to boot" << dendl;
6380     if (!is_waiting_for_healthy())
6381       start_waiting_for_healthy();
6382     // send pings sooner rather than later
6383     heartbeat_kick();
6384     return;
6385   }
6386   dout(1) << __func__ << dendl;
6387   set_state(STATE_PREBOOT);
6388   dout(10) << "start_boot - have maps " << superblock.oldest_map
6389            << ".." << superblock.newest_map << dendl;
6390   monc->get_version("osdmap", CB_OSD_GetVersion(this));
6391 }
6392
6393 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6394 {
6395   std::lock_guard l(osd_lock);
6396   if (is_preboot()) {
6397     _preboot(oldest, newest);
6398   }
6399 }
6400
6401 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6402 {
6403   ceph_assert(is_preboot());
6404   dout(10) << __func__ << " _preboot mon has osdmaps "
6405            << oldest << ".." << newest << dendl;
6406
6407   // ensure our local fullness awareness is accurate
6408   {
6409     std::lock_guard l(heartbeat_lock);
6410     heartbeat();
6411   }
6412
6413   const auto& monmap = monc->monmap;
6414   const auto osdmap = get_osdmap();
6415   // if our map within recent history, try to add ourselves to the osdmap.
6416   if (osdmap->get_epoch() == 0) {
6417     derr << "waiting for initial osdmap" << dendl;
6418   } else if (osdmap->is_destroyed(whoami)) {
6419     derr << "osdmap says I am destroyed" << dendl;
6420     // provide a small margin so we don't livelock seeing if we
6421     // un-destroyed ourselves.
6422     if (osdmap->get_epoch() > newest - 1) {
6423       exit(0);
6424     }
6425   } else if (osdmap->is_noup(whoami)) {
6426     derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6427   } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6428     derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6429          << dendl;
6430   } else if (service.need_fullness_update()) {
6431     derr << "osdmap fullness state needs update" << dendl;
6432     send_full_update();
6433   } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6434              superblock.purged_snaps_last < superblock.current_epoch) {
6435     dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6436              << " < newest_map " << superblock.current_epoch << dendl;
6437     _get_purged_snaps();
6438   } else if (osdmap->get_epoch() >= oldest - 1 &&
6439              osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6440
6441     // wait for pgs to fully catch up in a different thread, since
6442     // this thread might be required for splitting and merging PGs to
6443     // make progress.
6444     boot_finisher.queue(
6445       new LambdaContext(
6446         [this](int r) {
6447           std::unique_lock l(osd_lock);
6448           if (is_preboot()) {
6449             dout(10) << __func__ << " waiting for peering work to drain"
6450                      << dendl;
6451             l.unlock();
6452             for (auto shard : shards) {
6453               shard->wait_min_pg_epoch(get_osdmap_epoch());
6454             }
6455             l.lock();
6456           }
6457           if (is_preboot()) {
6458             _send_boot();
6459           }
6460         }));
6461     return;
6462   }
6463
6464   // get all the latest maps
6465   if (osdmap->get_epoch() + 1 >= oldest)
6466     osdmap_subscribe(osdmap->get_epoch() + 1, false);
6467   else
6468     osdmap_subscribe(oldest - 1, true);
6469 }
6470
6471 void OSD::_get_purged_snaps()
6472 {
6473   // NOTE: this is a naive, stateless implementaiton.  it may send multiple
6474   // overlapping requests to the mon, which will be somewhat inefficient, but
6475   // it should be reliable.
6476   dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6477            << ", newest_map " << superblock.current_epoch << dendl;
6478   MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6479     superblock.purged_snaps_last + 1,
6480     superblock.current_epoch + 1);
6481   monc->send_mon_message(m);
6482 }
6483
6484 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6485 {
6486   dout(10) << __func__ << " " << *m << dendl;
6487   ObjectStore::Transaction t;
6488   if (!is_preboot() ||
6489       m->last < superblock.purged_snaps_last) {
6490     goto out;
6491   }
6492   SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6493                                   make_purged_snaps_oid(), &t,
6494                                   m->purged_snaps);
6495   superblock.purged_snaps_last = m->last;
6496   write_superblock(t);
6497   store->queue_transaction(
6498     service.meta_ch,
6499     std::move(t));
6500   service.publish_superblock(superblock);
6501   if (m->last < superblock.current_epoch) {
6502     _get_purged_snaps();
6503   } else {
6504     start_boot();
6505   }
6506 out:
6507   m->put();
6508 }
6509
6510 void OSD::send_full_update()
6511 {
6512   if (!service.need_fullness_update())
6513     return;
6514   unsigned state = 0;
6515   if (service.is_full()) {
6516     state = CEPH_OSD_FULL;
6517   } else if (service.is_backfillfull()) {
6518     state = CEPH_OSD_BACKFILLFULL;
6519   } else if (service.is_nearfull()) {
6520     state = CEPH_OSD_NEARFULL;
6521   }
6522   set<string> s;
6523   OSDMap::calc_state_set(state, s);
6524   dout(10) << __func__ << " want state " << s << dendl;
6525   monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6526 }
6527
6528 void OSD::start_waiting_for_healthy()
6529 {
6530   dout(1) << "start_waiting_for_healthy" << dendl;
6531   set_state(STATE_WAITING_FOR_HEALTHY);
6532   last_heartbeat_resample = utime_t();
6533
6534   // subscribe to osdmap updates, in case our peers really are known to be dead
6535   osdmap_subscribe(get_osdmap_epoch() + 1, false);
6536 }
6537
6538 bool OSD::_is_healthy()
6539 {
6540   if (!cct->get_heartbeat_map()->is_healthy()) {
6541     dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6542     return false;
6543   }
6544
6545   if (is_waiting_for_healthy()) {
6546      utime_t now = ceph_clock_now();
6547      if (osd_markdown_log.empty()) {
6548        dout(5) << __func__ << " force returning true since last markdown"
6549                << " was " << cct->_conf->osd_max_markdown_period
6550                << "s ago" << dendl;
6551        return true;
6552     }
6553     std::lock_guard l(heartbeat_lock);
6554     int num = 0, up = 0;
6555     for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6556          p != heartbeat_peers.end();
6557          ++p) {
6558       if (p->second.is_healthy(now))
6559         ++up;
6560       ++num;
6561     }
6562     if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6563       dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6564               << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6565       return false;
6566     }
6567   }
6568
6569   return true;
6570 }
6571
6572 void OSD::_send_boot()
6573 {
6574   dout(10) << "_send_boot" << dendl;
6575   Connection *local_connection =
6576     cluster_messenger->get_loopback_connection().get();
6577   entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6578   entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6579   entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6580   entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6581
6582   dout(20) << " initial client_addrs " << client_addrs
6583            << ", cluster_addrs " << cluster_addrs
6584            << ", hb_back_addrs " << hb_back_addrs
6585            << ", hb_front_addrs " << hb_front_addrs
6586            << dendl;
6587   if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6588     dout(10) << " assuming cluster_addrs match client_addrs "
6589              << client_addrs << dendl;
6590     cluster_addrs = cluster_messenger->get_myaddrs();
6591   }
6592   if (auto session = local_connection->get_priv(); !session) {
6593     cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6594   }
6595
6596   local_connection = hb_back_server_messenger->get_loopback_connection().get();
6597   if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6598     dout(10) << " assuming hb_back_addrs match cluster_addrs "
6599              << cluster_addrs << dendl;
6600     hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6601   }
6602   if (auto session = local_connection->get_priv(); !session) {
6603     hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6604   }
6605
6606   local_connection = hb_front_server_messenger->get_loopback_connection().get();
6607   if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6608     dout(10) << " assuming hb_front_addrs match client_addrs "
6609              << client_addrs << dendl;
6610     hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6611   }
6612   if (auto session = local_connection->get_priv(); !session) {
6613     hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6614   }
6615
6616   // we now know what our front and back addrs will be, and we are
6617   // about to tell the mon what our metadata (including numa bindings)
6618   // are, so now is a good time!
6619   set_numa_affinity();
6620
6621   MOSDBoot *mboot = new MOSDBoot(
6622     superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6623     hb_back_addrs, hb_front_addrs, cluster_addrs,
6624     CEPH_FEATURES_ALL);
6625   dout(10) << " final client_addrs " << client_addrs
6626            << ", cluster_addrs " << cluster_addrs
6627            << ", hb_back_addrs " << hb_back_addrs
6628            << ", hb_front_addrs " << hb_front_addrs
6629            << dendl;
6630   _collect_metadata(&mboot->metadata);
6631   monc->send_mon_message(mboot);
6632   set_state(STATE_BOOTING);
6633 }
6634
6635 void OSD::_collect_metadata(map<string,string> *pm)
6636 {
6637   // config info
6638   (*pm)["osd_data"] = dev_path;
6639   if (store->get_type() == "filestore") {
6640     // not applicable for bluestore
6641     (*pm)["osd_journal"] = journal_path;
6642   }
6643   (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6644   (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6645   (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6646   (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6647
6648   // backend
6649   (*pm)["osd_objectstore"] = store->get_type();
6650   (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6651   (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6652   (*pm)["default_device_class"] = store->get_default_device_class();
6653   string osdspec_affinity;
6654   int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6655   if (r < 0 || osdspec_affinity.empty()) {
6656     osdspec_affinity = "";
6657   }
6658   (*pm)["osdspec_affinity"] = osdspec_affinity;
6659   store->collect_metadata(pm);
6660
6661   collect_sys_info(pm, cct);
6662
6663   (*pm)["front_iface"] = pick_iface(
6664     cct,
6665     client_messenger->get_myaddrs().front().get_sockaddr_storage());
6666   (*pm)["back_iface"] = pick_iface(
6667     cct,
6668     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6669
6670   // network numa
6671   {
6672     int node = -1;
6673     set<int> nodes;
6674     set<string> unknown;
6675     for (auto nm : { "front_iface", "back_iface" }) {
6676       if (!(*pm)[nm].size()) {
6677         unknown.insert(nm);
6678         continue;
6679       }
6680       int n = -1;
6681       int r = get_iface_numa_node((*pm)[nm], &n);
6682       if (r < 0) {
6683         unknown.insert((*pm)[nm]);
6684         continue;
6685       }
6686       nodes.insert(n);
6687       if (node < 0) {
6688         node = n;
6689       }
6690     }
6691     if (unknown.size()) {
6692       (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6693     }
6694     if (!nodes.empty()) {
6695       (*pm)["network_numa_nodes"] = stringify(nodes);
6696     }
6697     if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6698       (*pm)["network_numa_node"] = stringify(node);
6699     }
6700   }
6701
6702   if (numa_node >= 0) {
6703     (*pm)["numa_node"] = stringify(numa_node);
6704     (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6705                                                   &numa_cpu_set);
6706   }
6707
6708   set<string> devnames;
6709   store->get_devices(&devnames);
6710   map<string,string> errs;
6711   get_device_metadata(devnames, pm, &errs);
6712   for (auto& i : errs) {
6713     dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6714   }
6715   dout(10) << __func__ << " " << *pm << dendl;
6716 }
6717
6718 void OSD::queue_want_up_thru(epoch_t want)
6719 {
6720   std::shared_lock map_locker{map_lock};
6721   epoch_t cur = get_osdmap()->get_up_thru(whoami);
6722   std::lock_guard report_locker(mon_report_lock);
6723   if (want > up_thru_wanted) {
6724     dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6725              << ", currently " << cur
6726              << dendl;
6727     up_thru_wanted = want;
6728     send_alive();
6729   } else {
6730     dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6731              << ", currently " << cur
6732              << dendl;
6733   }
6734 }
6735
6736 void OSD::send_alive()
6737 {
6738   ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6739   const auto osdmap = get_osdmap();
6740   if (!osdmap->exists(whoami))
6741     return;
6742   epoch_t up_thru = osdmap->get_up_thru(whoami);
6743   dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6744   if (up_thru_wanted > up_thru) {
6745     dout(10) << "send_alive want " << up_thru_wanted << dendl;
6746     monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6747   }
6748 }
6749
6750 void OSD::request_full_map(epoch_t first, epoch_t last)
6751 {
6752   dout(10) << __func__ << " " << first << ".." << last
6753            << ", previously requested "
6754            << requested_full_first << ".." << requested_full_last << dendl;
6755   ceph_assert(ceph_mutex_is_locked(osd_lock));
6756   ceph_assert(first > 0 && last > 0);
6757   ceph_assert(first <= last);
6758   ceph_assert(first >= requested_full_first);  // we shouldn't ever ask for older maps
6759   if (requested_full_first == 0) {
6760     // first request
6761     requested_full_first = first;
6762     requested_full_last = last;
6763   } else if (last <= requested_full_last) {
6764     // dup
6765     return;
6766   } else {
6767     // additional request
6768     first = requested_full_last + 1;
6769     requested_full_last = last;
6770   }
6771   MMonGetOSDMap *req = new MMonGetOSDMap;
6772   req->request_full(first, last);
6773   monc->send_mon_message(req);
6774 }
6775
6776 void OSD::got_full_map(epoch_t e)
6777 {
6778   ceph_assert(requested_full_first <= requested_full_last);
6779   ceph_assert(ceph_mutex_is_locked(osd_lock));
6780   if (requested_full_first == 0) {
6781     dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6782     return;
6783   }
6784   if (e < requested_full_first) {
6785     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6786              << ".." << requested_full_last
6787              << ", ignoring" << dendl;
6788     return;
6789   }
6790   if (e >= requested_full_last) {
6791     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6792              << ".." << requested_full_last << ", resetting" << dendl;
6793     requested_full_first = requested_full_last = 0;
6794     return;
6795   }
6796
6797   requested_full_first = e + 1;
6798
6799   dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6800            << ".." << requested_full_last
6801            << ", still need more" << dendl;
6802 }
6803
6804 void OSD::requeue_failures()
6805 {
6806   std::lock_guard l(heartbeat_lock);
6807   unsigned old_queue = failure_queue.size();
6808   unsigned old_pending = failure_pending.size();
6809   for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6810     failure_queue[p->first] = p->second.first;
6811     failure_pending.erase(p++);
6812   }
6813   dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6814            << failure_queue.size() << dendl;
6815 }
6816
6817 void OSD::send_failures()
6818 {
6819   ceph_assert(ceph_mutex_is_locked(map_lock));
6820   ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6821   std::lock_guard l(heartbeat_lock);
6822   utime_t now = ceph_clock_now();
6823   const auto osdmap = get_osdmap();
6824   while (!failure_queue.empty()) {
6825     int osd = failure_queue.begin()->first;
6826     if (!failure_pending.count(osd)) {
6827       int failed_for = (int)(double)(now - failure_queue.begin()->second);
6828       monc->send_mon_message(
6829         new MOSDFailure(
6830           monc->get_fsid(),
6831           osd,
6832           osdmap->get_addrs(osd),
6833           failed_for,
6834           osdmap->get_epoch()));
6835       failure_pending[osd] = make_pair(failure_queue.begin()->second,
6836                                        osdmap->get_addrs(osd));
6837     }
6838     failure_queue.erase(osd);
6839   }
6840 }
6841
6842 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6843 {
6844   MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6845                                    MOSDFailure::FLAG_ALIVE);
6846   monc->send_mon_message(m);
6847 }
6848
6849 void OSD::cancel_pending_failures()
6850 {
6851   std::lock_guard l(heartbeat_lock);
6852   auto it = failure_pending.begin();
6853   while (it != failure_pending.end()) {
6854     dout(10) << __func__ << " canceling in-flight failure report for osd."
6855              << it->first << dendl;
6856     send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6857     failure_pending.erase(it++);
6858   }
6859 }
6860
6861 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6862 {
6863   const auto& monmap = monc->monmap;
6864   // send beacon to mon even if we are just connected, and the monmap is not
6865   // initialized yet by then.
6866   if (monmap.epoch > 0 &&
6867       monmap.get_required_features().contains_all(
6868         ceph::features::mon::FEATURE_LUMINOUS)) {
6869     dout(20) << __func__ << " sending" << dendl;
6870     MOSDBeacon* beacon = nullptr;
6871     {
6872       std::lock_guard l{min_last_epoch_clean_lock};
6873       beacon = new MOSDBeacon(get_osdmap_epoch(),
6874                               min_last_epoch_clean,
6875                               superblock.last_purged_snaps_scrub,
6876                               cct->_conf->osd_beacon_report_interval);
6877       beacon->pgs = min_last_epoch_clean_pgs;
6878       last_sent_beacon = now;
6879     }
6880     monc->send_mon_message(beacon);
6881   } else {
6882     dout(20) << __func__ << " not sending" << dendl;
6883   }
6884 }
6885
6886 void OSD::handle_command(MCommand *m)
6887 {
6888   ConnectionRef con = m->get_connection();
6889   auto session = ceph::ref_cast<Session>(con->get_priv());
6890   if (!session) {
6891     con->send_message(new MCommandReply(m, -EACCES));
6892     m->put();
6893     return;
6894   }
6895   if (!session->caps.allow_all()) {
6896     con->send_message(new MCommandReply(m, -EACCES));
6897     m->put();
6898     return;
6899   }
6900   cct->get_admin_socket()->queue_tell_command(m);
6901   m->put();
6902 }
6903
6904 namespace {
6905   class unlock_guard {
6906     ceph::mutex& m;
6907   public:
6908     explicit unlock_guard(ceph::mutex& mutex)
6909       : m(mutex)
6910     {
6911       m.unlock();
6912     }
6913     unlock_guard(unlock_guard&) = delete;
6914     ~unlock_guard() {
6915       m.lock();
6916     }
6917   };
6918 }
6919
6920 void OSD::scrub_purged_snaps()
6921 {
6922   dout(10) << __func__ << dendl;
6923   ceph_assert(ceph_mutex_is_locked(osd_lock));
6924   SnapMapper::Scrubber s(cct, store, service.meta_ch,
6925                          make_snapmapper_oid(),
6926                          make_purged_snaps_oid());
6927   clog->debug() << "purged_snaps scrub starts";
6928   osd_lock.unlock();
6929   s.run();
6930   if (s.stray.size()) {
6931     clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6932   } else {
6933     clog->debug() << "purged_snaps scrub ok";
6934   }
6935   set<pair<spg_t,snapid_t>> queued;
6936   for (auto& [pool, snap, hash, shard] : s.stray) {
6937     const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6938     if (!pi) {
6939       dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6940       continue;
6941     }
6942     pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6943     spg_t spgid(pgid, shard);
6944     pair<spg_t,snapid_t> p(spgid, snap);
6945     if (queued.count(p)) {
6946       dout(20) << __func__ << " pg " << spgid << " snap " << snap
6947                << " already queued" << dendl;
6948       continue;
6949     }
6950     PGRef pg = lookup_lock_pg(spgid);
6951     if (!pg) {
6952       dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6953       continue;
6954     }
6955     queued.insert(p);
6956     dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6957              << snap << dendl;
6958     pg->queue_snap_retrim(snap);
6959     pg->unlock();
6960   }
6961   osd_lock.lock();
6962   if (is_stopping()) {
6963     return;
6964   }
6965   dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6966   ObjectStore::Transaction t;
6967   superblock.last_purged_snaps_scrub = ceph_clock_now();
6968   write_superblock(t);
6969   int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6970   ceph_assert(tr == 0);
6971   if (is_active()) {
6972     send_beacon(ceph::coarse_mono_clock::now());
6973   }
6974   dout(10) << __func__ << " done" << dendl;
6975 }
6976
6977 void OSD::probe_smart(const string& only_devid, ostream& ss)
6978 {
6979   set<string> devnames;
6980   store->get_devices(&devnames);
6981   uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6982     "osd_smart_report_timeout");
6983
6984   // == typedef std::map<std::string, mValue> mObject;
6985   json_spirit::mObject json_map;
6986
6987   for (auto dev : devnames) {
6988     // smartctl works only on physical devices; filter out any logical device
6989     if (dev.find("dm-") == 0) {
6990       continue;
6991     }
6992
6993     string err;
6994     string devid = get_device_id(dev, &err);
6995     if (devid.size() == 0) {
6996       dout(10) << __func__ << " no unique id for dev " << dev << " ("
6997                << err << "), skipping" << dendl;
6998       continue;
6999     }
7000     if (only_devid.size() && devid != only_devid) {
7001       continue;
7002     }
7003
7004     json_spirit::mValue smart_json;
7005     if (block_device_get_metrics(dev, smart_timeout,
7006                                  &smart_json)) {
7007       dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7008       continue;
7009     }
7010     json_map[devid] = smart_json;
7011   }
7012   json_spirit::write(json_map, ss, json_spirit::pretty_print);
7013 }
7014
7015 bool OSD::heartbeat_dispatch(Message *m)
7016 {
7017   dout(30) << "heartbeat_dispatch " << m << dendl;
7018   switch (m->get_type()) {
7019
7020   case CEPH_MSG_PING:
7021     dout(10) << "ping from " << m->get_source_inst() << dendl;
7022     m->put();
7023     break;
7024
7025   case MSG_OSD_PING:
7026     handle_osd_ping(static_cast<MOSDPing*>(m));
7027     break;
7028
7029   default:
7030     dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7031     m->put();
7032   }
7033
7034   return true;
7035 }
7036
7037 bool OSD::ms_dispatch(Message *m)
7038 {
7039   dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7040   if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7041     service.got_stop_ack();
7042     m->put();
7043     return true;
7044   }
7045
7046   // lock!
7047
7048   osd_lock.lock();
7049   if (is_stopping()) {
7050     osd_lock.unlock();
7051     m->put();
7052     return true;
7053   }
7054
7055   do_waiters();
7056   _dispatch(m);
7057
7058   osd_lock.unlock();
7059
7060   return true;
7061 }
7062
7063 void OSDService::maybe_share_map(
7064   Connection *con,
7065   const OSDMapRef& osdmap,
7066   epoch_t peer_epoch_lb)
7067 {
7068   // NOTE: we assume caller hold something that keeps the Connection itself
7069   // pinned (e.g., an OpRequest's MessageRef).
7070   auto session = ceph::ref_cast<Session>(con->get_priv());
7071   if (!session) {
7072     return;
7073   }
7074
7075   // assume the peer has the newer of the op's sent_epoch and what
7076   // we think we sent them.
7077   session->sent_epoch_lock.lock();
7078   if (peer_epoch_lb > session->last_sent_epoch) {
7079     dout(10) << __func__ << " con " << con
7080              << " " << con->get_peer_addr()
7081              << " map epoch " << session->last_sent_epoch
7082              << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7083     session->last_sent_epoch = peer_epoch_lb;
7084   }
7085   epoch_t last_sent_epoch = session->last_sent_epoch;
7086   session->sent_epoch_lock.unlock();
7087
7088   if (osdmap->get_epoch() <= last_sent_epoch) {
7089     return;
7090   }
7091
7092   send_incremental_map(last_sent_epoch, con, osdmap);
7093   last_sent_epoch = osdmap->get_epoch();
7094
7095   session->sent_epoch_lock.lock();
7096   if (session->last_sent_epoch < last_sent_epoch) {
7097     dout(10) << __func__ << " con " << con
7098              << " " << con->get_peer_addr()
7099              << " map epoch " << session->last_sent_epoch
7100              << " -> " << last_sent_epoch << " (shared)" << dendl;
7101     session->last_sent_epoch = last_sent_epoch;
7102   }
7103   session->sent_epoch_lock.unlock();
7104 }
7105
7106 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7107 {
7108   ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7109
7110   auto i = session->waiting_on_map.begin();
7111   while (i != session->waiting_on_map.end()) {
7112     OpRequestRef op = &(*i);
7113     ceph_assert(ms_can_fast_dispatch(op->get_req()));
7114     auto m = op->get_req<MOSDFastDispatchOp>();
7115     if (m->get_min_epoch() > osdmap->get_epoch()) {
7116       break;
7117     }
7118     session->waiting_on_map.erase(i++);
7119     op->put();
7120
7121     spg_t pgid;
7122     if (m->get_type() == CEPH_MSG_OSD_OP) {
7123       pg_t actual_pgid = osdmap->raw_pg_to_pg(
7124         static_cast<const MOSDOp*>(m)->get_pg());
7125       if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7126         continue;
7127       }
7128     } else {
7129       pgid = m->get_spg();
7130     }
7131     enqueue_op(pgid, std::move(op), m->get_map_epoch());
7132   }
7133
7134   if (session->waiting_on_map.empty()) {
7135     clear_session_waiting_on_map(session);
7136   } else {
7137     register_session_waiting_on_map(session);
7138   }
7139 }
7140
7141 void OSD::ms_fast_dispatch(Message *m)
7142 {
7143
7144 #ifdef HAVE_JAEGER
7145   jaeger_tracing::init_tracer("osd-services-reinit");
7146   dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl;
7147   auto dispatch_span = jaeger_tracing::new_span(__func__);
7148 #endif
7149   FUNCTRACE(cct);
7150   if (service.is_stopping()) {
7151     m->put();
7152     return;
7153   }
7154
7155   // peering event?
7156   switch (m->get_type()) {
7157   case CEPH_MSG_PING:
7158     dout(10) << "ping from " << m->get_source() << dendl;
7159     m->put();
7160     return;
7161   case MSG_OSD_FORCE_RECOVERY:
7162     handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7163     return;
7164   case MSG_OSD_SCRUB2:
7165     handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7166     return;
7167
7168   case MSG_OSD_PG_CREATE2:
7169     return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7170   case MSG_OSD_PG_QUERY:
7171     return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7172   case MSG_OSD_PG_NOTIFY:
7173     return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7174   case MSG_OSD_PG_INFO:
7175     return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7176   case MSG_OSD_PG_REMOVE:
7177     return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7178
7179     // these are single-pg messages that handle themselves
7180   case MSG_OSD_PG_LOG:
7181   case MSG_OSD_PG_TRIM:
7182   case MSG_OSD_PG_NOTIFY2:
7183   case MSG_OSD_PG_QUERY2:
7184   case MSG_OSD_PG_INFO2:
7185   case MSG_OSD_BACKFILL_RESERVE:
7186   case MSG_OSD_RECOVERY_RESERVE:
7187   case MSG_OSD_PG_LEASE:
7188   case MSG_OSD_PG_LEASE_ACK:
7189     {
7190       MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7191       if (require_osd_peer(pm)) {
7192         enqueue_peering_evt(
7193           pm->get_spg(),
7194           PGPeeringEventRef(pm->get_event()));
7195       }
7196       pm->put();
7197       return;
7198     }
7199   }
7200
7201   OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7202   {
7203 #ifdef WITH_LTTNG
7204     osd_reqid_t reqid = op->get_reqid();
7205 #endif
7206     tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7207         reqid.name._num, reqid.tid, reqid.inc);
7208   }
7209 #ifdef HAVE_JAEGER
7210   op->set_osd_parent_span(dispatch_span);
7211   if (op->osd_parent_span) {
7212     auto op_req_span = jaeger_tracing::child_span("op-request-created", op->osd_parent_span);
7213     op->set_osd_parent_span(op_req_span);
7214   }
7215 #endif
7216   if (m->trace)
7217     op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7218
7219   // note sender epoch, min req's epoch
7220   op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7221   op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7222   ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7223
7224   service.maybe_inject_dispatch_delay();
7225
7226   if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7227       m->get_type() != CEPH_MSG_OSD_OP) {
7228     // queue it directly
7229     enqueue_op(
7230       static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7231       std::move(op),
7232       static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7233   } else {
7234     // legacy client, and this is an MOSDOp (the *only* fast dispatch
7235     // message that didn't have an explicit spg_t); we need to map
7236     // them to an spg_t while preserving delivery order.
7237     auto priv = m->get_connection()->get_priv();
7238     if (auto session = static_cast<Session*>(priv.get()); session) {
7239       std::lock_guard l{session->session_dispatch_lock};
7240       op->get();
7241       session->waiting_on_map.push_back(*op);
7242       OSDMapRef nextmap = service.get_nextmap_reserved();
7243       dispatch_session_waiting(session, nextmap);
7244       service.release_map(nextmap);
7245     }
7246   }
7247   OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7248 }
7249
7250 int OSD::ms_handle_authentication(Connection *con)
7251 {
7252   int ret = 0;
7253   auto s = ceph::ref_cast<Session>(con->get_priv());
7254   if (!s) {
7255     s = ceph::make_ref<Session>(cct, con);
7256     con->set_priv(s);
7257     s->entity_name = con->get_peer_entity_name();
7258     dout(10) << __func__ << " new session " << s << " con " << s->con
7259              << " entity " << s->entity_name
7260              << " addr " << con->get_peer_addrs() << dendl;
7261   } else {
7262     dout(10) << __func__ << " existing session " << s << " con " << s->con
7263              << " entity " << s->entity_name
7264              << " addr " << con->get_peer_addrs() << dendl;
7265   }
7266
7267   AuthCapsInfo &caps_info = con->get_peer_caps_info();
7268   if (caps_info.allow_all) {
7269     s->caps.set_allow_all();
7270   } else if (caps_info.caps.length() > 0) {
7271     bufferlist::const_iterator p = caps_info.caps.cbegin();
7272     string str;
7273     try {
7274       decode(str, p);
7275     }
7276     catch (ceph::buffer::error& e) {
7277       dout(10) << __func__ << " session " << s << " " << s->entity_name
7278                << " failed to decode caps string" << dendl;
7279       ret = -EACCES;
7280     }
7281     if (!ret) {
7282       bool success = s->caps.parse(str);
7283       if (success) {
7284         dout(10) << __func__ << " session " << s
7285                  << " " << s->entity_name
7286                  << " has caps " << s->caps << " '" << str << "'" << dendl;
7287         ret = 1;
7288       } else {
7289         dout(10) << __func__ << " session " << s << " " << s->entity_name
7290                  << " failed to parse caps '" << str << "'" << dendl;
7291         ret = -EACCES;
7292       }
7293     }
7294   }
7295   return ret;
7296 }
7297
7298 void OSD::do_waiters()
7299 {
7300   ceph_assert(ceph_mutex_is_locked(osd_lock));
7301
7302   dout(10) << "do_waiters -- start" << dendl;
7303   while (!finished.empty()) {
7304     OpRequestRef next = finished.front();
7305     finished.pop_front();
7306     dispatch_op(next);
7307   }
7308   dout(10) << "do_waiters -- finish" << dendl;
7309 }
7310
7311 void OSD::dispatch_op(OpRequestRef op)
7312 {
7313   switch (op->get_req()->get_type()) {
7314
7315   case MSG_OSD_PG_CREATE:
7316     handle_pg_create(op);
7317     break;
7318   }
7319 }
7320
7321 void OSD::_dispatch(Message *m)
7322 {
7323   ceph_assert(ceph_mutex_is_locked(osd_lock));
7324   dout(20) << "_dispatch " << m << " " << *m << dendl;
7325
7326   switch (m->get_type()) {
7327     // -- don't need OSDMap --
7328
7329     // map and replication
7330   case CEPH_MSG_OSD_MAP:
7331     handle_osd_map(static_cast<MOSDMap*>(m));
7332     break;
7333   case MSG_MON_GET_PURGED_SNAPS_REPLY:
7334     handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7335     break;
7336
7337     // osd
7338   case MSG_OSD_SCRUB:
7339     handle_scrub(static_cast<MOSDScrub*>(m));
7340     break;
7341
7342   case MSG_COMMAND:
7343     handle_command(static_cast<MCommand*>(m));
7344     return;
7345
7346     // -- need OSDMap --
7347
7348   case MSG_OSD_PG_CREATE:
7349     {
7350       OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7351       if (m->trace)
7352         op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7353       // no map?  starting up?
7354       if (!get_osdmap()) {
7355         dout(7) << "no OSDMap, not booted" << dendl;
7356         logger->inc(l_osd_waiting_for_map);
7357         waiting_for_osdmap.push_back(op);
7358         op->mark_delayed("no osdmap");
7359         break;
7360       }
7361
7362       // need OSDMap
7363       dispatch_op(op);
7364     }
7365   }
7366 }
7367
7368 // remove me post-nautilus
7369 void OSD::handle_scrub(MOSDScrub *m)
7370 {
7371   dout(10) << "handle_scrub " << *m << dendl;
7372   if (!require_mon_or_mgr_peer(m)) {
7373     m->put();
7374     return;
7375   }
7376   if (m->fsid != monc->get_fsid()) {
7377     dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7378             << dendl;
7379     m->put();
7380     return;
7381   }
7382
7383   vector<spg_t> spgs;
7384   _get_pgids(&spgs);
7385
7386   if (!m->scrub_pgs.empty()) {
7387     vector<spg_t> v;
7388     for (auto pgid : m->scrub_pgs) {
7389       spg_t pcand;
7390       if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7391           std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7392         v.push_back(pcand);
7393       }
7394     }
7395     spgs.swap(v);
7396   }
7397
7398   for (auto pgid : spgs) {
7399     enqueue_peering_evt(
7400       pgid,
7401       PGPeeringEventRef(
7402         std::make_shared<PGPeeringEvent>(
7403           get_osdmap_epoch(),
7404           get_osdmap_epoch(),
7405           PeeringState::RequestScrub(m->deep, m->repair))));
7406   }
7407
7408   m->put();
7409 }
7410
7411 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7412 {
7413   dout(10) << __func__ <<  " " << *m << dendl;
7414   if (!require_mon_or_mgr_peer(m)) {
7415     m->put();
7416     return;
7417   }
7418   if (m->fsid != monc->get_fsid()) {
7419     dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7420             << dendl;
7421     m->put();
7422     return;
7423   }
7424   for (auto pgid : m->scrub_pgs) {
7425     enqueue_peering_evt(
7426       pgid,
7427       PGPeeringEventRef(
7428         std::make_shared<PGPeeringEvent>(
7429           m->epoch,
7430           m->epoch,
7431           PeeringState::RequestScrub(m->deep, m->repair))));
7432   }
7433   m->put();
7434 }
7435
7436 bool OSD::scrub_random_backoff()
7437 {
7438   bool coin_flip = (rand() / (double)RAND_MAX >=
7439                     cct->_conf->osd_scrub_backoff_ratio);
7440   if (!coin_flip) {
7441     dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7442     return true;
7443   }
7444   return false;
7445 }
7446
7447 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7448                                const spg_t& pg, const utime_t& timestamp,
7449                                double pool_scrub_min_interval,
7450                                double pool_scrub_max_interval, bool must)
7451   : cct(cct),
7452     pgid(pg),
7453     sched_time(timestamp),
7454     deadline(timestamp)
7455 {
7456   // if not explicitly requested, postpone the scrub with a random delay
7457   if (!must) {
7458     double scrub_min_interval = pool_scrub_min_interval > 0 ?
7459       pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7460     double scrub_max_interval = pool_scrub_max_interval > 0 ?
7461       pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7462
7463     sched_time += scrub_min_interval;
7464     double r = rand() / (double)RAND_MAX;
7465     sched_time +=
7466       scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7467     if (scrub_max_interval == 0) {
7468       deadline = utime_t();
7469     } else {
7470       deadline += scrub_max_interval;
7471     }
7472
7473   }
7474 }
7475
7476 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7477   if (sched_time < rhs.sched_time)
7478     return true;
7479   if (sched_time > rhs.sched_time)
7480     return false;
7481   return pgid < rhs.pgid;
7482 }
7483
7484 void OSDService::dumps_scrub(ceph::Formatter *f)
7485 {
7486   ceph_assert(f != nullptr);
7487   std::lock_guard l(sched_scrub_lock);
7488
7489   f->open_array_section("scrubs");
7490   for (const auto &i: sched_scrub_pg) {
7491     f->open_object_section("scrub");
7492     f->dump_stream("pgid") << i.pgid;
7493     f->dump_stream("sched_time") << i.sched_time;
7494     f->dump_stream("deadline") << i.deadline;
7495     f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp());
7496     f->close_section();
7497   }
7498   f->close_section();
7499 }
7500
7501 double OSD::scrub_sleep_time(bool must_scrub)
7502 {
7503   if (must_scrub) {
7504     return cct->_conf->osd_scrub_sleep;
7505   }
7506   utime_t now = ceph_clock_now();
7507   if (scrub_time_permit(now)) {
7508     return cct->_conf->osd_scrub_sleep;
7509   }
7510   double normal_sleep = cct->_conf->osd_scrub_sleep;
7511   double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7512   return std::max(extended_sleep, normal_sleep);
7513 }
7514
7515 bool OSD::scrub_time_permit(utime_t now)
7516 {
7517   struct tm bdt;
7518   time_t tt = now.sec();
7519   localtime_r(&tt, &bdt);
7520
7521   bool day_permit = false;
7522   if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7523     if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7524       day_permit = true;
7525     }
7526   } else {
7527     if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7528       day_permit = true;
7529     }
7530   }
7531
7532   if (!day_permit) {
7533     dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7534             << " - " << cct->_conf->osd_scrub_end_week_day
7535             << " now " << bdt.tm_wday << " = no" << dendl;
7536     return false;
7537   }
7538
7539   bool time_permit = false;
7540   if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7541     if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7542       time_permit = true;
7543     }
7544   } else {
7545     if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7546       time_permit = true;
7547     }
7548   }
7549   if (time_permit) {
7550     dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7551             << " - " << cct->_conf->osd_scrub_end_hour
7552             << " now " << bdt.tm_hour << " = yes" << dendl;
7553   } else {
7554     dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7555             << " - " << cct->_conf->osd_scrub_end_hour
7556             << " now " << bdt.tm_hour << " = no" << dendl;
7557   }
7558   return time_permit;
7559 }
7560
7561 bool OSD::scrub_load_below_threshold()
7562 {
7563   double loadavgs[3];
7564   if (getloadavg(loadavgs, 3) != 3) {
7565     dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7566     return false;
7567   }
7568
7569   // allow scrub if below configured threshold
7570   long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7571   double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7572   if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7573     dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7574              << " < max " << cct->_conf->osd_scrub_load_threshold
7575              << " = yes" << dendl;
7576     return true;
7577   }
7578
7579   // allow scrub if below daily avg and currently decreasing
7580   if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7581     dout(20) << __func__ << " loadavg " << loadavgs[0]
7582              << " < daily_loadavg " << daily_loadavg
7583              << " and < 15m avg " << loadavgs[2]
7584              << " = yes" << dendl;
7585     return true;
7586   }
7587
7588   dout(20) << __func__ << " loadavg " << loadavgs[0]
7589            << " >= max " << cct->_conf->osd_scrub_load_threshold
7590            << " and ( >= daily_loadavg " << daily_loadavg
7591            << " or >= 15m avg " << loadavgs[2]
7592            << ") = no" << dendl;
7593   return false;
7594 }
7595
7596 void OSD::sched_scrub()
7597 {
7598   dout(20) << __func__ << " sched_scrub starts" << dendl;
7599
7600   // if not permitted, fail fast
7601   if (!service.can_inc_scrubs()) {
7602     dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7603     return;
7604   }
7605   bool allow_requested_repair_only = false;
7606   if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7607     if (!cct->_conf->osd_repair_during_recovery) {
7608       dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl;
7609       return;
7610     }
7611     dout(10) << __func__
7612              << " will only schedule explicitly requested repair due to active recovery"
7613              << dendl;
7614     allow_requested_repair_only = true;
7615   }
7616
7617   utime_t now = ceph_clock_now();
7618   bool time_permit = scrub_time_permit(now);
7619   bool load_is_low = scrub_load_below_threshold();
7620   dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7621
7622   OSDService::ScrubJob scrub_job;
7623   if (service.first_scrub_stamp(&scrub_job)) {
7624     do {
7625       dout(30) << "sched_scrub examine " << scrub_job.pgid << " at " << scrub_job.sched_time << dendl;
7626
7627       if (scrub_job.sched_time > now) {
7628         // save ourselves some effort
7629         dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time
7630                  << " > " << now << dendl;
7631         break;
7632       }
7633
7634       if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) {
7635         dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to "
7636                  << (!time_permit ? "time not permit" : "high load") << dendl;
7637         continue;
7638       }
7639
7640       PGRef pg = _lookup_lock_pg(scrub_job.pgid);
7641       if (!pg) {
7642         dout(20) << __func__ << " pg  " << scrub_job.pgid << " not found" << dendl;
7643         continue;
7644       }
7645
7646       // This has already started, so go on to the next scrub job
7647       if (pg->is_scrub_active()) {
7648         pg->unlock();
7649         dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl;
7650         continue;
7651       }
7652       // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7653       if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
7654         pg->unlock();
7655         dout(10) << __func__ << " skip " << scrub_job.pgid
7656                  << " because repairing is not explicitly requested on it"
7657                  << dendl;
7658         continue;
7659       }
7660
7661       // If it is reserving, let it resolve before going to the next scrub job
7662       if (pg->m_scrubber->is_reserving()) {
7663         pg->unlock();
7664         dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
7665         break;
7666       }
7667       dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
7668                << (pg->get_must_scrub() ? ", explicitly requested" :
7669                    (load_is_low ? ", load_is_low" : " deadline < now"))
7670                << dendl;
7671       if (pg->sched_scrub()) {
7672         pg->unlock();
7673         dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
7674         break;
7675       }
7676       pg->unlock();
7677     } while (service.next_scrub_stamp(scrub_job, &scrub_job));
7678   }
7679   dout(20) << "sched_scrub done" << dendl;
7680 }
7681
7682 void OSD::resched_all_scrubs()
7683 {
7684   dout(10) << __func__ << ": start" << dendl;
7685   const vector<spg_t> pgs = [this] {
7686     vector<spg_t> pgs;
7687     OSDService::ScrubJob job;
7688     if (service.first_scrub_stamp(&job)) {
7689       do {
7690         pgs.push_back(job.pgid);
7691       } while (service.next_scrub_stamp(job, &job));
7692     }
7693     return pgs;
7694   }();
7695   for (auto& pgid : pgs) {
7696       dout(20) << __func__ << ": examine " << pgid << dendl;
7697       PGRef pg = _lookup_lock_pg(pgid);
7698       if (!pg)
7699         continue;
7700       if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
7701         dout(15) << __func__ << ": reschedule " << pgid << dendl;
7702         pg->on_info_history_change();
7703       }
7704       pg->unlock();
7705   }
7706   dout(10) << __func__ << ": done" << dendl;
7707 }
7708
7709 MPGStats* OSD::collect_pg_stats()
7710 {
7711   // This implementation unconditionally sends every is_primary PG's
7712   // stats every time we're called.  This has equivalent cost to the
7713   // previous implementation's worst case where all PGs are busy and
7714   // their stats are always enqueued for sending.
7715   std::shared_lock l{map_lock};
7716
7717   osd_stat_t cur_stat = service.get_osd_stat();
7718   cur_stat.os_perf_stat = store->get_cur_stats();
7719
7720   auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7721   m->osd_stat = cur_stat;
7722
7723   std::lock_guard lec{min_last_epoch_clean_lock};
7724   min_last_epoch_clean = get_osdmap_epoch();
7725   min_last_epoch_clean_pgs.clear();
7726
7727   std::set<int64_t> pool_set;
7728   vector<PGRef> pgs;
7729   _get_pgs(&pgs);
7730   for (auto& pg : pgs) {
7731     auto pool = pg->pg_id.pgid.pool();
7732     pool_set.emplace((int64_t)pool);
7733     if (!pg->is_primary()) {
7734       continue;
7735     }
7736     pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7737         m->pg_stat[pg->pg_id.pgid] = s;
7738         min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
7739         min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7740       });
7741   }
7742   store_statfs_t st;
7743   bool per_pool_stats = false;
7744   bool per_pool_omap_stats = false;
7745   for (auto p : pool_set) {
7746     int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7747     if (r == -ENOTSUP) {
7748       break;
7749     } else {
7750       assert(r >= 0);
7751       m->pool_stat[p] = st;
7752       per_pool_stats = true;
7753     }
7754   }
7755
7756   // indicate whether we are reporting per-pool stats
7757   m->osd_stat.num_osds = 1;
7758   m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7759   m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7760
7761   return m;
7762 }
7763
7764 vector<DaemonHealthMetric> OSD::get_health_metrics()
7765 {
7766   vector<DaemonHealthMetric> metrics;
7767   {
7768     utime_t oldest_secs;
7769     const utime_t now = ceph_clock_now();
7770     auto too_old = now;
7771     too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7772     int slow = 0;
7773     TrackedOpRef oldest_op;
7774     auto count_slow_ops = [&](TrackedOp& op) {
7775       if (op.get_initiated() < too_old) {
7776         stringstream ss;
7777         ss << "slow request " << op.get_desc()
7778            << " initiated "
7779            << op.get_initiated()
7780            << " currently "
7781            << op.state_string();
7782         lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7783         clog->warn() << ss.str();
7784         slow++;
7785         if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7786           oldest_op = &op;
7787         }
7788         return true;
7789       } else {
7790         return false;
7791       }
7792     };
7793     if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7794       if (slow) {
7795         derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7796              << oldest_op->get_desc() << dendl;
7797       }
7798       metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7799     } else {
7800       // no news is not good news.
7801       metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7802     }
7803   }
7804   {
7805     std::lock_guard l(pending_creates_lock);
7806     auto n_primaries = pending_creates_from_mon;
7807     for (const auto& create : pending_creates_from_osd) {
7808       if (create.second) {
7809         n_primaries++;
7810       }
7811     }
7812     metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7813   }
7814   return metrics;
7815 }
7816
7817 // =====================================================
7818 // MAP
7819
7820 void OSD::wait_for_new_map(OpRequestRef op)
7821 {
7822   // ask?
7823   if (waiting_for_osdmap.empty()) {
7824     osdmap_subscribe(get_osdmap_epoch() + 1, false);
7825   }
7826
7827   logger->inc(l_osd_waiting_for_map);
7828   waiting_for_osdmap.push_back(op);
7829   op->mark_delayed("wait for new map");
7830 }
7831
7832
7833 /** update_map
7834  * assimilate new OSDMap(s).  scan pgs, etc.
7835  */
7836
7837 void OSD::note_down_osd(int peer)
7838 {
7839   ceph_assert(ceph_mutex_is_locked(osd_lock));
7840   cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7841
7842   std::lock_guard l{heartbeat_lock};
7843   failure_queue.erase(peer);
7844   failure_pending.erase(peer);
7845   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7846   if (p != heartbeat_peers.end()) {
7847     p->second.clear_mark_down();
7848     heartbeat_peers.erase(p);
7849   }
7850 }
7851
7852 void OSD::note_up_osd(int peer)
7853 {
7854   heartbeat_set_peers_need_update();
7855 }
7856
7857 struct C_OnMapCommit : public Context {
7858   OSD *osd;
7859   epoch_t first, last;
7860   MOSDMap *msg;
7861   C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7862     : osd(o), first(f), last(l), msg(m) {}
7863   void finish(int r) override {
7864     osd->_committed_osd_maps(first, last, msg);
7865     msg->put();
7866   }
7867 };
7868
7869 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7870 {
7871   std::lock_guard l(osdmap_subscribe_lock);
7872   if (latest_subscribed_epoch >= epoch && !force_request)
7873     return;
7874
7875   latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7876
7877   if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7878       force_request) {
7879     monc->renew_subs();
7880   }
7881 }
7882
7883 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7884 {
7885   epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7886   if (min <= superblock.oldest_map)
7887     return;
7888
7889   int num = 0;
7890   ObjectStore::Transaction t;
7891   for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7892     dout(20) << " removing old osdmap epoch " << e << dendl;
7893     t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7894     t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7895     superblock.oldest_map = e + 1;
7896     num++;
7897     if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7898       service.publish_superblock(superblock);
7899       write_superblock(t);
7900       int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7901       ceph_assert(tr == 0);
7902       num = 0;
7903       if (!skip_maps) {
7904         // skip_maps leaves us with a range of old maps if we fail to remove all
7905         // of them before moving superblock.oldest_map forward to the first map
7906         // in the incoming MOSDMap msg. so we should continue removing them in
7907         // this case, even we could do huge series of delete transactions all at
7908         // once.
7909         break;
7910       }
7911     }
7912   }
7913   if (num > 0) {
7914     service.publish_superblock(superblock);
7915     write_superblock(t);
7916     int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7917     ceph_assert(tr == 0);
7918   }
7919   // we should not remove the cached maps
7920   ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7921 }
7922
7923 void OSD::handle_osd_map(MOSDMap *m)
7924 {
7925   // wait for pgs to catch up
7926   {
7927     // we extend the map cache pins to accomodate pgs slow to consume maps
7928     // for some period, until we hit the max_lag_factor bound, at which point
7929     // we block here to stop injesting more maps than they are able to keep
7930     // up with.
7931     epoch_t max_lag = cct->_conf->osd_map_cache_size *
7932       m_osd_pg_epoch_max_lag_factor;
7933     ceph_assert(max_lag > 0);
7934     epoch_t osd_min = 0;
7935     for (auto shard : shards) {
7936       epoch_t min = shard->get_min_pg_epoch();
7937       if (osd_min == 0 || min < osd_min) {
7938         osd_min = min;
7939       }
7940     }
7941     epoch_t osdmap_epoch = get_osdmap_epoch();
7942     if (osd_min > 0 &&
7943         osdmap_epoch > max_lag &&
7944         osdmap_epoch - max_lag > osd_min) {
7945       epoch_t need = osdmap_epoch - max_lag;
7946       dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7947                << " max_lag " << max_lag << ")" << dendl;
7948       for (auto shard : shards) {
7949         epoch_t min = shard->get_min_pg_epoch();
7950         if (need > min) {
7951           dout(10) << __func__ << " waiting for pgs to consume " << need
7952                    << " (shard " << shard->shard_id << " min " << min
7953                    << ", map cache is " << cct->_conf->osd_map_cache_size
7954                    << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7955                    << ")" << dendl;
7956           unlock_guard unlock{osd_lock};
7957           shard->wait_min_pg_epoch(need);
7958         }
7959       }
7960     }
7961   }
7962
7963   ceph_assert(ceph_mutex_is_locked(osd_lock));
7964   map<epoch_t,OSDMapRef> added_maps;
7965   map<epoch_t,bufferlist> added_maps_bl;
7966   if (m->fsid != monc->get_fsid()) {
7967     dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7968             << monc->get_fsid() << dendl;
7969     m->put();
7970     return;
7971   }
7972   if (is_initializing()) {
7973     dout(0) << "ignoring osdmap until we have initialized" << dendl;
7974     m->put();
7975     return;
7976   }
7977
7978   auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7979   if (session && !(session->entity_name.is_mon() ||
7980                    session->entity_name.is_osd())) {
7981     //not enough perms!
7982     dout(10) << "got osd map from Session " << session
7983              << " which we can't take maps from (not a mon or osd)" << dendl;
7984     m->put();
7985     return;
7986   }
7987
7988   // share with the objecter
7989   if (!is_preboot())
7990     service.objecter->handle_osd_map(m);
7991
7992   epoch_t first = m->get_first();
7993   epoch_t last = m->get_last();
7994   dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7995           << superblock.newest_map
7996           << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7997           << dendl;
7998
7999   logger->inc(l_osd_map);
8000   logger->inc(l_osd_mape, last - first + 1);
8001   if (first <= superblock.newest_map)
8002     logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8003   if (service.max_oldest_map < m->oldest_map) {
8004     service.max_oldest_map = m->oldest_map;
8005     ceph_assert(service.max_oldest_map >= superblock.oldest_map);
8006   }
8007
8008   // make sure there is something new, here, before we bother flushing
8009   // the queues and such
8010   if (last <= superblock.newest_map) {
8011     dout(10) << " no new maps here, dropping" << dendl;
8012     m->put();
8013     return;
8014   }
8015
8016   // missing some?
8017   bool skip_maps = false;
8018   if (first > superblock.newest_map + 1) {
8019     dout(10) << "handle_osd_map message skips epochs "
8020              << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8021     if (m->oldest_map <= superblock.newest_map + 1) {
8022       osdmap_subscribe(superblock.newest_map + 1, false);
8023       m->put();
8024       return;
8025     }
8026     // always try to get the full range of maps--as many as we can.  this
8027     //  1- is good to have
8028     //  2- is at present the only way to ensure that we get a *full* map as
8029     //     the first map!
8030     if (m->oldest_map < first) {
8031       osdmap_subscribe(m->oldest_map - 1, true);
8032       m->put();
8033       return;
8034     }
8035     skip_maps = true;
8036   }
8037
8038   ObjectStore::Transaction t;
8039   uint64_t txn_size = 0;
8040
8041   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8042
8043   // store new maps: queue for disk and put in the osdmap cache
8044   epoch_t start = std::max(superblock.newest_map + 1, first);
8045   for (epoch_t e = start; e <= last; e++) {
8046     if (txn_size >= t.get_num_bytes()) {
8047       derr << __func__ << " transaction size overflowed" << dendl;
8048       ceph_assert(txn_size < t.get_num_bytes());
8049     }
8050     txn_size = t.get_num_bytes();
8051     map<epoch_t,bufferlist>::iterator p;
8052     p = m->maps.find(e);
8053     if (p != m->maps.end()) {
8054       dout(10) << "handle_osd_map  got full map for epoch " << e << dendl;
8055       OSDMap *o = new OSDMap;
8056       bufferlist& bl = p->second;
8057
8058       o->decode(bl);
8059
8060       purged_snaps[e] = o->get_new_purged_snaps();
8061
8062       ghobject_t fulloid = get_osdmap_pobject_name(e);
8063       t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8064       added_maps[e] = add_map(o);
8065       added_maps_bl[e] = bl;
8066       got_full_map(e);
8067       continue;
8068     }
8069
8070     p = m->incremental_maps.find(e);
8071     if (p != m->incremental_maps.end()) {
8072       dout(10) << "handle_osd_map  got inc map for epoch " << e << dendl;
8073       bufferlist& bl = p->second;
8074       ghobject_t oid = get_inc_osdmap_pobject_name(e);
8075       t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8076
8077       OSDMap *o = new OSDMap;
8078       if (e > 1) {
8079         bufferlist obl;
8080         bool got = get_map_bl(e - 1, obl);
8081         if (!got) {
8082           auto p = added_maps_bl.find(e - 1);
8083           ceph_assert(p != added_maps_bl.end());
8084           obl = p->second;
8085         }
8086         o->decode(obl);
8087       }
8088
8089       OSDMap::Incremental inc;
8090       auto p = bl.cbegin();
8091       inc.decode(p);
8092
8093       if (o->apply_incremental(inc) < 0) {
8094         derr << "ERROR: bad fsid?  i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
8095         ceph_abort_msg("bad fsid");
8096       }
8097
8098       bufferlist fbl;
8099       o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8100
8101       bool injected_failure = false;
8102       if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8103           (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8104         derr << __func__ << " injecting map crc failure" << dendl;
8105         injected_failure = true;
8106       }
8107
8108       if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8109         dout(2) << "got incremental " << e
8110                 << " but failed to encode full with correct crc; requesting"
8111                 << dendl;
8112         clog->warn() << "failed to encode map e" << e << " with expected crc";
8113         dout(20) << "my encoded map was:\n";
8114         fbl.hexdump(*_dout);
8115         *_dout << dendl;
8116         delete o;
8117         request_full_map(e, last);
8118         last = e - 1;
8119
8120         // don't continue committing if we failed to enc the first inc map
8121         if (last < start) {
8122           dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8123           m->put();
8124           return;
8125         }
8126         break;
8127       }
8128       got_full_map(e);
8129       purged_snaps[e] = o->get_new_purged_snaps();
8130
8131       ghobject_t fulloid = get_osdmap_pobject_name(e);
8132       t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8133       added_maps[e] = add_map(o);
8134       added_maps_bl[e] = fbl;
8135       continue;
8136     }
8137
8138     ceph_abort_msg("MOSDMap lied about what maps it had?");
8139   }
8140
8141   // even if this map isn't from a mon, we may have satisfied our subscription
8142   monc->sub_got("osdmap", last);
8143
8144   if (!m->maps.empty() && requested_full_first) {
8145     dout(10) << __func__ << " still missing full maps " << requested_full_first
8146              << ".." << requested_full_last << dendl;
8147     rerequest_full_maps();
8148   }
8149
8150   if (superblock.oldest_map) {
8151     // make sure we at least keep pace with incoming maps
8152     trim_maps(m->oldest_map, last - first + 1, skip_maps);
8153     pg_num_history.prune(superblock.oldest_map);
8154   }
8155
8156   if (!superblock.oldest_map || skip_maps)
8157     superblock.oldest_map = first;
8158   superblock.newest_map = last;
8159   superblock.current_epoch = last;
8160
8161   // note in the superblock that we were clean thru the prior epoch
8162   epoch_t boot_epoch = service.get_boot_epoch();
8163   if (boot_epoch && boot_epoch >= superblock.mounted) {
8164     superblock.mounted = boot_epoch;
8165     superblock.clean_thru = last;
8166   }
8167
8168   // check for pg_num changes and deleted pools
8169   OSDMapRef lastmap;
8170   for (auto& i : added_maps) {
8171     if (!lastmap) {
8172       if (!(lastmap = service.try_get_map(i.first - 1))) {
8173         dout(10) << __func__ << " can't get previous map " << i.first - 1
8174                  << " probably first start of this osd" << dendl;
8175         continue;
8176       }
8177     }
8178     ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8179     for (auto& j : lastmap->get_pools()) {
8180       if (!i.second->have_pg_pool(j.first)) {
8181         pg_num_history.log_pool_delete(i.first, j.first);
8182         dout(10) << __func__ << " recording final pg_pool_t for pool "
8183                  << j.first << dendl;
8184         // this information is needed by _make_pg() if have to restart before
8185         // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8186         ghobject_t obj = make_final_pool_info_oid(j.first);
8187         bufferlist bl;
8188         encode(j.second, bl, CEPH_FEATURES_ALL);
8189         string name = lastmap->get_pool_name(j.first);
8190         encode(name, bl);
8191         map<string,string> profile;
8192         if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8193           profile = lastmap->get_erasure_code_profile(
8194             lastmap->get_pg_pool(j.first)->erasure_code_profile);
8195         }
8196         encode(profile, bl);
8197         t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8198       } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8199                  new_pg_num != j.second.get_pg_num()) {
8200         dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8201                  << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8202         pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8203       }
8204     }
8205     for (auto& j : i.second->get_pools()) {
8206       if (!lastmap->have_pg_pool(j.first)) {
8207         dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8208                  << j.second.get_pg_num() << dendl;
8209         pg_num_history.log_pg_num_change(i.first, j.first,
8210                                          j.second.get_pg_num());
8211       }
8212     }
8213     lastmap = i.second;
8214   }
8215   pg_num_history.epoch = last;
8216   {
8217     bufferlist bl;
8218     ::encode(pg_num_history, bl);
8219     t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8220     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8221   }
8222
8223   // record new purged_snaps
8224   if (superblock.purged_snaps_last == start - 1) {
8225     SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8226                                     make_purged_snaps_oid(), &t,
8227                                     purged_snaps);
8228     superblock.purged_snaps_last = last;
8229   } else {
8230     dout(10) << __func__ << " superblock purged_snaps_last is "
8231              << superblock.purged_snaps_last
8232              << ", not recording new purged_snaps" << dendl;
8233   }
8234
8235   // superblock and commit
8236   write_superblock(t);
8237   t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8238   store->queue_transaction(
8239     service.meta_ch,
8240     std::move(t));
8241   service.publish_superblock(superblock);
8242 }
8243
8244 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8245 {
8246   dout(10) << __func__ << " " << first << ".." << last << dendl;
8247   if (is_stopping()) {
8248     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8249     return;
8250   }
8251   std::lock_guard l(osd_lock);
8252   if (is_stopping()) {
8253     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8254     return;
8255   }
8256   map_lock.lock();
8257
8258   ceph_assert(first <= last);
8259
8260   bool do_shutdown = false;
8261   bool do_restart = false;
8262   bool network_error = false;
8263   OSDMapRef osdmap = get_osdmap();
8264
8265   // advance through the new maps
8266   for (epoch_t cur = first; cur <= last; cur++) {
8267     dout(10) << " advance to epoch " << cur
8268              << " (<= last " << last
8269              << " <= newest_map " << superblock.newest_map
8270              << ")" << dendl;
8271
8272     OSDMapRef newmap = get_map(cur);
8273     ceph_assert(newmap);  // we just cached it above!
8274
8275     // start blocklisting messages sent to peers that go down.
8276     service.pre_publish_map(newmap);
8277
8278     // kill connections to newly down osds
8279     bool waited_for_reservations = false;
8280     set<int> old;
8281     osdmap = get_osdmap();
8282     osdmap->get_all_osds(old);
8283     for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8284       if (*p != whoami &&
8285           osdmap->is_up(*p) && // in old map
8286           newmap->is_down(*p)) {    // but not the new one
8287         if (!waited_for_reservations) {
8288           service.await_reserved_maps();
8289           waited_for_reservations = true;
8290         }
8291         note_down_osd(*p);
8292       } else if (*p != whoami &&
8293                 osdmap->is_down(*p) &&
8294                 newmap->is_up(*p)) {
8295         note_up_osd(*p);
8296       }
8297     }
8298
8299     if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8300       dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8301                << dendl;
8302       if (is_booting()) {
8303         // this captures the case where we sent the boot message while
8304         // NOUP was being set on the mon and our boot request was
8305         // dropped, and then later it is cleared.  it imperfectly
8306         // handles the case where our original boot message was not
8307         // dropped and we restart even though we might have booted, but
8308         // that is harmless (boot will just take slightly longer).
8309         do_restart = true;
8310       }
8311     }
8312
8313     osdmap = std::move(newmap);
8314     set_osdmap(osdmap);
8315     epoch_t up_epoch;
8316     epoch_t boot_epoch;
8317     service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8318     if (!up_epoch &&
8319         osdmap->is_up(whoami) &&
8320         osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8321       up_epoch = osdmap->get_epoch();
8322       dout(10) << "up_epoch is " << up_epoch << dendl;
8323       if (!boot_epoch) {
8324         boot_epoch = osdmap->get_epoch();
8325         dout(10) << "boot_epoch is " << boot_epoch << dendl;
8326       }
8327       service.set_epochs(&boot_epoch, &up_epoch, NULL);
8328     }
8329   }
8330
8331   epoch_t _bind_epoch = service.get_bind_epoch();
8332   if (osdmap->is_up(whoami) &&
8333       osdmap->get_addrs(whoami).legacy_equals(
8334         client_messenger->get_myaddrs()) &&
8335       _bind_epoch < osdmap->get_up_from(whoami)) {
8336
8337     if (is_booting()) {
8338       dout(1) << "state: booting -> active" << dendl;
8339       set_state(STATE_ACTIVE);
8340       do_restart = false;
8341
8342       // set incarnation so that osd_reqid_t's we generate for our
8343       // objecter requests are unique across restarts.
8344       service.objecter->set_client_incarnation(osdmap->get_epoch());
8345       cancel_pending_failures();
8346     }
8347   }
8348
8349   if (osdmap->get_epoch() > 0 &&
8350       is_active()) {
8351     if (!osdmap->exists(whoami)) {
8352       derr << "map says i do not exist.  shutting down." << dendl;
8353       do_shutdown = true;   // don't call shutdown() while we have
8354                             // everything paused
8355     } else if (osdmap->is_stop(whoami)) {
8356       derr << "map says i am stopped by admin. shutting down." << dendl;
8357       do_shutdown = true;
8358     } else if (!osdmap->is_up(whoami) ||
8359                !osdmap->get_addrs(whoami).legacy_equals(
8360                  client_messenger->get_myaddrs()) ||
8361                !osdmap->get_cluster_addrs(whoami).legacy_equals(
8362                  cluster_messenger->get_myaddrs()) ||
8363                !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8364                  hb_back_server_messenger->get_myaddrs()) ||
8365                !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8366                  hb_front_server_messenger->get_myaddrs())) {
8367       if (!osdmap->is_up(whoami)) {
8368         if (service.is_preparing_to_stop() || service.is_stopping()) {
8369           service.got_stop_ack();
8370         } else {
8371           clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8372                           "but it is still running";
8373           clog->debug() << "map e" << osdmap->get_epoch()
8374                         << " wrongly marked me down at e"
8375                         << osdmap->get_down_at(whoami);
8376         }
8377         if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8378           // note that this is best-effort...
8379           monc->send_mon_message(
8380             new MOSDMarkMeDead(
8381               monc->get_fsid(),
8382               whoami,
8383               osdmap->get_epoch()));
8384         }
8385       } else if (!osdmap->get_addrs(whoami).legacy_equals(
8386                    client_messenger->get_myaddrs())) {
8387         clog->error() << "map e" << osdmap->get_epoch()
8388                       << " had wrong client addr (" << osdmap->get_addrs(whoami)
8389                       << " != my " << client_messenger->get_myaddrs() << ")";
8390       } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8391                    cluster_messenger->get_myaddrs())) {
8392         clog->error() << "map e" << osdmap->get_epoch()
8393                       << " had wrong cluster addr ("
8394                       << osdmap->get_cluster_addrs(whoami)
8395                       << " != my " << cluster_messenger->get_myaddrs() << ")";
8396       } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8397                    hb_back_server_messenger->get_myaddrs())) {
8398         clog->error() << "map e" << osdmap->get_epoch()
8399                       << " had wrong heartbeat back addr ("
8400                       << osdmap->get_hb_back_addrs(whoami)
8401                       << " != my " << hb_back_server_messenger->get_myaddrs()
8402                       << ")";
8403       } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8404                    hb_front_server_messenger->get_myaddrs())) {
8405         clog->error() << "map e" << osdmap->get_epoch()
8406                       << " had wrong heartbeat front addr ("
8407                       << osdmap->get_hb_front_addrs(whoami)
8408                       << " != my " << hb_front_server_messenger->get_myaddrs()
8409                       << ")";
8410       }
8411
8412       if (!service.is_stopping()) {
8413         epoch_t up_epoch = 0;
8414         epoch_t bind_epoch = osdmap->get_epoch();
8415         service.set_epochs(NULL,&up_epoch, &bind_epoch);
8416         do_restart = true;
8417
8418         //add markdown log
8419         utime_t now = ceph_clock_now();
8420         utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8421         osd_markdown_log.push_back(now);
8422         if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8423           derr << __func__ << " marked down "
8424                << osd_markdown_log.size()
8425                << " > osd_max_markdown_count "
8426                << cct->_conf->osd_max_markdown_count
8427                << " in last " << grace << " seconds, shutting down"
8428                << dendl;
8429           do_restart = false;
8430           do_shutdown = true;
8431         }
8432
8433         start_waiting_for_healthy();
8434
8435         set<int> avoid_ports;
8436 #if defined(__FreeBSD__)
8437         // prevent FreeBSD from grabbing the client_messenger port during
8438         // rebinding. In which case a cluster_meesneger will connect also
8439         // to the same port
8440         client_messenger->get_myaddrs().get_ports(&avoid_ports);
8441 #endif
8442         cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8443
8444         int r = cluster_messenger->rebind(avoid_ports);
8445         if (r != 0) {
8446           do_shutdown = true;  // FIXME: do_restart?
8447           network_error = true;
8448           derr << __func__ << " marked down:"
8449                << " rebind cluster_messenger failed" << dendl;
8450         }
8451
8452         hb_back_server_messenger->mark_down_all();
8453         hb_front_server_messenger->mark_down_all();
8454         hb_front_client_messenger->mark_down_all();
8455         hb_back_client_messenger->mark_down_all();
8456
8457         reset_heartbeat_peers(true);
8458       }
8459     }
8460   }
8461
8462   map_lock.unlock();
8463
8464   check_osdmap_features();
8465
8466   // yay!
8467   consume_map();
8468
8469   if (is_active() || is_waiting_for_healthy())
8470     maybe_update_heartbeat_peers();
8471
8472   if (is_active()) {
8473     activate_map();
8474   }
8475
8476   if (do_shutdown) {
8477     if (network_error) {
8478       cancel_pending_failures();
8479     }
8480     // trigger shutdown in a different thread
8481     dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8482     queue_async_signal(SIGINT);
8483   }
8484   else if (m->newest_map && m->newest_map > last) {
8485     dout(10) << " msg say newest map is " << m->newest_map
8486              << ", requesting more" << dendl;
8487     osdmap_subscribe(osdmap->get_epoch()+1, false);
8488   }
8489   else if (is_preboot()) {
8490     if (m->get_source().is_mon())
8491       _preboot(m->oldest_map, m->newest_map);
8492     else
8493       start_boot();
8494   }
8495   else if (do_restart)
8496     start_boot();
8497
8498 }
8499
8500 void OSD::check_osdmap_features()
8501 {
8502   // adjust required feature bits?
8503
8504   // we have to be a bit careful here, because we are accessing the
8505   // Policy structures without taking any lock.  in particular, only
8506   // modify integer values that can safely be read by a racing CPU.
8507   // since we are only accessing existing Policy structures a their
8508   // current memory location, and setting or clearing bits in integer
8509   // fields, and we are the only writer, this is not a problem.
8510
8511   const auto osdmap = get_osdmap();
8512   {
8513     Messenger::Policy p = client_messenger->get_default_policy();
8514     uint64_t mask;
8515     uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8516     if ((p.features_required & mask) != features) {
8517       dout(0) << "crush map has features " << features
8518               << ", adjusting msgr requires for clients" << dendl;
8519       p.features_required = (p.features_required & ~mask) | features;
8520       client_messenger->set_default_policy(p);
8521     }
8522   }
8523   {
8524     Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8525     uint64_t mask;
8526     uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8527     if ((p.features_required & mask) != features) {
8528       dout(0) << "crush map has features " << features
8529               << " was " << p.features_required
8530               << ", adjusting msgr requires for mons" << dendl;
8531       p.features_required = (p.features_required & ~mask) | features;
8532       client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8533     }
8534   }
8535   {
8536     Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8537     uint64_t mask;
8538     uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8539
8540     if ((p.features_required & mask) != features) {
8541       dout(0) << "crush map has features " << features
8542               << ", adjusting msgr requires for osds" << dendl;
8543       p.features_required = (p.features_required & ~mask) | features;
8544       cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8545     }
8546
8547     if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8548       dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8549       superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8550       ObjectStore::Transaction t;
8551       write_superblock(t);
8552       int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8553       ceph_assert(err == 0);
8554     }
8555   }
8556
8557   if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8558     hb_front_server_messenger->set_require_authorizer(false);
8559     hb_back_server_messenger->set_require_authorizer(false);
8560   } else {
8561     hb_front_server_messenger->set_require_authorizer(true);
8562     hb_back_server_messenger->set_require_authorizer(true);
8563   }
8564
8565   if (osdmap->require_osd_release != last_require_osd_release) {
8566     dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8567             << " -> " << to_string(osdmap->require_osd_release) << dendl;
8568     store->write_meta("require_osd_release",
8569                       stringify((int)osdmap->require_osd_release));
8570     last_require_osd_release = osdmap->require_osd_release;
8571   }
8572 }
8573
8574 struct C_FinishSplits : public Context {
8575   OSD *osd;
8576   set<PGRef> pgs;
8577   C_FinishSplits(OSD *osd, const set<PGRef> &in)
8578     : osd(osd), pgs(in) {}
8579   void finish(int r) override {
8580     osd->_finish_splits(pgs);
8581   }
8582 };
8583
8584 void OSD::_finish_splits(set<PGRef>& pgs)
8585 {
8586   dout(10) << __func__ << " " << pgs << dendl;
8587   if (is_stopping())
8588     return;
8589   for (set<PGRef>::iterator i = pgs.begin();
8590        i != pgs.end();
8591        ++i) {
8592     PG *pg = i->get();
8593
8594     PeeringCtx rctx = create_context();
8595     pg->lock();
8596     dout(10) << __func__ << " " << *pg << dendl;
8597     epoch_t e = pg->get_osdmap_epoch();
8598     pg->handle_initialize(rctx);
8599     pg->queue_null(e, e);
8600     dispatch_context(rctx, pg, service.get_osdmap());
8601     pg->unlock();
8602
8603     unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8604     shards[shard_index]->register_and_wake_split_child(pg);
8605   }
8606 };
8607
8608 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8609                            unsigned need)
8610 {
8611   std::lock_guard l(merge_lock);
8612   auto& p = merge_waiters[nextmap->get_epoch()][target];
8613   p[src->pg_id] = src;
8614   dout(10) << __func__ << " added merge_waiter " << src->pg_id
8615            << " for " << target  << ", have " << p.size() << "/" << need
8616            << dendl;
8617   return p.size() == need;
8618 }
8619
8620 bool OSD::advance_pg(
8621   epoch_t osd_epoch,
8622   PG *pg,
8623   ThreadPool::TPHandle &handle,
8624   PeeringCtx &rctx)
8625 {
8626   if (osd_epoch <= pg->get_osdmap_epoch()) {
8627     return true;
8628   }
8629   ceph_assert(pg->is_locked());
8630   OSDMapRef lastmap = pg->get_osdmap();
8631   set<PGRef> new_pgs;  // any split children
8632   bool ret = true;
8633
8634   unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8635     lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8636   for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8637        next_epoch <= osd_epoch;
8638        ++next_epoch) {
8639     OSDMapRef nextmap = service.try_get_map(next_epoch);
8640     if (!nextmap) {
8641       dout(20) << __func__ << " missing map " << next_epoch << dendl;
8642       continue;
8643     }
8644
8645     unsigned new_pg_num =
8646       (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8647       nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8648     if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8649       // check for merge
8650       if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8651         spg_t parent;
8652         if (pg->pg_id.is_merge_source(
8653               old_pg_num,
8654               new_pg_num,
8655               &parent)) {
8656           // we are merge source
8657           PGRef spg = pg; // carry a ref
8658           dout(1) << __func__ << " " << pg->pg_id
8659                   << " is merge source, target is " << parent
8660                    << dendl;
8661           pg->write_if_dirty(rctx);
8662           if (!new_pgs.empty()) {
8663             rctx.transaction.register_on_applied(new C_FinishSplits(this,
8664                                                                     new_pgs));
8665             new_pgs.clear();
8666           }
8667           dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8668           pg->ch->flush();
8669           // release backoffs explicitly, since the on_shutdown path
8670           // aggressively tears down backoff state.
8671           if (pg->is_primary()) {
8672             pg->release_pg_backoffs();
8673           }
8674           pg->on_shutdown();
8675           OSDShard *sdata = pg->osd_shard;
8676           {
8677             std::lock_guard l(sdata->shard_lock);
8678             if (pg->pg_slot) {
8679               sdata->_detach_pg(pg->pg_slot);
8680               // update pg count now since we might not get an osdmap
8681               // any time soon.
8682               if (pg->is_primary())
8683                 logger->dec(l_osd_pg_primary);
8684               else if (pg->is_nonprimary())
8685                 logger->dec(l_osd_pg_replica); // misnomer
8686               else
8687                 logger->dec(l_osd_pg_stray);
8688             }
8689           }
8690           pg->unlock();
8691
8692           set<spg_t> children;
8693           parent.is_split(new_pg_num, old_pg_num, &children);
8694           if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8695             enqueue_peering_evt(
8696               parent,
8697               PGPeeringEventRef(
8698                 std::make_shared<PGPeeringEvent>(
8699                   nextmap->get_epoch(),
8700                   nextmap->get_epoch(),
8701                   NullEvt())));
8702           }
8703           ret = false;
8704           goto out;
8705         } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8706           // we are merge target
8707           set<spg_t> children;
8708           pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8709           dout(20) << __func__ << " " << pg->pg_id
8710                    << " is merge target, sources are " << children
8711                    << dendl;
8712           map<spg_t,PGRef> sources;
8713           {
8714             std::lock_guard l(merge_lock);
8715             auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8716             unsigned need = children.size();
8717             dout(20) << __func__ << " have " << s.size() << "/"
8718                      << need << dendl;
8719             if (s.size() == need) {
8720               sources.swap(s);
8721               merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8722               if (merge_waiters[nextmap->get_epoch()].empty()) {
8723                 merge_waiters.erase(nextmap->get_epoch());
8724               }
8725             }
8726           }
8727           if (!sources.empty()) {
8728             unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8729             unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8730             dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8731             pg->merge_from(
8732               sources, rctx, split_bits,
8733               nextmap->get_pg_pool(
8734                 pg->pg_id.pool())->last_pg_merge_meta);
8735             pg->pg_slot->waiting_for_merge_epoch = 0;
8736           } else {
8737             dout(20) << __func__ << " not ready to merge yet" << dendl;
8738             pg->write_if_dirty(rctx);
8739             if (!new_pgs.empty()) {
8740               rctx.transaction.register_on_applied(new C_FinishSplits(this,
8741                                                                       new_pgs));
8742               new_pgs.clear();
8743             }
8744             dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8745             pg->unlock();
8746             // kick source(s) to get them ready
8747             for (auto& i : children) {
8748               dout(20) << __func__ << " kicking source " << i << dendl;
8749               enqueue_peering_evt(
8750                 i,
8751                 PGPeeringEventRef(
8752                   std::make_shared<PGPeeringEvent>(
8753                     nextmap->get_epoch(),
8754                     nextmap->get_epoch(),
8755                     NullEvt())));
8756             }
8757             ret = false;
8758             goto out;
8759           }
8760         }
8761       }
8762     }
8763
8764     vector<int> newup, newacting;
8765     int up_primary, acting_primary;
8766     nextmap->pg_to_up_acting_osds(
8767       pg->pg_id.pgid,
8768       &newup, &up_primary,
8769       &newacting, &acting_primary);
8770     pg->handle_advance_map(
8771       nextmap, lastmap, newup, up_primary,
8772       newacting, acting_primary, rctx);
8773
8774     auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8775     auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8776     if (oldpool != lastmap->get_pools().end()
8777         && newpool != nextmap->get_pools().end()) {
8778       dout(20) << __func__
8779                << " new pool opts " << newpool->second.opts
8780                << " old pool opts " << oldpool->second.opts
8781                << dendl;
8782
8783       double old_min_interval = 0, new_min_interval = 0;
8784       oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8785       newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8786
8787       double old_max_interval = 0, new_max_interval = 0;
8788       oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8789       newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8790
8791       // Assume if an interval is change from set to unset or vice versa the actual config
8792       // is different.  Keep it simple even if it is possible to call resched_all_scrub()
8793       // unnecessarily.
8794       if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8795         pg->on_info_history_change();
8796       }
8797     }
8798
8799     if (new_pg_num && old_pg_num != new_pg_num) {
8800       // check for split
8801       set<spg_t> children;
8802       if (pg->pg_id.is_split(
8803             old_pg_num,
8804             new_pg_num,
8805             &children)) {
8806         split_pgs(
8807           pg, children, &new_pgs, lastmap, nextmap,
8808           rctx);
8809       }
8810     }
8811
8812     lastmap = nextmap;
8813     old_pg_num = new_pg_num;
8814     handle.reset_tp_timeout();
8815   }
8816   pg->handle_activate_map(rctx);
8817
8818   ret = true;
8819  out:
8820   if (!new_pgs.empty()) {
8821     rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8822   }
8823   return ret;
8824 }
8825
8826 void OSD::consume_map()
8827 {
8828   ceph_assert(ceph_mutex_is_locked(osd_lock));
8829   auto osdmap = get_osdmap();
8830   dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8831
8832   /** make sure the cluster is speaking in SORTBITWISE, because we don't
8833    *  speak the older sorting version any more. Be careful not to force
8834    *  a shutdown if we are merely processing old maps, though.
8835    */
8836   if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8837     derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8838     ceph_abort();
8839   }
8840
8841   service.pre_publish_map(osdmap);
8842   service.await_reserved_maps();
8843   service.publish_map(osdmap);
8844
8845   // prime splits and merges
8846   set<pair<spg_t,epoch_t>> newly_split;  // splits, and when
8847   set<pair<spg_t,epoch_t>> merge_pgs;    // merge participants, and when
8848   for (auto& shard : shards) {
8849     shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8850   }
8851   if (!newly_split.empty()) {
8852     for (auto& shard : shards) {
8853       shard->prime_splits(osdmap, &newly_split);
8854     }
8855     ceph_assert(newly_split.empty());
8856   }
8857
8858   // prune sent_ready_to_merge
8859   service.prune_sent_ready_to_merge(osdmap);
8860
8861   // FIXME, maybe: We could race against an incoming peering message
8862   // that instantiates a merge PG after identify_merges() below and
8863   // never set up its peer to complete the merge.  An OSD restart
8864   // would clear it up.  This is a hard race to resolve,
8865   // extraordinarily rare (we only merge PGs that are stable and
8866   // clean, so it'd have to be an imported PG to an OSD with a
8867   // slightly stale OSDMap...), so I'm ignoring it for now.  We plan to
8868   // replace all of this with a seastar-based code soon anyway.
8869   if (!merge_pgs.empty()) {
8870     // mark the pgs we already have, or create new and empty merge
8871     // participants for those we are missing.  do this all under the
8872     // shard lock so we don't have to worry about racing pg creates
8873     // via _process.
8874     for (auto& shard : shards) {
8875       shard->prime_merges(osdmap, &merge_pgs);
8876     }
8877     ceph_assert(merge_pgs.empty());
8878   }
8879
8880   service.prune_pg_created();
8881
8882   unsigned pushes_to_free = 0;
8883   for (auto& shard : shards) {
8884     shard->consume_map(osdmap, &pushes_to_free);
8885   }
8886
8887   vector<spg_t> pgids;
8888   _get_pgids(&pgids);
8889
8890   // count (FIXME, probably during seastar rewrite)
8891   int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8892   vector<PGRef> pgs;
8893   _get_pgs(&pgs);
8894   for (auto& pg : pgs) {
8895     // FIXME (probably during seastar rewrite): this is lockless and
8896     // racy, but we don't want to take pg lock here.
8897     if (pg->is_primary())
8898       num_pg_primary++;
8899     else if (pg->is_nonprimary())
8900       num_pg_replica++;  // misnomer
8901     else
8902       num_pg_stray++;
8903   }
8904
8905   {
8906     // FIXME (as part of seastar rewrite): move to OSDShard
8907     std::lock_guard l(pending_creates_lock);
8908     for (auto pg = pending_creates_from_osd.begin();
8909          pg != pending_creates_from_osd.end();) {
8910       if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8911         dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8912                  << "discarding pending_create_from_osd" << dendl;
8913         pg = pending_creates_from_osd.erase(pg);
8914       } else {
8915         ++pg;
8916       }
8917     }
8918   }
8919
8920   service.maybe_inject_dispatch_delay();
8921
8922   dispatch_sessions_waiting_on_map();
8923
8924   service.maybe_inject_dispatch_delay();
8925
8926   service.release_reserved_pushes(pushes_to_free);
8927
8928   // queue null events to push maps down to individual PGs
8929   for (auto pgid : pgids) {
8930     enqueue_peering_evt(
8931       pgid,
8932       PGPeeringEventRef(
8933         std::make_shared<PGPeeringEvent>(
8934           osdmap->get_epoch(),
8935           osdmap->get_epoch(),
8936           NullEvt())));
8937   }
8938   logger->set(l_osd_pg, pgids.size());
8939   logger->set(l_osd_pg_primary, num_pg_primary);
8940   logger->set(l_osd_pg_replica, num_pg_replica);
8941   logger->set(l_osd_pg_stray, num_pg_stray);
8942 }
8943
8944 void OSD::activate_map()
8945 {
8946   ceph_assert(ceph_mutex_is_locked(osd_lock));
8947   auto osdmap = get_osdmap();
8948
8949   dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8950
8951   // norecover?
8952   if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8953     if (!service.recovery_is_paused()) {
8954       dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8955       service.pause_recovery();
8956     }
8957   } else {
8958     if (service.recovery_is_paused()) {
8959       dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8960       service.unpause_recovery();
8961     }
8962   }
8963
8964   service.activate_map();
8965
8966   // process waiters
8967   take_waiters(waiting_for_osdmap);
8968 }
8969
8970 bool OSD::require_mon_peer(const Message *m)
8971 {
8972   if (!m->get_connection()->peer_is_mon()) {
8973     dout(0) << "require_mon_peer received from non-mon "
8974             << m->get_connection()->get_peer_addr()
8975             << " " << *m << dendl;
8976     return false;
8977   }
8978   return true;
8979 }
8980
8981 bool OSD::require_mon_or_mgr_peer(const Message *m)
8982 {
8983   if (!m->get_connection()->peer_is_mon() &&
8984       !m->get_connection()->peer_is_mgr()) {
8985     dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8986             << m->get_connection()->get_peer_addr()
8987             << " " << *m << dendl;
8988     return false;
8989   }
8990   return true;
8991 }
8992
8993 bool OSD::require_osd_peer(const Message *m)
8994 {
8995   if (!m->get_connection()->peer_is_osd()) {
8996     dout(0) << "require_osd_peer received from non-osd "
8997             << m->get_connection()->get_peer_addr()
8998             << " " << *m << dendl;
8999     return false;
9000   }
9001   return true;
9002 }
9003
9004 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9005 {
9006   epoch_t up_epoch = service.get_up_epoch();
9007   if (epoch < up_epoch) {
9008     dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9009     return false;
9010   }
9011
9012   if (!is_active()) {
9013     dout(7) << "still in boot state, dropping message " << *m << dendl;
9014     return false;
9015   }
9016
9017   return true;
9018 }
9019
9020 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
9021                                      bool is_fast_dispatch)
9022 {
9023   int from = m->get_source().num();
9024
9025   if (map->is_down(from) ||
9026       (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9027     dout(5) << "from dead osd." << from << ", marking down, "
9028             << " msg was " << m->get_source_inst().addr
9029             << " expected "
9030             << (map->is_up(from) ?
9031                 map->get_cluster_addrs(from) : entity_addrvec_t())
9032             << dendl;
9033     ConnectionRef con = m->get_connection();
9034     con->mark_down();
9035     if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
9036       if (!is_fast_dispatch)
9037         s->session_dispatch_lock.lock();
9038       clear_session_waiting_on_map(s);
9039       con->set_priv(nullptr);   // break ref <-> session cycle, if any
9040       s->con.reset();
9041       if (!is_fast_dispatch)
9042         s->session_dispatch_lock.unlock();
9043     }
9044     return false;
9045   }
9046   return true;
9047 }
9048
9049
9050 /*
9051  * require that we have same (or newer) map, and that
9052  * the source is the pg primary.
9053  */
9054 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9055                                     bool is_fast_dispatch)
9056 {
9057   const Message *m = op->get_req();
9058   const auto osdmap = get_osdmap();
9059   dout(15) << "require_same_or_newer_map " << epoch
9060            << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9061
9062   ceph_assert(ceph_mutex_is_locked(osd_lock));
9063
9064   // do they have a newer map?
9065   if (epoch > osdmap->get_epoch()) {
9066     dout(7) << "waiting for newer map epoch " << epoch
9067             << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9068     wait_for_new_map(op);
9069     return false;
9070   }
9071
9072   if (!require_self_aliveness(op->get_req(), epoch)) {
9073     return false;
9074   }
9075
9076   // ok, our map is same or newer.. do they still exist?
9077   if (m->get_connection()->get_messenger() == cluster_messenger &&
9078       !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9079     return false;
9080   }
9081
9082   return true;
9083 }
9084
9085
9086
9087
9088
9089 // ----------------------------------------
9090 // pg creation
9091
9092 void OSD::split_pgs(
9093   PG *parent,
9094   const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9095   OSDMapRef curmap,
9096   OSDMapRef nextmap,
9097   PeeringCtx &rctx)
9098 {
9099   unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9100   parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9101
9102   vector<object_stat_sum_t> updated_stats;
9103   parent->start_split_stats(childpgids, &updated_stats);
9104
9105   vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9106   for (set<spg_t>::const_iterator i = childpgids.begin();
9107        i != childpgids.end();
9108        ++i, ++stat_iter) {
9109     ceph_assert(stat_iter != updated_stats.end());
9110     dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9111     PG* child = _make_pg(nextmap, *i);
9112     child->lock(true);
9113     out_pgs->insert(child);
9114     child->ch = store->create_new_collection(child->coll);
9115
9116     {
9117       uint32_t shard_index = i->hash_to_shard(shards.size());
9118       assert(NULL != shards[shard_index]);
9119       store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9120     }
9121
9122     unsigned split_bits = i->get_split_bits(pg_num);
9123     dout(10) << " pg_num is " << pg_num
9124              << ", m_seed " << i->ps()
9125              << ", split_bits is " << split_bits << dendl;
9126     parent->split_colls(
9127       *i,
9128       split_bits,
9129       i->ps(),
9130       &child->get_pool().info,
9131       rctx.transaction);
9132     parent->split_into(
9133       i->pgid,
9134       child,
9135       split_bits);
9136
9137     child->init_collection_pool_opts();
9138
9139     child->finish_split_stats(*stat_iter, rctx.transaction);
9140     child->unlock();
9141   }
9142   ceph_assert(stat_iter != updated_stats.end());
9143   parent->finish_split_stats(*stat_iter, rctx.transaction);
9144 }
9145
9146 /*
9147  * holding osd_lock
9148  */
9149 void OSD::handle_pg_create(OpRequestRef op)
9150 {
9151   // NOTE: this can be removed in P release (mimic is the last version to
9152   // send MOSDPGCreate messages).
9153
9154   auto m = op->get_req<MOSDPGCreate>();
9155   ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9156
9157   dout(10) << "handle_pg_create " << *m << dendl;
9158
9159   if (!require_mon_peer(op->get_req())) {
9160     return;
9161   }
9162
9163   if (!require_same_or_newer_map(op, m->epoch, false))
9164     return;
9165
9166   op->mark_started();
9167
9168   const auto osdmap = get_osdmap();
9169   map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9170   for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9171        p != m->mkpg.end();
9172        ++p, ++ci) {
9173     ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9174     epoch_t created = p->second.created;
9175     if (p->second.split_bits) // Skip split pgs
9176       continue;
9177     pg_t on = p->first;
9178
9179     if (!osdmap->have_pg_pool(on.pool())) {
9180       dout(20) << "ignoring pg on deleted pool " << on << dendl;
9181       continue;
9182     }
9183
9184     dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9185
9186     spg_t pgid;
9187     bool mapped = osdmap->get_primary_shard(on, &pgid);
9188     ceph_assert(mapped);
9189
9190     // is it still ours?
9191     vector<int> up, acting;
9192     int up_primary = -1;
9193     int acting_primary = -1;
9194     osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9195     int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9196
9197     if (acting_primary != whoami) {
9198       dout(10) << "mkpg " << on << "  not acting_primary (" << acting_primary
9199                << "), my role=" << role << ", skipping" << dendl;
9200       continue;
9201     }
9202
9203
9204     PastIntervals pi;
9205     pg_history_t history;
9206     build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9207
9208     // The mon won't resend unless the primary changed, so we ignore
9209     // same_interval_since.  We'll pass this history with the current
9210     // epoch as the event.
9211     if (history.same_primary_since > m->epoch) {
9212       dout(10) << __func__ << ": got obsolete pg create on pgid "
9213                << pgid << " from epoch " << m->epoch
9214                << ", primary changed in " << history.same_primary_since
9215                << dendl;
9216       continue;
9217     }
9218     enqueue_peering_evt(
9219       pgid,
9220       PGPeeringEventRef(
9221         std::make_shared<PGPeeringEvent>(
9222           osdmap->get_epoch(),
9223           osdmap->get_epoch(),
9224           NullEvt(),
9225           true,
9226           new PGCreateInfo(
9227             pgid,
9228             osdmap->get_epoch(),
9229             history,
9230             pi,
9231             true)
9232           )));
9233   }
9234
9235   {
9236     std::lock_guard l(pending_creates_lock);
9237     if (pending_creates_from_mon == 0) {
9238       last_pg_create_epoch = m->epoch;
9239     }
9240   }
9241
9242   maybe_update_heartbeat_peers();
9243 }
9244
9245
9246 // ----------------------------------------
9247 // peering and recovery
9248
9249 PeeringCtx OSD::create_context()
9250 {
9251   return PeeringCtx(get_osdmap()->require_osd_release);
9252 }
9253
9254 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9255                            ThreadPool::TPHandle *handle)
9256 {
9257   if (!service.get_osdmap()->is_up(whoami)) {
9258     dout(20) << __func__ << " not up in osdmap" << dendl;
9259   } else if (!is_active()) {
9260     dout(20) << __func__ << " not active" << dendl;
9261   } else {
9262     for (auto& [osd, ls] : ctx.message_map) {
9263       if (!curmap->is_up(osd)) {
9264         dout(20) << __func__ << " skipping down osd." << osd << dendl;
9265         continue;
9266       }
9267       ConnectionRef con = service.get_con_osd_cluster(
9268         osd, curmap->get_epoch());
9269       if (!con) {
9270         dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9271                  << dendl;
9272         continue;
9273       }
9274       service.maybe_share_map(con.get(), curmap);
9275       for (auto m : ls) {
9276         con->send_message2(m);
9277       }
9278       ls.clear();
9279     }
9280   }
9281   if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9282     int tr = store->queue_transaction(
9283       pg->ch,
9284       std::move(ctx.transaction), TrackedOpRef(),
9285       handle);
9286     ceph_assert(tr == 0);
9287   }
9288 }
9289
9290 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9291 {
9292   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9293   if (!require_mon_peer(m)) {
9294     m->put();
9295     return;
9296   }
9297   for (auto& p : m->pgs) {
9298     spg_t pgid = p.first;
9299     epoch_t created = p.second.first;
9300     utime_t created_stamp = p.second.second;
9301     auto q = m->pg_extra.find(pgid);
9302     if (q == m->pg_extra.end()) {
9303       dout(20) << __func__ << " " << pgid << " e" << created
9304                << "@" << created_stamp
9305                << " (no history or past_intervals)" << dendl;
9306       // pre-octopus ... no pg history.  this can be removed in Q release.
9307       enqueue_peering_evt(
9308         pgid,
9309         PGPeeringEventRef(
9310           std::make_shared<PGPeeringEvent>(
9311             m->epoch,
9312             m->epoch,
9313             NullEvt(),
9314             true,
9315             new PGCreateInfo(
9316               pgid,
9317               created,
9318               pg_history_t(created, created_stamp),
9319               PastIntervals(),
9320               true)
9321             )));
9322     } else {
9323       dout(20) << __func__ << " " << pgid << " e" << created
9324                << "@" << created_stamp
9325                << " history " << q->second.first
9326                << " pi " << q->second.second << dendl;
9327       if (!q->second.second.empty() &&
9328           m->epoch < q->second.second.get_bounds().second) {
9329         clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9330                       << " and unmatched past_intervals " << q->second.second
9331                       << " (history " << q->second.first << ")";
9332       } else {
9333         enqueue_peering_evt(
9334           pgid,
9335           PGPeeringEventRef(
9336             std::make_shared<PGPeeringEvent>(
9337               m->epoch,
9338               m->epoch,
9339               NullEvt(),
9340               true,
9341               new PGCreateInfo(
9342                 pgid,
9343                 m->epoch,
9344                 q->second.first,
9345                 q->second.second,
9346                 true)
9347               )));
9348       }
9349     }
9350   }
9351
9352   {
9353     std::lock_guard l(pending_creates_lock);
9354     if (pending_creates_from_mon == 0) {
9355       last_pg_create_epoch = m->epoch;
9356     }
9357   }
9358
9359   m->put();
9360 }
9361
9362 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9363 {
9364   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9365   if (!require_osd_peer(m)) {
9366     m->put();
9367     return;
9368   }
9369   int from = m->get_source().num();
9370   for (auto& p : m->pg_list) {
9371     enqueue_peering_evt(
9372       p.first,
9373       PGPeeringEventRef(
9374         std::make_shared<PGPeeringEvent>(
9375           p.second.epoch_sent, p.second.epoch_sent,
9376           MQuery(
9377             p.first,
9378             pg_shard_t(from, p.second.from),
9379             p.second,
9380             p.second.epoch_sent),
9381           false))
9382       );
9383   }
9384   m->put();
9385 }
9386
9387 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9388 {
9389   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9390   if (!require_osd_peer(m)) {
9391     m->put();
9392     return;
9393   }
9394   int from = m->get_source().num();
9395   for (auto& p : m->get_pg_list()) {
9396     spg_t pgid(p.info.pgid.pgid, p.to);
9397     enqueue_peering_evt(
9398       pgid,
9399       PGPeeringEventRef(
9400         std::make_shared<PGPeeringEvent>(
9401           p.epoch_sent,
9402           p.query_epoch,
9403           MNotifyRec(
9404             pgid, pg_shard_t(from, p.from),
9405             p,
9406             m->get_connection()->get_features()),
9407           true,
9408           new PGCreateInfo(
9409             pgid,
9410             p.query_epoch,
9411             p.info.history,
9412             p.past_intervals,
9413             false)
9414           )));
9415   }
9416   m->put();
9417 }
9418
9419 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9420 {
9421   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9422   if (!require_osd_peer(m)) {
9423     m->put();
9424     return;
9425   }
9426   int from = m->get_source().num();
9427   for (auto& p : m->pg_list) {
9428     enqueue_peering_evt(
9429       spg_t(p.info.pgid.pgid, p.to),
9430       PGPeeringEventRef(
9431         std::make_shared<PGPeeringEvent>(
9432           p.epoch_sent, p.query_epoch,
9433           MInfoRec(
9434             pg_shard_t(from, p.from),
9435             p.info,
9436             p.epoch_sent)))
9437       );
9438   }
9439   m->put();
9440 }
9441
9442 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9443 {
9444   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9445   if (!require_osd_peer(m)) {
9446     m->put();
9447     return;
9448   }
9449   for (auto& pgid : m->pg_list) {
9450     enqueue_peering_evt(
9451       pgid,
9452       PGPeeringEventRef(
9453         std::make_shared<PGPeeringEvent>(
9454           m->get_epoch(), m->get_epoch(),
9455           PeeringState::DeleteStart())));
9456   }
9457   m->put();
9458 }
9459
9460 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9461 {
9462   dout(10) << __func__ << " " << *m << dendl;
9463   if (!require_mon_or_mgr_peer(m)) {
9464     m->put();
9465     return;
9466   }
9467   epoch_t epoch = get_osdmap_epoch();
9468   for (auto pgid : m->forced_pgs) {
9469     if (m->options & OFR_BACKFILL) {
9470       if (m->options & OFR_CANCEL) {
9471         enqueue_peering_evt(
9472           pgid,
9473           PGPeeringEventRef(
9474             std::make_shared<PGPeeringEvent>(
9475               epoch, epoch,
9476               PeeringState::UnsetForceBackfill())));
9477       } else {
9478         enqueue_peering_evt(
9479           pgid,
9480           PGPeeringEventRef(
9481             std::make_shared<PGPeeringEvent>(
9482               epoch, epoch,
9483               PeeringState::SetForceBackfill())));
9484       }
9485     } else if (m->options & OFR_RECOVERY) {
9486       if (m->options & OFR_CANCEL) {
9487         enqueue_peering_evt(
9488           pgid,
9489           PGPeeringEventRef(
9490             std::make_shared<PGPeeringEvent>(
9491               epoch, epoch,
9492               PeeringState::UnsetForceRecovery())));
9493       } else {
9494         enqueue_peering_evt(
9495           pgid,
9496           PGPeeringEventRef(
9497             std::make_shared<PGPeeringEvent>(
9498               epoch, epoch,
9499               PeeringState::SetForceRecovery())));
9500       }
9501     }
9502   }
9503   m->put();
9504 }
9505
9506 void OSD::handle_pg_query_nopg(const MQuery& q)
9507 {
9508   spg_t pgid = q.pgid;
9509   dout(10) << __func__ << " " << pgid << dendl;
9510
9511   OSDMapRef osdmap = get_osdmap();
9512   if (!osdmap->have_pg_pool(pgid.pool()))
9513     return;
9514
9515   dout(10) << " pg " << pgid << " dne" << dendl;
9516   pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9517   ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9518   if (con) {
9519     Message *m;
9520     if (q.query.type == pg_query_t::LOG ||
9521         q.query.type == pg_query_t::FULLLOG) {
9522       m = new MOSDPGLog(
9523         q.query.from, q.query.to,
9524         osdmap->get_epoch(), empty,
9525         q.query.epoch_sent);
9526     } else {
9527       vector<pg_notify_t> ls;
9528       ls.push_back(
9529         pg_notify_t(
9530           q.query.from, q.query.to,
9531           q.query.epoch_sent,
9532           osdmap->get_epoch(),
9533           empty,
9534           PastIntervals()));
9535       m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9536     }
9537     service.maybe_share_map(con.get(), osdmap);
9538     con->send_message(m);
9539   }
9540 }
9541
9542 void OSDService::queue_check_readable(spg_t spgid,
9543                                       epoch_t lpr,
9544                                       ceph::signedspan delay)
9545 {
9546   if (delay == ceph::signedspan::zero()) {
9547     osd->enqueue_peering_evt(
9548       spgid,
9549       PGPeeringEventRef(
9550         std::make_shared<PGPeeringEvent>(
9551           lpr, lpr,
9552           PeeringState::CheckReadable())));
9553   } else {
9554     mono_timer.add_event(
9555       delay,
9556       [this, spgid, lpr]() {
9557         queue_check_readable(spgid, lpr);
9558       });
9559   }
9560 }
9561
9562
9563 // =========================================================
9564 // RECOVERY
9565
9566 void OSDService::_maybe_queue_recovery() {
9567   ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9568   uint64_t available_pushes;
9569   while (!awaiting_throttle.empty() &&
9570          _recover_now(&available_pushes)) {
9571     uint64_t to_start = std::min(
9572       available_pushes,
9573       cct->_conf->osd_recovery_max_single_start);
9574     _queue_for_recovery(awaiting_throttle.front(), to_start);
9575     awaiting_throttle.pop_front();
9576     dout(10) << __func__ << " starting " << to_start
9577              << ", recovery_ops_reserved " << recovery_ops_reserved
9578              << " -> " << (recovery_ops_reserved + to_start) << dendl;
9579     recovery_ops_reserved += to_start;
9580   }
9581 }
9582
9583 bool OSDService::_recover_now(uint64_t *available_pushes)
9584 {
9585   if (available_pushes)
9586       *available_pushes = 0;
9587
9588   if (ceph_clock_now() < defer_recovery_until) {
9589     dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9590     return false;
9591   }
9592
9593   if (recovery_paused) {
9594     dout(15) << __func__ << " paused" << dendl;
9595     return false;
9596   }
9597
9598   uint64_t max = osd->get_recovery_max_active();
9599   if (max <= recovery_ops_active + recovery_ops_reserved) {
9600     dout(15) << __func__ << " active " << recovery_ops_active
9601              << " + reserved " << recovery_ops_reserved
9602              << " >= max " << max << dendl;
9603     return false;
9604   }
9605
9606   if (available_pushes)
9607     *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9608
9609   return true;
9610 }
9611
9612 unsigned OSDService::get_target_pg_log_entries() const
9613 {
9614   auto num_pgs = osd->get_num_pgs();
9615   auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9616   if (num_pgs > 0 && target > 0) {
9617     // target an even spread of our budgeted log entries across all
9618     // PGs.  note that while we only get to control the entry count
9619     // for primary PGs, we'll normally be responsible for a mix of
9620     // primary and replica PGs (for the same pool(s) even), so this
9621     // will work out.
9622     return std::max<unsigned>(
9623       std::min<unsigned>(target / num_pgs,
9624                          cct->_conf->osd_max_pg_log_entries),
9625       cct->_conf->osd_min_pg_log_entries);
9626   } else {
9627     // fall back to a per-pg value.
9628     return cct->_conf->osd_min_pg_log_entries;
9629   }
9630 }
9631
9632 void OSD::do_recovery(
9633   PG *pg, epoch_t queued, uint64_t reserved_pushes,
9634   ThreadPool::TPHandle &handle)
9635 {
9636   uint64_t started = 0;
9637
9638   /*
9639    * When the value of osd_recovery_sleep is set greater than zero, recovery
9640    * ops are scheduled after osd_recovery_sleep amount of time from the previous
9641    * recovery event's schedule time. This is done by adding a
9642    * recovery_requeue_callback event, which re-queues the recovery op using
9643    * queue_recovery_after_sleep.
9644    */
9645   float recovery_sleep = get_osd_recovery_sleep();
9646   {
9647     std::lock_guard l(service.sleep_lock);
9648     if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9649       PGRef pgref(pg);
9650       auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9651         dout(20) << "do_recovery wake up at "
9652                  << ceph_clock_now()
9653                  << ", re-queuing recovery" << dendl;
9654         std::lock_guard l(service.sleep_lock);
9655         service.recovery_needs_sleep = false;
9656         service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9657       });
9658
9659       // This is true for the first recovery op and when the previous recovery op
9660       // has been scheduled in the past. The next recovery op is scheduled after
9661       // completing the sleep from now.
9662
9663       if (auto now = ceph::real_clock::now();
9664           service.recovery_schedule_time < now) {
9665         service.recovery_schedule_time = now;
9666       }
9667       service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9668       service.sleep_timer.add_event_at(service.recovery_schedule_time,
9669                                        recovery_requeue_callback);
9670       dout(20) << "Recovery event scheduled at "
9671                << service.recovery_schedule_time << dendl;
9672       return;
9673     }
9674   }
9675
9676   {
9677     {
9678       std::lock_guard l(service.sleep_lock);
9679       service.recovery_needs_sleep = true;
9680     }
9681
9682     if (pg->pg_has_reset_since(queued)) {
9683       goto out;
9684     }
9685
9686     dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9687 #ifdef DEBUG_RECOVERY_OIDS
9688     dout(20) << "  active was " << service.recovery_oids[pg->pg_id] << dendl;
9689 #endif
9690
9691     bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9692     dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9693              << " on " << *pg << dendl;
9694
9695     if (do_unfound) {
9696       PeeringCtx rctx = create_context();
9697       rctx.handle = &handle;
9698       pg->find_unfound(queued, rctx);
9699       dispatch_context(rctx, pg, pg->get_osdmap());
9700     }
9701   }
9702
9703  out:
9704   ceph_assert(started <= reserved_pushes);
9705   service.release_reserved_pushes(reserved_pushes);
9706 }
9707
9708 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9709 {
9710   std::lock_guard l(recovery_lock);
9711   dout(10) << "start_recovery_op " << *pg << " " << soid
9712            << " (" << recovery_ops_active << "/"
9713            << osd->get_recovery_max_active() << " rops)"
9714            << dendl;
9715   recovery_ops_active++;
9716
9717 #ifdef DEBUG_RECOVERY_OIDS
9718   dout(20) << "  active was " << recovery_oids[pg->pg_id] << dendl;
9719   ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9720   recovery_oids[pg->pg_id].insert(soid);
9721 #endif
9722 }
9723
9724 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9725 {
9726   std::lock_guard l(recovery_lock);
9727   dout(10) << "finish_recovery_op " << *pg << " " << soid
9728            << " dequeue=" << dequeue
9729            << " (" << recovery_ops_active << "/"
9730            << osd->get_recovery_max_active() << " rops)"
9731            << dendl;
9732
9733   // adjust count
9734   ceph_assert(recovery_ops_active > 0);
9735   recovery_ops_active--;
9736
9737 #ifdef DEBUG_RECOVERY_OIDS
9738   dout(20) << "  active oids was " << recovery_oids[pg->pg_id] << dendl;
9739   ceph_assert(recovery_oids[pg->pg_id].count(soid));
9740   recovery_oids[pg->pg_id].erase(soid);
9741 #endif
9742
9743   _maybe_queue_recovery();
9744 }
9745
9746 bool OSDService::is_recovery_active()
9747 {
9748   if (cct->_conf->osd_debug_pretend_recovery_active) {
9749     return true;
9750   }
9751   return local_reserver.has_reservation() || remote_reserver.has_reservation();
9752 }
9753
9754 void OSDService::release_reserved_pushes(uint64_t pushes)
9755 {
9756   std::lock_guard l(recovery_lock);
9757   dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9758            << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9759            << dendl;
9760   ceph_assert(recovery_ops_reserved >= pushes);
9761   recovery_ops_reserved -= pushes;
9762   _maybe_queue_recovery();
9763 }
9764
9765 // =========================================================
9766 // OPS
9767
9768 bool OSD::op_is_discardable(const MOSDOp *op)
9769 {
9770   // drop client request if they are not connected and can't get the
9771   // reply anyway.
9772   if (!op->get_connection()->is_connected()) {
9773     return true;
9774   }
9775   return false;
9776 }
9777
9778 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9779 {
9780   const utime_t stamp = op->get_req()->get_recv_stamp();
9781   const utime_t latency = ceph_clock_now() - stamp;
9782   const unsigned priority = op->get_req()->get_priority();
9783   const int cost = op->get_req()->get_cost();
9784   const uint64_t owner = op->get_req()->get_source().num();
9785   const int type = op->get_req()->get_type();
9786
9787   dout(15) << "enqueue_op " << op << " prio " << priority
9788            << " type " << type
9789            << " cost " << cost
9790            << " latency " << latency
9791            << " epoch " << epoch
9792            << " " << *(op->get_req()) << dendl;
9793   op->osd_trace.event("enqueue op");
9794   op->osd_trace.keyval("priority", priority);
9795   op->osd_trace.keyval("cost", cost);
9796 #ifdef HAVE_JAEGER
9797   if (op->osd_parent_span) {
9798     auto enqueue_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
9799     enqueue_span->Log({
9800         {"priority", priority},
9801         {"cost", cost},
9802         {"epoch", epoch},
9803         {"owner", owner},
9804         {"type", type}
9805         });
9806   }
9807 #endif
9808   op->mark_queued_for_pg();
9809   logger->tinc(l_osd_op_before_queue_op_lat, latency);
9810   if (type == MSG_OSD_PG_PUSH ||
9811       type == MSG_OSD_PG_PUSH_REPLY) {
9812     op_shardedwq.queue(
9813       OpSchedulerItem(
9814         unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9815         cost, priority, stamp, owner, epoch));
9816   } else {
9817     op_shardedwq.queue(
9818       OpSchedulerItem(
9819         unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9820         cost, priority, stamp, owner, epoch));
9821   }
9822 }
9823
9824 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9825 {
9826   dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9827   op_shardedwq.queue(
9828     OpSchedulerItem(
9829       unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9830       10,
9831       cct->_conf->osd_peering_op_priority,
9832       utime_t(),
9833       0,
9834       evt->get_epoch_sent()));
9835 }
9836
9837 /*
9838  * NOTE: dequeue called in worker thread, with pg lock
9839  */
9840 void OSD::dequeue_op(
9841   PGRef pg, OpRequestRef op,
9842   ThreadPool::TPHandle &handle)
9843 {
9844   const Message *m = op->get_req();
9845
9846   FUNCTRACE(cct);
9847   OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9848
9849   utime_t now = ceph_clock_now();
9850   op->set_dequeued_time(now);
9851
9852   utime_t latency = now - m->get_recv_stamp();
9853   dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9854            << " cost " << m->get_cost()
9855            << " latency " << latency
9856            << " " << *m
9857            << " pg " << *pg << dendl;
9858
9859   logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9860
9861   service.maybe_share_map(m->get_connection().get(),
9862                           pg->get_osdmap(),
9863                           op->sent_epoch);
9864
9865   if (pg->is_deleting())
9866     return;
9867
9868   op->mark_reached_pg();
9869   op->osd_trace.event("dequeue_op");
9870
9871   pg->do_request(op, handle);
9872
9873   // finish
9874   dout(10) << "dequeue_op " << op << " finish" << dendl;
9875   OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9876 }
9877
9878
9879 void OSD::dequeue_peering_evt(
9880   OSDShard *sdata,
9881   PG *pg,
9882   PGPeeringEventRef evt,
9883   ThreadPool::TPHandle& handle)
9884 {
9885   PeeringCtx rctx = create_context();
9886   auto curmap = sdata->get_osdmap();
9887   bool need_up_thru = false;
9888   epoch_t same_interval_since = 0;
9889   if (!pg) {
9890     if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9891       handle_pg_query_nopg(*q);
9892     } else {
9893       derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9894       ceph_abort();
9895     }
9896   } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9897     pg->do_peering_event(evt, rctx);
9898     if (pg->is_deleted()) {
9899       pg->unlock();
9900       return;
9901     }
9902     dispatch_context(rctx, pg, curmap, &handle);
9903     need_up_thru = pg->get_need_up_thru();
9904     same_interval_since = pg->get_same_interval_since();
9905     pg->unlock();
9906   }
9907
9908   if (need_up_thru) {
9909     queue_want_up_thru(same_interval_since);
9910   }
9911
9912   service.send_pg_temp();
9913 }
9914
9915 void OSD::dequeue_delete(
9916   OSDShard *sdata,
9917   PG *pg,
9918   epoch_t e,
9919   ThreadPool::TPHandle& handle)
9920 {
9921   dequeue_peering_evt(
9922     sdata,
9923     pg,
9924     PGPeeringEventRef(
9925       std::make_shared<PGPeeringEvent>(
9926         e, e,
9927         PeeringState::DeleteSome())),
9928     handle);
9929 }
9930
9931
9932
9933 // --------------------------------
9934
9935 const char** OSD::get_tracked_conf_keys() const
9936 {
9937   static const char* KEYS[] = {
9938     "osd_max_backfills",
9939     "osd_min_recovery_priority",
9940     "osd_max_trimming_pgs",
9941     "osd_op_complaint_time",
9942     "osd_op_log_threshold",
9943     "osd_op_history_size",
9944     "osd_op_history_duration",
9945     "osd_op_history_slow_op_size",
9946     "osd_op_history_slow_op_threshold",
9947     "osd_enable_op_tracker",
9948     "osd_map_cache_size",
9949     "osd_pg_epoch_max_lag_factor",
9950     "osd_pg_epoch_persisted_max_stale",
9951     "osd_recovery_sleep",
9952     "osd_recovery_sleep_hdd",
9953     "osd_recovery_sleep_ssd",
9954     "osd_recovery_sleep_hybrid",
9955     "osd_delete_sleep",
9956     "osd_delete_sleep_hdd",
9957     "osd_delete_sleep_ssd",
9958     "osd_delete_sleep_hybrid",
9959     "osd_snap_trim_sleep",
9960     "osd_snap_trim_sleep_hdd",
9961     "osd_snap_trim_sleep_ssd",
9962     "osd_snap_trim_sleep_hybrid"
9963     "osd_scrub_sleep",
9964     "osd_recovery_max_active",
9965     "osd_recovery_max_active_hdd",
9966     "osd_recovery_max_active_ssd",
9967     // clog & admin clog
9968     "clog_to_monitors",
9969     "clog_to_syslog",
9970     "clog_to_syslog_facility",
9971     "clog_to_syslog_level",
9972     "osd_objectstore_fuse",
9973     "clog_to_graylog",
9974     "clog_to_graylog_host",
9975     "clog_to_graylog_port",
9976     "host",
9977     "fsid",
9978     "osd_recovery_delay_start",
9979     "osd_client_message_size_cap",
9980     "osd_client_message_cap",
9981     "osd_heartbeat_min_size",
9982     "osd_heartbeat_interval",
9983     "osd_object_clean_region_max_num_intervals",
9984     "osd_scrub_min_interval",
9985     "osd_scrub_max_interval",
9986     NULL
9987   };
9988   return KEYS;
9989 }
9990
9991 void OSD::handle_conf_change(const ConfigProxy& conf,
9992                              const std::set <std::string> &changed)
9993 {
9994   std::lock_guard l{osd_lock};
9995
9996   if (changed.count("osd_max_backfills") ||
9997       changed.count("osd_delete_sleep") ||
9998       changed.count("osd_delete_sleep_hdd") ||
9999       changed.count("osd_delete_sleep_ssd") ||
10000       changed.count("osd_delete_sleep_hybrid") ||
10001       changed.count("osd_snap_trim_sleep") ||
10002       changed.count("osd_snap_trim_sleep_hdd") ||
10003       changed.count("osd_snap_trim_sleep_ssd") ||
10004       changed.count("osd_snap_trim_sleep_hybrid") ||
10005       changed.count("osd_scrub_sleep") ||
10006       changed.count("osd_recovery_sleep") ||
10007       changed.count("osd_recovery_sleep_hdd") ||
10008       changed.count("osd_recovery_sleep_ssd") ||
10009       changed.count("osd_recovery_sleep_hybrid") ||
10010       changed.count("osd_recovery_max_active") ||
10011       changed.count("osd_recovery_max_active_hdd") ||
10012       changed.count("osd_recovery_max_active_ssd")) {
10013     if (!maybe_override_options_for_qos() &&
10014         changed.count("osd_max_backfills")) {
10015       // Scheduler is not "mclock". Fallback to earlier behavior
10016       service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10017       service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10018     }
10019   }
10020   if (changed.count("osd_min_recovery_priority")) {
10021     service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10022     service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10023   }
10024   if (changed.count("osd_max_trimming_pgs")) {
10025     service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10026   }
10027   if (changed.count("osd_op_complaint_time") ||
10028       changed.count("osd_op_log_threshold")) {
10029     op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10030                                            cct->_conf->osd_op_log_threshold);
10031   }
10032   if (changed.count("osd_op_history_size") ||
10033       changed.count("osd_op_history_duration")) {
10034     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10035                                              cct->_conf->osd_op_history_duration);
10036   }
10037   if (changed.count("osd_op_history_slow_op_size") ||
10038       changed.count("osd_op_history_slow_op_threshold")) {
10039     op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10040                                                       cct->_conf->osd_op_history_slow_op_threshold);
10041   }
10042   if (changed.count("osd_enable_op_tracker")) {
10043       op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10044   }
10045   if (changed.count("osd_map_cache_size")) {
10046     service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10047     service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10048     service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10049   }
10050   if (changed.count("clog_to_monitors") ||
10051       changed.count("clog_to_syslog") ||
10052       changed.count("clog_to_syslog_level") ||
10053       changed.count("clog_to_syslog_facility") ||
10054       changed.count("clog_to_graylog") ||
10055       changed.count("clog_to_graylog_host") ||
10056       changed.count("clog_to_graylog_port") ||
10057       changed.count("host") ||
10058       changed.count("fsid")) {
10059     update_log_config();
10060   }
10061   if (changed.count("osd_pg_epoch_max_lag_factor")) {
10062     m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10063       "osd_pg_epoch_max_lag_factor");
10064   }
10065
10066 #ifdef HAVE_LIBFUSE
10067   if (changed.count("osd_objectstore_fuse")) {
10068     if (store) {
10069       enable_disable_fuse(false);
10070     }
10071   }
10072 #endif
10073
10074   if (changed.count("osd_recovery_delay_start")) {
10075     service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10076     service.kick_recovery_queue();
10077   }
10078
10079   if (changed.count("osd_client_message_cap")) {
10080     uint64_t newval = cct->_conf->osd_client_message_cap;
10081     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10082     if (pol.throttler_messages && newval > 0) {
10083       pol.throttler_messages->reset_max(newval);
10084     }
10085   }
10086   if (changed.count("osd_client_message_size_cap")) {
10087     uint64_t newval = cct->_conf->osd_client_message_size_cap;
10088     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10089     if (pol.throttler_bytes && newval > 0) {
10090       pol.throttler_bytes->reset_max(newval);
10091     }
10092   }
10093   if (changed.count("osd_object_clean_region_max_num_intervals")) {
10094     ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10095   }
10096
10097   if (changed.count("osd_scrub_min_interval") ||
10098       changed.count("osd_scrub_max_interval")) {
10099     resched_all_scrubs();
10100     dout(0) << __func__ << ": scrub interval change" << dendl;
10101   }
10102   check_config();
10103   if (changed.count("osd_asio_thread_count")) {
10104     service.poolctx.stop();
10105     service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10106   }
10107 }
10108
10109 void OSD::maybe_override_max_osd_capacity_for_qos()
10110 {
10111   // If the scheduler enabled is mclock, override the default
10112   // osd capacity with the value obtained from running the
10113   // osd bench test. This is later used to setup mclock.
10114   if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
10115       (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false)) {
10116     std::string max_capacity_iops_config;
10117     bool force_run_benchmark =
10118       cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
10119
10120     if (store_is_rotational) {
10121       max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
10122     } else {
10123       max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
10124     }
10125
10126     if (!force_run_benchmark) {
10127       double default_iops = 0.0;
10128
10129       // Get the current osd iops capacity
10130       double cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
10131
10132       // Get the default max iops capacity
10133       auto val = cct->_conf.get_val_default(max_capacity_iops_config);
10134       if (!val.has_value()) {
10135         derr << __func__ << " Unable to determine default value of "
10136             << max_capacity_iops_config << dendl;
10137         // Cannot determine default iops. Force a run of the OSD benchmark.
10138         force_run_benchmark = true;
10139       } else {
10140         // Default iops
10141         default_iops = std::stod(val.value());
10142       }
10143
10144       // Determine if we really need to run the osd benchmark
10145       if (!force_run_benchmark && (default_iops != cur_iops)) {
10146         dout(1) << __func__ << std::fixed << std::setprecision(2)
10147                 << " default_iops: " << default_iops
10148                 << " cur_iops: " << cur_iops
10149                 << ". Skip OSD benchmark test." << dendl;
10150         return;
10151       }
10152     }
10153
10154     // Run osd bench: write 100 4MiB objects with blocksize 4KiB
10155     int64_t count = 12288000; // Count of bytes to write
10156     int64_t bsize = 4096;     // Block size
10157     int64_t osize = 4194304;  // Object size
10158     int64_t onum = 100;       // Count of objects to write
10159     double elapsed = 0.0;     // Time taken to complete the test
10160     double iops = 0.0;
10161     stringstream ss;
10162     int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
10163     if (ret != 0) {
10164       derr << __func__
10165            << " osd bench err: " << ret
10166            << " osd bench errstr: " << ss.str()
10167            << dendl;
10168       return;
10169     }
10170
10171     double rate = count / elapsed;
10172     iops = rate / bsize;
10173     dout(1) << __func__
10174             << " osd bench result -"
10175             << std::fixed << std::setprecision(3)
10176             << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
10177             << " iops: " << iops
10178             << " elapsed_sec: " << elapsed
10179             << dendl;
10180
10181     // Persist iops to the MON store
10182     ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
10183     if (ret < 0) {
10184       // Fallback to setting the config within the in-memory "values" map.
10185       cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops));
10186     }
10187
10188     // Override the max osd capacity for all shards
10189     for (auto& shard : shards) {
10190       shard->update_scheduler_config();
10191     }
10192   }
10193 }
10194
10195 bool OSD::maybe_override_options_for_qos()
10196 {
10197   // If the scheduler enabled is mclock, override the recovery, backfill
10198   // and sleep options so that mclock can meet the QoS goals.
10199   if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") {
10200     dout(1) << __func__
10201             << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
10202
10203     // Set high value for recovery max active
10204     uint32_t rec_max_active = 1000;
10205     cct->_conf.set_val(
10206       "osd_recovery_max_active", std::to_string(rec_max_active));
10207     cct->_conf.set_val(
10208       "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
10209     cct->_conf.set_val(
10210       "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
10211
10212     // Set high value for osd_max_backfill
10213     uint32_t max_backfills = 1000;
10214     cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
10215     service.local_reserver.set_max(max_backfills);
10216     service.remote_reserver.set_max(max_backfills);
10217
10218     // Disable recovery sleep
10219     cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10220     cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10221     cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10222     cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10223
10224     // Disable delete sleep
10225     cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10226     cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10227     cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10228     cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10229
10230     // Disable snap trim sleep
10231     cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10232     cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10233     cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10234     cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10235
10236     // Disable scrub sleep
10237     cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10238     return true;
10239   }
10240   return false;
10241 }
10242
10243 int OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
10244 {
10245   std::string cmd =
10246     "{"
10247       "\"prefix\": \"config set\", "
10248       "\"who\": \"osd." + std::to_string(whoami) + "\", "
10249       "\"name\": \"" + key + "\", "
10250       "\"value\": \"" + val + "\""
10251     "}";
10252
10253   vector<std::string> vcmd{cmd};
10254   bufferlist inbl;
10255   std::string outs;
10256   C_SaferCond cond;
10257   monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond);
10258   int r = cond.wait();
10259   if (r < 0) {
10260     derr << __func__ << " Failed to set config key " << key
10261          << " err: " << cpp_strerror(r)
10262          << " errstr: " << outs << dendl;
10263     return r;
10264   }
10265
10266   return 0;
10267 }
10268
10269 void OSD::update_log_config()
10270 {
10271   map<string,string> log_to_monitors;
10272   map<string,string> log_to_syslog;
10273   map<string,string> log_channel;
10274   map<string,string> log_prio;
10275   map<string,string> log_to_graylog;
10276   map<string,string> log_to_graylog_host;
10277   map<string,string> log_to_graylog_port;
10278   uuid_d fsid;
10279   string host;
10280
10281   if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10282                                log_channel, log_prio, log_to_graylog,
10283                                log_to_graylog_host, log_to_graylog_port,
10284                                fsid, host) == 0)
10285     clog->update_config(log_to_monitors, log_to_syslog,
10286                         log_channel, log_prio, log_to_graylog,
10287                         log_to_graylog_host, log_to_graylog_port,
10288                         fsid, host);
10289   derr << "log_to_monitors " << log_to_monitors << dendl;
10290 }
10291
10292 void OSD::check_config()
10293 {
10294   // some sanity checks
10295   if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10296     clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10297                  << " is not > osd_pg_epoch_persisted_max_stale ("
10298                  << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10299   }
10300   if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
10301     clog->warn() << "osd_object_clean_region_max_num_intervals ("
10302                  << cct->_conf->osd_object_clean_region_max_num_intervals
10303                 << ") is < 0";
10304   }
10305 }
10306
10307 // --------------------------------
10308
10309 void OSD::get_latest_osdmap()
10310 {
10311   dout(10) << __func__ << " -- start" << dendl;
10312
10313   boost::system::error_code ec;
10314   service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
10315
10316   dout(10) << __func__ << " -- finish" << dendl;
10317 }
10318
10319 // --------------------------------
10320
10321 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10322   const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10323   const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
10324   dout(10) << "setting " << queries.size() << " queries" << dendl;
10325
10326   std::list<OSDPerfMetricQuery> supported_queries;
10327   for (auto &it : queries) {
10328     auto &query = it.first;
10329     if (!query.key_descriptor.empty()) {
10330       supported_queries.push_back(query);
10331     }
10332   }
10333   if (supported_queries.size() < queries.size()) {
10334     dout(1) << queries.size() - supported_queries.size()
10335             << " unsupported queries" << dendl;
10336   }
10337   {
10338     std::lock_guard locker{m_perf_queries_lock};
10339     m_perf_queries = supported_queries;
10340     m_perf_limits = queries;
10341   }
10342   std::vector<PGRef> pgs;
10343   _get_pgs(&pgs);
10344   for (auto& pg : pgs) {
10345     std::scoped_lock l{*pg};
10346     pg->set_dynamic_perf_stats_queries(supported_queries);
10347   }
10348 }
10349
10350 MetricPayload OSD::get_perf_reports() {
10351   OSDMetricPayload payload;
10352   std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10353
10354   std::vector<PGRef> pgs;
10355   _get_pgs(&pgs);
10356   DynamicPerfStats dps;
10357   for (auto& pg : pgs) {
10358     // m_perf_queries can be modified only in set_perf_queries by mgr client
10359     // request, and it is protected by by mgr client's lock, which is held
10360     // when set_perf_queries/get_perf_reports are called, so we may not hold
10361     // m_perf_queries_lock here.
10362     DynamicPerfStats pg_dps(m_perf_queries);
10363     pg->lock();
10364     pg->get_dynamic_perf_stats(&pg_dps);
10365     pg->unlock();
10366     dps.merge(pg_dps);
10367   }
10368   dps.add_to_reports(m_perf_limits, &reports);
10369   dout(20) << "reports for " << reports.size() << " queries" << dendl;
10370
10371   return payload;
10372 }
10373
10374 // =============================================================
10375
10376 #undef dout_context
10377 #define dout_context cct
10378 #undef dout_prefix
10379 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10380
10381 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10382 {
10383   dout(10) << pg->pg_id << " " << pg << dendl;
10384   slot->pg = pg;
10385   pg->osd_shard = this;
10386   pg->pg_slot = slot;
10387   osd->inc_num_pgs();
10388
10389   slot->epoch = pg->get_osdmap_epoch();
10390   pg_slots_by_epoch.insert(*slot);
10391 }
10392
10393 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10394 {
10395   dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10396   slot->pg->osd_shard = nullptr;
10397   slot->pg->pg_slot = nullptr;
10398   slot->pg = nullptr;
10399   osd->dec_num_pgs();
10400
10401   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10402   slot->epoch = 0;
10403   if (waiting_for_min_pg_epoch) {
10404     min_pg_epoch_cond.notify_all();
10405   }
10406 }
10407
10408 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10409 {
10410   std::lock_guard l(shard_lock);
10411   dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10412            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10413   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10414   dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10415   slot->epoch = e;
10416   pg_slots_by_epoch.insert(*slot);
10417   dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10418            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10419   if (waiting_for_min_pg_epoch) {
10420     min_pg_epoch_cond.notify_all();
10421   }
10422 }
10423
10424 epoch_t OSDShard::get_min_pg_epoch()
10425 {
10426   std::lock_guard l(shard_lock);
10427   auto p = pg_slots_by_epoch.begin();
10428   if (p == pg_slots_by_epoch.end()) {
10429     return 0;
10430   }
10431   return p->epoch;
10432 }
10433
10434 void OSDShard::wait_min_pg_epoch(epoch_t need)
10435 {
10436   std::unique_lock l{shard_lock};
10437   ++waiting_for_min_pg_epoch;
10438   min_pg_epoch_cond.wait(l, [need, this] {
10439     if (pg_slots_by_epoch.empty()) {
10440       return true;
10441     } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10442       return true;
10443     } else {
10444       dout(10) << need << " waiting on "
10445                << pg_slots_by_epoch.begin()->epoch << dendl;
10446       return false;
10447     }
10448   });
10449   --waiting_for_min_pg_epoch;
10450 }
10451
10452 epoch_t OSDShard::get_max_waiting_epoch()
10453 {
10454   std::lock_guard l(shard_lock);
10455   epoch_t r = 0;
10456   for (auto& i : pg_slots) {
10457     if (!i.second->waiting_peering.empty()) {
10458       r = std::max(r, i.second->waiting_peering.rbegin()->first);
10459     }
10460   }
10461   return r;
10462 }
10463
10464 void OSDShard::consume_map(
10465   const OSDMapRef& new_osdmap,
10466   unsigned *pushes_to_free)
10467 {
10468   std::lock_guard l(shard_lock);
10469   OSDMapRef old_osdmap;
10470   {
10471     std::lock_guard l(osdmap_lock);
10472     old_osdmap = std::move(shard_osdmap);
10473     shard_osdmap = new_osdmap;
10474   }
10475   dout(10) << new_osdmap->get_epoch()
10476            << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10477            << dendl;
10478   bool queued = false;
10479
10480   // check slots
10481   auto p = pg_slots.begin();
10482   while (p != pg_slots.end()) {
10483     OSDShardPGSlot *slot = p->second.get();
10484     const spg_t& pgid = p->first;
10485     dout(20) << __func__ << " " << pgid << dendl;
10486     if (!slot->waiting_for_split.empty()) {
10487       dout(20) << __func__ << "  " << pgid
10488                << " waiting for split " << slot->waiting_for_split << dendl;
10489       ++p;
10490       continue;
10491     }
10492     if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10493       dout(20) << __func__ << "  " << pgid
10494                << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10495                << dendl;
10496       ++p;
10497       continue;
10498     }
10499     if (!slot->waiting_peering.empty()) {
10500       epoch_t first = slot->waiting_peering.begin()->first;
10501       if (first <= new_osdmap->get_epoch()) {
10502         dout(20) << __func__ << "  " << pgid
10503                  << " pending_peering first epoch " << first
10504                  << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10505         _wake_pg_slot(pgid, slot);
10506         queued = true;
10507       }
10508       ++p;
10509       continue;
10510     }
10511     if (!slot->waiting.empty()) {
10512       if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10513         dout(20) << __func__ << "  " << pgid << " maps to us, keeping"
10514                  << dendl;
10515         ++p;
10516         continue;
10517       }
10518       while (!slot->waiting.empty() &&
10519              slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10520         auto& qi = slot->waiting.front();
10521         dout(20) << __func__ << "  " << pgid
10522                  << " waiting item " << qi
10523                  << " epoch " << qi.get_map_epoch()
10524                  << " <= " << new_osdmap->get_epoch()
10525                  << ", "
10526                  << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10527                      "misdirected")
10528                  << ", dropping" << dendl;
10529         *pushes_to_free += qi.get_reserved_pushes();
10530         slot->waiting.pop_front();
10531       }
10532     }
10533     if (slot->waiting.empty() &&
10534         slot->num_running == 0 &&
10535         slot->waiting_for_split.empty() &&
10536         !slot->pg) {
10537       dout(20) << __func__ << "  " << pgid << " empty, pruning" << dendl;
10538       p = pg_slots.erase(p);
10539       continue;
10540     }
10541
10542     ++p;
10543   }
10544   if (queued) {
10545     std::lock_guard l{sdata_wait_lock};
10546     sdata_cond.notify_one();
10547   }
10548 }
10549
10550 void OSDShard::_wake_pg_slot(
10551   spg_t pgid,
10552   OSDShardPGSlot *slot)
10553 {
10554   dout(20) << __func__ << " " << pgid
10555            << " to_process " << slot->to_process
10556            << " waiting " << slot->waiting
10557            << " waiting_peering " << slot->waiting_peering << dendl;
10558   for (auto i = slot->to_process.rbegin();
10559        i != slot->to_process.rend();
10560        ++i) {
10561     scheduler->enqueue_front(std::move(*i));
10562   }
10563   slot->to_process.clear();
10564   for (auto i = slot->waiting.rbegin();
10565        i != slot->waiting.rend();
10566        ++i) {
10567     scheduler->enqueue_front(std::move(*i));
10568   }
10569   slot->waiting.clear();
10570   for (auto i = slot->waiting_peering.rbegin();
10571        i != slot->waiting_peering.rend();
10572        ++i) {
10573     // this is overkill; we requeue everything, even if some of these
10574     // items are waiting for maps we don't have yet.  FIXME, maybe,
10575     // someday, if we decide this inefficiency matters
10576     for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10577       scheduler->enqueue_front(std::move(*j));
10578     }
10579   }
10580   slot->waiting_peering.clear();
10581   ++slot->requeue_seq;
10582 }
10583
10584 void OSDShard::identify_splits_and_merges(
10585   const OSDMapRef& as_of_osdmap,
10586   set<pair<spg_t,epoch_t>> *split_pgs,
10587   set<pair<spg_t,epoch_t>> *merge_pgs)
10588 {
10589   std::lock_guard l(shard_lock);
10590   if (shard_osdmap) {
10591     for (auto& i : pg_slots) {
10592       const spg_t& pgid = i.first;
10593       auto *slot = i.second.get();
10594       if (slot->pg) {
10595         osd->service.identify_splits_and_merges(
10596           shard_osdmap, as_of_osdmap, pgid,
10597           split_pgs, merge_pgs);
10598       } else if (!slot->waiting_for_split.empty()) {
10599         osd->service.identify_splits_and_merges(
10600           shard_osdmap, as_of_osdmap, pgid,
10601           split_pgs, nullptr);
10602       } else {
10603         dout(20) << __func__ << " slot " << pgid
10604                  << " has no pg and waiting_for_split " << dendl;
10605       }
10606     }
10607   }
10608 }
10609
10610 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10611                             set<pair<spg_t,epoch_t>> *pgids)
10612 {
10613   std::lock_guard l(shard_lock);
10614   _prime_splits(pgids);
10615   if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10616     set<pair<spg_t,epoch_t>> newer_children;
10617     for (auto i : *pgids) {
10618       osd->service.identify_splits_and_merges(
10619         as_of_osdmap, shard_osdmap, i.first,
10620         &newer_children, nullptr);
10621     }
10622     newer_children.insert(pgids->begin(), pgids->end());
10623     dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10624              << shard_osdmap->get_epoch() << ", new children " << newer_children
10625              << dendl;
10626     _prime_splits(&newer_children);
10627     // note: we don't care what is left over here for other shards.
10628     // if this shard is ahead of us and one isn't, e.g., one thread is
10629     // calling into prime_splits via _process (due to a newly created
10630     // pg) and this shard has a newer map due to a racing consume_map,
10631     // then any grandchildren left here will be identified (or were
10632     // identified) when the slower shard's osdmap is advanced.
10633     // _prime_splits() will tolerate the case where the pgid is
10634     // already primed.
10635   }
10636 }
10637
10638 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10639 {
10640   dout(10) << *pgids << dendl;
10641   auto p = pgids->begin();
10642   while (p != pgids->end()) {
10643     unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10644     if (shard_index == shard_id) {
10645       auto r = pg_slots.emplace(p->first, nullptr);
10646       if (r.second) {
10647         dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10648         r.first->second = make_unique<OSDShardPGSlot>();
10649         r.first->second->waiting_for_split.insert(p->second);
10650       } else {
10651         auto q = r.first;
10652         ceph_assert(q != pg_slots.end());
10653         dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10654                  << dendl;
10655         q->second->waiting_for_split.insert(p->second);
10656       }
10657       p = pgids->erase(p);
10658     } else {
10659       ++p;
10660     }
10661   }
10662 }
10663
10664 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10665                             set<pair<spg_t,epoch_t>> *merge_pgs)
10666 {
10667   std::lock_guard l(shard_lock);
10668   dout(20) << __func__ << " checking shard " << shard_id
10669            << " for remaining merge pgs " << merge_pgs << dendl;
10670   auto p = merge_pgs->begin();
10671   while (p != merge_pgs->end()) {
10672     spg_t pgid = p->first;
10673     epoch_t epoch = p->second;
10674     unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10675     if (shard_index != shard_id) {
10676       ++p;
10677       continue;
10678     }
10679     OSDShardPGSlot *slot;
10680     auto r = pg_slots.emplace(pgid, nullptr);
10681     if (r.second) {
10682       r.first->second = make_unique<OSDShardPGSlot>();
10683     }
10684     slot = r.first->second.get();
10685     if (slot->pg) {
10686       // already have pg
10687       dout(20) << __func__ << "  have merge participant pg " << pgid
10688                << " " << slot->pg << dendl;
10689     } else if (!slot->waiting_for_split.empty() &&
10690                *slot->waiting_for_split.begin() < epoch) {
10691       dout(20) << __func__ << "  pending split on merge participant pg " << pgid
10692                << " " << slot->waiting_for_split << dendl;
10693     } else {
10694       dout(20) << __func__ << "  creating empty merge participant " << pgid
10695                << " for merge in " << epoch << dendl;
10696       // leave history zeroed; PG::merge_from() will fill it in.
10697       pg_history_t history;
10698       PGCreateInfo cinfo(pgid, epoch - 1,
10699                          history, PastIntervals(), false);
10700       PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10701       _attach_pg(r.first->second.get(), pg.get());
10702       _wake_pg_slot(pgid, slot);
10703       pg->unlock();
10704     }
10705     // mark slot for merge
10706     dout(20) << __func__ << "  marking merge participant " << pgid << dendl;
10707     slot->waiting_for_merge_epoch = epoch;
10708     p = merge_pgs->erase(p);
10709   }
10710 }
10711
10712 void OSDShard::register_and_wake_split_child(PG *pg)
10713 {
10714   epoch_t epoch;
10715   {
10716     std::lock_guard l(shard_lock);
10717     dout(10) << pg->pg_id << " " << pg << dendl;
10718     auto p = pg_slots.find(pg->pg_id);
10719     ceph_assert(p != pg_slots.end());
10720     auto *slot = p->second.get();
10721     dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10722              << dendl;
10723     ceph_assert(!slot->pg);
10724     ceph_assert(!slot->waiting_for_split.empty());
10725     _attach_pg(slot, pg);
10726
10727     epoch = pg->get_osdmap_epoch();
10728     ceph_assert(slot->waiting_for_split.count(epoch));
10729     slot->waiting_for_split.erase(epoch);
10730     if (slot->waiting_for_split.empty()) {
10731       _wake_pg_slot(pg->pg_id, slot);
10732     } else {
10733       dout(10) << __func__ << " still waiting for split on "
10734                << slot->waiting_for_split << dendl;
10735     }
10736   }
10737
10738   // kick child to ensure it pulls up to the latest osdmap
10739   osd->enqueue_peering_evt(
10740     pg->pg_id,
10741     PGPeeringEventRef(
10742       std::make_shared<PGPeeringEvent>(
10743         epoch,
10744         epoch,
10745         NullEvt())));
10746
10747   std::lock_guard l{sdata_wait_lock};
10748   sdata_cond.notify_one();
10749 }
10750
10751 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10752 {
10753   std::lock_guard l(shard_lock);
10754   vector<spg_t> to_delete;
10755   for (auto& i : pg_slots) {
10756     if (i.first != parent &&
10757         i.first.get_ancestor(old_pg_num) == parent) {
10758       dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10759                << dendl;
10760       _wake_pg_slot(i.first, i.second.get());
10761       to_delete.push_back(i.first);
10762     }
10763   }
10764   for (auto pgid : to_delete) {
10765     pg_slots.erase(pgid);
10766   }
10767 }
10768
10769 void OSDShard::update_scheduler_config()
10770 {
10771   std::lock_guard l(shard_lock);
10772   scheduler->update_configuration();
10773 }
10774
10775 OSDShard::OSDShard(
10776   int id,
10777   CephContext *cct,
10778   OSD *osd)
10779   : shard_id(id),
10780     cct(cct),
10781     osd(osd),
10782     shard_name(string("OSDShard.") + stringify(id)),
10783     sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10784     sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10785     osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10786     shard_lock_name(shard_name + "::shard_lock"),
10787     shard_lock{make_mutex(shard_lock_name)},
10788     scheduler(ceph::osd::scheduler::make_scheduler(
10789       cct, osd->num_shards, osd->store->is_rotational())),
10790     context_queue(sdata_wait_lock, sdata_cond)
10791 {
10792   dout(0) << "using op scheduler " << *scheduler << dendl;
10793 }
10794
10795
10796 // =============================================================
10797
10798 #undef dout_context
10799 #define dout_context osd->cct
10800 #undef dout_prefix
10801 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10802
10803 void OSD::ShardedOpWQ::_add_slot_waiter(
10804   spg_t pgid,
10805   OSDShardPGSlot *slot,
10806   OpSchedulerItem&& qi)
10807 {
10808   if (qi.is_peering()) {
10809     dout(20) << __func__ << " " << pgid
10810              << " peering, item epoch is "
10811              << qi.get_map_epoch()
10812              << ", will wait on " << qi << dendl;
10813     slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10814   } else {
10815     dout(20) << __func__ << " " << pgid
10816              << " item epoch is "
10817              << qi.get_map_epoch()
10818              << ", will wait on " << qi << dendl;
10819     slot->waiting.push_back(std::move(qi));
10820   }
10821 }
10822
10823 #undef dout_prefix
10824 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10825
10826 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10827 {
10828   uint32_t shard_index = thread_index % osd->num_shards;
10829   auto& sdata = osd->shards[shard_index];
10830   ceph_assert(sdata);
10831
10832   // If all threads of shards do oncommits, there is a out-of-order
10833   // problem.  So we choose the thread which has the smallest
10834   // thread_index(thread_index < num_shards) of shard to do oncommit
10835   // callback.
10836   bool is_smallest_thread_index = thread_index < osd->num_shards;
10837
10838   // peek at spg_t
10839   sdata->shard_lock.lock();
10840   if (sdata->scheduler->empty() &&
10841       (!is_smallest_thread_index || sdata->context_queue.empty())) {
10842     std::unique_lock wait_lock{sdata->sdata_wait_lock};
10843     if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10844       // we raced with a context_queue addition, don't wait
10845       wait_lock.unlock();
10846     } else if (!sdata->stop_waiting) {
10847       dout(20) << __func__ << " empty q, waiting" << dendl;
10848       osd->cct->get_heartbeat_map()->clear_timeout(hb);
10849       sdata->shard_lock.unlock();
10850       sdata->sdata_cond.wait(wait_lock);
10851       wait_lock.unlock();
10852       sdata->shard_lock.lock();
10853       if (sdata->scheduler->empty() &&
10854          !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10855         sdata->shard_lock.unlock();
10856         return;
10857       }
10858       // found a work item; reapply default wq timeouts
10859       osd->cct->get_heartbeat_map()->reset_timeout(hb,
10860         timeout_interval, suicide_interval);
10861     } else {
10862       dout(20) << __func__ << " need return immediately" << dendl;
10863       wait_lock.unlock();
10864       sdata->shard_lock.unlock();
10865       return;
10866     }
10867   }
10868
10869   list<Context *> oncommits;
10870   if (is_smallest_thread_index) {
10871     sdata->context_queue.move_to(oncommits);
10872   }
10873
10874   WorkItem work_item;
10875   while (!std::get_if<OpSchedulerItem>(&work_item)) {
10876     if (sdata->scheduler->empty()) {
10877       if (osd->is_stopping()) {
10878         sdata->shard_lock.unlock();
10879         for (auto c : oncommits) {
10880           dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10881           delete c;
10882         }
10883         return;    // OSD shutdown, discard.
10884       }
10885       sdata->shard_lock.unlock();
10886       handle_oncommits(oncommits);
10887       return;
10888     }
10889
10890     work_item = sdata->scheduler->dequeue();
10891     if (osd->is_stopping()) {
10892       sdata->shard_lock.unlock();
10893       for (auto c : oncommits) {
10894         dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10895         delete c;
10896       }
10897       return;    // OSD shutdown, discard.
10898     }
10899
10900     // If the work item is scheduled in the future, wait until
10901     // the time returned in the dequeue response before retrying.
10902     if (auto when_ready = std::get_if<double>(&work_item)) {
10903       if (is_smallest_thread_index) {
10904         sdata->shard_lock.unlock();
10905         handle_oncommits(oncommits);
10906         return;
10907       }
10908       std::unique_lock wait_lock{sdata->sdata_wait_lock};
10909       auto future_time = ceph::real_clock::from_double(*when_ready);
10910       dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10911       // Disable heartbeat timeout until we find a non-future work item to process.
10912       osd->cct->get_heartbeat_map()->clear_timeout(hb);
10913       sdata->shard_lock.unlock();
10914       ++sdata->waiting_threads;
10915       sdata->sdata_cond.wait_until(wait_lock, future_time);
10916       --sdata->waiting_threads;
10917       wait_lock.unlock();
10918       sdata->shard_lock.lock();
10919       // Reapply default wq timeouts
10920       osd->cct->get_heartbeat_map()->reset_timeout(hb,
10921         timeout_interval, suicide_interval);
10922     }
10923   } // while
10924
10925   // Access the stored item
10926   auto item = std::move(std::get<OpSchedulerItem>(work_item));
10927   if (osd->is_stopping()) {
10928     sdata->shard_lock.unlock();
10929     for (auto c : oncommits) {
10930       dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10931       delete c;
10932     }
10933     return;    // OSD shutdown, discard.
10934   }
10935
10936   const auto token = item.get_ordering_token();
10937   auto r = sdata->pg_slots.emplace(token, nullptr);
10938   if (r.second) {
10939     r.first->second = make_unique<OSDShardPGSlot>();
10940   }
10941   OSDShardPGSlot *slot = r.first->second.get();
10942   dout(20) << __func__ << " " << token
10943            << (r.second ? " (new)" : "")
10944            << " to_process " << slot->to_process
10945            << " waiting " << slot->waiting
10946            << " waiting_peering " << slot->waiting_peering
10947            << dendl;
10948   slot->to_process.push_back(std::move(item));
10949   dout(20) << __func__ << " " << slot->to_process.back()
10950            << " queued" << dendl;
10951
10952  retry_pg:
10953   PGRef pg = slot->pg;
10954
10955   // lock pg (if we have it)
10956   if (pg) {
10957     // note the requeue seq now...
10958     uint64_t requeue_seq = slot->requeue_seq;
10959     ++slot->num_running;
10960
10961     sdata->shard_lock.unlock();
10962     osd->service.maybe_inject_dispatch_delay();
10963     pg->lock();
10964     osd->service.maybe_inject_dispatch_delay();
10965     sdata->shard_lock.lock();
10966
10967     auto q = sdata->pg_slots.find(token);
10968     if (q == sdata->pg_slots.end()) {
10969       // this can happen if we race with pg removal.
10970       dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10971       pg->unlock();
10972       sdata->shard_lock.unlock();
10973       handle_oncommits(oncommits);
10974       return;
10975     }
10976     slot = q->second.get();
10977     --slot->num_running;
10978
10979     if (slot->to_process.empty()) {
10980       // raced with _wake_pg_slot or consume_map
10981       dout(20) << __func__ << " " << token
10982                << " nothing queued" << dendl;
10983       pg->unlock();
10984       sdata->shard_lock.unlock();
10985       handle_oncommits(oncommits);
10986       return;
10987     }
10988     if (requeue_seq != slot->requeue_seq) {
10989       dout(20) << __func__ << " " << token
10990                << " requeue_seq " << slot->requeue_seq << " > our "
10991                << requeue_seq << ", we raced with _wake_pg_slot"
10992                << dendl;
10993       pg->unlock();
10994       sdata->shard_lock.unlock();
10995       handle_oncommits(oncommits);
10996       return;
10997     }
10998     if (slot->pg != pg) {
10999       // this can happen if we race with pg removal.
11000       dout(20) << __func__ << " slot " << token << " no longer attached to "
11001                << pg << dendl;
11002       pg->unlock();
11003       goto retry_pg;
11004     }
11005   }
11006
11007   dout(20) << __func__ << " " << token
11008            << " to_process " << slot->to_process
11009            << " waiting " << slot->waiting
11010            << " waiting_peering " << slot->waiting_peering << dendl;
11011
11012   ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
11013                                  suicide_interval);
11014
11015   // take next item
11016   auto qi = std::move(slot->to_process.front());
11017   slot->to_process.pop_front();
11018   dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11019   set<pair<spg_t,epoch_t>> new_children;
11020   OSDMapRef osdmap;
11021
11022   while (!pg) {
11023     // should this pg shard exist on this osd in this (or a later) epoch?
11024     osdmap = sdata->shard_osdmap;
11025     const PGCreateInfo *create_info = qi.creates_pg();
11026     if (!slot->waiting_for_split.empty()) {
11027       dout(20) << __func__ << " " << token
11028                << " splitting " << slot->waiting_for_split << dendl;
11029       _add_slot_waiter(token, slot, std::move(qi));
11030     } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11031       dout(20) << __func__ << " " << token
11032                << " map " << qi.get_map_epoch() << " > "
11033                << osdmap->get_epoch() << dendl;
11034       _add_slot_waiter(token, slot, std::move(qi));
11035     } else if (qi.is_peering()) {
11036       if (!qi.peering_requires_pg()) {
11037         // for pg-less events, we run them under the ordering lock, since
11038         // we don't have the pg lock to keep them ordered.
11039         qi.run(osd, sdata, pg, tp_handle);
11040       } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11041         if (create_info) {
11042           if (create_info->by_mon &&
11043               osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11044             dout(20) << __func__ << " " << token
11045                      << " no pg, no longer primary, ignoring mon create on "
11046                      << qi << dendl;
11047           } else {
11048             dout(20) << __func__ << " " << token
11049                      << " no pg, should create on " << qi << dendl;
11050             pg = osd->handle_pg_create_info(osdmap, create_info);
11051             if (pg) {
11052               // we created the pg! drop out and continue "normally"!
11053               sdata->_attach_pg(slot, pg.get());
11054               sdata->_wake_pg_slot(token, slot);
11055
11056               // identify split children between create epoch and shard epoch.
11057               osd->service.identify_splits_and_merges(
11058                 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11059               sdata->_prime_splits(&new_children);
11060               // distribute remaining split children to other shards below!
11061               break;
11062             }
11063             dout(20) << __func__ << " ignored create on " << qi << dendl;
11064           }
11065         } else {
11066           dout(20) << __func__ << " " << token
11067                    << " no pg, peering, !create, discarding " << qi << dendl;
11068         }
11069       } else {
11070         dout(20) << __func__ << " " << token
11071                  << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11072                  << ", discarding " << qi
11073                  << dendl;
11074       }
11075     } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11076       dout(20) << __func__ << " " << token
11077                << " no pg, should exist e" << osdmap->get_epoch()
11078                << ", will wait on " << qi << dendl;
11079       _add_slot_waiter(token, slot, std::move(qi));
11080     } else {
11081       dout(20) << __func__ << " " << token
11082                << " no pg, shouldn't exist e" << osdmap->get_epoch()
11083                << ", dropping " << qi << dendl;
11084       // share map with client?
11085       if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11086         osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11087                                      sdata->shard_osdmap,
11088                                      (*_op)->sent_epoch);
11089       }
11090       unsigned pushes_to_free = qi.get_reserved_pushes();
11091       if (pushes_to_free > 0) {
11092         sdata->shard_lock.unlock();
11093         osd->service.release_reserved_pushes(pushes_to_free);
11094         handle_oncommits(oncommits);
11095         return;
11096       }
11097     }
11098     sdata->shard_lock.unlock();
11099     handle_oncommits(oncommits);
11100     return;
11101   }
11102   if (qi.is_peering()) {
11103     OSDMapRef osdmap = sdata->shard_osdmap;
11104     if (qi.get_map_epoch() > osdmap->get_epoch()) {
11105       _add_slot_waiter(token, slot, std::move(qi));
11106       sdata->shard_lock.unlock();
11107       pg->unlock();
11108       handle_oncommits(oncommits);
11109       return;
11110     }
11111   }
11112   sdata->shard_lock.unlock();
11113
11114   if (!new_children.empty()) {
11115     for (auto shard : osd->shards) {
11116       shard->prime_splits(osdmap, &new_children);
11117     }
11118     ceph_assert(new_children.empty());
11119   }
11120
11121   // osd_opwq_process marks the point at which an operation has been dequeued
11122   // and will begin to be handled by a worker thread.
11123   {
11124 #ifdef WITH_LTTNG
11125     osd_reqid_t reqid;
11126     if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11127       reqid = (*_op)->get_reqid();
11128     }
11129 #endif
11130     tracepoint(osd, opwq_process_start, reqid.name._type,
11131         reqid.name._num, reqid.tid, reqid.inc);
11132   }
11133
11134   lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11135   Formatter *f = Formatter::create("json");
11136   f->open_object_section("q");
11137   dump(f);
11138   f->close_section();
11139   f->flush(*_dout);
11140   delete f;
11141   *_dout << dendl;
11142
11143   qi.run(osd, sdata, pg, tp_handle);
11144
11145   {
11146 #ifdef WITH_LTTNG
11147     osd_reqid_t reqid;
11148     if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11149       reqid = (*_op)->get_reqid();
11150     }
11151 #endif
11152     tracepoint(osd, opwq_process_finish, reqid.name._type,
11153         reqid.name._num, reqid.tid, reqid.inc);
11154   }
11155
11156   handle_oncommits(oncommits);
11157 }
11158
11159 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11160   uint32_t shard_index =
11161     item.get_ordering_token().hash_to_shard(osd->shards.size());
11162
11163   dout(20) << __func__ << " " << item << dendl;
11164
11165   OSDShard* sdata = osd->shards[shard_index];
11166   assert (NULL != sdata);
11167
11168   bool empty = true;
11169   {
11170     std::lock_guard l{sdata->shard_lock};
11171     empty = sdata->scheduler->empty();
11172     sdata->scheduler->enqueue(std::move(item));
11173   }
11174
11175   {
11176     std::lock_guard l{sdata->sdata_wait_lock};
11177     if (empty) {
11178       sdata->sdata_cond.notify_all();
11179     } else if (sdata->waiting_threads) {
11180       sdata->sdata_cond.notify_one();
11181     }
11182   }
11183 }
11184
11185 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
11186 {
11187   auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11188   auto& sdata = osd->shards[shard_index];
11189   ceph_assert(sdata);
11190   sdata->shard_lock.lock();
11191   auto p = sdata->pg_slots.find(item.get_ordering_token());
11192   if (p != sdata->pg_slots.end() &&
11193       !p->second->to_process.empty()) {
11194     // we may be racing with _process, which has dequeued a new item
11195     // from scheduler, put it on to_process, and is now busy taking the
11196     // pg lock.  ensure this old requeued item is ordered before any
11197     // such newer item in to_process.
11198     p->second->to_process.push_front(std::move(item));
11199     item = std::move(p->second->to_process.back());
11200     p->second->to_process.pop_back();
11201     dout(20) << __func__
11202              << " " << p->second->to_process.front()
11203              << " shuffled w/ " << item << dendl;
11204   } else {
11205     dout(20) << __func__ << " " << item << dendl;
11206   }
11207   sdata->scheduler->enqueue_front(std::move(item));
11208   sdata->shard_lock.unlock();
11209   std::lock_guard l{sdata->sdata_wait_lock};
11210   sdata->sdata_cond.notify_one();
11211 }
11212
11213 namespace ceph::osd_cmds {
11214
11215 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11216          std::ostream& os)
11217 {
11218   if (!ceph_using_tcmalloc()) {
11219         os << "could not issue heap profiler command -- not using tcmalloc!";
11220         return -EOPNOTSUPP;
11221   }
11222
11223   string cmd;
11224   if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
11225         os << "unable to get value for command \"" << cmd << "\"";
11226        return -EINVAL;
11227   }
11228
11229   std::vector<std::string> cmd_vec;
11230   get_str_vec(cmd, cmd_vec);
11231
11232   string val;
11233   if (cmd_getval(cmdmap, "value", val)) {
11234     cmd_vec.push_back(val);
11235   }
11236
11237   ceph_heap_profiler_handle_command(cmd_vec, os);
11238
11239   return 0;
11240 }
11241
11242 } // namespace ceph::osd_cmds