ceph/src/osd/OSD.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2017 OVH
   8  *
   9  * This is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License version 2.1, as published by the Free Software
  12  * Foundation.  See file COPYING.
  13  *
  14  */
  15
  16 #include "acconfig.h"
  17
  18 #include <cctype>
  19 #include <fstream>
  20 #include <iostream>
  21 #include <iterator>
  22
  23 #include <unistd.h>
  24 #include <sys/stat.h>
  25 #include <signal.h>
  26 #include <time.h>
  27 #include <boost/scoped_ptr.hpp>
  28 #include <boost/range/adaptor/reversed.hpp>
  29
  30 #ifdef HAVE_SYS_PARAM_H
  31 #include <sys/param.h>
  32 #endif
  33
  34 #ifdef HAVE_SYS_MOUNT_H
  35 #include <sys/mount.h>
  36 #endif
  37
  38 #include "osd/PG.h"
  39 #include "osd/scrub_machine.h"
  40 #include "osd/pg_scrubber.h"
  41
  42 #include "include/types.h"
  43 #include "include/compat.h"
  44 #include "include/random.h"
  45
  46 #include "OSD.h"
  47 #include "OSDMap.h"
  48 #include "Watch.h"
  49 #include "osdc/Objecter.h"
  50
  51 #include "common/errno.h"
  52 #include "common/ceph_argparse.h"
  53 #include "common/ceph_releases.h"
  54 #include "common/ceph_time.h"
  55 #include "common/version.h"
  56 #include "common/async/blocked_completion.h"
  57 #include "common/pick_address.h"
  58 #include "common/blkdev.h"
  59 #include "common/numa.h"
  60
  61 #include "os/ObjectStore.h"
  62 #ifdef HAVE_LIBFUSE
  63 #include "os/FuseStore.h"
  64 #endif
  65
  66 #include "PrimaryLogPG.h"
  67
  68 #include "msg/Messenger.h"
  69 #include "msg/Message.h"
  70
  71 #include "mon/MonClient.h"
  72
  73 #include "messages/MLog.h"
  74
  75 #include "messages/MGenericMessage.h"
  76 #include "messages/MOSDPing.h"
  77 #include "messages/MOSDFailure.h"
  78 #include "messages/MOSDMarkMeDown.h"
  79 #include "messages/MOSDMarkMeDead.h"
  80 #include "messages/MOSDFull.h"
  81 #include "messages/MOSDOp.h"
  82 #include "messages/MOSDOpReply.h"
  83 #include "messages/MOSDBackoff.h"
  84 #include "messages/MOSDBeacon.h"
  85 #include "messages/MOSDRepOp.h"
  86 #include "messages/MOSDRepOpReply.h"
  87 #include "messages/MOSDBoot.h"
  88 #include "messages/MOSDPGTemp.h"
  89 #include "messages/MOSDPGReadyToMerge.h"
  90
  91 #include "messages/MOSDMap.h"
  92 #include "messages/MMonGetOSDMap.h"
  93 #include "messages/MOSDPGNotify.h"
  94 #include "messages/MOSDPGNotify2.h"
  95 #include "messages/MOSDPGQuery.h"
  96 #include "messages/MOSDPGQuery2.h"
  97 #include "messages/MOSDPGLog.h"
  98 #include "messages/MOSDPGRemove.h"
  99 #include "messages/MOSDPGInfo.h"
 100 #include "messages/MOSDPGInfo2.h"
 101 #include "messages/MOSDPGCreate.h"
 102 #include "messages/MOSDPGCreate2.h"
 103 #include "messages/MBackfillReserve.h"
 104 #include "messages/MRecoveryReserve.h"
 105 #include "messages/MOSDForceRecovery.h"
 106 #include "messages/MOSDECSubOpWrite.h"
 107 #include "messages/MOSDECSubOpWriteReply.h"
 108 #include "messages/MOSDECSubOpRead.h"
 109 #include "messages/MOSDECSubOpReadReply.h"
 110 #include "messages/MOSDPGCreated.h"
 111 #include "messages/MOSDPGUpdateLogMissing.h"
 112 #include "messages/MOSDPGUpdateLogMissingReply.h"
 113
 114 #include "messages/MOSDPeeringOp.h"
 115
 116 #include "messages/MOSDAlive.h"
 117
 118 #include "messages/MOSDScrub.h"
 119 #include "messages/MOSDScrub2.h"
 120 #include "messages/MOSDRepScrub.h"
 121
 122 #include "messages/MCommand.h"
 123 #include "messages/MCommandReply.h"
 124
 125 #include "messages/MPGStats.h"
 126
 127 #include "messages/MWatchNotify.h"
 128 #include "messages/MOSDPGPush.h"
 129 #include "messages/MOSDPGPushReply.h"
 130 #include "messages/MOSDPGPull.h"
 131
 132 #include "messages/MMonGetPurgedSnaps.h"
 133 #include "messages/MMonGetPurgedSnapsReply.h"
 134
 135 #include "common/perf_counters.h"
 136 #include "common/Timer.h"
 137 #include "common/LogClient.h"
 138 #include "common/AsyncReserver.h"
 139 #include "common/HeartbeatMap.h"
 140 #include "common/admin_socket.h"
 141 #include "common/ceph_context.h"
 142
 143 #include "global/signal_handler.h"
 144 #include "global/pidfile.h"
 145
 146 #include "include/color.h"
 147 #include "perfglue/cpu_profiler.h"
 148 #include "perfglue/heap_profiler.h"
 149
 150 #include "osd/ClassHandler.h"
 151 #include "osd/OpRequest.h"
 152
 153 #include "auth/AuthAuthorizeHandler.h"
 154 #include "auth/RotatingKeyRing.h"
 155
 156 #include "objclass/objclass.h"
 157
 158 #include "common/cmdparse.h"
 159 #include "include/str_list.h"
 160 #include "include/util.h"
 161
 162 #include "include/ceph_assert.h"
 163 #include "common/config.h"
 164 #include "common/EventTrace.h"
 165
 166 #include "json_spirit/json_spirit_reader.h"
 167 #include "json_spirit/json_spirit_writer.h"
 168
 169 #ifdef WITH_LTTNG
 170 #define TRACEPOINT_DEFINE
 171 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 172 #include "tracing/osd.h"
 173 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 174 #undef TRACEPOINT_DEFINE
 175 #else
 176 #define tracepoint(...)
 177 #endif
 178 #ifdef HAVE_JAEGER
 179 #include "common/tracer.h"
 180 #endif
 181
 182 #define dout_context cct
 183 #define dout_subsys ceph_subsys_osd
 184 #undef dout_prefix
 185 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
 186
 187 using std::deque;
 188 using std::list;
 189 using std::lock_guard;
 190 using std::make_pair;
 191 using std::make_tuple;
 192 using std::make_unique;
 193 using std::map;
 194 using std::ostream;
 195 using std::ostringstream;
 196 using std::pair;
 197 using std::set;
 198 using std::string;
 199 using std::stringstream;
 200 using std::to_string;
 201 using std::unique_ptr;
 202 using std::vector;
 203
 204 using ceph::bufferlist;
 205 using ceph::bufferptr;
 206 using ceph::decode;
 207 using ceph::encode;
 208 using ceph::fixed_u_to_string;
 209 using ceph::Formatter;
 210 using ceph::heartbeat_handle_d;
 211 using ceph::make_mutex;
 212
 213 using namespace ceph::osd::scheduler;
 214 using TOPNSPC::common::cmd_getval;
 215
 216 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
 217   return *_dout << "osd." << whoami << " " << epoch << " ";
 218 }
 219
 220 //Initial features in new superblock.
 221 //Features here are also automatically upgraded
 222 CompatSet OSD::get_osd_initial_compat_set() {
 223   CompatSet::FeatureSet ceph_osd_feature_compat;
 224   CompatSet::FeatureSet ceph_osd_feature_ro_compat;
 225   CompatSet::FeatureSet ceph_osd_feature_incompat;
 226   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
 227   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
 228   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
 229   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
 230   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
 231   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
 232   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
 233   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
 234   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
 235   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
 236   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
 237   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
 238   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
 239   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
 240   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
 241   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
 242   return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
 243                    ceph_osd_feature_incompat);
 244 }
 245
 246 //Features are added here that this OSD supports.
 247 CompatSet OSD::get_osd_compat_set() {
 248   CompatSet compat =  get_osd_initial_compat_set();
 249   //Any features here can be set in code, but not in initial superblock
 250   compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
 251   return compat;
 252 }
 253
 254 OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
 255   osd(osd),
 256   cct(osd->cct),
 257   whoami(osd->whoami), store(osd->store),
 258   log_client(osd->log_client), clog(osd->clog),
 259   pg_recovery_stats(osd->pg_recovery_stats),
 260   cluster_messenger(osd->cluster_messenger),
 261   client_messenger(osd->client_messenger),
 262   logger(osd->logger),
 263   recoverystate_perf(osd->recoverystate_perf),
 264   monc(osd->monc),
 265   osd_max_object_size(cct->_conf, "osd_max_object_size"),
 266   osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
 267   publish_lock{ceph::make_mutex("OSDService::publish_lock")},
 268   pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
 269   max_oldest_map(0),
 270   scrubs_local(0),
 271   scrubs_remote(0),
 272   agent_valid_iterator(false),
 273   agent_ops(0),
 274   flush_mode_high_count(0),
 275   agent_active(true),
 276   agent_thread(this),
 277   agent_stop_flag(false),
 278   agent_timer(osd->client_messenger->cct, agent_timer_lock),
 279   last_recalibrate(ceph_clock_now()),
 280   promote_max_objects(0),
 281   promote_max_bytes(0),
 282   poolctx(poolctx),
 283   objecter(make_unique<Objecter>(osd->client_messenger->cct,
 284                                  osd->objecter_messenger,
 285                                  osd->monc, poolctx)),
 286   m_objecter_finishers(cct->_conf->osd_objecter_finishers),
 287   watch_timer(osd->client_messenger->cct, watch_lock),
 288   next_notif_id(0),
 289   recovery_request_timer(cct, recovery_request_lock, false),
 290   sleep_timer(cct, sleep_lock, false),
 291   reserver_finisher(cct),
 292   local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 293                  cct->_conf->osd_min_recovery_priority),
 294   remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 295                   cct->_conf->osd_min_recovery_priority),
 296   snap_reserver(cct, &reserver_finisher,
 297                 cct->_conf->osd_max_trimming_pgs),
 298   recovery_ops_active(0),
 299   recovery_ops_reserved(0),
 300   recovery_paused(false),
 301   map_cache(cct, cct->_conf->osd_map_cache_size),
 302   map_bl_cache(cct->_conf->osd_map_cache_size),
 303   map_bl_inc_cache(cct->_conf->osd_map_cache_size),
 304   cur_state(NONE),
 305   cur_ratio(0), physical_ratio(0),
 306   boot_epoch(0), up_epoch(0), bind_epoch(0)
 307 {
 308   objecter->init();
 309
 310   for (int i = 0; i < m_objecter_finishers; i++) {
 311     ostringstream str;
 312     str << "objecter-finisher-" << i;
 313     auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
 314     objecter_finishers.push_back(std::move(fin));
 315   }
 316 }
 317
 318 #ifdef PG_DEBUG_REFS
 319 void OSDService::add_pgid(spg_t pgid, PG *pg) {
 320   std::lock_guard l(pgid_lock);
 321   if (!pgid_tracker.count(pgid)) {
 322     live_pgs[pgid] = pg;
 323   }
 324   pgid_tracker[pgid]++;
 325 }
 326 void OSDService::remove_pgid(spg_t pgid, PG *pg)
 327 {
 328   std::lock_guard l(pgid_lock);
 329   ceph_assert(pgid_tracker.count(pgid));
 330   ceph_assert(pgid_tracker[pgid] > 0);
 331   pgid_tracker[pgid]--;
 332   if (pgid_tracker[pgid] == 0) {
 333     pgid_tracker.erase(pgid);
 334     live_pgs.erase(pgid);
 335   }
 336 }
 337 void OSDService::dump_live_pgids()
 338 {
 339   std::lock_guard l(pgid_lock);
 340   derr << "live pgids:" << dendl;
 341   for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
 342        i != pgid_tracker.cend();
 343        ++i) {
 344     derr << "\t" << *i << dendl;
 345     live_pgs[i->first]->dump_live_ids();
 346   }
 347 }
 348 #endif
 349
 350
 351 ceph::signedspan OSDService::get_mnow()
 352 {
 353   return ceph::mono_clock::now() - osd->startup_time;
 354 }
 355
 356 void OSDService::identify_splits_and_merges(
 357   OSDMapRef old_map,
 358   OSDMapRef new_map,
 359   spg_t pgid,
 360   set<pair<spg_t,epoch_t>> *split_children,
 361   set<pair<spg_t,epoch_t>> *merge_pgs)
 362 {
 363   if (!old_map->have_pg_pool(pgid.pool())) {
 364     return;
 365   }
 366   int old_pgnum = old_map->get_pg_num(pgid.pool());
 367   auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
 368   if (p == osd->pg_num_history.pg_nums.end()) {
 369     return;
 370   }
 371   dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
 372            << " to e" << new_map->get_epoch()
 373            << " pg_nums " << p->second << dendl;
 374   deque<spg_t> queue;
 375   queue.push_back(pgid);
 376   set<spg_t> did;
 377   while (!queue.empty()) {
 378     auto cur = queue.front();
 379     queue.pop_front();
 380     did.insert(cur);
 381     unsigned pgnum = old_pgnum;
 382     for (auto q = p->second.lower_bound(old_map->get_epoch());
 383          q != p->second.end() &&
 384            q->first <= new_map->get_epoch();
 385          ++q) {
 386       if (pgnum < q->second) {
 387         // split?
 388         if (cur.ps() < pgnum) {
 389           set<spg_t> children;
 390           if (cur.is_split(pgnum, q->second, &children)) {
 391             dout(20) << __func__ << " " << cur << " e" << q->first
 392                      << " pg_num " << pgnum << " -> " << q->second
 393                      << " children " << children << dendl;
 394             for (auto i : children) {
 395               split_children->insert(make_pair(i, q->first));
 396               if (!did.count(i))
 397                 queue.push_back(i);
 398             }
 399           }
 400         } else if (cur.ps() < q->second) {
 401           dout(20) << __func__ << " " << cur << " e" << q->first
 402                    << " pg_num " << pgnum << " -> " << q->second
 403                    << " is a child" << dendl;
 404           // normally we'd capture this from the parent, but it's
 405           // possible the parent doesn't exist yet (it will be
 406           // fabricated to allow an intervening merge).  note this PG
 407           // as a split child here to be sure we catch it.
 408           split_children->insert(make_pair(cur, q->first));
 409         } else {
 410           dout(20) << __func__ << " " << cur << " e" << q->first
 411                    << " pg_num " << pgnum << " -> " << q->second
 412                    << " is post-split, skipping" << dendl;
 413         }
 414       } else if (merge_pgs) {
 415         // merge?
 416         if (cur.ps() >= q->second) {
 417           if (cur.ps() < pgnum) {
 418             spg_t parent;
 419             if (cur.is_merge_source(pgnum, q->second, &parent)) {
 420               set<spg_t> children;
 421               parent.is_split(q->second, pgnum, &children);
 422               dout(20) << __func__ << " " << cur << " e" << q->first
 423                        << " pg_num " << pgnum << " -> " << q->second
 424                        << " is merge source, target " << parent
 425                        << ", source(s) " << children << dendl;
 426               merge_pgs->insert(make_pair(parent, q->first));
 427               if (!did.count(parent)) {
 428                 // queue (and re-scan) parent in case it might not exist yet
 429                 // and there are some future splits pending on it
 430                 queue.push_back(parent);
 431               }
 432               for (auto c : children) {
 433                 merge_pgs->insert(make_pair(c, q->first));
 434                 if (!did.count(c))
 435                   queue.push_back(c);
 436               }
 437             }
 438           } else {
 439             dout(20) << __func__ << " " << cur << " e" << q->first
 440                      << " pg_num " << pgnum << " -> " << q->second
 441                      << " is beyond old pgnum, skipping" << dendl;
 442           }
 443         } else {
 444           set<spg_t> children;
 445           if (cur.is_split(q->second, pgnum, &children)) {
 446             dout(20) << __func__ << " " << cur << " e" << q->first
 447                      << " pg_num " << pgnum << " -> " << q->second
 448                      << " is merge target, source " << children << dendl;
 449             for (auto c : children) {
 450               merge_pgs->insert(make_pair(c, q->first));
 451               if (!did.count(c))
 452                 queue.push_back(c);
 453             }
 454             merge_pgs->insert(make_pair(cur, q->first));
 455           }
 456         }
 457       }
 458       pgnum = q->second;
 459     }
 460   }
 461 }
 462
 463 void OSDService::need_heartbeat_peer_update()
 464 {
 465   osd->need_heartbeat_peer_update();
 466 }
 467
 468 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
 469 {
 470   std::lock_guard l(hb_stamp_lock);
 471   if (peer >= hb_stamps.size()) {
 472     hb_stamps.resize(peer + 1);
 473   }
 474   if (!hb_stamps[peer]) {
 475     hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
 476   }
 477   return hb_stamps[peer];
 478 }
 479
 480 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
 481 {
 482   osd->enqueue_peering_evt(
 483     spgid,
 484     PGPeeringEventRef(
 485       std::make_shared<PGPeeringEvent>(
 486         epoch, epoch,
 487         RenewLease())));
 488 }
 489
 490 void OSDService::start_shutdown()
 491 {
 492   {
 493     std::lock_guard l(agent_timer_lock);
 494     agent_timer.shutdown();
 495   }
 496
 497   {
 498     std::lock_guard l(sleep_lock);
 499     sleep_timer.shutdown();
 500   }
 501
 502   {
 503     std::lock_guard l(recovery_request_lock);
 504     recovery_request_timer.shutdown();
 505   }
 506 }
 507
 508 void OSDService::shutdown_reserver()
 509 {
 510   reserver_finisher.wait_for_empty();
 511   reserver_finisher.stop();
 512 }
 513
 514 void OSDService::shutdown()
 515 {
 516   mono_timer.suspend();
 517
 518   {
 519     std::lock_guard l(watch_lock);
 520     watch_timer.shutdown();
 521   }
 522
 523   objecter->shutdown();
 524   for (auto& f : objecter_finishers) {
 525     f->wait_for_empty();
 526     f->stop();
 527   }
 528
 529   publish_map(OSDMapRef());
 530   next_osdmap = OSDMapRef();
 531 }
 532
 533 void OSDService::init()
 534 {
 535   reserver_finisher.start();
 536   for (auto& f : objecter_finishers) {
 537     f->start();
 538   }
 539   objecter->set_client_incarnation(0);
 540
 541   // deprioritize objecter in daemonperf output
 542   objecter->get_logger()->set_prio_adjust(-3);
 543
 544   watch_timer.init();
 545   agent_timer.init();
 546   mono_timer.resume();
 547
 548   agent_thread.create("osd_srv_agent");
 549
 550   if (cct->_conf->osd_recovery_delay_start)
 551     defer_recovery(cct->_conf->osd_recovery_delay_start);
 552 }
 553
 554 void OSDService::final_init()
 555 {
 556   objecter->start(osdmap.get());
 557 }
 558
 559 void OSDService::activate_map()
 560 {
 561   // wake/unwake the tiering agent
 562   std::lock_guard l{agent_lock};
 563   agent_active =
 564     !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
 565     osd->is_active();
 566   agent_cond.notify_all();
 567 }
 568
 569 void OSDService::request_osdmap_update(epoch_t e)
 570 {
 571   osd->osdmap_subscribe(e, false);
 572 }
 573
 574
 575 class AgentTimeoutCB : public Context {
 576   PGRef pg;
 577 public:
 578   explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
 579   void finish(int) override {
 580     pg->agent_choose_mode_restart();
 581   }
 582 };
 583
 584 void OSDService::agent_entry()
 585 {
 586   dout(10) << __func__ << " start" << dendl;
 587   std::unique_lock agent_locker{agent_lock};
 588
 589   while (!agent_stop_flag) {
 590     if (agent_queue.empty()) {
 591       dout(20) << __func__ << " empty queue" << dendl;
 592       agent_cond.wait(agent_locker);
 593       continue;
 594     }
 595     uint64_t level = agent_queue.rbegin()->first;
 596     set<PGRef>& top = agent_queue.rbegin()->second;
 597     dout(10) << __func__
 598              << " tiers " << agent_queue.size()
 599              << ", top is " << level
 600              << " with pgs " << top.size()
 601              << ", ops " << agent_ops << "/"
 602              << cct->_conf->osd_agent_max_ops
 603              << (agent_active ? " active" : " NOT ACTIVE")
 604              << dendl;
 605     dout(20) << __func__ << " oids " << agent_oids << dendl;
 606     int max = cct->_conf->osd_agent_max_ops - agent_ops;
 607     int agent_flush_quota = max;
 608     if (!flush_mode_high_count)
 609       agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
 610     if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
 611       agent_cond.wait(agent_locker);
 612       continue;
 613     }
 614
 615     if (!agent_valid_iterator || agent_queue_pos == top.end()) {
 616       agent_queue_pos = top.begin();
 617       agent_valid_iterator = true;
 618     }
 619     PGRef pg = *agent_queue_pos;
 620     dout(10) << "high_count " << flush_mode_high_count
 621              << " agent_ops " << agent_ops
 622              << " flush_quota " << agent_flush_quota << dendl;
 623     agent_locker.unlock();
 624     if (!pg->agent_work(max, agent_flush_quota)) {
 625       dout(10) << __func__ << " " << pg->pg_id
 626         << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
 627         << " seconds" << dendl;
 628
 629       logger->inc(l_osd_tier_delay);
 630       // Queue a timer to call agent_choose_mode for this pg in 5 seconds
 631       std::lock_guard timer_locker{agent_timer_lock};
 632       Context *cb = new AgentTimeoutCB(pg);
 633       agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
 634     }
 635     agent_locker.lock();
 636   }
 637   dout(10) << __func__ << " finish" << dendl;
 638 }
 639
 640 void OSDService::agent_stop()
 641 {
 642   {
 643     std::lock_guard l(agent_lock);
 644
 645     // By this time all ops should be cancelled
 646     ceph_assert(agent_ops == 0);
 647     // By this time all PGs are shutdown and dequeued
 648     if (!agent_queue.empty()) {
 649       set<PGRef>& top = agent_queue.rbegin()->second;
 650       derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
 651       ceph_abort_msg("agent queue not empty");
 652     }
 653
 654     agent_stop_flag = true;
 655     agent_cond.notify_all();
 656   }
 657   agent_thread.join();
 658 }
 659
 660 // -------------------------------------
 661
 662 void OSDService::promote_throttle_recalibrate()
 663 {
 664   utime_t now = ceph_clock_now();
 665   double dur = now - last_recalibrate;
 666   last_recalibrate = now;
 667   unsigned prob = promote_probability_millis;
 668
 669   uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
 670   uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
 671
 672   unsigned min_prob = 1;
 673
 674   uint64_t attempts, obj, bytes;
 675   promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
 676   dout(10) << __func__ << " " << attempts << " attempts, promoted "
 677            << obj << " objects and " << byte_u_t(bytes) << "; target "
 678            << target_obj_sec << " obj/sec or "
 679            << byte_u_t(target_bytes_sec) << "/sec"
 680            << dendl;
 681
 682   // calculate what the probability *should* be, given the targets
 683   unsigned new_prob;
 684   if (attempts && dur > 0) {
 685     uint64_t avg_size = 1;
 686     if (obj)
 687       avg_size = std::max<uint64_t>(bytes / obj, 1);
 688     unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
 689     unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
 690       / (double)attempts;
 691     dout(20) << __func__ << "  po " << po << " pb " << pb << " avg_size "
 692              << avg_size << dendl;
 693     if (target_obj_sec && target_bytes_sec)
 694       new_prob = std::min(po, pb);
 695     else if (target_obj_sec)
 696       new_prob = po;
 697     else if (target_bytes_sec)
 698       new_prob = pb;
 699     else
 700       new_prob = 1000;
 701   } else {
 702     new_prob = 1000;
 703   }
 704   dout(20) << __func__ << "  new_prob " << new_prob << dendl;
 705
 706   // correct for persistent skew between target rate and actual rate, adjust
 707   double ratio = 1.0;
 708   unsigned actual = 0;
 709   if (attempts && obj) {
 710     actual = obj * 1000 / attempts;
 711     ratio = (double)actual / (double)prob;
 712     new_prob = (double)new_prob / ratio;
 713   }
 714   new_prob = std::max(new_prob, min_prob);
 715   new_prob = std::min(new_prob, 1000u);
 716
 717   // adjust
 718   prob = (prob + new_prob) / 2;
 719   prob = std::max(prob, min_prob);
 720   prob = std::min(prob, 1000u);
 721   dout(10) << __func__ << "  actual " << actual
 722            << ", actual/prob ratio " << ratio
 723            << ", adjusted new_prob " << new_prob
 724            << ", prob " << promote_probability_millis << " -> " << prob
 725            << dendl;
 726   promote_probability_millis = prob;
 727
 728   // set hard limits for this interval to mitigate stampedes
 729   promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
 730   promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
 731 }
 732
 733 // -------------------------------------
 734
 735 float OSDService::get_failsafe_full_ratio()
 736 {
 737   float full_ratio = cct->_conf->osd_failsafe_full_ratio;
 738   if (full_ratio > 1.0) full_ratio /= 100.0;
 739   return full_ratio;
 740 }
 741
 742 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
 743 {
 744   // The OSDMap ratios take precendence.  So if the failsafe is .95 and
 745   // the admin sets the cluster full to .96, the failsafe moves up to .96
 746   // too.  (Not that having failsafe == full is ideal, but it's better than
 747   // dropping writes before the clusters appears full.)
 748   OSDMapRef osdmap = get_osdmap();
 749   if (!osdmap || osdmap->get_epoch() == 0) {
 750     return NONE;
 751   }
 752   float nearfull_ratio = osdmap->get_nearfull_ratio();
 753   float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
 754   float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
 755   float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
 756
 757   if (osdmap->require_osd_release < ceph_release_t::luminous) {
 758     // use the failsafe for nearfull and full; the mon isn't using the
 759     // flags anyway because we're mid-upgrade.
 760     full_ratio = failsafe_ratio;
 761     backfillfull_ratio = failsafe_ratio;
 762     nearfull_ratio = failsafe_ratio;
 763   } else if (full_ratio <= 0 ||
 764              backfillfull_ratio <= 0 ||
 765              nearfull_ratio <= 0) {
 766     derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
 767     // use failsafe flag.  ick.  the monitor did something wrong or the user
 768     // did something stupid.
 769     full_ratio = failsafe_ratio;
 770     backfillfull_ratio = failsafe_ratio;
 771     nearfull_ratio = failsafe_ratio;
 772   }
 773
 774   if (injectfull_state > NONE && injectfull) {
 775     inject = "(Injected)";
 776     return injectfull_state;
 777   } else if (pratio > failsafe_ratio) {
 778     return FAILSAFE;
 779   } else if (ratio > full_ratio) {
 780     return FULL;
 781   } else if (ratio > backfillfull_ratio) {
 782     return BACKFILLFULL;
 783   } else if (pratio > nearfull_ratio) {
 784     return NEARFULL;
 785   }
 786    return NONE;
 787 }
 788
 789 void OSDService::check_full_status(float ratio, float pratio)
 790 {
 791   std::lock_guard l(full_status_lock);
 792
 793   cur_ratio = ratio;
 794   physical_ratio = pratio;
 795
 796   string inject;
 797   s_names new_state;
 798   new_state = recalc_full_state(ratio, pratio, inject);
 799
 800   dout(20) << __func__ << " cur ratio " << ratio
 801            << ", physical ratio " << pratio
 802            << ", new state " << get_full_state_name(new_state)
 803            << " " << inject
 804            << dendl;
 805
 806   // warn
 807   if (cur_state != new_state) {
 808     dout(10) << __func__ << " " << get_full_state_name(cur_state)
 809              << " -> " << get_full_state_name(new_state) << dendl;
 810     if (new_state == FAILSAFE) {
 811       clog->error() << "full status failsafe engaged, dropping updates, now "
 812                     << (int)roundf(ratio * 100) << "% full";
 813     } else if (cur_state == FAILSAFE) {
 814       clog->error() << "full status failsafe disengaged, no longer dropping "
 815                      << "updates, now " << (int)roundf(ratio * 100) << "% full";
 816     }
 817     cur_state = new_state;
 818   }
 819 }
 820
 821 bool OSDService::need_fullness_update()
 822 {
 823   OSDMapRef osdmap = get_osdmap();
 824   s_names cur = NONE;
 825   if (osdmap->exists(whoami)) {
 826     if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
 827       cur = FULL;
 828     } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
 829       cur = BACKFILLFULL;
 830     } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
 831       cur = NEARFULL;
 832     }
 833   }
 834   s_names want = NONE;
 835   if (is_full())
 836     want = FULL;
 837   else if (is_backfillfull())
 838     want = BACKFILLFULL;
 839   else if (is_nearfull())
 840     want = NEARFULL;
 841   return want != cur;
 842 }
 843
 844 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
 845 {
 846   if (injectfull && injectfull_state >= type) {
 847     // injectfull is either a count of the number of times to return failsafe full
 848     // or if -1 then always return full
 849     if (injectfull > 0)
 850       --injectfull;
 851     ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
 852              << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
 853              << dendl;
 854     return true;
 855   }
 856   return false;
 857 }
 858
 859 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
 860 {
 861   std::lock_guard l(full_status_lock);
 862
 863   if (_check_inject_full(dpp, type))
 864     return true;
 865
 866   if (cur_state >= type)
 867     ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
 868                        << " physical " << physical_ratio << dendl;
 869
 870   return cur_state >= type;
 871 }
 872
 873 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
 874 {
 875   ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
 876   {
 877     std::lock_guard l(full_status_lock);
 878     if (_check_inject_full(dpp, type)) {
 879       return true;
 880     }
 881   }
 882
 883   float pratio;
 884   float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
 885
 886   string notused;
 887   s_names tentative_state = recalc_full_state(ratio, pratio, notused);
 888
 889   if (tentative_state >= type)
 890     ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
 891
 892   return tentative_state >= type;
 893 }
 894
 895 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
 896 {
 897   return _check_full(dpp, FAILSAFE);
 898 }
 899
 900 bool OSDService::check_full(DoutPrefixProvider *dpp) const
 901 {
 902   return _check_full(dpp, FULL);
 903 }
 904
 905 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
 906 {
 907   return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
 908 }
 909
 910 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
 911 {
 912   return _check_full(dpp, BACKFILLFULL);
 913 }
 914
 915 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
 916 {
 917   return _check_full(dpp, NEARFULL);
 918 }
 919
 920 bool OSDService::is_failsafe_full() const
 921 {
 922   std::lock_guard l(full_status_lock);
 923   return cur_state == FAILSAFE;
 924 }
 925
 926 bool OSDService::is_full() const
 927 {
 928   std::lock_guard l(full_status_lock);
 929   return cur_state >= FULL;
 930 }
 931
 932 bool OSDService::is_backfillfull() const
 933 {
 934   std::lock_guard l(full_status_lock);
 935   return cur_state >= BACKFILLFULL;
 936 }
 937
 938 bool OSDService::is_nearfull() const
 939 {
 940   std::lock_guard l(full_status_lock);
 941   return cur_state >= NEARFULL;
 942 }
 943
 944 void OSDService::set_injectfull(s_names type, int64_t count)
 945 {
 946   std::lock_guard l(full_status_lock);
 947   injectfull_state = type;
 948   injectfull = count;
 949 }
 950
 951 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
 952                             osd_alert_list_t& alerts)
 953 {
 954   uint64_t bytes = stbuf.total;
 955   uint64_t avail = stbuf.available;
 956   uint64_t used = stbuf.get_used_raw();
 957
 958   // For testing fake statfs values so it doesn't matter if all
 959   // OSDs are using the same partition.
 960   if (cct->_conf->fake_statfs_for_testing) {
 961     uint64_t total_num_bytes = 0;
 962     vector<PGRef> pgs;
 963     osd->_get_pgs(&pgs);
 964     for (auto p : pgs) {
 965       total_num_bytes += p->get_stats_num_bytes();
 966     }
 967     bytes = cct->_conf->fake_statfs_for_testing;
 968     if (total_num_bytes < bytes)
 969       avail = bytes - total_num_bytes;
 970     else
 971       avail = 0;
 972     dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
 973             << " adjust available " << avail
 974             << dendl;
 975     used = bytes - avail;
 976   }
 977
 978   logger->set(l_osd_stat_bytes, bytes);
 979   logger->set(l_osd_stat_bytes_used, used);
 980   logger->set(l_osd_stat_bytes_avail, avail);
 981
 982   std::lock_guard l(stat_lock);
 983   osd_stat.statfs = stbuf;
 984   osd_stat.os_alerts.clear();
 985   osd_stat.os_alerts[whoami].swap(alerts);
 986   if (cct->_conf->fake_statfs_for_testing) {
 987     osd_stat.statfs.total = bytes;
 988     osd_stat.statfs.available = avail;
 989     // For testing don't want used to go negative, so clear reserved
 990     osd_stat.statfs.internally_reserved = 0;
 991   }
 992 }
 993
 994 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
 995                                     int num_pgs)
 996 {
 997   utime_t now = ceph_clock_now();
 998   auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
 999   std::lock_guard l(stat_lock);
1000   osd_stat.hb_peers.swap(hb_peers);
1001   osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
1002   osd_stat.num_pgs = num_pgs;
1003   // Clean entries that aren't updated
1004   // This is called often enough that we can just remove 1 at a time
1005   for (auto i: osd_stat.hb_pingtime) {
1006     if (i.second.last_update == 0)
1007       continue;
1008     if (stale_time && now.sec() - i.second.last_update > stale_time) {
1009       dout(20) << __func__ << " time out heartbeat for osd " << i.first
1010                << " last_update " << i.second.last_update << dendl;
1011       osd_stat.hb_pingtime.erase(i.first);
1012       break;
1013     }
1014   }
1015   return osd_stat;
1016 }
1017
1018 void OSDService::inc_osd_stat_repaired()
1019 {
1020   std::lock_guard l(stat_lock);
1021   osd_stat.num_shards_repaired++;
1022   return;
1023 }
1024
1025 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1026                                          uint64_t adjust_used)
1027 {
1028   *pratio =
1029    ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1030
1031   if (adjust_used) {
1032     dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used()  << dendl;
1033     if (new_stat.statfs.available > adjust_used)
1034       new_stat.statfs.available -= adjust_used;
1035     else
1036       new_stat.statfs.available = 0;
1037     dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1038   }
1039
1040   // Check all pgs and adjust kb_used to include all pending backfill data
1041   int backfill_adjusted = 0;
1042   vector<PGRef> pgs;
1043   osd->_get_pgs(&pgs);
1044   for (auto p : pgs) {
1045     backfill_adjusted += p->pg_stat_adjust(&new_stat);
1046   }
1047   if (backfill_adjusted) {
1048     dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1049   }
1050   return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1051 }
1052
1053 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1054 {
1055   OSDMapRef next_map = get_nextmap_reserved();
1056   // service map is always newer/newest
1057   ceph_assert(from_epoch <= next_map->get_epoch());
1058
1059   if (next_map->is_down(peer) ||
1060       next_map->get_info(peer).up_from > from_epoch) {
1061     m->put();
1062     release_map(next_map);
1063     return;
1064   }
1065   ConnectionRef peer_con;
1066   if (peer == whoami) {
1067     peer_con = osd->cluster_messenger->get_loopback_connection();
1068   } else {
1069     peer_con = osd->cluster_messenger->connect_to_osd(
1070         next_map->get_cluster_addrs(peer), false, true);
1071   }
1072   maybe_share_map(peer_con.get(), next_map);
1073   peer_con->send_message(m);
1074   release_map(next_map);
1075 }
1076
1077 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1078 {
1079   OSDMapRef next_map = get_nextmap_reserved();
1080   // service map is always newer/newest
1081   ceph_assert(from_epoch <= next_map->get_epoch());
1082
1083   for (auto& iter : messages) {
1084     if (next_map->is_down(iter.first) ||
1085         next_map->get_info(iter.first).up_from > from_epoch) {
1086       iter.second->put();
1087       continue;
1088     }
1089     ConnectionRef peer_con;
1090     if (iter.first == whoami) {
1091       peer_con = osd->cluster_messenger->get_loopback_connection();
1092     } else {
1093       peer_con = osd->cluster_messenger->connect_to_osd(
1094           next_map->get_cluster_addrs(iter.first), false, true);
1095     }
1096     maybe_share_map(peer_con.get(), next_map);
1097     peer_con->send_message(iter.second);
1098   }
1099   release_map(next_map);
1100 }
1101 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1102 {
1103   OSDMapRef next_map = get_nextmap_reserved();
1104   // service map is always newer/newest
1105   ceph_assert(from_epoch <= next_map->get_epoch());
1106
1107   if (next_map->is_down(peer) ||
1108       next_map->get_info(peer).up_from > from_epoch) {
1109     release_map(next_map);
1110     return NULL;
1111   }
1112   ConnectionRef con;
1113   if (peer == whoami) {
1114     con = osd->cluster_messenger->get_loopback_connection();
1115   } else {
1116     con = osd->cluster_messenger->connect_to_osd(
1117         next_map->get_cluster_addrs(peer), false, true);
1118   }
1119   release_map(next_map);
1120   return con;
1121 }
1122
1123 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1124 {
1125   OSDMapRef next_map = get_nextmap_reserved();
1126   // service map is always newer/newest
1127   ceph_assert(from_epoch <= next_map->get_epoch());
1128
1129   pair<ConnectionRef,ConnectionRef> ret;
1130   if (next_map->is_down(peer) ||
1131       next_map->get_info(peer).up_from > from_epoch) {
1132     release_map(next_map);
1133     return ret;
1134   }
1135   ret.first = osd->hb_back_client_messenger->connect_to_osd(
1136     next_map->get_hb_back_addrs(peer));
1137   ret.second = osd->hb_front_client_messenger->connect_to_osd(
1138     next_map->get_hb_front_addrs(peer));
1139   release_map(next_map);
1140   return ret;
1141 }
1142
1143 entity_name_t OSDService::get_cluster_msgr_name() const
1144 {
1145   return cluster_messenger->get_myname();
1146 }
1147
1148 void OSDService::queue_want_pg_temp(pg_t pgid,
1149                                     const vector<int>& want,
1150                                     bool forced)
1151 {
1152   std::lock_guard l(pg_temp_lock);
1153   auto p = pg_temp_pending.find(pgid);
1154   if (p == pg_temp_pending.end() ||
1155       p->second.acting != want ||
1156       forced) {
1157     pg_temp_wanted[pgid] = {want, forced};
1158   }
1159 }
1160
1161 void OSDService::remove_want_pg_temp(pg_t pgid)
1162 {
1163   std::lock_guard l(pg_temp_lock);
1164   pg_temp_wanted.erase(pgid);
1165   pg_temp_pending.erase(pgid);
1166 }
1167
1168 void OSDService::_sent_pg_temp()
1169 {
1170 #ifdef HAVE_STDLIB_MAP_SPLICING
1171   pg_temp_pending.merge(pg_temp_wanted);
1172 #else
1173   pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1174                          make_move_iterator(end(pg_temp_wanted)));
1175 #endif
1176   pg_temp_wanted.clear();
1177 }
1178
1179 void OSDService::requeue_pg_temp()
1180 {
1181   std::lock_guard l(pg_temp_lock);
1182   // wanted overrides pending.  note that remove_want_pg_temp
1183   // clears the item out of both.
1184   unsigned old_wanted = pg_temp_wanted.size();
1185   unsigned old_pending = pg_temp_pending.size();
1186   _sent_pg_temp();
1187   pg_temp_wanted.swap(pg_temp_pending);
1188   dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1189            << pg_temp_wanted.size() << dendl;
1190 }
1191
1192 std::ostream& operator<<(std::ostream& out,
1193                          const OSDService::pg_temp_t& pg_temp)
1194 {
1195   out << pg_temp.acting;
1196   if (pg_temp.forced) {
1197     out << " (forced)";
1198   }
1199   return out;
1200 }
1201
1202 void OSDService::send_pg_temp()
1203 {
1204   std::lock_guard l(pg_temp_lock);
1205   if (pg_temp_wanted.empty())
1206     return;
1207   dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1208   MOSDPGTemp *ms[2] = {nullptr, nullptr};
1209   for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1210     auto& m = ms[pg_temp.forced];
1211     if (!m) {
1212       m = new MOSDPGTemp(osdmap->get_epoch());
1213       m->forced = pg_temp.forced;
1214     }
1215     m->pg_temp.emplace(pgid, pg_temp.acting);
1216   }
1217   for (auto m : ms) {
1218     if (m) {
1219       monc->send_mon_message(m);
1220     }
1221   }
1222   _sent_pg_temp();
1223 }
1224
1225 void OSDService::send_pg_created(pg_t pgid)
1226 {
1227   std::lock_guard l(pg_created_lock);
1228   dout(20) << __func__ << dendl;
1229   auto o = get_osdmap();
1230   if (o->require_osd_release >= ceph_release_t::luminous) {
1231     pg_created.insert(pgid);
1232     monc->send_mon_message(new MOSDPGCreated(pgid));
1233   }
1234 }
1235
1236 void OSDService::send_pg_created()
1237 {
1238   std::lock_guard l(pg_created_lock);
1239   dout(20) << __func__ << dendl;
1240   auto o = get_osdmap();
1241   if (o->require_osd_release >= ceph_release_t::luminous) {
1242     for (auto pgid : pg_created) {
1243       monc->send_mon_message(new MOSDPGCreated(pgid));
1244     }
1245   }
1246 }
1247
1248 void OSDService::prune_pg_created()
1249 {
1250   std::lock_guard l(pg_created_lock);
1251   dout(20) << __func__ << dendl;
1252   auto o = get_osdmap();
1253   auto i = pg_created.begin();
1254   while (i != pg_created.end()) {
1255     auto p = o->get_pg_pool(i->pool());
1256     if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1257       dout(20) << __func__ << " pruning " << *i << dendl;
1258       i = pg_created.erase(i);
1259     } else {
1260       dout(20) << __func__ << " keeping " << *i << dendl;
1261       ++i;
1262     }
1263   }
1264 }
1265
1266
1267 // --------------------------------------
1268 // dispatch
1269
1270 bool OSDService::can_inc_scrubs()
1271 {
1272   bool can_inc = false;
1273   std::lock_guard l(sched_scrub_lock);
1274
1275   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1276     dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1277              << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1278     can_inc = true;
1279   } else {
1280     dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1281              << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1282   }
1283
1284   return can_inc;
1285 }
1286
1287 bool OSDService::inc_scrubs_local()
1288 {
1289   bool result = false;
1290   std::lock_guard l{sched_scrub_lock};
1291   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1292     dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1293              << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1294     result = true;
1295     ++scrubs_local;
1296   } else {
1297     dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1298   }
1299   return result;
1300 }
1301
1302 void OSDService::dec_scrubs_local()
1303 {
1304   std::lock_guard l{sched_scrub_lock};
1305   dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1306            << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1307   --scrubs_local;
1308   ceph_assert(scrubs_local >= 0);
1309 }
1310
1311 bool OSDService::inc_scrubs_remote()
1312 {
1313   bool result = false;
1314   std::lock_guard l{sched_scrub_lock};
1315   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1316     dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1317              << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1318     result = true;
1319     ++scrubs_remote;
1320   } else {
1321     dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1322   }
1323   return result;
1324 }
1325
1326 void OSDService::dec_scrubs_remote()
1327 {
1328   std::lock_guard l{sched_scrub_lock};
1329   dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1330            << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1331   --scrubs_remote;
1332   ceph_assert(scrubs_remote >= 0);
1333 }
1334
1335 void OSDService::dump_scrub_reservations(Formatter *f)
1336 {
1337   std::lock_guard l{sched_scrub_lock};
1338   f->dump_int("scrubs_local", scrubs_local);
1339   f->dump_int("scrubs_remote", scrubs_remote);
1340   f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1341 }
1342
1343 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1344                                  epoch_t *_bind_epoch) const
1345 {
1346   std::lock_guard l(epoch_lock);
1347   if (_boot_epoch)
1348     *_boot_epoch = boot_epoch;
1349   if (_up_epoch)
1350     *_up_epoch = up_epoch;
1351   if (_bind_epoch)
1352     *_bind_epoch = bind_epoch;
1353 }
1354
1355 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1356                             const epoch_t *_bind_epoch)
1357 {
1358   std::lock_guard l(epoch_lock);
1359   if (_boot_epoch) {
1360     ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1361     boot_epoch = *_boot_epoch;
1362   }
1363   if (_up_epoch) {
1364     ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1365     up_epoch = *_up_epoch;
1366   }
1367   if (_bind_epoch) {
1368     ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1369     bind_epoch = *_bind_epoch;
1370   }
1371 }
1372
1373 bool OSDService::prepare_to_stop()
1374 {
1375   std::unique_lock l(is_stopping_lock);
1376   if (get_state() != NOT_STOPPING)
1377     return false;
1378
1379   OSDMapRef osdmap = get_osdmap();
1380   if (osdmap && osdmap->is_up(whoami)) {
1381     dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1382     set_state(PREPARING_TO_STOP);
1383     monc->send_mon_message(
1384       new MOSDMarkMeDown(
1385         monc->get_fsid(),
1386         whoami,
1387         osdmap->get_addrs(whoami),
1388         osdmap->get_epoch(),
1389         true  // request ack
1390         ));
1391     const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1392     is_stopping_cond.wait_for(l, timeout,
1393       [this] { return get_state() == STOPPING; });
1394   }
1395   dout(0) << __func__ << " starting shutdown" << dendl;
1396   set_state(STOPPING);
1397   return true;
1398 }
1399
1400 void OSDService::got_stop_ack()
1401 {
1402   std::scoped_lock l(is_stopping_lock);
1403   if (get_state() == PREPARING_TO_STOP) {
1404     dout(0) << __func__ << " starting shutdown" << dendl;
1405     set_state(STOPPING);
1406     is_stopping_cond.notify_all();
1407   } else {
1408     dout(10) << __func__ << " ignoring msg" << dendl;
1409   }
1410 }
1411
1412 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1413                                                OSDSuperblock& sblock)
1414 {
1415   MOSDMap *m = new MOSDMap(monc->get_fsid(),
1416                            osdmap->get_encoding_features());
1417   m->oldest_map = max_oldest_map;
1418   m->newest_map = sblock.newest_map;
1419
1420   int max = cct->_conf->osd_map_message_max;
1421   ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1422
1423   if (since < m->oldest_map) {
1424     // we don't have the next map the target wants, so start with a
1425     // full map.
1426     bufferlist bl;
1427     dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1428              << since << ", starting with full map" << dendl;
1429     since = m->oldest_map;
1430     if (!get_map_bl(since, bl)) {
1431       derr << __func__ << " missing full map " << since << dendl;
1432       goto panic;
1433     }
1434     max--;
1435     max_bytes -= bl.length();
1436     m->maps[since] = std::move(bl);
1437   }
1438   for (epoch_t e = since + 1; e <= to; ++e) {
1439     bufferlist bl;
1440     if (get_inc_map_bl(e, bl)) {
1441       m->incremental_maps[e] = std::move(bl);
1442     } else {
1443       dout(10) << __func__ << " missing incremental map " << e << dendl;
1444       if (!get_map_bl(e, bl)) {
1445         derr << __func__ << " also missing full map " << e << dendl;
1446         goto panic;
1447       }
1448       m->maps[e] = std::move(bl);
1449     }
1450     max--;
1451     max_bytes -= bl.length();
1452     if (max <= 0 || max_bytes <= 0) {
1453       break;
1454     }
1455   }
1456   return m;
1457
1458  panic:
1459   if (!m->maps.empty() ||
1460       !m->incremental_maps.empty()) {
1461     // send what we have so far
1462     return m;
1463   }
1464   // send something
1465   bufferlist bl;
1466   if (get_inc_map_bl(m->newest_map, bl)) {
1467     m->incremental_maps[m->newest_map] = std::move(bl);
1468   } else {
1469     derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1470     if (!get_map_bl(m->newest_map, bl)) {
1471       derr << __func__ << " unable to load latest full map " << m->newest_map
1472            << dendl;
1473       ceph_abort();
1474     }
1475     m->maps[m->newest_map] = std::move(bl);
1476   }
1477   return m;
1478 }
1479
1480 void OSDService::send_map(MOSDMap *m, Connection *con)
1481 {
1482   con->send_message(m);
1483 }
1484
1485 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1486                                       const OSDMapRef& osdmap)
1487 {
1488   epoch_t to = osdmap->get_epoch();
1489   dout(10) << "send_incremental_map " << since << " -> " << to
1490            << " to " << con << " " << con->get_peer_addr() << dendl;
1491
1492   MOSDMap *m = NULL;
1493   while (!m) {
1494     OSDSuperblock sblock(get_superblock());
1495     if (since < sblock.oldest_map) {
1496       // just send latest full map
1497       MOSDMap *m = new MOSDMap(monc->get_fsid(),
1498                                osdmap->get_encoding_features());
1499       m->oldest_map = max_oldest_map;
1500       m->newest_map = sblock.newest_map;
1501       get_map_bl(to, m->maps[to]);
1502       send_map(m, con);
1503       return;
1504     }
1505
1506     if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1507       dout(10) << "  " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1508                << ", only sending most recent" << dendl;
1509       since = to - cct->_conf->osd_map_share_max_epochs;
1510     }
1511
1512     m = build_incremental_map_msg(since, to, sblock);
1513   }
1514   send_map(m, con);
1515 }
1516
1517 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1518 {
1519   bool found = map_bl_cache.lookup(e, &bl);
1520   if (found) {
1521     logger->inc(l_osd_map_bl_cache_hit);
1522     return true;
1523   }
1524   logger->inc(l_osd_map_bl_cache_miss);
1525   found = store->read(meta_ch,
1526                       OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1527                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1528   if (found) {
1529     _add_map_bl(e, bl);
1530   }
1531   return found;
1532 }
1533
1534 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1535 {
1536   std::lock_guard l(map_cache_lock);
1537   bool found = map_bl_inc_cache.lookup(e, &bl);
1538   if (found) {
1539     logger->inc(l_osd_map_bl_cache_hit);
1540     return true;
1541   }
1542   logger->inc(l_osd_map_bl_cache_miss);
1543   found = store->read(meta_ch,
1544                       OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1545                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1546   if (found) {
1547     _add_map_inc_bl(e, bl);
1548   }
1549   return found;
1550 }
1551
1552 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1553 {
1554   dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1555   // cache a contiguous buffer
1556   if (bl.get_num_buffers() > 1) {
1557     bl.rebuild();
1558   }
1559   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1560   map_bl_cache.add(e, bl);
1561 }
1562
1563 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1564 {
1565   dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1566   // cache a contiguous buffer
1567   if (bl.get_num_buffers() > 1) {
1568     bl.rebuild();
1569   }
1570   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1571   map_bl_inc_cache.add(e, bl);
1572 }
1573
1574 OSDMapRef OSDService::_add_map(OSDMap *o)
1575 {
1576   epoch_t e = o->get_epoch();
1577
1578   if (cct->_conf->osd_map_dedup) {
1579     // Dedup against an existing map at a nearby epoch
1580     OSDMapRef for_dedup = map_cache.lower_bound(e);
1581     if (for_dedup) {
1582       OSDMap::dedup(for_dedup.get(), o);
1583     }
1584   }
1585   bool existed;
1586   OSDMapRef l = map_cache.add(e, o, &existed);
1587   if (existed) {
1588     delete o;
1589   }
1590   return l;
1591 }
1592
1593 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1594 {
1595   std::lock_guard l(map_cache_lock);
1596   OSDMapRef retval = map_cache.lookup(epoch);
1597   if (retval) {
1598     dout(30) << "get_map " << epoch << " -cached" << dendl;
1599     logger->inc(l_osd_map_cache_hit);
1600     return retval;
1601   }
1602   {
1603     logger->inc(l_osd_map_cache_miss);
1604     epoch_t lb = map_cache.cached_key_lower_bound();
1605     if (epoch < lb) {
1606       dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1607       logger->inc(l_osd_map_cache_miss_low);
1608       logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1609     }
1610   }
1611
1612   OSDMap *map = new OSDMap;
1613   if (epoch > 0) {
1614     dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1615     bufferlist bl;
1616     if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1617       derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1618       delete map;
1619       return OSDMapRef();
1620     }
1621     map->decode(bl);
1622   } else {
1623     dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1624   }
1625   return _add_map(map);
1626 }
1627
1628 // ops
1629
1630
1631 void OSDService::reply_op_error(OpRequestRef op, int err)
1632 {
1633   reply_op_error(op, err, eversion_t(), 0, {});
1634 }
1635
1636 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1637                                 version_t uv,
1638                                 vector<pg_log_op_return_item_t> op_returns)
1639 {
1640   auto m = op->get_req<MOSDOp>();
1641   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1642   int flags;
1643   flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1644
1645   MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1646                                        !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1647   reply->set_reply_versions(v, uv);
1648   reply->set_op_returns(op_returns);
1649   m->get_connection()->send_message(reply);
1650 }
1651
1652 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1653 {
1654   if (!cct->_conf->osd_debug_misdirected_ops) {
1655     return;
1656   }
1657
1658   auto m = op->get_req<MOSDOp>();
1659   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1660
1661   ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1662
1663   if (pg->is_ec_pg()) {
1664     /**
1665        * OSD recomputes op target based on current OSDMap. With an EC pg, we
1666        * can get this result:
1667        * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1668        *    [CRUSH_ITEM_NONE, 2, 3]/3
1669        * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1670        *    [3, 2, 3]/3
1671        * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1672        *    -- misdirected op
1673        * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1674        *    it and fulfils it
1675        *
1676        * We can't compute the op target based on the sending map epoch due to
1677        * splitting.  The simplest thing is to detect such cases here and drop
1678        * them without an error (the client will resend anyway).
1679        */
1680     ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1681     OSDMapRef opmap = try_get_map(m->get_map_epoch());
1682     if (!opmap) {
1683       dout(7) << __func__ << ": " << *pg << " no longer have map for "
1684               << m->get_map_epoch() << ", dropping" << dendl;
1685       return;
1686     }
1687     pg_t _pgid = m->get_raw_pg();
1688     spg_t pgid;
1689     if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1690       _pgid = opmap->raw_pg_to_pg(_pgid);
1691     if (opmap->get_primary_shard(_pgid, &pgid) &&
1692         pgid.shard != pg->pg_id.shard) {
1693       dout(7) << __func__ << ": " << *pg << " primary changed since "
1694               << m->get_map_epoch() << ", dropping" << dendl;
1695       return;
1696     }
1697   }
1698
1699   dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1700   clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1701                << " pg " << m->get_raw_pg()
1702                << " to osd." << whoami
1703                << " not " << pg->get_acting()
1704                << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1705 }
1706
1707 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1708 {
1709   osd->op_shardedwq.queue(std::move(qi));
1710 }
1711
1712 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1713 {
1714   osd->op_shardedwq.queue_front(std::move(qi));
1715 }
1716
1717 void OSDService::queue_recovery_context(
1718   PG *pg,
1719   GenContext<ThreadPool::TPHandle&> *c)
1720 {
1721   epoch_t e = get_osdmap_epoch();
1722   enqueue_back(
1723     OpSchedulerItem(
1724       unique_ptr<OpSchedulerItem::OpQueueable>(
1725         new PGRecoveryContext(pg->get_pgid(), c, e)),
1726       cct->_conf->osd_recovery_cost,
1727       cct->_conf->osd_recovery_priority,
1728       ceph_clock_now(),
1729       0,
1730       e));
1731 }
1732
1733 void OSDService::queue_for_snap_trim(PG *pg)
1734 {
1735   dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1736   enqueue_back(
1737     OpSchedulerItem(
1738       unique_ptr<OpSchedulerItem::OpQueueable>(
1739         new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1740       cct->_conf->osd_snap_trim_cost,
1741       cct->_conf->osd_snap_trim_priority,
1742       ceph_clock_now(),
1743       0,
1744       pg->get_osdmap_epoch()));
1745 }
1746
1747 template <class MSG_TYPE>
1748 void OSDService::queue_scrub_event_msg(PG* pg,
1749                                        Scrub::scrub_prio_t with_priority,
1750                                        unsigned int qu_priority)
1751 {
1752   const auto epoch = pg->get_osdmap_epoch();
1753   auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1754   dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1755
1756   enqueue_back(OpSchedulerItem(
1757     unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1758     pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1759 }
1760
1761 template <class MSG_TYPE>
1762 void OSDService::queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority)
1763 {
1764   const auto epoch = pg->get_osdmap_epoch();
1765   auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1766   dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1767
1768   enqueue_back(OpSchedulerItem(
1769     unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1770     pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1771 }
1772
1773 void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1774 {
1775   queue_scrub_event_msg<PGScrub>(pg, with_priority);
1776 }
1777
1778 void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1779 {
1780   queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1781 }
1782
1783 void OSDService::queue_for_rep_scrub(PG* pg,
1784                                      Scrub::scrub_prio_t with_priority,
1785                                      unsigned int qu_priority)
1786 {
1787   queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority);
1788 }
1789
1790 void OSDService::queue_for_rep_scrub_resched(PG* pg,
1791                                              Scrub::scrub_prio_t with_priority,
1792                                              unsigned int qu_priority)
1793 {
1794   // Resulting scrub event: 'SchedReplica'
1795   queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority);
1796 }
1797
1798 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1799 {
1800   // Resulting scrub event: 'RemotesReserved'
1801   queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1802 }
1803
1804 void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1805 {
1806   // Resulting scrub event: 'ReservationFailure'
1807   queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1808 }
1809
1810 void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1811 {
1812   // Resulting scrub event: 'InternalSchedScrub'
1813   queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1814 }
1815
1816 void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1817 {
1818   // Resulting scrub event: 'ActivePushesUpd'
1819   queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1820 }
1821
1822 void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1823 {
1824   queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1825 }
1826
1827 void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1828 {
1829   // Resulting scrub event: 'Unblocked'
1830   queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1831 }
1832
1833 void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1834 {
1835   // Resulting scrub event: 'DigestUpdate'
1836   queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1837 }
1838
1839 void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1840 {
1841   // Resulting scrub event: 'GotReplicas'
1842   queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1843 }
1844
1845 void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1846 {
1847   // Resulting scrub event: 'ReplicaPushesUpd'
1848   queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1849 }
1850
1851 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1852 {
1853   dout(10) << __func__ << " on " << pgid << " e " << e  << dendl;
1854   enqueue_back(
1855     OpSchedulerItem(
1856       unique_ptr<OpSchedulerItem::OpQueueable>(
1857         new PGDelete(pgid, e)),
1858       cct->_conf->osd_pg_delete_cost,
1859       cct->_conf->osd_pg_delete_priority,
1860       ceph_clock_now(),
1861       0,
1862       e));
1863 }
1864
1865 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1866 {
1867   return osd->try_finish_pg_delete(pg, old_pg_num);
1868 }
1869
1870 // ---
1871
1872 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1873 {
1874   std::lock_guard l(merge_lock);
1875   dout(10) << __func__ << " " << pg->pg_id << dendl;
1876   ready_to_merge_source[pg->pg_id.pgid] = version;
1877   assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1878   _send_ready_to_merge();
1879 }
1880
1881 void OSDService::set_ready_to_merge_target(PG *pg,
1882                                            eversion_t version,
1883                                            epoch_t last_epoch_started,
1884                                            epoch_t last_epoch_clean)
1885 {
1886   std::lock_guard l(merge_lock);
1887   dout(10) << __func__ << " " << pg->pg_id << dendl;
1888   ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1889                                          make_tuple(version,
1890                                                     last_epoch_started,
1891                                                     last_epoch_clean)));
1892   assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1893   _send_ready_to_merge();
1894 }
1895
1896 void OSDService::set_not_ready_to_merge_source(pg_t source)
1897 {
1898   std::lock_guard l(merge_lock);
1899   dout(10) << __func__ << " " << source << dendl;
1900   not_ready_to_merge_source.insert(source);
1901   assert(ready_to_merge_source.count(source) == 0);
1902   _send_ready_to_merge();
1903 }
1904
1905 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1906 {
1907   std::lock_guard l(merge_lock);
1908   dout(10) << __func__ << " " << target << " source " << source << dendl;
1909   not_ready_to_merge_target[target] = source;
1910   assert(ready_to_merge_target.count(target) == 0);
1911   _send_ready_to_merge();
1912 }
1913
1914 void OSDService::send_ready_to_merge()
1915 {
1916   std::lock_guard l(merge_lock);
1917   _send_ready_to_merge();
1918 }
1919
1920 void OSDService::_send_ready_to_merge()
1921 {
1922   dout(20) << __func__
1923            << " ready_to_merge_source " << ready_to_merge_source
1924            << " not_ready_to_merge_source " << not_ready_to_merge_source
1925            << " ready_to_merge_target " << ready_to_merge_target
1926            << " not_ready_to_merge_target " << not_ready_to_merge_target
1927            << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1928            << dendl;
1929   for (auto src : not_ready_to_merge_source) {
1930     if (sent_ready_to_merge_source.count(src) == 0) {
1931       monc->send_mon_message(new MOSDPGReadyToMerge(
1932                                src,
1933                                {}, {}, 0, 0,
1934                                false,
1935                                osdmap->get_epoch()));
1936       sent_ready_to_merge_source.insert(src);
1937     }
1938   }
1939   for (auto p : not_ready_to_merge_target) {
1940     if (sent_ready_to_merge_source.count(p.second) == 0) {
1941       monc->send_mon_message(new MOSDPGReadyToMerge(
1942                                p.second,
1943                                {}, {}, 0, 0,
1944                                false,
1945                                osdmap->get_epoch()));
1946       sent_ready_to_merge_source.insert(p.second);
1947     }
1948   }
1949   for (auto src : ready_to_merge_source) {
1950     if (not_ready_to_merge_source.count(src.first) ||
1951         not_ready_to_merge_target.count(src.first.get_parent())) {
1952       continue;
1953     }
1954     auto p = ready_to_merge_target.find(src.first.get_parent());
1955     if (p != ready_to_merge_target.end() &&
1956         sent_ready_to_merge_source.count(src.first) == 0) {
1957       monc->send_mon_message(new MOSDPGReadyToMerge(
1958                                src.first,           // source pgid
1959                                src.second,          // src version
1960                                std::get<0>(p->second), // target version
1961                                std::get<1>(p->second), // PG's last_epoch_started
1962                                std::get<2>(p->second), // PG's last_epoch_clean
1963                                true,
1964                                osdmap->get_epoch()));
1965       sent_ready_to_merge_source.insert(src.first);
1966     }
1967   }
1968 }
1969
1970 void OSDService::clear_ready_to_merge(PG *pg)
1971 {
1972   std::lock_guard l(merge_lock);
1973   dout(10) << __func__ << " " << pg->pg_id << dendl;
1974   ready_to_merge_source.erase(pg->pg_id.pgid);
1975   ready_to_merge_target.erase(pg->pg_id.pgid);
1976   not_ready_to_merge_source.erase(pg->pg_id.pgid);
1977   not_ready_to_merge_target.erase(pg->pg_id.pgid);
1978   sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1979 }
1980
1981 void OSDService::clear_sent_ready_to_merge()
1982 {
1983   std::lock_guard l(merge_lock);
1984   sent_ready_to_merge_source.clear();
1985 }
1986
1987 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1988 {
1989   std::lock_guard l(merge_lock);
1990   auto i = sent_ready_to_merge_source.begin();
1991   while (i != sent_ready_to_merge_source.end()) {
1992     if (!osdmap->pg_exists(*i)) {
1993       dout(10) << __func__ << " " << *i << dendl;
1994       i = sent_ready_to_merge_source.erase(i);
1995     } else {
1996       ++i;
1997     }
1998   }
1999 }
2000
2001 // ---
2002
2003 void OSDService::_queue_for_recovery(
2004   std::pair<epoch_t, PGRef> p,
2005   uint64_t reserved_pushes)
2006 {
2007   ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
2008   enqueue_back(
2009     OpSchedulerItem(
2010       unique_ptr<OpSchedulerItem::OpQueueable>(
2011         new PGRecovery(
2012           p.second->get_pgid(), p.first, reserved_pushes)),
2013       cct->_conf->osd_recovery_cost,
2014       cct->_conf->osd_recovery_priority,
2015       ceph_clock_now(),
2016       0,
2017       p.first));
2018 }
2019
2020 // ====================================================================
2021 // OSD
2022
2023 #undef dout_prefix
2024 #define dout_prefix *_dout
2025
2026 // Commands shared between OSD's console and admin console:
2027 namespace ceph::osd_cmds {
2028
2029 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
2030
2031 } // namespace ceph::osd_cmds
2032
2033 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
2034 {
2035   int ret;
2036
2037   OSDSuperblock sb;
2038   bufferlist sbbl;
2039   ObjectStore::CollectionHandle ch;
2040
2041   // if we are fed a uuid for this osd, use it.
2042   store->set_fsid(cct->_conf->osd_uuid);
2043
2044   ret = store->mkfs();
2045   if (ret) {
2046     derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2047          << cpp_strerror(ret) << dendl;
2048     goto free_store;
2049   }
2050
2051   store->set_cache_shards(1);  // doesn't matter for mkfs!
2052
2053   ret = store->mount();
2054   if (ret) {
2055     derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2056          << cpp_strerror(ret) << dendl;
2057     goto free_store;
2058   }
2059
2060   ch = store->open_collection(coll_t::meta());
2061   if (ch) {
2062     ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2063     if (ret < 0) {
2064       derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2065       goto free_store;
2066     }
2067     /* if we already have superblock, check content of superblock */
2068     dout(0) << " have superblock" << dendl;
2069     auto p = sbbl.cbegin();
2070     decode(sb, p);
2071     if (whoami != sb.whoami) {
2072       derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2073            << dendl;
2074       ret = -EINVAL;
2075       goto umount_store;
2076     }
2077     if (fsid != sb.cluster_fsid) {
2078       derr << "provided cluster fsid " << fsid
2079            << " != superblock's " << sb.cluster_fsid << dendl;
2080       ret = -EINVAL;
2081       goto umount_store;
2082     }
2083   } else {
2084     // create superblock
2085     sb.cluster_fsid = fsid;
2086     sb.osd_fsid = store->get_fsid();
2087     sb.whoami = whoami;
2088     sb.compat_features = get_osd_initial_compat_set();
2089
2090     bufferlist bl;
2091     encode(sb, bl);
2092
2093     ObjectStore::CollectionHandle ch = store->create_new_collection(
2094       coll_t::meta());
2095     ObjectStore::Transaction t;
2096     t.create_collection(coll_t::meta(), 0);
2097     t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2098     ret = store->queue_transaction(ch, std::move(t));
2099     if (ret) {
2100       derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2101            << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2102       goto umount_store;
2103     }
2104   }
2105
2106   ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
2107   if (ret) {
2108     derr << "OSD::mkfs: failed to write fsid file: error "
2109          << cpp_strerror(ret) << dendl;
2110     goto umount_store;
2111   }
2112
2113 umount_store:
2114   if (ch) {
2115     ch.reset();
2116   }
2117   store->umount();
2118 free_store:
2119   delete store;
2120   return ret;
2121 }
2122
2123 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2124 {
2125   char val[80];
2126   int r;
2127
2128   snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2129   r = store->write_meta("magic", val);
2130   if (r < 0)
2131     return r;
2132
2133   snprintf(val, sizeof(val), "%d", whoami);
2134   r = store->write_meta("whoami", val);
2135   if (r < 0)
2136     return r;
2137
2138   cluster_fsid.print(val);
2139   r = store->write_meta("ceph_fsid", val);
2140   if (r < 0)
2141     return r;
2142
2143   string key = cct->_conf.get_val<string>("key");
2144   if (key.size()) {
2145     r = store->write_meta("osd_key", key);
2146     if (r < 0)
2147       return r;
2148   } else {
2149     string keyfile = cct->_conf.get_val<string>("keyfile");
2150     if (!keyfile.empty()) {
2151       bufferlist keybl;
2152       string err;
2153       r = keybl.read_file(keyfile.c_str(), &err);
2154       if (r < 0) {
2155         derr << __func__ << " failed to read keyfile " << keyfile << ": "
2156              << err << ": " << cpp_strerror(r) << dendl;
2157         return r;
2158       }
2159       r = store->write_meta("osd_key", keybl.to_str());
2160       if (r < 0)
2161         return r;
2162     }
2163   }
2164   if (!osdspec_affinity.empty()) {
2165     r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2166     if (r < 0)
2167       return r;
2168   }
2169
2170   r = store->write_meta("ready", "ready");
2171   if (r < 0)
2172     return r;
2173
2174   return 0;
2175 }
2176
2177 int OSD::peek_meta(ObjectStore *store,
2178                    std::string *magic,
2179                    uuid_d *cluster_fsid,
2180                    uuid_d *osd_fsid,
2181                    int *whoami,
2182                    ceph_release_t *require_osd_release)
2183 {
2184   string val;
2185
2186   int r = store->read_meta("magic", &val);
2187   if (r < 0)
2188     return r;
2189   *magic = val;
2190
2191   r = store->read_meta("whoami", &val);
2192   if (r < 0)
2193     return r;
2194   *whoami = atoi(val.c_str());
2195
2196   r = store->read_meta("ceph_fsid", &val);
2197   if (r < 0)
2198     return r;
2199   r = cluster_fsid->parse(val.c_str());
2200   if (!r)
2201     return -EINVAL;
2202
2203   r = store->read_meta("fsid", &val);
2204   if (r < 0) {
2205     *osd_fsid = uuid_d();
2206   } else {
2207     r = osd_fsid->parse(val.c_str());
2208     if (!r)
2209       return -EINVAL;
2210   }
2211
2212   r = store->read_meta("require_osd_release", &val);
2213   if (r >= 0) {
2214     *require_osd_release = ceph_release_from_name(val);
2215   }
2216
2217   return 0;
2218 }
2219
2220
2221 #undef dout_prefix
2222 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2223
2224 // cons/des
2225
2226 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2227          int id,
2228          Messenger *internal_messenger,
2229          Messenger *external_messenger,
2230          Messenger *hb_client_front,
2231          Messenger *hb_client_back,
2232          Messenger *hb_front_serverm,
2233          Messenger *hb_back_serverm,
2234          Messenger *osdc_messenger,
2235          MonClient *mc,
2236          const std::string &dev, const std::string &jdev,
2237          ceph::async::io_context_pool& poolctx) :
2238   Dispatcher(cct_),
2239   tick_timer(cct, osd_lock),
2240   tick_timer_without_osd_lock(cct, tick_timer_lock),
2241   gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2242   cluster_messenger(internal_messenger),
2243   client_messenger(external_messenger),
2244   objecter_messenger(osdc_messenger),
2245   monc(mc),
2246   mgrc(cct_, client_messenger, &mc->monmap),
2247   logger(create_logger()),
2248   recoverystate_perf(create_recoverystate_perf()),
2249   store(store_),
2250   log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2251   clog(log_client.create_channel()),
2252   whoami(id),
2253   dev_path(dev), journal_path(jdev),
2254   store_is_rotational(store->is_rotational()),
2255   trace_endpoint("0.0.0.0", 0, "osd"),
2256   asok_hook(NULL),
2257   m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2258                                   "osd_pg_epoch_max_lag_factor")),
2259   osd_compat(get_osd_compat_set()),
2260   osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2261             get_num_op_threads()),
2262   heartbeat_stop(false),
2263   heartbeat_need_update(true),
2264   hb_front_client_messenger(hb_client_front),
2265   hb_back_client_messenger(hb_client_back),
2266   hb_front_server_messenger(hb_front_serverm),
2267   hb_back_server_messenger(hb_back_serverm),
2268   daily_loadavg(0.0),
2269   heartbeat_thread(this),
2270   heartbeat_dispatcher(this),
2271   op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2272                   cct->_conf->osd_num_op_tracker_shard),
2273   test_ops_hook(NULL),
2274   op_shardedwq(
2275     this,
2276     ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2277     ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
2278     &osd_op_tp),
2279   last_pg_create_epoch(0),
2280   boot_finisher(cct),
2281   up_thru_wanted(0),
2282   requested_full_first(0),
2283   requested_full_last(0),
2284   service(this, poolctx)
2285 {
2286
2287   if (!gss_ktfile_client.empty()) {
2288     // Assert we can export environment variable
2289     /*
2290         The default client keytab is used, if it is present and readable,
2291         to automatically obtain initial credentials for GSSAPI client
2292         applications. The principal name of the first entry in the client
2293         keytab is used by default when obtaining initial credentials.
2294         1. The KRB5_CLIENT_KTNAME environment variable.
2295         2. The default_client_keytab_name profile variable in [libdefaults].
2296         3. The hardcoded default, DEFCKTNAME.
2297     */
2298     const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2299                                     gss_ktfile_client.c_str(), 1));
2300     ceph_assert(set_result == 0);
2301   }
2302
2303   monc->set_messenger(client_messenger);
2304   op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2305                                          cct->_conf->osd_op_log_threshold);
2306   op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2307                                            cct->_conf->osd_op_history_duration);
2308   op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2309                                                     cct->_conf->osd_op_history_slow_op_threshold);
2310   ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2311 #ifdef WITH_BLKIN
2312   std::stringstream ss;
2313   ss << "osd." << whoami;
2314   trace_endpoint.copy_name(ss.str());
2315 #endif
2316
2317   // initialize shards
2318   num_shards = get_num_op_shards();
2319   for (uint32_t i = 0; i < num_shards; i++) {
2320     OSDShard *one_shard = new OSDShard(
2321       i,
2322       cct,
2323       this);
2324     shards.push_back(one_shard);
2325   }
2326
2327   // override some config options if mclock is enabled on all the shards
2328   maybe_override_options_for_qos();
2329 }
2330
2331 OSD::~OSD()
2332 {
2333   while (!shards.empty()) {
2334     delete shards.back();
2335     shards.pop_back();
2336   }
2337   cct->get_perfcounters_collection()->remove(recoverystate_perf);
2338   cct->get_perfcounters_collection()->remove(logger);
2339   delete recoverystate_perf;
2340   delete logger;
2341   delete store;
2342 }
2343
2344 double OSD::get_tick_interval() const
2345 {
2346   // vary +/- 5% to avoid scrub scheduling livelocks
2347   constexpr auto delta = 0.05;
2348   return (OSD_TICK_INTERVAL *
2349           ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2350 }
2351
2352 void OSD::handle_signal(int signum)
2353 {
2354   ceph_assert(signum == SIGINT || signum == SIGTERM);
2355   derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2356   shutdown();
2357 }
2358
2359 int OSD::pre_init()
2360 {
2361   std::lock_guard lock(osd_lock);
2362   if (is_stopping())
2363     return 0;
2364
2365   if (store->test_mount_in_use()) {
2366     derr << "OSD::pre_init: object store '" << dev_path << "' is "
2367          << "currently in use. (Is ceph-osd already running?)" << dendl;
2368     return -EBUSY;
2369   }
2370
2371   cct->_conf.add_observer(this);
2372   return 0;
2373 }
2374
2375 int OSD::set_numa_affinity()
2376 {
2377   // storage numa node
2378   int store_node = -1;
2379   store->get_numa_node(&store_node, nullptr, nullptr);
2380   if (store_node >= 0) {
2381     dout(1) << __func__ << " storage numa node " << store_node << dendl;
2382   }
2383
2384   // check network numa node(s)
2385   int front_node = -1, back_node = -1;
2386   string front_iface = pick_iface(
2387     cct,
2388     client_messenger->get_myaddrs().front().get_sockaddr_storage());
2389   string back_iface = pick_iface(
2390     cct,
2391     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2392   int r = get_iface_numa_node(front_iface, &front_node);
2393   if (r >= 0 && front_node >= 0) {
2394     dout(1) << __func__ << " public network " << front_iface << " numa node "
2395             << front_node << dendl;
2396     r = get_iface_numa_node(back_iface, &back_node);
2397     if (r >= 0 && back_node >= 0) {
2398       dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2399               << back_node << dendl;
2400       if (front_node == back_node &&
2401           front_node == store_node) {
2402         dout(1) << " objectstore and network numa nodes all match" << dendl;
2403         if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2404           numa_node = front_node;
2405         }
2406       } else if (front_node != back_node) {
2407         dout(1) << __func__ << " public and cluster network numa nodes do not match"
2408                 << dendl;
2409       } else {
2410         dout(1) << __func__ << " objectstore and network numa nodes do not match"
2411                 << dendl;
2412       }
2413     } else if (back_node == -2) {
2414       dout(1) << __func__ << " cluster network " << back_iface
2415               << " ports numa nodes do not match" << dendl;
2416     } else {
2417       derr << __func__ << " unable to identify cluster interface '" << back_iface
2418            << "' numa node: " << cpp_strerror(r) << dendl;
2419     }
2420   } else if (front_node == -2) {
2421     dout(1) << __func__ << " public network " << front_iface
2422             << " ports numa nodes do not match" << dendl;
2423   } else {
2424     derr << __func__ << " unable to identify public interface '" << front_iface
2425          << "' numa node: " << cpp_strerror(r) << dendl;
2426   }
2427   if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2428     // this takes precedence over the automagic logic above
2429     numa_node = node;
2430   }
2431   if (numa_node >= 0) {
2432     int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2433     if (r < 0) {
2434       dout(1) << __func__ << " unable to determine numa node " << numa_node
2435               << " CPUs" << dendl;
2436       numa_node = -1;
2437     } else {
2438       dout(1) << __func__ << " setting numa affinity to node " << numa_node
2439               << " cpus "
2440               << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2441               << dendl;
2442       r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2443       if (r < 0) {
2444         r = -errno;
2445         derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2446              << dendl;
2447         numa_node = -1;
2448       }
2449     }
2450   } else {
2451     dout(1) << __func__ << " not setting numa affinity" << dendl;
2452   }
2453   return 0;
2454 }
2455
2456 // asok
2457
2458 class OSDSocketHook : public AdminSocketHook {
2459   OSD *osd;
2460 public:
2461   explicit OSDSocketHook(OSD *o) : osd(o) {}
2462   int call(std::string_view prefix, const cmdmap_t& cmdmap,
2463            Formatter *f,
2464            std::ostream& ss,
2465            bufferlist& out) override {
2466     ceph_abort("should use async hook");
2467   }
2468   void call_async(
2469     std::string_view prefix,
2470     const cmdmap_t& cmdmap,
2471     Formatter *f,
2472     const bufferlist& inbl,
2473     std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2474     try {
2475       osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2476     } catch (const TOPNSPC::common::bad_cmd_get& e) {
2477       bufferlist empty;
2478       on_finish(-EINVAL, e.what(), empty);
2479     }
2480   }
2481 };
2482
2483 std::set<int64_t> OSD::get_mapped_pools()
2484 {
2485   std::set<int64_t> pools;
2486   std::vector<spg_t> pgids;
2487   _get_pgids(&pgids);
2488   for (const auto &pgid : pgids) {
2489     pools.insert(pgid.pool());
2490   }
2491   return pools;
2492 }
2493
2494 void OSD::asok_command(
2495   std::string_view prefix, const cmdmap_t& cmdmap,
2496   Formatter *f,
2497   const bufferlist& inbl,
2498   std::function<void(int,const std::string&,bufferlist&)> on_finish)
2499 {
2500   int ret = 0;
2501   stringstream ss;   // stderr error message stream
2502   bufferlist outbl;  // if empty at end, we'll dump formatter as output
2503
2504   // --- PG commands are routed here to PG::do_command ---
2505   if (prefix == "pg" ||
2506       prefix == "query" ||
2507       prefix == "mark_unfound_lost" ||
2508       prefix == "list_unfound" ||
2509       prefix == "scrub" ||
2510       prefix == "deep_scrub"
2511     ) {
2512     string pgidstr;
2513     pg_t pgid;
2514     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2515       ss << "no pgid specified";
2516       ret = -EINVAL;
2517       goto out;
2518     }
2519     if (!pgid.parse(pgidstr.c_str())) {
2520       ss << "couldn't parse pgid '" << pgidstr << "'";
2521       ret = -EINVAL;
2522       goto out;
2523     }
2524     spg_t pcand;
2525     PGRef pg;
2526     if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2527         (pg = _lookup_lock_pg(pcand))) {
2528       if (pg->is_primary()) {
2529         cmdmap_t new_cmdmap = cmdmap;
2530         try {
2531           pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2532           pg->unlock();
2533           return; // the pg handler calls on_finish directly
2534         } catch (const TOPNSPC::common::bad_cmd_get& e) {
2535           pg->unlock();
2536           ss << e.what();
2537           ret = -EINVAL;
2538           goto out;
2539         }
2540       } else {
2541         ss << "not primary for pgid " << pgid;
2542         // do not reply; they will get newer maps and realize they
2543         // need to resend.
2544         pg->unlock();
2545         ret = -EAGAIN;
2546         goto out;
2547       }
2548     } else {
2549       ss << "i don't have pgid " << pgid;
2550       ret = -ENOENT;
2551     }
2552   }
2553
2554   // --- OSD commands follow ---
2555
2556   else if (prefix == "status") {
2557     lock_guard l(osd_lock);
2558     f->open_object_section("status");
2559     f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2560     f->dump_stream("osd_fsid") << superblock.osd_fsid;
2561     f->dump_unsigned("whoami", superblock.whoami);
2562     f->dump_string("state", get_state_name(get_state()));
2563     f->dump_unsigned("oldest_map", superblock.oldest_map);
2564     f->dump_unsigned("newest_map", superblock.newest_map);
2565     f->dump_unsigned("num_pgs", num_pgs);
2566     f->close_section();
2567   } else if (prefix == "flush_journal") {
2568     store->flush_journal();
2569   } else if (prefix == "dump_ops_in_flight" ||
2570              prefix == "ops" ||
2571              prefix == "dump_blocked_ops" ||
2572              prefix == "dump_historic_ops" ||
2573              prefix == "dump_historic_ops_by_duration" ||
2574              prefix == "dump_historic_slow_ops") {
2575
2576     const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2577 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2578 will start to track new ops received afterwards.";
2579
2580     set<string> filters;
2581     vector<string> filter_str;
2582     if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2583         copy(filter_str.begin(), filter_str.end(),
2584            inserter(filters, filters.end()));
2585     }
2586
2587     if (prefix == "dump_ops_in_flight" ||
2588         prefix == "ops") {
2589       if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2590         ss << error_str;
2591         ret = -EINVAL;
2592         goto out;
2593       }
2594     }
2595     if (prefix == "dump_blocked_ops") {
2596       if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2597         ss << error_str;
2598         ret = -EINVAL;
2599         goto out;
2600       }
2601     }
2602     if (prefix == "dump_historic_ops") {
2603       if (!op_tracker.dump_historic_ops(f, false, filters)) {
2604         ss << error_str;
2605         ret = -EINVAL;
2606         goto out;
2607       }
2608     }
2609     if (prefix == "dump_historic_ops_by_duration") {
2610       if (!op_tracker.dump_historic_ops(f, true, filters)) {
2611         ss << error_str;
2612         ret = -EINVAL;
2613         goto out;
2614       }
2615     }
2616     if (prefix == "dump_historic_slow_ops") {
2617       if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2618         ss << error_str;
2619         ret = -EINVAL;
2620         goto out;
2621       }
2622     }
2623   } else if (prefix == "dump_op_pq_state") {
2624     f->open_object_section("pq");
2625     op_shardedwq.dump(f);
2626     f->close_section();
2627   } else if (prefix == "dump_blocklist") {
2628     list<pair<entity_addr_t,utime_t> > bl;
2629     OSDMapRef curmap = service.get_osdmap();
2630
2631     f->open_array_section("blocklist");
2632     curmap->get_blocklist(&bl);
2633     for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2634         it != bl.end(); ++it) {
2635       f->open_object_section("entry");
2636       f->open_object_section("entity_addr_t");
2637       it->first.dump(f);
2638       f->close_section(); //entity_addr_t
2639       it->second.localtime(f->dump_stream("expire_time"));
2640       f->close_section(); //entry
2641     }
2642     f->close_section(); //blocklist
2643   } else if (prefix == "dump_watchers") {
2644     list<obj_watch_item_t> watchers;
2645     // scan pg's
2646     vector<PGRef> pgs;
2647     _get_pgs(&pgs);
2648     for (auto& pg : pgs) {
2649       list<obj_watch_item_t> pg_watchers;
2650       pg->get_watchers(&pg_watchers);
2651       watchers.splice(watchers.end(), pg_watchers);
2652     }
2653
2654     f->open_array_section("watchers");
2655     for (list<obj_watch_item_t>::iterator it = watchers.begin();
2656         it != watchers.end(); ++it) {
2657
2658       f->open_object_section("watch");
2659
2660       f->dump_string("namespace", it->obj.nspace);
2661       f->dump_string("object", it->obj.oid.name);
2662
2663       f->open_object_section("entity_name");
2664       it->wi.name.dump(f);
2665       f->close_section(); //entity_name_t
2666
2667       f->dump_unsigned("cookie", it->wi.cookie);
2668       f->dump_unsigned("timeout", it->wi.timeout_seconds);
2669
2670       f->open_object_section("entity_addr_t");
2671       it->wi.addr.dump(f);
2672       f->close_section(); //entity_addr_t
2673
2674       f->close_section(); //watch
2675     }
2676
2677     f->close_section(); //watchers
2678   } else if (prefix == "dump_recovery_reservations") {
2679     f->open_object_section("reservations");
2680     f->open_object_section("local_reservations");
2681     service.local_reserver.dump(f);
2682     f->close_section();
2683     f->open_object_section("remote_reservations");
2684     service.remote_reserver.dump(f);
2685     f->close_section();
2686     f->close_section();
2687   } else if (prefix == "dump_scrub_reservations") {
2688     f->open_object_section("scrub_reservations");
2689     service.dump_scrub_reservations(f);
2690     f->close_section();
2691   } else if (prefix == "get_latest_osdmap") {
2692     get_latest_osdmap();
2693   } else if (prefix == "set_heap_property") {
2694     string property;
2695     int64_t value = 0;
2696     string error;
2697     bool success = false;
2698     if (!cmd_getval(cmdmap, "property", property)) {
2699       error = "unable to get property";
2700       success = false;
2701     } else if (!cmd_getval(cmdmap, "value", value)) {
2702       error = "unable to get value";
2703       success = false;
2704     } else if (value < 0) {
2705       error = "negative value not allowed";
2706       success = false;
2707     } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2708       error = "invalid property";
2709       success = false;
2710     } else {
2711       success = true;
2712     }
2713     f->open_object_section("result");
2714     f->dump_string("error", error);
2715     f->dump_bool("success", success);
2716     f->close_section();
2717   } else if (prefix == "get_heap_property") {
2718     string property;
2719     size_t value = 0;
2720     string error;
2721     bool success = false;
2722     if (!cmd_getval(cmdmap, "property", property)) {
2723       error = "unable to get property";
2724       success = false;
2725     } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2726       error = "invalid property";
2727       success = false;
2728     } else {
2729       success = true;
2730     }
2731     f->open_object_section("result");
2732     f->dump_string("error", error);
2733     f->dump_bool("success", success);
2734     f->dump_int("value", value);
2735     f->close_section();
2736   } else if (prefix == "dump_objectstore_kv_stats") {
2737     store->get_db_statistics(f);
2738   } else if (prefix == "dump_scrubs") {
2739     service.dumps_scrub(f);
2740   } else if (prefix == "calc_objectstore_db_histogram") {
2741     store->generate_db_histogram(f);
2742   } else if (prefix == "flush_store_cache") {
2743     store->flush_cache(&ss);
2744   } else if (prefix == "dump_pgstate_history") {
2745     f->open_object_section("pgstate_history");
2746     f->open_array_section("pgs");
2747     vector<PGRef> pgs;
2748     _get_pgs(&pgs);
2749     for (auto& pg : pgs) {
2750       f->open_object_section("pg");
2751       f->dump_stream("pg") << pg->pg_id;
2752       f->dump_string("currently", pg->get_current_state());
2753       pg->dump_pgstate_history(f);
2754       f->close_section();
2755     }
2756     f->close_section();
2757     f->close_section();
2758   } else if (prefix == "compact") {
2759     dout(1) << "triggering manual compaction" << dendl;
2760     auto start = ceph::coarse_mono_clock::now();
2761     store->compact();
2762     auto end = ceph::coarse_mono_clock::now();
2763     double duration = std::chrono::duration<double>(end-start).count();
2764     dout(1) << "finished manual compaction in "
2765             << duration
2766             << " seconds" << dendl;
2767     f->open_object_section("compact_result");
2768     f->dump_float("elapsed_time", duration);
2769     f->close_section();
2770   } else if (prefix == "get_mapped_pools") {
2771     f->open_array_section("mapped_pools");
2772     set<int64_t> poollist = get_mapped_pools();
2773     for (auto pool : poollist) {
2774       f->dump_int("pool_id", pool);
2775     }
2776     f->close_section();
2777   } else if (prefix == "smart") {
2778     string devid;
2779     cmd_getval(cmdmap, "devid", devid);
2780     ostringstream out;
2781     probe_smart(devid, out);
2782     outbl.append(out.str());
2783   } else if (prefix == "list_devices") {
2784     set<string> devnames;
2785     store->get_devices(&devnames);
2786     f->open_array_section("list_devices");
2787     for (auto dev : devnames) {
2788       if (dev.find("dm-") == 0) {
2789         continue;
2790       }
2791       string err;
2792       f->open_object_section("device");
2793       f->dump_string("device", "/dev/" + dev);
2794       f->dump_string("device_id", get_device_id(dev, &err));
2795       f->close_section();
2796     }
2797     f->close_section();
2798   } else if (prefix == "send_beacon") {
2799     lock_guard l(osd_lock);
2800     if (is_active()) {
2801       send_beacon(ceph::coarse_mono_clock::now());
2802     }
2803   }
2804
2805   else if (prefix == "cluster_log") {
2806     vector<string> msg;
2807     cmd_getval(cmdmap, "message", msg);
2808     if (msg.empty()) {
2809       ret = -EINVAL;
2810       ss << "ignoring empty log message";
2811       goto out;
2812     }
2813     string message = msg.front();
2814     for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2815       message += " " + *a;
2816     string lvl;
2817     cmd_getval(cmdmap, "level", lvl);
2818     clog_type level = string_to_clog_type(lvl);
2819     if (level < 0) {
2820       ret = -EINVAL;
2821       ss << "unknown level '" << lvl << "'";
2822       goto out;
2823     }
2824     clog->do_log(level, message);
2825   }
2826
2827   else if (prefix == "bench") {
2828     int64_t count;
2829     int64_t bsize;
2830     int64_t osize, onum;
2831     // default count 1G, size 4MB
2832     cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2833     cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2834     cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2835     cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2836
2837     uint32_t duration = cct->_conf->osd_bench_duration;
2838
2839     if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2840       // let us limit the block size because the next checks rely on it
2841       // having a sane value.  If we allow any block size to be set things
2842       // can still go sideways.
2843       ss << "block 'size' values are capped at "
2844          << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2845          << " a higher value, please adjust 'osd_bench_max_block_size'";
2846       ret = -EINVAL;
2847       goto out;
2848     } else if (bsize < (int64_t) (1 << 20)) {
2849       // entering the realm of small block sizes.
2850       // limit the count to a sane value, assuming a configurable amount of
2851       // IOPS and duration, so that the OSD doesn't get hung up on this,
2852       // preventing timeouts from going off
2853       int64_t max_count =
2854         bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2855       if (count > max_count) {
2856         ss << "'count' values greater than " << max_count
2857            << " for a block size of " << byte_u_t(bsize) << ", assuming "
2858            << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2859            << " for " << duration << " seconds,"
2860            << " can cause ill effects on osd. "
2861            << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2862            << " value if you wish to use a higher 'count'.";
2863         ret = -EINVAL;
2864         goto out;
2865       }
2866     } else {
2867       // 1MB block sizes are big enough so that we get more stuff done.
2868       // However, to avoid the osd from getting hung on this and having
2869       // timers being triggered, we are going to limit the count assuming
2870       // a configurable throughput and duration.
2871       // NOTE: max_count is the total amount of bytes that we believe we
2872       //       will be able to write during 'duration' for the given
2873       //       throughput.  The block size hardly impacts this unless it's
2874       //       way too big.  Given we already check how big the block size
2875       //       is, it's safe to assume everything will check out.
2876       int64_t max_count =
2877         cct->_conf->osd_bench_large_size_max_throughput * duration;
2878       if (count > max_count) {
2879         ss << "'count' values greater than " << max_count
2880            << " for a block size of " << byte_u_t(bsize) << ", assuming "
2881            << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2882            << " for " << duration << " seconds,"
2883            << " can cause ill effects on osd. "
2884            << " Please adjust 'osd_bench_large_size_max_throughput'"
2885            << " with a higher value if you wish to use a higher 'count'.";
2886         ret = -EINVAL;
2887         goto out;
2888       }
2889     }
2890
2891     if (osize && bsize > osize)
2892       bsize = osize;
2893
2894     dout(1) << " bench count " << count
2895             << " bsize " << byte_u_t(bsize) << dendl;
2896
2897     ObjectStore::Transaction cleanupt;
2898
2899     if (osize && onum) {
2900       bufferlist bl;
2901       bufferptr bp(osize);
2902       bp.zero();
2903       bl.push_back(std::move(bp));
2904       bl.rebuild_page_aligned();
2905       for (int i=0; i<onum; ++i) {
2906         char nm[30];
2907         snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2908         object_t oid(nm);
2909         hobject_t soid(sobject_t(oid, 0));
2910         ObjectStore::Transaction t;
2911         t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2912         store->queue_transaction(service.meta_ch, std::move(t), NULL);
2913         cleanupt.remove(coll_t(), ghobject_t(soid));
2914       }
2915     }
2916
2917     bufferlist bl;
2918     bufferptr bp(bsize);
2919     bp.zero();
2920     bl.push_back(std::move(bp));
2921     bl.rebuild_page_aligned();
2922
2923     {
2924       C_SaferCond waiter;
2925       if (!service.meta_ch->flush_commit(&waiter)) {
2926         waiter.wait();
2927       }
2928     }
2929
2930     utime_t start = ceph_clock_now();
2931     for (int64_t pos = 0; pos < count; pos += bsize) {
2932       char nm[30];
2933       unsigned offset = 0;
2934       if (onum && osize) {
2935         snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2936         offset = rand() % (osize / bsize) * bsize;
2937       } else {
2938         snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2939       }
2940       object_t oid(nm);
2941       hobject_t soid(sobject_t(oid, 0));
2942       ObjectStore::Transaction t;
2943       t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2944       store->queue_transaction(service.meta_ch, std::move(t), NULL);
2945       if (!onum || !osize)
2946         cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2947     }
2948
2949     {
2950       C_SaferCond waiter;
2951       if (!service.meta_ch->flush_commit(&waiter)) {
2952         waiter.wait();
2953       }
2954     }
2955     utime_t end = ceph_clock_now();
2956
2957     // clean up
2958     store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2959     {
2960       C_SaferCond waiter;
2961       if (!service.meta_ch->flush_commit(&waiter)) {
2962         waiter.wait();
2963       }
2964     }
2965
2966     double elapsed = end - start;
2967     double rate = count / elapsed;
2968     double iops = rate / bsize;
2969     f->open_object_section("osd_bench_results");
2970     f->dump_int("bytes_written", count);
2971     f->dump_int("blocksize", bsize);
2972     f->dump_float("elapsed_sec", elapsed);
2973     f->dump_float("bytes_per_sec", rate);
2974     f->dump_float("iops", iops);
2975     f->close_section();
2976   }
2977
2978   else if (prefix == "flush_pg_stats") {
2979     mgrc.send_pgstats();
2980     f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2981   }
2982
2983   else if (prefix == "heap") {
2984     ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2985   }
2986
2987   else if (prefix == "debug dump_missing") {
2988     f->open_array_section("pgs");
2989     vector<PGRef> pgs;
2990     _get_pgs(&pgs);
2991     for (auto& pg : pgs) {
2992       string s = stringify(pg->pg_id);
2993       f->open_array_section(s.c_str());
2994       pg->lock();
2995       pg->dump_missing(f);
2996       pg->unlock();
2997       f->close_section();
2998     }
2999     f->close_section();
3000   }
3001
3002   else if (prefix == "debug kick_recovery_wq") {
3003     int64_t delay;
3004     cmd_getval(cmdmap, "delay", delay);
3005     ostringstream oss;
3006     oss << delay;
3007     ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
3008     if (ret != 0) {
3009       ss << "kick_recovery_wq: error setting "
3010          << "osd_recovery_delay_start to '" << delay << "': error "
3011          << ret;
3012       goto out;
3013     }
3014     cct->_conf.apply_changes(nullptr);
3015     ss << "kicking recovery queue. set osd_recovery_delay_start "
3016        << "to " << cct->_conf->osd_recovery_delay_start;
3017   }
3018
3019   else if (prefix == "cpu_profiler") {
3020     ostringstream ds;
3021     string arg;
3022     cmd_getval(cmdmap, "arg", arg);
3023     vector<string> argvec;
3024     get_str_vec(arg, argvec);
3025     cpu_profiler_handle_command(argvec, ds);
3026     outbl.append(ds.str());
3027   }
3028
3029   else if (prefix == "dump_pg_recovery_stats") {
3030     lock_guard l(osd_lock);
3031     pg_recovery_stats.dump_formatted(f);
3032   }
3033
3034   else if (prefix == "reset_pg_recovery_stats") {
3035     lock_guard l(osd_lock);
3036     pg_recovery_stats.reset();
3037   }
3038
3039   else if (prefix == "perf histogram dump") {
3040     std::string logger;
3041     std::string counter;
3042     cmd_getval(cmdmap, "logger", logger);
3043     cmd_getval(cmdmap, "counter", counter);
3044     cct->get_perfcounters_collection()->dump_formatted_histograms(
3045       f, false, logger, counter);
3046   }
3047
3048   else if (prefix == "cache drop") {
3049     lock_guard l(osd_lock);
3050     dout(20) << "clearing all caches" << dendl;
3051     // Clear the objectstore's cache - onode and buffer for Bluestore,
3052     // system's pagecache for Filestore
3053     ret = store->flush_cache(&ss);
3054     if (ret < 0) {
3055       ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
3056       goto out;
3057     }
3058     // Clear the objectcontext cache (per PG)
3059     vector<PGRef> pgs;
3060     _get_pgs(&pgs);
3061     for (auto& pg: pgs) {
3062       pg->clear_cache();
3063     }
3064   }
3065
3066   else if (prefix == "cache status") {
3067     lock_guard l(osd_lock);
3068     int obj_ctx_count = 0;
3069     vector<PGRef> pgs;
3070     _get_pgs(&pgs);
3071     for (auto& pg: pgs) {
3072       obj_ctx_count += pg->get_cache_obj_count();
3073     }
3074     f->open_object_section("cache_status");
3075     f->dump_int("object_ctx", obj_ctx_count);
3076     store->dump_cache_stats(f);
3077     f->close_section();
3078   }
3079
3080   else if (prefix == "scrub_purged_snaps") {
3081     lock_guard l(osd_lock);
3082     scrub_purged_snaps();
3083   }
3084
3085   else if (prefix == "dump_osd_network") {
3086     lock_guard l(osd_lock);
3087     int64_t value = 0;
3088     if (!(cmd_getval(cmdmap, "value", value))) {
3089       // Convert milliseconds to microseconds
3090       value = static_cast<double>(g_conf().get_val<double>(
3091                                     "mon_warn_on_slow_ping_time")) * 1000;
3092       if (value == 0) {
3093         double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
3094         value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
3095         value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
3096       }
3097     } else {
3098       // Convert user input to microseconds
3099       value *= 1000;
3100     }
3101     if (value < 0) value = 0;
3102
3103     struct osd_ping_time_t {
3104       uint32_t pingtime;
3105       int to;
3106       bool back;
3107       std::array<uint32_t,3> times;
3108       std::array<uint32_t,3> min;
3109       std::array<uint32_t,3> max;
3110       uint32_t last;
3111       uint32_t last_update;
3112
3113       bool operator<(const osd_ping_time_t& rhs) const {
3114         if (pingtime < rhs.pingtime)
3115           return true;
3116         if (pingtime > rhs.pingtime)
3117           return false;
3118         if (to < rhs.to)
3119           return true;
3120         if (to > rhs.to)
3121           return false;
3122         return back;
3123       }
3124     };
3125
3126     set<osd_ping_time_t> sorted;
3127     // Get pingtimes under lock and not on the stack
3128     map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3129     service.get_hb_pingtime(pingtimes);
3130     for (auto j : *pingtimes) {
3131       if (j.second.last_update == 0)
3132         continue;
3133       osd_ping_time_t item;
3134       item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3135       item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3136       if (item.pingtime >= value) {
3137         item.to = j.first;
3138         item.times[0] = j.second.back_pingtime[0];
3139         item.times[1] = j.second.back_pingtime[1];
3140         item.times[2] = j.second.back_pingtime[2];
3141         item.min[0] = j.second.back_min[0];
3142         item.min[1] = j.second.back_min[1];
3143         item.min[2] = j.second.back_min[2];
3144         item.max[0] = j.second.back_max[0];
3145         item.max[1] = j.second.back_max[1];
3146         item.max[2] = j.second.back_max[2];
3147         item.last = j.second.back_last;
3148         item.back = true;
3149         item.last_update = j.second.last_update;
3150         sorted.emplace(item);
3151       }
3152       if (j.second.front_last == 0)
3153         continue;
3154       item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3155       item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3156       if (item.pingtime >= value) {
3157         item.to = j.first;
3158         item.times[0] = j.second.front_pingtime[0];
3159         item.times[1] = j.second.front_pingtime[1];
3160         item.times[2] = j.second.front_pingtime[2];
3161         item.min[0] = j.second.front_min[0];
3162         item.min[1] = j.second.front_min[1];
3163         item.min[2] = j.second.front_min[2];
3164         item.max[0] = j.second.front_max[0];
3165         item.max[1] = j.second.front_max[1];
3166         item.max[2] = j.second.front_max[2];
3167         item.last = j.second.front_last;
3168         item.last_update = j.second.last_update;
3169         item.back = false;
3170         sorted.emplace(item);
3171       }
3172     }
3173     delete pingtimes;
3174     //
3175     // Network ping times (1min 5min 15min)
3176     f->open_object_section("network_ping_times");
3177     f->dump_int("threshold", value / 1000);
3178     f->open_array_section("entries");
3179     for (auto &sitem : boost::adaptors::reverse(sorted)) {
3180       ceph_assert(sitem.pingtime >= value);
3181       f->open_object_section("entry");
3182
3183       const time_t lu(sitem.last_update);
3184       char buffer[26];
3185       string lustr(ctime_r(&lu, buffer));
3186       lustr.pop_back();   // Remove trailing \n
3187       auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3188       f->dump_string("last update", lustr);
3189       f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3190       f->dump_int("from osd", whoami);
3191       f->dump_int("to osd", sitem.to);
3192       f->dump_string("interface", (sitem.back ? "back" : "front"));
3193       f->open_object_section("average");
3194       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3195       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3196       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3197       f->close_section();  // average
3198       f->open_object_section("min");
3199       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3200       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3201       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3202       f->close_section();  // min
3203       f->open_object_section("max");
3204       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3205       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3206       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3207       f->close_section();  // max
3208       f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3209       f->close_section();  // entry
3210     }
3211     f->close_section(); // entries
3212     f->close_section(); // network_ping_times
3213   } else {
3214     ceph_abort_msg("broken asok registration");
3215   }
3216
3217  out:
3218   on_finish(ret, ss.str(), outbl);
3219 }
3220
3221 class TestOpsSocketHook : public AdminSocketHook {
3222   OSDService *service;
3223   ObjectStore *store;
3224 public:
3225   TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3226   int call(std::string_view command, const cmdmap_t& cmdmap,
3227            Formatter *f,
3228            std::ostream& errss,
3229            bufferlist& out) override {
3230     int r = 0;
3231     stringstream outss;
3232     try {
3233       test_ops(service, store, command, cmdmap, outss);
3234       out.append(outss);
3235     } catch (const TOPNSPC::common::bad_cmd_get& e) {
3236       errss << e.what();
3237       r = -EINVAL;
3238     }
3239     return r;
3240   }
3241   void test_ops(OSDService *service, ObjectStore *store,
3242                 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3243
3244 };
3245
3246 class OSD::C_Tick : public Context {
3247   OSD *osd;
3248   public:
3249   explicit C_Tick(OSD *o) : osd(o) {}
3250   void finish(int r) override {
3251     osd->tick();
3252   }
3253 };
3254
3255 class OSD::C_Tick_WithoutOSDLock : public Context {
3256   OSD *osd;
3257   public:
3258   explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3259   void finish(int r) override {
3260     osd->tick_without_osd_lock();
3261   }
3262 };
3263
3264 int OSD::enable_disable_fuse(bool stop)
3265 {
3266 #ifdef HAVE_LIBFUSE
3267   int r;
3268   string mntpath = cct->_conf->osd_data + "/fuse";
3269   if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3270     dout(1) << __func__ << " disabling" << dendl;
3271     fuse_store->stop();
3272     delete fuse_store;
3273     fuse_store = NULL;
3274     r = ::rmdir(mntpath.c_str());
3275     if (r < 0) {
3276       r = -errno;
3277       derr << __func__ << " failed to rmdir " << mntpath << ": "
3278            << cpp_strerror(r) << dendl;
3279       return r;
3280     }
3281     return 0;
3282   }
3283   if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3284     dout(1) << __func__ << " enabling" << dendl;
3285     r = ::mkdir(mntpath.c_str(), 0700);
3286     if (r < 0)
3287       r = -errno;
3288     if (r < 0 && r != -EEXIST) {
3289       derr << __func__ << " unable to create " << mntpath << ": "
3290            << cpp_strerror(r) << dendl;
3291       return r;
3292     }
3293     fuse_store = new FuseStore(store, mntpath);
3294     r = fuse_store->start();
3295     if (r < 0) {
3296       derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3297       delete fuse_store;
3298       fuse_store = NULL;
3299       return r;
3300     }
3301   }
3302 #endif  // HAVE_LIBFUSE
3303   return 0;
3304 }
3305
3306 size_t OSD::get_num_cache_shards()
3307 {
3308   return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3309 }
3310
3311 int OSD::get_num_op_shards()
3312 {
3313   if (cct->_conf->osd_op_num_shards)
3314     return cct->_conf->osd_op_num_shards;
3315   if (store_is_rotational)
3316     return cct->_conf->osd_op_num_shards_hdd;
3317   else
3318     return cct->_conf->osd_op_num_shards_ssd;
3319 }
3320
3321 int OSD::get_num_op_threads()
3322 {
3323   if (cct->_conf->osd_op_num_threads_per_shard)
3324     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3325   if (store_is_rotational)
3326     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3327   else
3328     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3329 }
3330
3331 float OSD::get_osd_recovery_sleep()
3332 {
3333   if (cct->_conf->osd_recovery_sleep)
3334     return cct->_conf->osd_recovery_sleep;
3335   if (!store_is_rotational && !journal_is_rotational)
3336     return cct->_conf->osd_recovery_sleep_ssd;
3337   else if (store_is_rotational && !journal_is_rotational)
3338     return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3339   else
3340     return cct->_conf->osd_recovery_sleep_hdd;
3341 }
3342
3343 float OSD::get_osd_delete_sleep()
3344 {
3345   float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3346   if (osd_delete_sleep > 0)
3347     return osd_delete_sleep;
3348   if (!store_is_rotational && !journal_is_rotational)
3349     return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3350   if (store_is_rotational && !journal_is_rotational)
3351     return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3352   return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3353 }
3354
3355 int OSD::get_recovery_max_active()
3356 {
3357   if (cct->_conf->osd_recovery_max_active)
3358     return cct->_conf->osd_recovery_max_active;
3359   if (store_is_rotational)
3360     return cct->_conf->osd_recovery_max_active_hdd;
3361   else
3362     return cct->_conf->osd_recovery_max_active_ssd;
3363 }
3364
3365 float OSD::get_osd_snap_trim_sleep()
3366 {
3367   float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3368   if (osd_snap_trim_sleep > 0)
3369     return osd_snap_trim_sleep;
3370   if (!store_is_rotational && !journal_is_rotational)
3371     return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3372   if (store_is_rotational && !journal_is_rotational)
3373     return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3374   return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3375 }
3376
3377 int OSD::init()
3378 {
3379   OSDMapRef osdmap;
3380   CompatSet initial, diff;
3381   std::lock_guard lock(osd_lock);
3382   if (is_stopping())
3383     return 0;
3384
3385   tick_timer.init();
3386   tick_timer_without_osd_lock.init();
3387   service.recovery_request_timer.init();
3388   service.sleep_timer.init();
3389
3390   boot_finisher.start();
3391
3392   {
3393     string val;
3394     store->read_meta("require_osd_release", &val);
3395     last_require_osd_release = ceph_release_from_name(val);
3396   }
3397
3398   // mount.
3399   dout(2) << "init " << dev_path
3400           << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3401           << dendl;
3402   dout(2) << "journal " << journal_path << dendl;
3403   ceph_assert(store);  // call pre_init() first!
3404
3405   store->set_cache_shards(get_num_cache_shards());
3406
3407   int r = store->mount();
3408   if (r < 0) {
3409     derr << "OSD:init: unable to mount object store" << dendl;
3410     return r;
3411   }
3412   journal_is_rotational = store->is_journal_rotational();
3413   dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3414           << dendl;
3415
3416   enable_disable_fuse(false);
3417
3418   dout(2) << "boot" << dendl;
3419
3420   service.meta_ch = store->open_collection(coll_t::meta());
3421
3422   // initialize the daily loadavg with current 15min loadavg
3423   double loadavgs[3];
3424   if (getloadavg(loadavgs, 3) == 3) {
3425     daily_loadavg = loadavgs[2];
3426   } else {
3427     derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3428     daily_loadavg = 1.0;
3429   }
3430
3431   int rotating_auth_attempts = 0;
3432   auto rotating_auth_timeout =
3433     g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3434
3435   // sanity check long object name handling
3436   {
3437     hobject_t l;
3438     l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3439     l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3440     l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3441     r = store->validate_hobject_key(l);
3442     if (r < 0) {
3443       derr << "backend (" << store->get_type() << ") is unable to support max "
3444            << "object name[space] len" << dendl;
3445       derr << "   osd max object name len = "
3446            << cct->_conf->osd_max_object_name_len << dendl;
3447       derr << "   osd max object namespace len = "
3448            << cct->_conf->osd_max_object_namespace_len << dendl;
3449       derr << cpp_strerror(r) << dendl;
3450       if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3451         goto out;
3452       }
3453       derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3454            << dendl;
3455     } else {
3456       dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3457     }
3458   }
3459
3460   // read superblock
3461   r = read_superblock();
3462   if (r < 0) {
3463     derr << "OSD::init() : unable to read osd superblock" << dendl;
3464     r = -EINVAL;
3465     goto out;
3466   }
3467
3468   if (osd_compat.compare(superblock.compat_features) < 0) {
3469     derr << "The disk uses features unsupported by the executable." << dendl;
3470     derr << " ondisk features " << superblock.compat_features << dendl;
3471     derr << " daemon features " << osd_compat << dendl;
3472
3473     if (osd_compat.writeable(superblock.compat_features)) {
3474       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3475       derr << "it is still writeable, though. Missing features: " << diff << dendl;
3476       r = -EOPNOTSUPP;
3477       goto out;
3478     }
3479     else {
3480       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3481       derr << "Cannot write to disk! Missing features: " << diff << dendl;
3482       r = -EOPNOTSUPP;
3483       goto out;
3484     }
3485   }
3486
3487   assert_warn(whoami == superblock.whoami);
3488   if (whoami != superblock.whoami) {
3489     derr << "OSD::init: superblock says osd"
3490          << superblock.whoami << " but I am osd." << whoami << dendl;
3491     r = -EINVAL;
3492     goto out;
3493   }
3494
3495   startup_time = ceph::mono_clock::now();
3496
3497   // load up "current" osdmap
3498   assert_warn(!get_osdmap());
3499   if (get_osdmap()) {
3500     derr << "OSD::init: unable to read current osdmap" << dendl;
3501     r = -EINVAL;
3502     goto out;
3503   }
3504   osdmap = get_map(superblock.current_epoch);
3505   set_osdmap(osdmap);
3506
3507   // make sure we don't have legacy pgs deleting
3508   {
3509     vector<coll_t> ls;
3510     int r = store->list_collections(ls);
3511     ceph_assert(r >= 0);
3512     for (auto c : ls) {
3513       spg_t pgid;
3514       if (c.is_pg(&pgid) &&
3515           !osdmap->have_pg_pool(pgid.pool())) {
3516         ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3517         if (!store->exists(service.meta_ch, oid)) {
3518           derr << __func__ << " missing pg_pool_t for deleted pool "
3519                << pgid.pool() << " for pg " << pgid
3520                << "; please downgrade to luminous and allow "
3521                << "pg deletion to complete before upgrading" << dendl;
3522           ceph_abort();
3523         }
3524       }
3525     }
3526   }
3527
3528   initial = get_osd_initial_compat_set();
3529   diff = superblock.compat_features.unsupported(initial);
3530   if (superblock.compat_features.merge(initial)) {
3531     // Are we adding SNAPMAPPER2?
3532     if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3533       dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3534               << dendl;
3535       auto ch = service.meta_ch;
3536       auto hoid = make_snapmapper_oid();
3537       unsigned max = cct->_conf->osd_target_transaction_size;
3538       r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3539       if (r < 0)
3540         goto out;
3541     }
3542     // We need to persist the new compat_set before we
3543     // do anything else
3544     dout(5) << "Upgrading superblock adding: " << diff << dendl;
3545     ObjectStore::Transaction t;
3546     write_superblock(t);
3547     r = store->queue_transaction(service.meta_ch, std::move(t));
3548     if (r < 0)
3549       goto out;
3550   }
3551
3552   // make sure snap mapper object exists
3553   if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3554     dout(10) << "init creating/touching snapmapper object" << dendl;
3555     ObjectStore::Transaction t;
3556     t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3557     r = store->queue_transaction(service.meta_ch, std::move(t));
3558     if (r < 0)
3559       goto out;
3560   }
3561   if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3562     dout(10) << "init creating/touching purged_snaps object" << dendl;
3563     ObjectStore::Transaction t;
3564     t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3565     r = store->queue_transaction(service.meta_ch, std::move(t));
3566     if (r < 0)
3567       goto out;
3568   }
3569
3570   if (cct->_conf->osd_open_classes_on_start) {
3571     int r = ClassHandler::get_instance().open_all_classes();
3572     if (r)
3573       dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3574   }
3575
3576   check_osdmap_features();
3577
3578   {
3579     epoch_t bind_epoch = osdmap->get_epoch();
3580     service.set_epochs(NULL, NULL, &bind_epoch);
3581   }
3582
3583   clear_temp_objects();
3584
3585   // initialize osdmap references in sharded wq
3586   for (auto& shard : shards) {
3587     std::lock_guard l(shard->osdmap_lock);
3588     shard->shard_osdmap = osdmap;
3589   }
3590
3591   // load up pgs (as they previously existed)
3592   load_pgs();
3593
3594   dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3595
3596   if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3597     dout(2) << "compacting object store's omap" << dendl;
3598     store->compact();
3599   }
3600
3601   // prime osd stats
3602   {
3603     struct store_statfs_t stbuf;
3604     osd_alert_list_t alerts;
3605     int r = store->statfs(&stbuf, &alerts);
3606     ceph_assert(r == 0);
3607     service.set_statfs(stbuf, alerts);
3608   }
3609
3610   // client_messenger's auth_client will be set up by monc->init() later.
3611   for (auto m : { cluster_messenger,
3612         objecter_messenger,
3613         hb_front_client_messenger,
3614         hb_back_client_messenger,
3615         hb_front_server_messenger,
3616         hb_back_server_messenger } ) {
3617     m->set_auth_client(monc);
3618   }
3619   for (auto m : { client_messenger,
3620         cluster_messenger,
3621         hb_front_server_messenger,
3622         hb_back_server_messenger }) {
3623     m->set_auth_server(monc);
3624   }
3625   monc->set_handle_authentication_dispatcher(this);
3626
3627   monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3628                       | CEPH_ENTITY_TYPE_MGR);
3629   r = monc->init();
3630   if (r < 0)
3631     goto out;
3632
3633   mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
3634   mgrc.set_perf_metric_query_cb(
3635     [this](const ConfigPayload &config_payload) {
3636         set_perf_queries(config_payload);
3637       },
3638       [this] {
3639         return get_perf_reports();
3640       });
3641   mgrc.init();
3642
3643   // tell monc about log_client so it will know about mon session resets
3644   monc->set_log_client(&log_client);
3645   update_log_config();
3646
3647   // i'm ready!
3648   client_messenger->add_dispatcher_tail(&mgrc);
3649   client_messenger->add_dispatcher_tail(this);
3650   cluster_messenger->add_dispatcher_head(this);
3651
3652   hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3653   hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3654   hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3655   hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3656
3657   objecter_messenger->add_dispatcher_head(service.objecter.get());
3658
3659   service.init();
3660   service.publish_map(osdmap);
3661   service.publish_superblock(superblock);
3662   service.max_oldest_map = superblock.oldest_map;
3663
3664   for (auto& shard : shards) {
3665     // put PGs in a temporary set because we may modify pg_slots
3666     // unordered_map below.
3667     set<PGRef> pgs;
3668     for (auto& i : shard->pg_slots) {
3669       PGRef pg = i.second->pg;
3670       if (!pg) {
3671         continue;
3672       }
3673       pgs.insert(pg);
3674     }
3675     for (auto pg : pgs) {
3676       std::scoped_lock l{*pg};
3677       set<pair<spg_t,epoch_t>> new_children;
3678       set<pair<spg_t,epoch_t>> merge_pgs;
3679       service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3680                                          &new_children, &merge_pgs);
3681       if (!new_children.empty()) {
3682         for (auto shard : shards) {
3683           shard->prime_splits(osdmap, &new_children);
3684         }
3685         assert(new_children.empty());
3686       }
3687       if (!merge_pgs.empty()) {
3688         for (auto shard : shards) {
3689           shard->prime_merges(osdmap, &merge_pgs);
3690         }
3691         assert(merge_pgs.empty());
3692       }
3693     }
3694   }
3695
3696   osd_op_tp.start();
3697
3698   // start the heartbeat
3699   heartbeat_thread.create("osd_srv_heartbt");
3700
3701   // tick
3702   tick_timer.add_event_after(get_tick_interval(),
3703                              new C_Tick(this));
3704   {
3705     std::lock_guard l(tick_timer_lock);
3706     tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3707                                                 new C_Tick_WithoutOSDLock(this));
3708   }
3709
3710   osd_lock.unlock();
3711
3712   r = monc->authenticate();
3713   if (r < 0) {
3714     derr << __func__ << " authentication failed: " << cpp_strerror(r)
3715          << dendl;
3716     exit(1);
3717   }
3718
3719   while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3720     derr << "unable to obtain rotating service keys; retrying" << dendl;
3721     ++rotating_auth_attempts;
3722     if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3723         derr << __func__ << " wait_auth_rotating timed out" << dendl;
3724         exit(1);
3725     }
3726   }
3727
3728   r = update_crush_device_class();
3729   if (r < 0) {
3730     derr << __func__ << " unable to update_crush_device_class: "
3731          << cpp_strerror(r) << dendl;
3732     exit(1);
3733   }
3734
3735   r = update_crush_location();
3736   if (r < 0) {
3737     derr << __func__ << " unable to update_crush_location: "
3738          << cpp_strerror(r) << dendl;
3739     exit(1);
3740   }
3741
3742   osd_lock.lock();
3743   if (is_stopping())
3744     return 0;
3745
3746   // start objecter *after* we have authenticated, so that we don't ignore
3747   // the OSDMaps it requests.
3748   service.final_init();
3749
3750   check_config();
3751
3752   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3753   consume_map();
3754
3755   dout(0) << "done with init, starting boot process" << dendl;
3756
3757   // subscribe to any pg creations
3758   monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3759
3760   // MgrClient needs this (it doesn't have MonClient reference itself)
3761   monc->sub_want("mgrmap", 0, 0);
3762
3763   // we don't need to ask for an osdmap here; objecter will
3764   //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3765
3766   monc->renew_subs();
3767
3768   start_boot();
3769
3770   return 0;
3771
3772 out:
3773   enable_disable_fuse(true);
3774   store->umount();
3775   delete store;
3776   store = NULL;
3777   return r;
3778 }
3779
3780 void OSD::final_init()
3781 {
3782   AdminSocket *admin_socket = cct->get_admin_socket();
3783   asok_hook = new OSDSocketHook(this);
3784   int r = admin_socket->register_command("status", asok_hook,
3785                                          "high-level status of OSD");
3786   ceph_assert(r == 0);
3787   r = admin_socket->register_command("flush_journal",
3788                                      asok_hook,
3789                                      "flush the journal to permanent store");
3790   ceph_assert(r == 0);
3791   r = admin_socket->register_command("dump_ops_in_flight " \
3792                                      "name=filterstr,type=CephString,n=N,req=false",
3793                                      asok_hook,
3794                                      "show the ops currently in flight");
3795   ceph_assert(r == 0);
3796   r = admin_socket->register_command("ops " \
3797                                      "name=filterstr,type=CephString,n=N,req=false",
3798                                      asok_hook,
3799                                      "show the ops currently in flight");
3800   ceph_assert(r == 0);
3801   r = admin_socket->register_command("dump_blocked_ops " \
3802                                      "name=filterstr,type=CephString,n=N,req=false",
3803                                      asok_hook,
3804                                      "show the blocked ops currently in flight");
3805   ceph_assert(r == 0);
3806   r = admin_socket->register_command("dump_historic_ops " \
3807                                      "name=filterstr,type=CephString,n=N,req=false",
3808                                      asok_hook,
3809                                      "show recent ops");
3810   ceph_assert(r == 0);
3811   r = admin_socket->register_command("dump_historic_slow_ops " \
3812                                      "name=filterstr,type=CephString,n=N,req=false",
3813                                      asok_hook,
3814                                      "show slowest recent ops");
3815   ceph_assert(r == 0);
3816   r = admin_socket->register_command("dump_historic_ops_by_duration " \
3817                                      "name=filterstr,type=CephString,n=N,req=false",
3818                                      asok_hook,
3819                                      "show slowest recent ops, sorted by duration");
3820   ceph_assert(r == 0);
3821   r = admin_socket->register_command("dump_op_pq_state",
3822                                      asok_hook,
3823                                      "dump op priority queue state");
3824   ceph_assert(r == 0);
3825   r = admin_socket->register_command("dump_blocklist",
3826                                      asok_hook,
3827                                      "dump blocklisted clients and times");
3828   ceph_assert(r == 0);
3829   r = admin_socket->register_command("dump_watchers",
3830                                      asok_hook,
3831                                      "show clients which have active watches,"
3832                                      " and on which objects");
3833   ceph_assert(r == 0);
3834   r = admin_socket->register_command("dump_recovery_reservations",
3835                                      asok_hook,
3836                                      "show recovery reservations");
3837   ceph_assert(r == 0);
3838   r = admin_socket->register_command("dump_scrub_reservations",
3839                                      asok_hook,
3840                                      "show scrub reservations");
3841   ceph_assert(r == 0);
3842   r = admin_socket->register_command("get_latest_osdmap",
3843                                      asok_hook,
3844                                      "force osd to update the latest map from "
3845                                      "the mon");
3846   ceph_assert(r == 0);
3847
3848   r = admin_socket->register_command("set_heap_property " \
3849                                      "name=property,type=CephString " \
3850                                      "name=value,type=CephInt",
3851                                      asok_hook,
3852                                      "update malloc extension heap property");
3853   ceph_assert(r == 0);
3854
3855   r = admin_socket->register_command("get_heap_property " \
3856                                      "name=property,type=CephString",
3857                                      asok_hook,
3858                                      "get malloc extension heap property");
3859   ceph_assert(r == 0);
3860
3861   r = admin_socket->register_command("dump_objectstore_kv_stats",
3862                                      asok_hook,
3863                                      "print statistics of kvdb which used by bluestore");
3864   ceph_assert(r == 0);
3865
3866   r = admin_socket->register_command("dump_scrubs",
3867                                      asok_hook,
3868                                      "print scheduled scrubs");
3869   ceph_assert(r == 0);
3870
3871   r = admin_socket->register_command("calc_objectstore_db_histogram",
3872                                      asok_hook,
3873                                      "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3874   ceph_assert(r == 0);
3875
3876   r = admin_socket->register_command("flush_store_cache",
3877                                      asok_hook,
3878                                      "Flush bluestore internal cache");
3879   ceph_assert(r == 0);
3880   r = admin_socket->register_command("dump_pgstate_history",
3881                                      asok_hook,
3882                                      "show recent state history");
3883   ceph_assert(r == 0);
3884
3885   r = admin_socket->register_command("compact",
3886                                      asok_hook,
3887                                      "Commpact object store's omap."
3888                                      " WARNING: Compaction probably slows your requests");
3889   ceph_assert(r == 0);
3890
3891   r = admin_socket->register_command("get_mapped_pools",
3892                                      asok_hook,
3893                                      "dump pools whose PG(s) are mapped to this OSD.");
3894
3895   ceph_assert(r == 0);
3896
3897   r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3898                                      asok_hook,
3899                                      "probe OSD devices for SMART data.");
3900
3901   ceph_assert(r == 0);
3902
3903   r = admin_socket->register_command("list_devices",
3904                                      asok_hook,
3905                                      "list OSD devices.");
3906   r = admin_socket->register_command("send_beacon",
3907                                      asok_hook,
3908                                      "send OSD beacon to mon immediately");
3909
3910   r = admin_socket->register_command(
3911     "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3912     "Dump osd heartbeat network ping times");
3913   ceph_assert(r == 0);
3914
3915   test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3916   // Note: pools are CephString instead of CephPoolname because
3917   // these commands traditionally support both pool names and numbers
3918   r = admin_socket->register_command(
3919    "setomapval " \
3920    "name=pool,type=CephString " \
3921    "name=objname,type=CephObjectname " \
3922    "name=key,type=CephString "\
3923    "name=val,type=CephString",
3924    test_ops_hook,
3925    "set omap key");
3926   ceph_assert(r == 0);
3927   r = admin_socket->register_command(
3928     "rmomapkey " \
3929     "name=pool,type=CephString " \
3930     "name=objname,type=CephObjectname " \
3931     "name=key,type=CephString",
3932     test_ops_hook,
3933     "remove omap key");
3934   ceph_assert(r == 0);
3935   r = admin_socket->register_command(
3936     "setomapheader " \
3937     "name=pool,type=CephString " \
3938     "name=objname,type=CephObjectname " \
3939     "name=header,type=CephString",
3940     test_ops_hook,
3941     "set omap header");
3942   ceph_assert(r == 0);
3943
3944   r = admin_socket->register_command(
3945     "getomap " \
3946     "name=pool,type=CephString " \
3947     "name=objname,type=CephObjectname",
3948     test_ops_hook,
3949     "output entire object map");
3950   ceph_assert(r == 0);
3951
3952   r = admin_socket->register_command(
3953     "truncobj " \
3954     "name=pool,type=CephString " \
3955     "name=objname,type=CephObjectname " \
3956     "name=len,type=CephInt",
3957     test_ops_hook,
3958     "truncate object to length");
3959   ceph_assert(r == 0);
3960
3961   r = admin_socket->register_command(
3962     "injectdataerr " \
3963     "name=pool,type=CephString " \
3964     "name=objname,type=CephObjectname " \
3965     "name=shardid,type=CephInt,req=false,range=0|255",
3966     test_ops_hook,
3967     "inject data error to an object");
3968   ceph_assert(r == 0);
3969
3970   r = admin_socket->register_command(
3971     "injectmdataerr " \
3972     "name=pool,type=CephString " \
3973     "name=objname,type=CephObjectname " \
3974     "name=shardid,type=CephInt,req=false,range=0|255",
3975     test_ops_hook,
3976     "inject metadata error to an object");
3977   ceph_assert(r == 0);
3978   r = admin_socket->register_command(
3979     "set_recovery_delay " \
3980     "name=utime,type=CephInt,req=false",
3981     test_ops_hook,
3982      "Delay osd recovery by specified seconds");
3983   ceph_assert(r == 0);
3984   r = admin_socket->register_command(
3985    "injectfull " \
3986    "name=type,type=CephString,req=false " \
3987    "name=count,type=CephInt,req=false ",
3988    test_ops_hook,
3989    "Inject a full disk (optional count times)");
3990   ceph_assert(r == 0);
3991   r = admin_socket->register_command(
3992     "bench " \
3993     "name=count,type=CephInt,req=false "    \
3994     "name=size,type=CephInt,req=false "            \
3995     "name=object_size,type=CephInt,req=false "     \
3996     "name=object_num,type=CephInt,req=false ",
3997     asok_hook,
3998     "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3999     "(default count=1G default size=4MB). Results in log.");
4000   ceph_assert(r == 0);
4001   r = admin_socket->register_command(
4002     "cluster_log " \
4003     "name=level,type=CephChoices,strings=error,warning,info,debug "     \
4004     "name=message,type=CephString,n=N",
4005     asok_hook,
4006     "log a message to the cluster log");
4007   ceph_assert(r == 0);
4008   r = admin_socket->register_command(
4009     "flush_pg_stats",
4010     asok_hook,
4011     "flush pg stats");
4012   ceph_assert(r == 0);
4013   r = admin_socket->register_command(
4014     "heap " \
4015     "name=heapcmd,type=CephChoices,strings="                            \
4016     "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4017     "name=value,type=CephString,req=false",
4018     asok_hook,
4019     "show heap usage info (available only if compiled with tcmalloc)");
4020   ceph_assert(r == 0);
4021   r = admin_socket->register_command(
4022     "debug dump_missing "                       \
4023     "name=filename,type=CephFilepath",
4024     asok_hook,
4025     "dump missing objects to a named file");
4026   ceph_assert(r == 0);
4027   r = admin_socket->register_command(
4028     "debug kick_recovery_wq "                                           \
4029     "name=delay,type=CephInt,range=0",
4030     asok_hook,
4031     "set osd_recovery_delay_start to <val>");
4032   ceph_assert(r == 0);
4033   r = admin_socket->register_command(
4034     "cpu_profiler "                                             \
4035     "name=arg,type=CephChoices,strings=status|flush",
4036     asok_hook,
4037     "run cpu profiling on daemon");
4038   ceph_assert(r == 0);
4039   r = admin_socket->register_command(
4040     "dump_pg_recovery_stats",
4041     asok_hook,
4042     "dump pg recovery statistics");
4043   ceph_assert(r == 0);
4044   r = admin_socket->register_command(
4045     "reset_pg_recovery_stats",
4046     asok_hook,
4047     "reset pg recovery statistics");
4048   ceph_assert(r == 0);
4049   r = admin_socket->register_command(
4050     "cache drop",
4051     asok_hook,
4052     "Drop all OSD caches");
4053   ceph_assert(r == 0);
4054   r = admin_socket->register_command(
4055     "cache status",
4056     asok_hook,
4057     "Get OSD caches statistics");
4058   ceph_assert(r == 0);
4059   r = admin_socket->register_command(
4060     "scrub_purged_snaps",
4061     asok_hook,
4062     "Scrub purged_snaps vs snapmapper index");
4063   ceph_assert(r == 0);
4064
4065   // -- pg commands --
4066   // old form: ceph pg <pgid> command ...
4067   r = admin_socket->register_command(
4068     "pg "                          \
4069     "name=pgid,type=CephPgid "     \
4070     "name=cmd,type=CephChoices,strings=query",
4071     asok_hook,
4072     "");
4073   ceph_assert(r == 0);
4074   r = admin_socket->register_command(
4075     "pg "                          \
4076     "name=pgid,type=CephPgid "     \
4077     "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4078     "name=mulcmd,type=CephChoices,strings=revert|delete",
4079     asok_hook,
4080     "");
4081   ceph_assert(r == 0);
4082   r = admin_socket->register_command(
4083     "pg "                          \
4084     "name=pgid,type=CephPgid "     \
4085     "name=cmd,type=CephChoices,strings=list_unfound " \
4086     "name=offset,type=CephString,req=false",
4087     asok_hook,
4088     "");
4089   ceph_assert(r == 0);
4090   r = admin_socket->register_command(
4091     "pg "                          \
4092     "name=pgid,type=CephPgid "     \
4093     "name=cmd,type=CephChoices,strings=scrub " \
4094     "name=time,type=CephInt,req=false",
4095     asok_hook,
4096     "");
4097   ceph_assert(r == 0);
4098   r = admin_socket->register_command(
4099     "pg "                          \
4100     "name=pgid,type=CephPgid "     \
4101     "name=cmd,type=CephChoices,strings=deep_scrub " \
4102     "name=time,type=CephInt,req=false",
4103     asok_hook,
4104     "");
4105   ceph_assert(r == 0);
4106   // new form: tell <pgid> <cmd> for both cli and rest
4107   r = admin_socket->register_command(
4108     "query",
4109     asok_hook,
4110     "show details of a specific pg");
4111   ceph_assert(r == 0);
4112   r = admin_socket->register_command(
4113     "mark_unfound_lost "                                        \
4114     "name=pgid,type=CephPgid,req=false "                        \
4115     "name=mulcmd,type=CephChoices,strings=revert|delete",
4116     asok_hook,
4117     "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4118   ceph_assert(r == 0);
4119   r = admin_socket->register_command(
4120     "list_unfound "                                     \
4121     "name=pgid,type=CephPgid,req=false "                \
4122     "name=offset,type=CephString,req=false",
4123     asok_hook,
4124     "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4125   ceph_assert(r == 0);
4126   r = admin_socket->register_command(
4127     "scrub "                            \
4128     "name=pgid,type=CephPgid,req=false "        \
4129     "name=time,type=CephInt,req=false",
4130     asok_hook,
4131     "Trigger a scheduled scrub ");
4132   ceph_assert(r == 0);
4133   r = admin_socket->register_command(
4134     "deep_scrub "                       \
4135     "name=pgid,type=CephPgid,req=false "        \
4136     "name=time,type=CephInt,req=false",
4137     asok_hook,
4138     "Trigger a scheduled deep scrub ");
4139   ceph_assert(r == 0);
4140 }
4141
4142 PerfCounters* OSD::create_logger()
4143 {
4144   PerfCounters* logger = build_osd_logger(cct);
4145   cct->get_perfcounters_collection()->add(logger);
4146   return logger;
4147 }
4148
4149 PerfCounters* OSD::create_recoverystate_perf()
4150 {
4151   PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
4152   cct->get_perfcounters_collection()->add(recoverystate_perf);
4153   return recoverystate_perf;
4154 }
4155
4156 int OSD::shutdown()
4157 {
4158   if (cct->_conf->osd_fast_shutdown) {
4159     derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4160     if (cct->_conf->osd_fast_shutdown_notify_mon)
4161       service.prepare_to_stop();
4162     cct->_log->flush();
4163     _exit(0);
4164   }
4165
4166   if (!service.prepare_to_stop())
4167     return 0; // already shutting down
4168   osd_lock.lock();
4169   if (is_stopping()) {
4170     osd_lock.unlock();
4171     return 0;
4172   }
4173   dout(0) << "shutdown" << dendl;
4174
4175   set_state(STATE_STOPPING);
4176
4177   // Debugging
4178   if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4179     cct->_conf.set_val("debug_osd", "100");
4180     cct->_conf.set_val("debug_journal", "100");
4181     cct->_conf.set_val("debug_filestore", "100");
4182     cct->_conf.set_val("debug_bluestore", "100");
4183     cct->_conf.set_val("debug_ms", "100");
4184     cct->_conf.apply_changes(nullptr);
4185   }
4186
4187   // stop MgrClient earlier as it's more like an internal consumer of OSD
4188   mgrc.shutdown();
4189
4190   service.start_shutdown();
4191
4192   // stop sending work to pgs.  this just prevents any new work in _process
4193   // from racing with on_shutdown and potentially entering the pg after.
4194   op_shardedwq.drain();
4195
4196   // Shutdown PGs
4197   {
4198     vector<PGRef> pgs;
4199     _get_pgs(&pgs);
4200     for (auto pg : pgs) {
4201       pg->shutdown();
4202     }
4203   }
4204
4205   // drain op queue again (in case PGs requeued something)
4206   op_shardedwq.drain();
4207   {
4208     finished.clear(); // zap waiters (bleh, this is messy)
4209     waiting_for_osdmap.clear();
4210   }
4211
4212   // unregister commands
4213   cct->get_admin_socket()->unregister_commands(asok_hook);
4214   delete asok_hook;
4215   asok_hook = NULL;
4216
4217   cct->get_admin_socket()->unregister_commands(test_ops_hook);
4218   delete test_ops_hook;
4219   test_ops_hook = NULL;
4220
4221   osd_lock.unlock();
4222
4223   {
4224     std::lock_guard l{heartbeat_lock};
4225     heartbeat_stop = true;
4226     heartbeat_cond.notify_all();
4227     heartbeat_peers.clear();
4228   }
4229   heartbeat_thread.join();
4230
4231   hb_back_server_messenger->mark_down_all();
4232   hb_front_server_messenger->mark_down_all();
4233   hb_front_client_messenger->mark_down_all();
4234   hb_back_client_messenger->mark_down_all();
4235
4236   osd_op_tp.drain();
4237   osd_op_tp.stop();
4238   dout(10) << "op sharded tp stopped" << dendl;
4239
4240   dout(10) << "stopping agent" << dendl;
4241   service.agent_stop();
4242
4243   boot_finisher.wait_for_empty();
4244
4245   osd_lock.lock();
4246
4247   boot_finisher.stop();
4248   reset_heartbeat_peers(true);
4249
4250   tick_timer.shutdown();
4251
4252   {
4253     std::lock_guard l(tick_timer_lock);
4254     tick_timer_without_osd_lock.shutdown();
4255   }
4256
4257   // note unmount epoch
4258   dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4259   superblock.mounted = service.get_boot_epoch();
4260   superblock.clean_thru = get_osdmap_epoch();
4261   ObjectStore::Transaction t;
4262   write_superblock(t);
4263   int r = store->queue_transaction(service.meta_ch, std::move(t));
4264   if (r) {
4265     derr << "OSD::shutdown: error writing superblock: "
4266          << cpp_strerror(r) << dendl;
4267   }
4268
4269
4270   service.shutdown_reserver();
4271
4272   // Remove PGs
4273 #ifdef PG_DEBUG_REFS
4274   service.dump_live_pgids();
4275 #endif
4276   while (true) {
4277     vector<PGRef> pgs;
4278     _get_pgs(&pgs, true);
4279     if (pgs.empty()) {
4280       break;
4281     }
4282     for (auto& pg : pgs) {
4283       if (pg->is_deleted()) {
4284         continue;
4285       }
4286       dout(20) << " kicking pg " << pg << dendl;
4287       pg->lock();
4288       if (pg->get_num_ref() != 1) {
4289         derr << "pgid " << pg->get_pgid() << " has ref count of "
4290              << pg->get_num_ref() << dendl;
4291 #ifdef PG_DEBUG_REFS
4292         pg->dump_live_ids();
4293 #endif
4294         if (cct->_conf->osd_shutdown_pgref_assert) {
4295           ceph_abort();
4296         }
4297       }
4298       pg->ch.reset();
4299       pg->unlock();
4300     }
4301   }
4302 #ifdef PG_DEBUG_REFS
4303   service.dump_live_pgids();
4304 #endif
4305
4306   osd_lock.unlock();
4307   cct->_conf.remove_observer(this);
4308   osd_lock.lock();
4309
4310   service.meta_ch.reset();
4311
4312   dout(10) << "syncing store" << dendl;
4313   enable_disable_fuse(true);
4314
4315   if (cct->_conf->osd_journal_flush_on_shutdown) {
4316     dout(10) << "flushing journal" << dendl;
4317     store->flush_journal();
4318   }
4319
4320   monc->shutdown();
4321   osd_lock.unlock();
4322   {
4323     std::unique_lock l{map_lock};
4324     set_osdmap(OSDMapRef());
4325   }
4326   for (auto s : shards) {
4327     std::lock_guard l(s->osdmap_lock);
4328     s->shard_osdmap = OSDMapRef();
4329   }
4330   service.shutdown();
4331
4332   std::lock_guard lock(osd_lock);
4333   store->umount();
4334   delete store;
4335   store = nullptr;
4336   dout(10) << "Store synced" << dendl;
4337
4338   op_tracker.on_shutdown();
4339
4340   ClassHandler::get_instance().shutdown();
4341   client_messenger->shutdown();
4342   cluster_messenger->shutdown();
4343   hb_front_client_messenger->shutdown();
4344   hb_back_client_messenger->shutdown();
4345   objecter_messenger->shutdown();
4346   hb_front_server_messenger->shutdown();
4347   hb_back_server_messenger->shutdown();
4348
4349   return r;
4350 }
4351
4352 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4353 {
4354   bool created = false;
4355   while (true) {
4356     dout(10) << __func__ << " cmd: " << cmd << dendl;
4357     vector<string> vcmd{cmd};
4358     bufferlist inbl;
4359     C_SaferCond w;
4360     string outs;
4361     monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4362     int r = w.wait();
4363     if (r < 0) {
4364       if (r == -ENOENT && !created) {
4365         string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4366           + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4367         vector<string> vnewcmd{newcmd};
4368         bufferlist inbl;
4369         C_SaferCond w;
4370         string outs;
4371         monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4372         int r = w.wait();
4373         if (r < 0) {
4374           derr << __func__ << " fail: osd does not exist and created failed: "
4375                << cpp_strerror(r) << dendl;
4376           return r;
4377         }
4378         created = true;
4379         continue;
4380       }
4381       derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4382       return r;
4383     }
4384     break;
4385   }
4386
4387   return 0;
4388 }
4389
4390 int OSD::update_crush_location()
4391 {
4392   if (!cct->_conf->osd_crush_update_on_start) {
4393     dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4394     return 0;
4395   }
4396
4397   char weight[32];
4398   if (cct->_conf->osd_crush_initial_weight >= 0) {
4399     snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4400   } else {
4401     struct store_statfs_t st;
4402     osd_alert_list_t alerts;
4403     int r = store->statfs(&st, &alerts);
4404     if (r < 0) {
4405       derr << "statfs: " << cpp_strerror(r) << dendl;
4406       return r;
4407     }
4408     snprintf(weight, sizeof(weight), "%.4lf",
4409              std::max(.00001,
4410                       double(st.total) /
4411                       double(1ull << 40 /* TB */)));
4412   }
4413
4414   dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4415
4416   string cmd =
4417     string("{\"prefix\": \"osd crush create-or-move\", ") +
4418     string("\"id\": ") + stringify(whoami) + ", " +
4419     string("\"weight\":") + weight + ", " +
4420     string("\"args\": [") + stringify(cct->crush_location) + "]}";
4421   return mon_cmd_maybe_osd_create(cmd);
4422 }
4423
4424 int OSD::update_crush_device_class()
4425 {
4426   if (!cct->_conf->osd_class_update_on_start) {
4427     dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4428     return 0;
4429   }
4430
4431   string device_class;
4432   int r = store->read_meta("crush_device_class", &device_class);
4433   if (r < 0 || device_class.empty()) {
4434     device_class = store->get_default_device_class();
4435   }
4436
4437   if (device_class.empty()) {
4438     dout(20) << __func__ << " no device class stored locally" << dendl;
4439     return 0;
4440   }
4441
4442   string cmd =
4443     string("{\"prefix\": \"osd crush set-device-class\", ") +
4444     string("\"class\": \"") + device_class + string("\", ") +
4445     string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4446
4447   r = mon_cmd_maybe_osd_create(cmd);
4448   if (r == -EBUSY) {
4449     // good, already bound to a device-class
4450     return 0;
4451   } else {
4452     return r;
4453   }
4454 }
4455
4456 void OSD::write_superblock(ObjectStore::Transaction& t)
4457 {
4458   dout(10) << "write_superblock " << superblock << dendl;
4459
4460   //hack: at minimum it's using the baseline feature set
4461   if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4462     superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4463
4464   bufferlist bl;
4465   encode(superblock, bl);
4466   t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4467 }
4468
4469 int OSD::read_superblock()
4470 {
4471   bufferlist bl;
4472   int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4473   if (r < 0)
4474     return r;
4475
4476   auto p = bl.cbegin();
4477   decode(superblock, p);
4478
4479   dout(10) << "read_superblock " << superblock << dendl;
4480
4481   return 0;
4482 }
4483
4484 void OSD::clear_temp_objects()
4485 {
4486   dout(10) << __func__ << dendl;
4487   vector<coll_t> ls;
4488   store->list_collections(ls);
4489   for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4490     spg_t pgid;
4491     if (!p->is_pg(&pgid))
4492       continue;
4493
4494     // list temp objects
4495     dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4496
4497     vector<ghobject_t> temps;
4498     ghobject_t next;
4499     while (1) {
4500       vector<ghobject_t> objects;
4501       auto ch = store->open_collection(*p);
4502       ceph_assert(ch);
4503       store->collection_list(ch, next, ghobject_t::get_max(),
4504                              store->get_ideal_list_max(),
4505                              &objects, &next);
4506       if (objects.empty())
4507         break;
4508       vector<ghobject_t>::iterator q;
4509       for (q = objects.begin(); q != objects.end(); ++q) {
4510         // Hammer set pool for temps to -1, so check for clean-up
4511         if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4512           temps.push_back(*q);
4513         } else {
4514           break;
4515         }
4516       }
4517       // If we saw a non-temp object and hit the break above we can
4518       // break out of the while loop too.
4519       if (q != objects.end())
4520         break;
4521     }
4522     if (!temps.empty()) {
4523       ObjectStore::Transaction t;
4524       int removed = 0;
4525       for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4526         dout(20) << "  removing " << *p << " object " << *q << dendl;
4527         t.remove(*p, *q);
4528         if (++removed > cct->_conf->osd_target_transaction_size) {
4529           store->queue_transaction(service.meta_ch, std::move(t));
4530           t = ObjectStore::Transaction();
4531           removed = 0;
4532         }
4533       }
4534       if (removed) {
4535         store->queue_transaction(service.meta_ch, std::move(t));
4536       }
4537     }
4538   }
4539 }
4540
4541 void OSD::recursive_remove_collection(CephContext* cct,
4542                                       ObjectStore *store, spg_t pgid,
4543                                       coll_t tmp)
4544 {
4545   OSDriver driver(
4546     store,
4547     coll_t(),
4548     make_snapmapper_oid());
4549
4550   ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4551   ObjectStore::Transaction t;
4552   SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4553
4554   ghobject_t next;
4555   int max = cct->_conf->osd_target_transaction_size;
4556   vector<ghobject_t> objects;
4557   objects.reserve(max);
4558   while (true) {
4559     objects.clear();
4560     store->collection_list(ch, next, ghobject_t::get_max(),
4561       max, &objects, &next);
4562     generic_dout(10) << __func__ << " " << objects << dendl;
4563     if (objects.empty())
4564       break;
4565     for (auto& p: objects) {
4566       OSDriver::OSTransaction _t(driver.get_transaction(&t));
4567       int r = mapper.remove_oid(p.hobj, &_t);
4568       if (r != 0 && r != -ENOENT)
4569         ceph_abort();
4570       t.remove(tmp, p);
4571     }
4572     int r = store->queue_transaction(ch, std::move(t));
4573     ceph_assert(r == 0);
4574     t = ObjectStore::Transaction();
4575   }
4576   t.remove_collection(tmp);
4577   int r = store->queue_transaction(ch, std::move(t));
4578   ceph_assert(r == 0);
4579
4580   C_SaferCond waiter;
4581   if (!ch->flush_commit(&waiter)) {
4582     waiter.wait();
4583   }
4584 }
4585
4586
4587 // ======================================================
4588 // PG's
4589
4590 PG* OSD::_make_pg(
4591   OSDMapRef createmap,
4592   spg_t pgid)
4593 {
4594   dout(10) << __func__ << " " << pgid << dendl;
4595   pg_pool_t pi;
4596   map<string,string> ec_profile;
4597   string name;
4598   if (createmap->have_pg_pool(pgid.pool())) {
4599     pi = *createmap->get_pg_pool(pgid.pool());
4600     name = createmap->get_pool_name(pgid.pool());
4601     if (pi.is_erasure()) {
4602       ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4603     }
4604   } else {
4605     // pool was deleted; grab final pg_pool_t off disk.
4606     ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4607     bufferlist bl;
4608     int r = store->read(service.meta_ch, oid, 0, 0, bl);
4609     if (r < 0) {
4610       derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4611            << dendl;
4612       return nullptr;
4613     }
4614     ceph_assert(r >= 0);
4615     auto p = bl.cbegin();
4616     decode(pi, p);
4617     decode(name, p);
4618     if (p.end()) { // dev release v13.0.2 did not include ec_profile
4619       derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4620            << " tombstone" << dendl;
4621       return nullptr;
4622     }
4623     decode(ec_profile, p);
4624   }
4625   PGPool pool(createmap, pgid.pool(), pi, name);
4626   PG *pg;
4627   if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4628       pi.type == pg_pool_t::TYPE_ERASURE)
4629     pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4630   else
4631     ceph_abort();
4632   return pg;
4633 }
4634
4635 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4636 {
4637   v->clear();
4638   v->reserve(get_num_pgs());
4639   for (auto& s : shards) {
4640     std::lock_guard l(s->shard_lock);
4641     for (auto& j : s->pg_slots) {
4642       if (j.second->pg &&
4643           !j.second->pg->is_deleted()) {
4644         v->push_back(j.second->pg);
4645         if (clear_too) {
4646           s->_detach_pg(j.second.get());
4647         }
4648       }
4649     }
4650   }
4651 }
4652
4653 void OSD::_get_pgids(vector<spg_t> *v)
4654 {
4655   v->clear();
4656   v->reserve(get_num_pgs());
4657   for (auto& s : shards) {
4658     std::lock_guard l(s->shard_lock);
4659     for (auto& j : s->pg_slots) {
4660       if (j.second->pg &&
4661           !j.second->pg->is_deleted()) {
4662         v->push_back(j.first);
4663       }
4664     }
4665   }
4666 }
4667
4668 void OSD::register_pg(PGRef pg)
4669 {
4670   spg_t pgid = pg->get_pgid();
4671   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4672   auto sdata = shards[shard_index];
4673   std::lock_guard l(sdata->shard_lock);
4674   auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4675   ceph_assert(r.second);
4676   auto *slot = r.first->second.get();
4677   dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4678   sdata->_attach_pg(slot, pg.get());
4679 }
4680
4681 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4682 {
4683   auto sdata = pg->osd_shard;
4684   ceph_assert(sdata);
4685   {
4686     std::lock_guard l(sdata->shard_lock);
4687     auto p = sdata->pg_slots.find(pg->pg_id);
4688     if (p == sdata->pg_slots.end() ||
4689         !p->second->pg) {
4690       dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4691       return false;
4692     }
4693     if (p->second->waiting_for_merge_epoch) {
4694       dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4695       return false;
4696     }
4697     dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4698     sdata->_detach_pg(p->second.get());
4699   }
4700
4701   for (auto shard : shards) {
4702     shard->unprime_split_children(pg->pg_id, old_pg_num);
4703   }
4704
4705   // update pg count now since we might not get an osdmap any time soon.
4706   if (pg->is_primary())
4707     service.logger->dec(l_osd_pg_primary);
4708   else if (pg->is_nonprimary())
4709     service.logger->dec(l_osd_pg_replica); // misnomver
4710   else
4711     service.logger->dec(l_osd_pg_stray);
4712
4713   return true;
4714 }
4715
4716 PGRef OSD::_lookup_pg(spg_t pgid)
4717 {
4718   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4719   auto sdata = shards[shard_index];
4720   std::lock_guard l(sdata->shard_lock);
4721   auto p = sdata->pg_slots.find(pgid);
4722   if (p == sdata->pg_slots.end()) {
4723     return nullptr;
4724   }
4725   return p->second->pg;
4726 }
4727
4728 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4729 {
4730   PGRef pg = _lookup_pg(pgid);
4731   if (!pg) {
4732     return nullptr;
4733   }
4734   pg->lock();
4735   if (!pg->is_deleted()) {
4736     return pg;
4737   }
4738   pg->unlock();
4739   return nullptr;
4740 }
4741
4742 PGRef OSD::lookup_lock_pg(spg_t pgid)
4743 {
4744   return _lookup_lock_pg(pgid);
4745 }
4746
4747 void OSD::load_pgs()
4748 {
4749   ceph_assert(ceph_mutex_is_locked(osd_lock));
4750   dout(0) << "load_pgs" << dendl;
4751
4752   {
4753     auto pghist = make_pg_num_history_oid();
4754     bufferlist bl;
4755     int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4756     if (r >= 0 && bl.length() > 0) {
4757       auto p = bl.cbegin();
4758       decode(pg_num_history, p);
4759     }
4760     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4761   }
4762
4763   vector<coll_t> ls;
4764   int r = store->list_collections(ls);
4765   if (r < 0) {
4766     derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4767   }
4768
4769   int num = 0;
4770   for (vector<coll_t>::iterator it = ls.begin();
4771        it != ls.end();
4772        ++it) {
4773     spg_t pgid;
4774     if (it->is_temp(&pgid) ||
4775        (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4776       dout(10) << "load_pgs " << *it
4777                << " removing, legacy or flagged for removal pg" << dendl;
4778       recursive_remove_collection(cct, store, pgid, *it);
4779       continue;
4780     }
4781
4782     if (!it->is_pg(&pgid)) {
4783       dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4784       continue;
4785     }
4786
4787     dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4788     epoch_t map_epoch = 0;
4789     int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4790     if (r < 0) {
4791       derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4792            << dendl;
4793       continue;
4794     }
4795
4796     PGRef pg;
4797     if (map_epoch > 0) {
4798       OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4799       if (!pgosdmap) {
4800         if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4801           derr << __func__ << ": could not find map for epoch " << map_epoch
4802                << " on pg " << pgid << ", but the pool is not present in the "
4803                << "current map, so this is probably a result of bug 10617.  "
4804                << "Skipping the pg for now, you can use ceph-objectstore-tool "
4805                << "to clean it up later." << dendl;
4806           continue;
4807         } else {
4808           derr << __func__ << ": have pgid " << pgid << " at epoch "
4809                << map_epoch << ", but missing map.  Crashing."
4810                << dendl;
4811           ceph_abort_msg("Missing map in load_pgs");
4812         }
4813       }
4814       pg = _make_pg(pgosdmap, pgid);
4815     } else {
4816       pg = _make_pg(get_osdmap(), pgid);
4817     }
4818     if (!pg) {
4819       recursive_remove_collection(cct, store, pgid, *it);
4820       continue;
4821     }
4822
4823     // there can be no waiters here, so we don't call _wake_pg_slot
4824
4825     pg->lock();
4826     pg->ch = store->open_collection(pg->coll);
4827
4828     // read pg state, log
4829     pg->read_state(store);
4830
4831     if (pg->dne())  {
4832       dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4833       pg->ch = nullptr;
4834       pg->unlock();
4835       recursive_remove_collection(cct, store, pgid, *it);
4836       continue;
4837     }
4838     {
4839       uint32_t shard_index = pgid.hash_to_shard(shards.size());
4840       assert(NULL != shards[shard_index]);
4841       store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4842     }
4843
4844     pg->reg_next_scrub();
4845
4846     dout(10) << __func__ << " loaded " << *pg << dendl;
4847     pg->unlock();
4848
4849     register_pg(pg);
4850     ++num;
4851   }
4852   dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4853 }
4854
4855
4856 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4857                                  const PGCreateInfo *info)
4858 {
4859   spg_t pgid = info->pgid;
4860
4861   if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4862     dout(10) << __func__ << " hit max pg, dropping" << dendl;
4863     return nullptr;
4864   }
4865
4866   PeeringCtx rctx = create_context();
4867
4868   OSDMapRef startmap = get_map(info->epoch);
4869
4870   if (info->by_mon) {
4871     int64_t pool_id = pgid.pgid.pool();
4872     const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4873     if (!pool) {
4874       dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4875       return nullptr;
4876     }
4877     if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4878         !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4879       // this ensures we do not process old creating messages after the
4880       // pool's initial pgs have been created (and pg are subsequently
4881       // allowed to split or merge).
4882       dout(20) << __func__ << "  dropping " << pgid
4883                << "create, pool does not have CREATING flag set" << dendl;
4884       return nullptr;
4885     }
4886   }
4887
4888   int up_primary, acting_primary;
4889   vector<int> up, acting;
4890   startmap->pg_to_up_acting_osds(
4891     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4892
4893   const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4894   if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4895       store->get_type() != "bluestore") {
4896     clog->warn() << "pg " << pgid
4897                  << " is at risk of silent data corruption: "
4898                  << "the pool allows ec overwrites but is not stored in "
4899                  << "bluestore, so deep scrubbing will not detect bitrot";
4900   }
4901   create_pg_collection(
4902     rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4903   init_pg_ondisk(rctx.transaction, pgid, pp);
4904
4905   int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4906
4907   PGRef pg = _make_pg(startmap, pgid);
4908   pg->ch = store->create_new_collection(pg->coll);
4909
4910   {
4911     uint32_t shard_index = pgid.hash_to_shard(shards.size());
4912     assert(NULL != shards[shard_index]);
4913     store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4914   }
4915
4916   pg->lock(true);
4917
4918   // we are holding the shard lock
4919   ceph_assert(!pg->is_deleted());
4920
4921   pg->init(
4922     role,
4923     up,
4924     up_primary,
4925     acting,
4926     acting_primary,
4927     info->history,
4928     info->past_intervals,
4929     false,
4930     rctx.transaction);
4931
4932   pg->init_collection_pool_opts();
4933
4934   if (pg->is_primary()) {
4935     std::lock_guard locker{m_perf_queries_lock};
4936     pg->set_dynamic_perf_stats_queries(m_perf_queries);
4937   }
4938
4939   pg->handle_initialize(rctx);
4940   pg->handle_activate_map(rctx);
4941
4942   dispatch_context(rctx, pg.get(), osdmap, nullptr);
4943
4944   dout(10) << __func__ << " new pg " << *pg << dendl;
4945   return pg;
4946 }
4947
4948 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4949                                 spg_t pgid,
4950                                 bool is_mon_create)
4951 {
4952   const auto max_pgs_per_osd =
4953     (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4954      cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4955
4956   if (num_pgs < max_pgs_per_osd) {
4957     return false;
4958   }
4959
4960   std::lock_guard l(pending_creates_lock);
4961   if (is_mon_create) {
4962     pending_creates_from_mon++;
4963   } else {
4964     bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4965     pending_creates_from_osd.emplace(pgid, is_primary);
4966   }
4967   dout(1) << __func__ << " withhold creation of pg " << pgid
4968           << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4969   return true;
4970 }
4971
4972 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4973 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4974 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4975 static vector<int32_t> twiddle(const vector<int>& acting) {
4976   if (acting.size() > 1) {
4977     return {acting[0]};
4978   } else {
4979     vector<int32_t> twiddled(acting.begin(), acting.end());
4980     twiddled.push_back(-1);
4981     return twiddled;
4982   }
4983 }
4984
4985 void OSD::resume_creating_pg()
4986 {
4987   bool do_sub_pg_creates = false;
4988   bool have_pending_creates = false;
4989   {
4990     const auto max_pgs_per_osd =
4991       (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4992        cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4993     if (max_pgs_per_osd <= num_pgs) {
4994       // this could happen if admin decreases this setting before a PG is removed
4995       return;
4996     }
4997     unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4998     std::lock_guard l(pending_creates_lock);
4999     if (pending_creates_from_mon > 0) {
5000       dout(20) << __func__ << " pending_creates_from_mon "
5001                << pending_creates_from_mon << dendl;
5002       do_sub_pg_creates = true;
5003       if (pending_creates_from_mon >= spare_pgs) {
5004         spare_pgs = pending_creates_from_mon = 0;
5005       } else {
5006         spare_pgs -= pending_creates_from_mon;
5007         pending_creates_from_mon = 0;
5008       }
5009     }
5010     auto pg = pending_creates_from_osd.cbegin();
5011     while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
5012       dout(20) << __func__ << " pg " << pg->first << dendl;
5013       vector<int> acting;
5014       get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5015       service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
5016       pg = pending_creates_from_osd.erase(pg);
5017       do_sub_pg_creates = true;
5018       spare_pgs--;
5019     }
5020     have_pending_creates = (pending_creates_from_mon > 0 ||
5021                             !pending_creates_from_osd.empty());
5022   }
5023
5024   bool do_renew_subs = false;
5025   if (do_sub_pg_creates) {
5026     if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5027       dout(4) << __func__ << ": resolicit pg creates from mon since "
5028               << last_pg_create_epoch << dendl;
5029       do_renew_subs = true;
5030     }
5031   }
5032   version_t start = get_osdmap_epoch() + 1;
5033   if (have_pending_creates) {
5034     // don't miss any new osdmap deleting PGs
5035     if (monc->sub_want("osdmap", start, 0)) {
5036       dout(4) << __func__ << ": resolicit osdmap from mon since "
5037               << start << dendl;
5038       do_renew_subs = true;
5039     }
5040   } else if (do_sub_pg_creates) {
5041     // no need to subscribe the osdmap continuously anymore
5042     // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5043     if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
5044       dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
5045               << start << dendl;
5046       do_renew_subs = true;
5047     }
5048   }
5049
5050   if (do_renew_subs) {
5051     monc->renew_subs();
5052   }
5053
5054   service.send_pg_temp();
5055 }
5056
5057 void OSD::build_initial_pg_history(
5058   spg_t pgid,
5059   epoch_t created,
5060   utime_t created_stamp,
5061   pg_history_t *h,
5062   PastIntervals *pi)
5063 {
5064   dout(10) << __func__ << " " << pgid << " created " << created << dendl;
5065   *h = pg_history_t(created, created_stamp);
5066
5067   OSDMapRef lastmap = service.get_map(created);
5068   int up_primary, acting_primary;
5069   vector<int> up, acting;
5070   lastmap->pg_to_up_acting_osds(
5071     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5072
5073   ostringstream debug;
5074   for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
5075     OSDMapRef osdmap = service.get_map(e);
5076     int new_up_primary, new_acting_primary;
5077     vector<int> new_up, new_acting;
5078     osdmap->pg_to_up_acting_osds(
5079       pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5080
5081     // this is a bit imprecise, but sufficient?
5082     struct min_size_predicate_t : public IsPGRecoverablePredicate {
5083       const pg_pool_t *pi;
5084       bool operator()(const set<pg_shard_t> &have) const {
5085         return have.size() >= pi->min_size;
5086       }
5087       explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
5088     } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5089
5090     bool new_interval = PastIntervals::check_new_interval(
5091       acting_primary,
5092       new_acting_primary,
5093       acting, new_acting,
5094       up_primary,
5095       new_up_primary,
5096       up, new_up,
5097       h->same_interval_since,
5098       h->last_epoch_clean,
5099       osdmap.get(),
5100       lastmap.get(),
5101       pgid.pgid,
5102       min_size_predicate,
5103       pi,
5104       &debug);
5105     if (new_interval) {
5106       h->same_interval_since = e;
5107       if (up != new_up) {
5108         h->same_up_since = e;
5109       }
5110       if (acting_primary != new_acting_primary) {
5111         h->same_primary_since = e;
5112       }
5113       if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5114                              osdmap->get_pg_num(pgid.pgid.pool()),
5115                              nullptr)) {
5116         h->last_epoch_split = e;
5117       }
5118       up = new_up;
5119       acting = new_acting;
5120       up_primary = new_up_primary;
5121       acting_primary = new_acting_primary;
5122     }
5123     lastmap = osdmap;
5124   }
5125   dout(20) << __func__ << " " << debug.str() << dendl;
5126   dout(10) << __func__ << " " << *h << " " << *pi
5127            << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5128                        pi->get_bounds()) << ")"
5129            << dendl;
5130 }
5131
5132 void OSD::_add_heartbeat_peer(int p)
5133 {
5134   if (p == whoami)
5135     return;
5136   HeartbeatInfo *hi;
5137
5138   map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5139   if (i == heartbeat_peers.end()) {
5140     pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5141     if (!cons.first)
5142       return;
5143     assert(cons.second);
5144
5145     hi = &heartbeat_peers[p];
5146     hi->peer = p;
5147
5148     auto stamps = service.get_hb_stamps(p);
5149
5150     auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5151     sb->peer = p;
5152     sb->stamps = stamps;
5153     hi->hb_interval_start = ceph_clock_now();
5154     hi->con_back = cons.first.get();
5155     hi->con_back->set_priv(sb);
5156
5157     auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5158     sf->peer = p;
5159     sf->stamps = stamps;
5160     hi->con_front = cons.second.get();
5161     hi->con_front->set_priv(sf);
5162
5163     dout(10) << "_add_heartbeat_peer: new peer osd." << p
5164              << " " << hi->con_back->get_peer_addr()
5165              << " " << hi->con_front->get_peer_addr()
5166              << dendl;
5167   } else {
5168     hi = &i->second;
5169   }
5170   hi->epoch = get_osdmap_epoch();
5171 }
5172
5173 void OSD::_remove_heartbeat_peer(int n)
5174 {
5175   map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5176   ceph_assert(q != heartbeat_peers.end());
5177   dout(20) << " removing heartbeat peer osd." << n
5178            << " " << q->second.con_back->get_peer_addr()
5179            << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5180            << dendl;
5181   q->second.clear_mark_down();
5182   heartbeat_peers.erase(q);
5183 }
5184
5185 void OSD::need_heartbeat_peer_update()
5186 {
5187   if (is_stopping())
5188     return;
5189   dout(20) << "need_heartbeat_peer_update" << dendl;
5190   heartbeat_set_peers_need_update();
5191 }
5192
5193 void OSD::maybe_update_heartbeat_peers()
5194 {
5195   ceph_assert(ceph_mutex_is_locked(osd_lock));
5196
5197   if (is_waiting_for_healthy() || is_active()) {
5198     utime_t now = ceph_clock_now();
5199     if (last_heartbeat_resample == utime_t()) {
5200       last_heartbeat_resample = now;
5201       heartbeat_set_peers_need_update();
5202     } else if (!heartbeat_peers_need_update()) {
5203       utime_t dur = now - last_heartbeat_resample;
5204       if (dur > cct->_conf->osd_heartbeat_grace) {
5205         dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5206         heartbeat_set_peers_need_update();
5207         last_heartbeat_resample = now;
5208         // automatically clean up any stale heartbeat peers
5209         // if we are unhealthy, then clean all
5210         reset_heartbeat_peers(is_waiting_for_healthy());
5211       }
5212     }
5213   }
5214
5215   if (!heartbeat_peers_need_update())
5216     return;
5217   heartbeat_clear_peers_need_update();
5218
5219   std::lock_guard l(heartbeat_lock);
5220
5221   dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5222
5223
5224   // build heartbeat from set
5225   if (is_active()) {
5226     vector<PGRef> pgs;
5227     _get_pgs(&pgs);
5228     for (auto& pg : pgs) {
5229       pg->with_heartbeat_peers([&](int peer) {
5230           if (get_osdmap()->is_up(peer)) {
5231             _add_heartbeat_peer(peer);
5232           }
5233         });
5234     }
5235   }
5236
5237   // include next and previous up osds to ensure we have a fully-connected set
5238   set<int> want, extras;
5239   const int next = get_osdmap()->get_next_up_osd_after(whoami);
5240   if (next >= 0)
5241     want.insert(next);
5242   int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5243   if (prev >= 0 && prev != next)
5244     want.insert(prev);
5245
5246   // make sure we have at least **min_down** osds coming from different
5247   // subtree level (e.g., hosts) for fast failure detection.
5248   auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5249   auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5250   auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5251   get_osdmap()->get_random_up_osds_by_subtree(
5252     whoami, subtree, limit, want, &want);
5253
5254   for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5255     dout(10) << " adding neighbor peer osd." << *p << dendl;
5256     extras.insert(*p);
5257     _add_heartbeat_peer(*p);
5258   }
5259
5260   // remove down peers; enumerate extras
5261   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5262   while (p != heartbeat_peers.end()) {
5263     if (!get_osdmap()->is_up(p->first)) {
5264       int o = p->first;
5265       ++p;
5266       _remove_heartbeat_peer(o);
5267       continue;
5268     }
5269     if (p->second.epoch < get_osdmap_epoch()) {
5270       extras.insert(p->first);
5271     }
5272     ++p;
5273   }
5274
5275   // too few?
5276   for (int n = next; n >= 0; ) {
5277     if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5278       break;
5279     if (!extras.count(n) && !want.count(n) && n != whoami) {
5280       dout(10) << " adding random peer osd." << n << dendl;
5281       extras.insert(n);
5282       _add_heartbeat_peer(n);
5283     }
5284     n = get_osdmap()->get_next_up_osd_after(n);
5285     if (n == next)
5286       break;  // came full circle; stop
5287   }
5288
5289   // too many?
5290   for (set<int>::iterator p = extras.begin();
5291        (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5292        ++p) {
5293     if (want.count(*p))
5294       continue;
5295     _remove_heartbeat_peer(*p);
5296   }
5297
5298   dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5299
5300   // clean up stale failure pending
5301   for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5302     if (heartbeat_peers.count(it->first) == 0) {
5303       send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5304       failure_pending.erase(it++);
5305     } else {
5306       it++;
5307     }
5308   }
5309 }
5310
5311 void OSD::reset_heartbeat_peers(bool all)
5312 {
5313   ceph_assert(ceph_mutex_is_locked(osd_lock));
5314   dout(10) << "reset_heartbeat_peers" << dendl;
5315   utime_t stale = ceph_clock_now();
5316   stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5317   std::lock_guard l(heartbeat_lock);
5318   for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5319     auto& [peer, hi] = *it;
5320     if (all || hi.is_stale(stale)) {
5321       hi.clear_mark_down();
5322       // stop sending failure_report to mon too
5323       failure_queue.erase(peer);
5324       failure_pending.erase(peer);
5325       it = heartbeat_peers.erase(it);
5326     } else {
5327       ++it;
5328     }
5329   }
5330 }
5331
5332 void OSD::handle_osd_ping(MOSDPing *m)
5333 {
5334   if (superblock.cluster_fsid != m->fsid) {
5335     dout(20) << "handle_osd_ping from " << m->get_source_inst()
5336              << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5337              << dendl;
5338     m->put();
5339     return;
5340   }
5341
5342   int from = m->get_source().num();
5343
5344   heartbeat_lock.lock();
5345   if (is_stopping()) {
5346     heartbeat_lock.unlock();
5347     m->put();
5348     return;
5349   }
5350
5351   utime_t now = ceph_clock_now();
5352   auto mnow = service.get_mnow();
5353   ConnectionRef con(m->get_connection());
5354   OSDMapRef curmap = service.get_osdmap();
5355   if (!curmap) {
5356     heartbeat_lock.unlock();
5357     m->put();
5358     return;
5359   }
5360
5361   auto sref = con->get_priv();
5362   Session *s = static_cast<Session*>(sref.get());
5363   if (!s) {
5364     heartbeat_lock.unlock();
5365     m->put();
5366     return;
5367   }
5368   if (!s->stamps) {
5369     s->peer = from;
5370     s->stamps = service.get_hb_stamps(from);
5371   }
5372
5373   switch (m->op) {
5374
5375   case MOSDPing::PING:
5376     {
5377       if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5378         auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5379         if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5380           if (heartbeat_drop->second == 0) {
5381             debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5382           } else {
5383             --heartbeat_drop->second;
5384             dout(5) << "Dropping heartbeat from " << from
5385                     << ", " << heartbeat_drop->second
5386                     << " remaining to drop" << dendl;
5387             break;
5388           }
5389         } else if (cct->_conf->osd_debug_drop_ping_probability >
5390                    ((((double)(rand()%100))/100.0))) {
5391           heartbeat_drop =
5392             debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5393                              cct->_conf->osd_debug_drop_ping_duration)).first;
5394           dout(5) << "Dropping heartbeat from " << from
5395                   << ", " << heartbeat_drop->second
5396                   << " remaining to drop" << dendl;
5397           break;
5398         }
5399       }
5400
5401       ceph::signedspan sender_delta_ub{};
5402       s->stamps->got_ping(
5403         m->up_from,
5404         mnow,
5405         m->mono_send_stamp,
5406         m->delta_ub,
5407         &sender_delta_ub);
5408       dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5409
5410       if (!cct->get_heartbeat_map()->is_healthy()) {
5411         dout(10) << "internal heartbeat not healthy, dropping ping request"
5412                  << dendl;
5413         break;
5414       }
5415
5416       Message *r = new MOSDPing(monc->get_fsid(),
5417                                 curmap->get_epoch(),
5418                                 MOSDPing::PING_REPLY,
5419                                 m->ping_stamp,
5420                                 m->mono_ping_stamp,
5421                                 mnow,
5422                                 service.get_up_epoch(),
5423                                 cct->_conf->osd_heartbeat_min_size,
5424                                 sender_delta_ub);
5425       con->send_message(r);
5426
5427       if (curmap->is_up(from)) {
5428         if (is_active()) {
5429           ConnectionRef cluster_con = service.get_con_osd_cluster(
5430             from, curmap->get_epoch());
5431           if (cluster_con) {
5432             service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5433           }
5434         }
5435       } else if (!curmap->exists(from) ||
5436                  curmap->get_down_at(from) > m->map_epoch) {
5437         // tell them they have died
5438         Message *r = new MOSDPing(monc->get_fsid(),
5439                                   curmap->get_epoch(),
5440                                   MOSDPing::YOU_DIED,
5441                                   m->ping_stamp,
5442                                   m->mono_ping_stamp,
5443                                   mnow,
5444                                   service.get_up_epoch(),
5445                                   cct->_conf->osd_heartbeat_min_size);
5446         con->send_message(r);
5447       }
5448     }
5449     break;
5450
5451   case MOSDPing::PING_REPLY:
5452     {
5453       map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5454       if (i != heartbeat_peers.end()) {
5455         auto acked = i->second.ping_history.find(m->ping_stamp);
5456         if (acked != i->second.ping_history.end()) {
5457           int &unacknowledged = acked->second.second;
5458           if (con == i->second.con_back) {
5459             dout(25) << "handle_osd_ping got reply from osd." << from
5460                      << " first_tx " << i->second.first_tx
5461                      << " last_tx " << i->second.last_tx
5462                      << " last_rx_back " << i->second.last_rx_back
5463                      << " -> " << now
5464                      << " last_rx_front " << i->second.last_rx_front
5465                      << dendl;
5466             i->second.last_rx_back = now;
5467             ceph_assert(unacknowledged > 0);
5468             --unacknowledged;
5469             // if there is no front con, set both stamps.
5470             if (i->second.con_front == NULL) {
5471               i->second.last_rx_front = now;
5472               ceph_assert(unacknowledged > 0);
5473               --unacknowledged;
5474             }
5475           } else if (con == i->second.con_front) {
5476             dout(25) << "handle_osd_ping got reply from osd." << from
5477                      << " first_tx " << i->second.first_tx
5478                      << " last_tx " << i->second.last_tx
5479                      << " last_rx_back " << i->second.last_rx_back
5480                      << " last_rx_front " << i->second.last_rx_front
5481                      << " -> " << now
5482                      << dendl;
5483             i->second.last_rx_front = now;
5484             ceph_assert(unacknowledged > 0);
5485             --unacknowledged;
5486           }
5487
5488           if (unacknowledged == 0) {
5489             // succeeded in getting all replies
5490             dout(25) << "handle_osd_ping got all replies from osd." << from
5491                      << " , erase pending ping(sent at " << m->ping_stamp << ")"
5492                      << " and older pending ping(s)"
5493                      << dendl;
5494
5495 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5496             ++i->second.hb_average_count;
5497             uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5498             i->second.hb_total_back += back_pingtime;
5499             if (back_pingtime < i->second.hb_min_back)
5500               i->second.hb_min_back = back_pingtime;
5501             if (back_pingtime > i->second.hb_max_back)
5502               i->second.hb_max_back = back_pingtime;
5503             uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5504             i->second.hb_total_front += front_pingtime;
5505             if (front_pingtime < i->second.hb_min_front)
5506               i->second.hb_min_front = front_pingtime;
5507             if (front_pingtime > i->second.hb_max_front)
5508               i->second.hb_max_front = front_pingtime;
5509
5510             ceph_assert(i->second.hb_interval_start != utime_t());
5511             if (i->second.hb_interval_start == utime_t())
5512               i->second.hb_interval_start = now;
5513             int64_t hb_avg_time_period = 60;
5514             if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5515               hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5516             }
5517             if (now - i->second.hb_interval_start >=  utime_t(hb_avg_time_period, 0)) {
5518               uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5519               uint32_t back_min = i->second.hb_min_back;
5520               uint32_t back_max = i->second.hb_max_back;
5521               uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5522               uint32_t front_min = i->second.hb_min_front;
5523               uint32_t front_max = i->second.hb_max_front;
5524
5525               // Reset for new interval
5526               i->second.hb_average_count = 0;
5527               i->second.hb_interval_start = now;
5528               i->second.hb_total_back = i->second.hb_max_back = 0;
5529               i->second.hb_min_back =  UINT_MAX;
5530               i->second.hb_total_front = i->second.hb_max_front = 0;
5531               i->second.hb_min_front = UINT_MAX;
5532
5533               // Record per osd interace ping times
5534               // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5535               if (i->second.hb_back_pingtime.size() == 0) {
5536                 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5537                 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5538                   i->second.hb_back_pingtime.push_back(back_avg);
5539                   i->second.hb_back_min.push_back(back_min);
5540                   i->second.hb_back_max.push_back(back_max);
5541                   i->second.hb_front_pingtime.push_back(front_avg);
5542                   i->second.hb_front_min.push_back(front_min);
5543                   i->second.hb_front_max.push_back(front_max);
5544                   ++i->second.hb_index;
5545                 }
5546               } else {
5547                 int index = i->second.hb_index & (hb_vector_size - 1);
5548                 i->second.hb_back_pingtime[index] = back_avg;
5549                 i->second.hb_back_min[index] = back_min;
5550                 i->second.hb_back_max[index] = back_max;
5551                 i->second.hb_front_pingtime[index] = front_avg;
5552                 i->second.hb_front_min[index] = front_min;
5553                 i->second.hb_front_max[index] = front_max;
5554                 ++i->second.hb_index;
5555               }
5556
5557               {
5558                 std::lock_guard l(service.stat_lock);
5559                 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5560                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5561
5562                 uint32_t total = 0;
5563                 uint32_t min = UINT_MAX;
5564                 uint32_t max = 0;
5565                 uint32_t count = 0;
5566                 uint32_t which = 0;
5567                 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5568                 for (int32_t k = size - 1 ; k >= 0; --k) {
5569                   ++count;
5570                   int index = (i->second.hb_index + k) % size;
5571                   total += i->second.hb_back_pingtime[index];
5572                   if (i->second.hb_back_min[index] < min)
5573                     min = i->second.hb_back_min[index];
5574                   if (i->second.hb_back_max[index] > max)
5575                     max = i->second.hb_back_max[index];
5576                   if (count == 1 || count == 5 || count == 15) {
5577                     service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5578                     service.osd_stat.hb_pingtime[from].back_min[which] = min;
5579                     service.osd_stat.hb_pingtime[from].back_max[which] = max;
5580                     which++;
5581                     if (count == 15)
5582                       break;
5583                   }
5584                 }
5585
5586                 if (i->second.con_front != NULL) {
5587                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5588
5589                   total = 0;
5590                   min = UINT_MAX;
5591                   max = 0;
5592                   count = 0;
5593                   which = 0;
5594                   for (int32_t k = size - 1 ; k >= 0; --k) {
5595                     ++count;
5596                     int index = (i->second.hb_index + k) % size;
5597                     total += i->second.hb_front_pingtime[index];
5598                     if (i->second.hb_front_min[index] < min)
5599                       min = i->second.hb_front_min[index];
5600                     if (i->second.hb_front_max[index] > max)
5601                       max = i->second.hb_front_max[index];
5602                     if (count == 1 || count == 5 || count == 15) {
5603                       service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5604                       service.osd_stat.hb_pingtime[from].front_min[which] = min;
5605                       service.osd_stat.hb_pingtime[from].front_max[which] = max;
5606                       which++;
5607                       if (count == 15)
5608                         break;
5609                     }
5610                   }
5611                 }
5612               }
5613             } else {
5614                 std::lock_guard l(service.stat_lock);
5615                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5616                 if (i->second.con_front != NULL)
5617                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5618             }
5619             i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5620           }
5621
5622           if (i->second.is_healthy(now)) {
5623             // Cancel false reports
5624             auto failure_queue_entry = failure_queue.find(from);
5625             if (failure_queue_entry != failure_queue.end()) {
5626               dout(10) << "handle_osd_ping canceling queued "
5627                        << "failure report for osd." << from << dendl;
5628               failure_queue.erase(failure_queue_entry);
5629             }
5630
5631             auto failure_pending_entry = failure_pending.find(from);
5632             if (failure_pending_entry != failure_pending.end()) {
5633               dout(10) << "handle_osd_ping canceling in-flight "
5634                        << "failure report for osd." << from << dendl;
5635               send_still_alive(curmap->get_epoch(),
5636                                from,
5637                                failure_pending_entry->second.second);
5638               failure_pending.erase(failure_pending_entry);
5639             }
5640           }
5641         } else {
5642           // old replies, deprecated by newly sent pings.
5643           dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5644                    << ") is found, treat as covered by newly sent pings "
5645                    << "and ignore"
5646                    << dendl;
5647         }
5648       }
5649
5650       if (m->map_epoch &&
5651           curmap->is_up(from)) {
5652         if (is_active()) {
5653           ConnectionRef cluster_con = service.get_con_osd_cluster(
5654             from, curmap->get_epoch());
5655           if (cluster_con) {
5656             service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5657           }
5658         }
5659       }
5660
5661       s->stamps->got_ping_reply(
5662         mnow,
5663         m->mono_send_stamp,
5664         m->delta_ub);
5665       dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5666     }
5667     break;
5668
5669   case MOSDPing::YOU_DIED:
5670     dout(10) << "handle_osd_ping " << m->get_source_inst()
5671              << " says i am down in " << m->map_epoch << dendl;
5672     osdmap_subscribe(curmap->get_epoch()+1, false);
5673     break;
5674   }
5675
5676   heartbeat_lock.unlock();
5677   m->put();
5678 }
5679
5680 void OSD::heartbeat_entry()
5681 {
5682   std::unique_lock l(heartbeat_lock);
5683   if (is_stopping())
5684     return;
5685   while (!heartbeat_stop) {
5686     heartbeat();
5687
5688     double wait;
5689     if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5690       wait = (float)cct->_conf->osd_heartbeat_interval;
5691     } else {
5692       wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5693     }
5694     auto w = ceph::make_timespan(wait);
5695     dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5696     heartbeat_cond.wait_for(l, w);
5697     if (is_stopping())
5698       return;
5699     dout(30) << "heartbeat_entry woke up" << dendl;
5700   }
5701 }
5702
5703 void OSD::heartbeat_check()
5704 {
5705   ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5706   utime_t now = ceph_clock_now();
5707
5708   // check for incoming heartbeats (move me elsewhere?)
5709   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5710        p != heartbeat_peers.end();
5711        ++p) {
5712
5713     if (p->second.first_tx == utime_t()) {
5714       dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5715                << " yet, skipping" << dendl;
5716       continue;
5717     }
5718
5719     dout(25) << "heartbeat_check osd." << p->first
5720              << " first_tx " << p->second.first_tx
5721              << " last_tx " << p->second.last_tx
5722              << " last_rx_back " << p->second.last_rx_back
5723              << " last_rx_front " << p->second.last_rx_front
5724              << dendl;
5725     if (p->second.is_unhealthy(now)) {
5726       utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5727       if (p->second.last_rx_back == utime_t() ||
5728           p->second.last_rx_front == utime_t()) {
5729         derr << "heartbeat_check: no reply from "
5730              << p->second.con_front->get_peer_addr().get_sockaddr()
5731              << " osd." << p->first
5732              << " ever on either front or back, first ping sent "
5733              << p->second.first_tx
5734              << " (oldest deadline " << oldest_deadline << ")"
5735              << dendl;
5736         // fail
5737         failure_queue[p->first] = p->second.first_tx;
5738       } else {
5739         derr << "heartbeat_check: no reply from "
5740              << p->second.con_front->get_peer_addr().get_sockaddr()
5741              << " osd." << p->first << " since back " << p->second.last_rx_back
5742              << " front " << p->second.last_rx_front
5743              << " (oldest deadline " << oldest_deadline << ")"
5744              << dendl;
5745         // fail
5746         failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5747       }
5748     }
5749   }
5750 }
5751
5752 void OSD::heartbeat()
5753 {
5754   ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5755   dout(30) << "heartbeat" << dendl;
5756
5757   // get CPU load avg
5758   double loadavgs[1];
5759   int hb_interval = cct->_conf->osd_heartbeat_interval;
5760   int n_samples = 86400;
5761   if (hb_interval > 1) {
5762     n_samples /= hb_interval;
5763     if (n_samples < 1)
5764       n_samples = 1;
5765   }
5766
5767   if (getloadavg(loadavgs, 1) == 1) {
5768     logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5769     daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5770     dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5771   }
5772
5773   dout(30) << "heartbeat checking stats" << dendl;
5774
5775   // refresh peer list and osd stats
5776   vector<int> hb_peers;
5777   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5778        p != heartbeat_peers.end();
5779        ++p)
5780     hb_peers.push_back(p->first);
5781
5782   auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5783   dout(5) << __func__ << " " << new_stat << dendl;
5784   ceph_assert(new_stat.statfs.total);
5785
5786   float pratio;
5787   float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5788
5789   service.check_full_status(ratio, pratio);
5790
5791   utime_t now = ceph_clock_now();
5792   auto mnow = service.get_mnow();
5793   utime_t deadline = now;
5794   deadline += cct->_conf->osd_heartbeat_grace;
5795
5796   // send heartbeats
5797   for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5798        i != heartbeat_peers.end();
5799        ++i) {
5800     int peer = i->first;
5801     Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5802     if (!s) {
5803       dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5804       continue;
5805     }
5806     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5807
5808     i->second.last_tx = now;
5809     if (i->second.first_tx == utime_t())
5810       i->second.first_tx = now;
5811     i->second.ping_history[now] = make_pair(deadline,
5812       HeartbeatInfo::HEARTBEAT_MAX_CONN);
5813     if (i->second.hb_interval_start == utime_t())
5814       i->second.hb_interval_start = now;
5815
5816     std::optional<ceph::signedspan> delta_ub;
5817     s->stamps->sent_ping(&delta_ub);
5818
5819     i->second.con_back->send_message(
5820       new MOSDPing(monc->get_fsid(),
5821                    service.get_osdmap_epoch(),
5822                    MOSDPing::PING,
5823                    now,
5824                    mnow,
5825                    mnow,
5826                    service.get_up_epoch(),
5827                    cct->_conf->osd_heartbeat_min_size,
5828                    delta_ub));
5829
5830     if (i->second.con_front)
5831       i->second.con_front->send_message(
5832         new MOSDPing(monc->get_fsid(),
5833                      service.get_osdmap_epoch(),
5834                      MOSDPing::PING,
5835                      now,
5836                      mnow,
5837                      mnow,
5838                      service.get_up_epoch(),
5839                      cct->_conf->osd_heartbeat_min_size,
5840                      delta_ub));
5841   }
5842
5843   logger->set(l_osd_hb_to, heartbeat_peers.size());
5844
5845   // hmm.. am i all alone?
5846   dout(30) << "heartbeat lonely?" << dendl;
5847   if (heartbeat_peers.empty()) {
5848     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5849       last_mon_heartbeat = now;
5850       dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5851       osdmap_subscribe(get_osdmap_epoch() + 1, false);
5852     }
5853   }
5854
5855   dout(30) << "heartbeat done" << dendl;
5856 }
5857
5858 bool OSD::heartbeat_reset(Connection *con)
5859 {
5860   std::lock_guard l(heartbeat_lock);
5861   auto s = con->get_priv();
5862   dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5863   con->set_priv(nullptr);
5864   if (s) {
5865     if (is_stopping()) {
5866       return true;
5867     }
5868     auto session = static_cast<Session*>(s.get());
5869     auto p = heartbeat_peers.find(session->peer);
5870     if (p != heartbeat_peers.end() &&
5871         (p->second.con_back == con ||
5872          p->second.con_front == con)) {
5873       dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5874                << ", reopening" << dendl;
5875       p->second.clear_mark_down(con);
5876       pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5877       if (newcon.first) {
5878         p->second.con_back = newcon.first.get();
5879         p->second.con_back->set_priv(s);
5880         if (newcon.second) {
5881           p->second.con_front = newcon.second.get();
5882           p->second.con_front->set_priv(s);
5883         }
5884         p->second.ping_history.clear();
5885       } else {
5886         dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5887                  << ", raced with osdmap update, closing out peer" << dendl;
5888         heartbeat_peers.erase(p);
5889       }
5890     } else {
5891       dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5892     }
5893   }
5894   return true;
5895 }
5896
5897
5898
5899 // =========================================
5900
5901 void OSD::tick()
5902 {
5903   ceph_assert(ceph_mutex_is_locked(osd_lock));
5904   dout(10) << "tick" << dendl;
5905
5906   utime_t now = ceph_clock_now();
5907   // throw out any obsolete markdown log
5908   utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5909   while (!osd_markdown_log.empty() &&
5910           osd_markdown_log.front() + grace < now)
5911     osd_markdown_log.pop_front();
5912
5913   if (is_active() || is_waiting_for_healthy()) {
5914     maybe_update_heartbeat_peers();
5915   }
5916
5917   if (is_waiting_for_healthy()) {
5918     start_boot();
5919   }
5920
5921   if (is_waiting_for_healthy() || is_booting()) {
5922     std::lock_guard l(heartbeat_lock);
5923     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5924       last_mon_heartbeat = now;
5925       dout(1) << __func__ << " checking mon for new map" << dendl;
5926       osdmap_subscribe(get_osdmap_epoch() + 1, false);
5927     }
5928   }
5929
5930   do_waiters();
5931
5932   // scrub purged_snaps every deep scrub interval
5933   {
5934     const utime_t last = superblock.last_purged_snaps_scrub;
5935     utime_t next = last;
5936     next += cct->_conf->osd_scrub_min_interval;
5937     std::mt19937 rng;
5938     // use a seed that is stable for each scrub interval, but varies
5939     // by OSD to avoid any herds.
5940     rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5941     double r = (rng() % 1024) / 1024;
5942     next +=
5943       cct->_conf->osd_scrub_min_interval *
5944       cct->_conf->osd_scrub_interval_randomize_ratio * r;
5945     if (next < ceph_clock_now()) {
5946       dout(20) << __func__ << " last_purged_snaps_scrub " << last
5947                << " next " << next << " ... now" << dendl;
5948       scrub_purged_snaps();
5949     } else {
5950       dout(20) << __func__ << " last_purged_snaps_scrub " << last
5951                << " next " << next << dendl;
5952     }
5953   }
5954
5955   tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5956 }
5957
5958 void OSD::tick_without_osd_lock()
5959 {
5960   ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5961   dout(10) << "tick_without_osd_lock" << dendl;
5962
5963   logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
5964   logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
5965   logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
5966
5967   // refresh osd stats
5968   struct store_statfs_t stbuf;
5969   osd_alert_list_t alerts;
5970   int r = store->statfs(&stbuf, &alerts);
5971   ceph_assert(r == 0);
5972   service.set_statfs(stbuf, alerts);
5973
5974   // osd_lock is not being held, which means the OSD state
5975   // might change when doing the monitor report
5976   if (is_active() || is_waiting_for_healthy()) {
5977     {
5978       std::lock_guard l{heartbeat_lock};
5979       heartbeat_check();
5980     }
5981     map_lock.lock_shared();
5982     std::lock_guard l(mon_report_lock);
5983
5984     // mon report?
5985     utime_t now = ceph_clock_now();
5986     if (service.need_fullness_update() ||
5987         now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5988       last_mon_report = now;
5989       send_full_update();
5990       send_failures();
5991     }
5992     map_lock.unlock_shared();
5993
5994     epoch_t max_waiting_epoch = 0;
5995     for (auto s : shards) {
5996       max_waiting_epoch = std::max(max_waiting_epoch,
5997                                    s->get_max_waiting_epoch());
5998     }
5999     if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6000       dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6001                << ", requesting new map" << dendl;
6002       osdmap_subscribe(superblock.newest_map + 1, false);
6003     }
6004   }
6005
6006   if (is_active()) {
6007     if (!scrub_random_backoff()) {
6008       sched_scrub();
6009     }
6010     service.promote_throttle_recalibrate();
6011     resume_creating_pg();
6012     bool need_send_beacon = false;
6013     const auto now = ceph::coarse_mono_clock::now();
6014     {
6015       // borrow lec lock to pretect last_sent_beacon from changing
6016       std::lock_guard l{min_last_epoch_clean_lock};
6017       const auto elapsed = now - last_sent_beacon;
6018       if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
6019         cct->_conf->osd_beacon_report_interval) {
6020         need_send_beacon = true;
6021       }
6022     }
6023     if (need_send_beacon) {
6024       send_beacon(now);
6025     }
6026   }
6027
6028   mgrc.update_daemon_health(get_health_metrics());
6029   service.kick_recovery_queue();
6030   tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6031                                               new C_Tick_WithoutOSDLock(this));
6032 }
6033
6034 // Usage:
6035 //   setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6036 //   rmomapkey <pool-id> [namespace/]<obj-name> <key>
6037 //   setomapheader <pool-id> [namespace/]<obj-name> <header>
6038 //   getomap <pool> [namespace/]<obj-name>
6039 //   truncobj <pool-id> [namespace/]<obj-name> <newlen>
6040 //   injectmdataerr [namespace/]<obj-name> [shardid]
6041 //   injectdataerr [namespace/]<obj-name> [shardid]
6042 //
6043 //   set_recovery_delay [utime]
6044 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
6045                                  std::string_view command,
6046                                  const cmdmap_t& cmdmap, ostream &ss)
6047 {
6048   //Test support
6049   //Support changing the omap on a single osd by using the Admin Socket to
6050   //directly request the osd make a change.
6051   if (command == "setomapval" || command == "rmomapkey" ||
6052       command == "setomapheader" || command == "getomap" ||
6053       command == "truncobj" || command == "injectmdataerr" ||
6054       command == "injectdataerr"
6055     ) {
6056     pg_t rawpg;
6057     int64_t pool;
6058     OSDMapRef curmap = service->get_osdmap();
6059     int r = -1;
6060
6061     string poolstr;
6062
6063     cmd_getval(cmdmap, "pool", poolstr);
6064     pool = curmap->lookup_pg_pool_name(poolstr);
6065     //If we can't find it by name then maybe id specified
6066     if (pool < 0 && isdigit(poolstr[0]))
6067       pool = atoll(poolstr.c_str());
6068     if (pool < 0) {
6069       ss << "Invalid pool '" << poolstr << "''";
6070       return;
6071     }
6072
6073     string objname, nspace;
6074     cmd_getval(cmdmap, "objname", objname);
6075     std::size_t found = objname.find_first_of('/');
6076     if (found != string::npos) {
6077       nspace = objname.substr(0, found);
6078       objname = objname.substr(found+1);
6079     }
6080     object_locator_t oloc(pool, nspace);
6081     r = curmap->object_locator_to_pg(object_t(objname), oloc,  rawpg);
6082
6083     if (r < 0) {
6084       ss << "Invalid namespace/objname";
6085       return;
6086     }
6087
6088     int64_t shardid;
6089     cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
6090     hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6091     ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6092     spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6093     if (curmap->pg_is_ec(rawpg)) {
6094         if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6095             ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6096             return;
6097         }
6098     }
6099
6100     ObjectStore::Transaction t;
6101
6102     if (command == "setomapval") {
6103       map<string, bufferlist> newattrs;
6104       bufferlist val;
6105       string key, valstr;
6106       cmd_getval(cmdmap, "key", key);
6107       cmd_getval(cmdmap, "val", valstr);
6108
6109       val.append(valstr);
6110       newattrs[key] = val;
6111       t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
6112       r = store->queue_transaction(service->meta_ch, std::move(t));
6113       if (r < 0)
6114         ss << "error=" << r;
6115       else
6116         ss << "ok";
6117     } else if (command == "rmomapkey") {
6118       string key;
6119       cmd_getval(cmdmap, "key", key);
6120
6121       t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6122       r = store->queue_transaction(service->meta_ch, std::move(t));
6123       if (r < 0)
6124         ss << "error=" << r;
6125       else
6126         ss << "ok";
6127     } else if (command == "setomapheader") {
6128       bufferlist newheader;
6129       string headerstr;
6130
6131       cmd_getval(cmdmap, "header", headerstr);
6132       newheader.append(headerstr);
6133       t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6134       r = store->queue_transaction(service->meta_ch, std::move(t));
6135       if (r < 0)
6136         ss << "error=" << r;
6137       else
6138         ss << "ok";
6139     } else if (command == "getomap") {
6140       //Debug: Output entire omap
6141       bufferlist hdrbl;
6142       map<string, bufferlist> keyvals;
6143       auto ch = store->open_collection(coll_t(pgid));
6144       if (!ch) {
6145         ss << "unable to open collection for " << pgid;
6146         r = -ENOENT;
6147       } else {
6148         r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6149         if (r >= 0) {
6150           ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6151           for (map<string, bufferlist>::iterator it = keyvals.begin();
6152                it != keyvals.end(); ++it)
6153             ss << " key=" << (*it).first << " val="
6154                << string((*it).second.c_str(), (*it).second.length());
6155         } else {
6156           ss << "error=" << r;
6157         }
6158       }
6159     } else if (command == "truncobj") {
6160       int64_t trunclen;
6161       cmd_getval(cmdmap, "len", trunclen);
6162       t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6163       r = store->queue_transaction(service->meta_ch, std::move(t));
6164       if (r < 0)
6165         ss << "error=" << r;
6166       else
6167         ss << "ok";
6168     } else if (command == "injectdataerr") {
6169       store->inject_data_error(gobj);
6170       ss << "ok";
6171     } else if (command == "injectmdataerr") {
6172       store->inject_mdata_error(gobj);
6173       ss << "ok";
6174     }
6175     return;
6176   }
6177   if (command == "set_recovery_delay") {
6178     int64_t delay;
6179     cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6180     ostringstream oss;
6181     oss << delay;
6182     int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6183                                          oss.str().c_str());
6184     if (r != 0) {
6185       ss << "set_recovery_delay: error setting "
6186          << "osd_recovery_delay_start to '" << delay << "': error "
6187          << r;
6188       return;
6189     }
6190     service->cct->_conf.apply_changes(nullptr);
6191     ss << "set_recovery_delay: set osd_recovery_delay_start "
6192        << "to " << service->cct->_conf->osd_recovery_delay_start;
6193     return;
6194   }
6195   if (command == "injectfull") {
6196     int64_t count;
6197     string type;
6198     OSDService::s_names state;
6199     cmd_getval(cmdmap, "type", type, string("full"));
6200     cmd_getval(cmdmap, "count", count, (int64_t)-1);
6201     if (type == "none" || count == 0) {
6202       type = "none";
6203       count = 0;
6204     }
6205     state = service->get_full_state(type);
6206     if (state == OSDService::s_names::INVALID) {
6207       ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6208       return;
6209     }
6210     service->set_injectfull(state, count);
6211     return;
6212   }
6213   ss << "Internal error - command=" << command;
6214 }
6215
6216 // =========================================
6217
6218 void OSD::ms_handle_connect(Connection *con)
6219 {
6220   dout(10) << __func__ << " con " << con << dendl;
6221   if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6222     std::lock_guard l(osd_lock);
6223     if (is_stopping())
6224       return;
6225     dout(10) << __func__ << " on mon" << dendl;
6226
6227     if (is_preboot()) {
6228       start_boot();
6229     } else if (is_booting()) {
6230       _send_boot();       // resend boot message
6231     } else {
6232       map_lock.lock_shared();
6233       std::lock_guard l2(mon_report_lock);
6234
6235       utime_t now = ceph_clock_now();
6236       last_mon_report = now;
6237
6238       // resend everything, it's a new session
6239       send_full_update();
6240       send_alive();
6241       service.requeue_pg_temp();
6242       service.clear_sent_ready_to_merge();
6243       service.send_pg_temp();
6244       service.send_ready_to_merge();
6245       service.send_pg_created();
6246       requeue_failures();
6247       send_failures();
6248
6249       map_lock.unlock_shared();
6250       if (is_active()) {
6251         send_beacon(ceph::coarse_mono_clock::now());
6252       }
6253     }
6254
6255     // full map requests may happen while active or pre-boot
6256     if (requested_full_first) {
6257       rerequest_full_maps();
6258     }
6259   }
6260 }
6261
6262 void OSD::ms_handle_fast_connect(Connection *con)
6263 {
6264   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6265       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6266     if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6267       s = ceph::make_ref<Session>(cct, con);
6268       con->set_priv(s);
6269       dout(10) << " new session (outgoing) " << s << " con=" << s->con
6270           << " addr=" << s->con->get_peer_addr() << dendl;
6271       // we don't connect to clients
6272       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6273       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6274     }
6275   }
6276 }
6277
6278 void OSD::ms_handle_fast_accept(Connection *con)
6279 {
6280   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6281       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6282     if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6283       s = ceph::make_ref<Session>(cct, con);
6284       con->set_priv(s);
6285       dout(10) << "new session (incoming)" << s << " con=" << con
6286           << " addr=" << con->get_peer_addr()
6287           << " must have raced with connect" << dendl;
6288       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6289       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6290     }
6291   }
6292 }
6293
6294 bool OSD::ms_handle_reset(Connection *con)
6295 {
6296   auto session = ceph::ref_cast<Session>(con->get_priv());
6297   dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6298   if (!session)
6299     return false;
6300   session->wstate.reset(con);
6301   session->con->set_priv(nullptr);
6302   session->con.reset();  // break con <-> session ref cycle
6303   // note that we break session->con *before* the session_handle_reset
6304   // cleanup below.  this avoids a race between us and
6305   // PG::add_backoff, Session::check_backoff, etc.
6306   session_handle_reset(session);
6307   return true;
6308 }
6309
6310 bool OSD::ms_handle_refused(Connection *con)
6311 {
6312   if (!cct->_conf->osd_fast_fail_on_connection_refused)
6313     return false;
6314
6315   auto session = ceph::ref_cast<Session>(con->get_priv());
6316   dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6317   if (!session)
6318     return false;
6319   int type = con->get_peer_type();
6320   // handle only OSD failures here
6321   if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6322     OSDMapRef osdmap = get_osdmap();
6323     if (osdmap) {
6324       int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6325       if (id >= 0 && osdmap->is_up(id)) {
6326         // I'm cheating mon heartbeat grace logic, because we know it's not going
6327         // to respawn alone. +1 so we won't hit any boundary case.
6328         monc->send_mon_message(
6329           new MOSDFailure(
6330             monc->get_fsid(),
6331             id,
6332             osdmap->get_addrs(id),
6333             cct->_conf->osd_heartbeat_grace + 1,
6334             osdmap->get_epoch(),
6335             MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6336             ));
6337       }
6338     }
6339   }
6340   return true;
6341 }
6342
6343 struct CB_OSD_GetVersion {
6344   OSD *osd;
6345   explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6346   void operator ()(boost::system::error_code ec, version_t newest,
6347                    version_t oldest) {
6348     if (!ec)
6349       osd->_got_mon_epochs(oldest, newest);
6350   }
6351 };
6352
6353 void OSD::start_boot()
6354 {
6355   if (!_is_healthy()) {
6356     // if we are not healthy, do not mark ourselves up (yet)
6357     dout(1) << "not healthy; waiting to boot" << dendl;
6358     if (!is_waiting_for_healthy())
6359       start_waiting_for_healthy();
6360     // send pings sooner rather than later
6361     heartbeat_kick();
6362     return;
6363   }
6364   dout(1) << __func__ << dendl;
6365   set_state(STATE_PREBOOT);
6366   dout(10) << "start_boot - have maps " << superblock.oldest_map
6367            << ".." << superblock.newest_map << dendl;
6368   monc->get_version("osdmap", CB_OSD_GetVersion(this));
6369 }
6370
6371 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6372 {
6373   std::lock_guard l(osd_lock);
6374   if (is_preboot()) {
6375     _preboot(oldest, newest);
6376   }
6377 }
6378
6379 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6380 {
6381   ceph_assert(is_preboot());
6382   dout(10) << __func__ << " _preboot mon has osdmaps "
6383            << oldest << ".." << newest << dendl;
6384
6385   // ensure our local fullness awareness is accurate
6386   {
6387     std::lock_guard l(heartbeat_lock);
6388     heartbeat();
6389   }
6390
6391   const auto& monmap = monc->monmap;
6392   const auto osdmap = get_osdmap();
6393   // if our map within recent history, try to add ourselves to the osdmap.
6394   if (osdmap->get_epoch() == 0) {
6395     derr << "waiting for initial osdmap" << dendl;
6396   } else if (osdmap->is_destroyed(whoami)) {
6397     derr << "osdmap says I am destroyed" << dendl;
6398     // provide a small margin so we don't livelock seeing if we
6399     // un-destroyed ourselves.
6400     if (osdmap->get_epoch() > newest - 1) {
6401       exit(0);
6402     }
6403   } else if (osdmap->is_noup(whoami)) {
6404     derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6405   } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6406     derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6407          << dendl;
6408   } else if (service.need_fullness_update()) {
6409     derr << "osdmap fullness state needs update" << dendl;
6410     send_full_update();
6411   } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6412              superblock.purged_snaps_last < superblock.current_epoch) {
6413     dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6414              << " < newest_map " << superblock.current_epoch << dendl;
6415     _get_purged_snaps();
6416   } else if (osdmap->get_epoch() >= oldest - 1 &&
6417              osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6418
6419     // wait for pgs to fully catch up in a different thread, since
6420     // this thread might be required for splitting and merging PGs to
6421     // make progress.
6422     boot_finisher.queue(
6423       new LambdaContext(
6424         [this](int r) {
6425           std::unique_lock l(osd_lock);
6426           if (is_preboot()) {
6427             dout(10) << __func__ << " waiting for peering work to drain"
6428                      << dendl;
6429             l.unlock();
6430             for (auto shard : shards) {
6431               shard->wait_min_pg_epoch(get_osdmap_epoch());
6432             }
6433             l.lock();
6434           }
6435           if (is_preboot()) {
6436             _send_boot();
6437           }
6438         }));
6439     return;
6440   }
6441
6442   // get all the latest maps
6443   if (osdmap->get_epoch() + 1 >= oldest)
6444     osdmap_subscribe(osdmap->get_epoch() + 1, false);
6445   else
6446     osdmap_subscribe(oldest - 1, true);
6447 }
6448
6449 void OSD::_get_purged_snaps()
6450 {
6451   // NOTE: this is a naive, stateless implementaiton.  it may send multiple
6452   // overlapping requests to the mon, which will be somewhat inefficient, but
6453   // it should be reliable.
6454   dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6455            << ", newest_map " << superblock.current_epoch << dendl;
6456   MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6457     superblock.purged_snaps_last + 1,
6458     superblock.current_epoch + 1);
6459   monc->send_mon_message(m);
6460 }
6461
6462 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6463 {
6464   dout(10) << __func__ << " " << *m << dendl;
6465   ObjectStore::Transaction t;
6466   if (!is_preboot() ||
6467       m->last < superblock.purged_snaps_last) {
6468     goto out;
6469   }
6470   SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6471                                   make_purged_snaps_oid(), &t,
6472                                   m->purged_snaps);
6473   superblock.purged_snaps_last = m->last;
6474   write_superblock(t);
6475   store->queue_transaction(
6476     service.meta_ch,
6477     std::move(t));
6478   service.publish_superblock(superblock);
6479   if (m->last < superblock.current_epoch) {
6480     _get_purged_snaps();
6481   } else {
6482     start_boot();
6483   }
6484 out:
6485   m->put();
6486 }
6487
6488 void OSD::send_full_update()
6489 {
6490   if (!service.need_fullness_update())
6491     return;
6492   unsigned state = 0;
6493   if (service.is_full()) {
6494     state = CEPH_OSD_FULL;
6495   } else if (service.is_backfillfull()) {
6496     state = CEPH_OSD_BACKFILLFULL;
6497   } else if (service.is_nearfull()) {
6498     state = CEPH_OSD_NEARFULL;
6499   }
6500   set<string> s;
6501   OSDMap::calc_state_set(state, s);
6502   dout(10) << __func__ << " want state " << s << dendl;
6503   monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6504 }
6505
6506 void OSD::start_waiting_for_healthy()
6507 {
6508   dout(1) << "start_waiting_for_healthy" << dendl;
6509   set_state(STATE_WAITING_FOR_HEALTHY);
6510   last_heartbeat_resample = utime_t();
6511
6512   // subscribe to osdmap updates, in case our peers really are known to be dead
6513   osdmap_subscribe(get_osdmap_epoch() + 1, false);
6514 }
6515
6516 bool OSD::_is_healthy()
6517 {
6518   if (!cct->get_heartbeat_map()->is_healthy()) {
6519     dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6520     return false;
6521   }
6522
6523   if (is_waiting_for_healthy()) {
6524      utime_t now = ceph_clock_now();
6525      if (osd_markdown_log.empty()) {
6526        dout(5) << __func__ << " force returning true since last markdown"
6527                << " was " << cct->_conf->osd_max_markdown_period
6528                << "s ago" << dendl;
6529        return true;
6530     }
6531     std::lock_guard l(heartbeat_lock);
6532     int num = 0, up = 0;
6533     for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6534          p != heartbeat_peers.end();
6535          ++p) {
6536       if (p->second.is_healthy(now))
6537         ++up;
6538       ++num;
6539     }
6540     if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6541       dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6542               << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6543       return false;
6544     }
6545   }
6546
6547   return true;
6548 }
6549
6550 void OSD::_send_boot()
6551 {
6552   dout(10) << "_send_boot" << dendl;
6553   Connection *local_connection =
6554     cluster_messenger->get_loopback_connection().get();
6555   entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6556   entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6557   entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6558   entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6559
6560   dout(20) << " initial client_addrs " << client_addrs
6561            << ", cluster_addrs " << cluster_addrs
6562            << ", hb_back_addrs " << hb_back_addrs
6563            << ", hb_front_addrs " << hb_front_addrs
6564            << dendl;
6565   if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6566     dout(10) << " assuming cluster_addrs match client_addrs "
6567              << client_addrs << dendl;
6568     cluster_addrs = cluster_messenger->get_myaddrs();
6569   }
6570   if (auto session = local_connection->get_priv(); !session) {
6571     cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6572   }
6573
6574   local_connection = hb_back_server_messenger->get_loopback_connection().get();
6575   if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6576     dout(10) << " assuming hb_back_addrs match cluster_addrs "
6577              << cluster_addrs << dendl;
6578     hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6579   }
6580   if (auto session = local_connection->get_priv(); !session) {
6581     hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6582   }
6583
6584   local_connection = hb_front_server_messenger->get_loopback_connection().get();
6585   if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6586     dout(10) << " assuming hb_front_addrs match client_addrs "
6587              << client_addrs << dendl;
6588     hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6589   }
6590   if (auto session = local_connection->get_priv(); !session) {
6591     hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6592   }
6593
6594   // we now know what our front and back addrs will be, and we are
6595   // about to tell the mon what our metadata (including numa bindings)
6596   // are, so now is a good time!
6597   set_numa_affinity();
6598
6599   MOSDBoot *mboot = new MOSDBoot(
6600     superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6601     hb_back_addrs, hb_front_addrs, cluster_addrs,
6602     CEPH_FEATURES_ALL);
6603   dout(10) << " final client_addrs " << client_addrs
6604            << ", cluster_addrs " << cluster_addrs
6605            << ", hb_back_addrs " << hb_back_addrs
6606            << ", hb_front_addrs " << hb_front_addrs
6607            << dendl;
6608   _collect_metadata(&mboot->metadata);
6609   monc->send_mon_message(mboot);
6610   set_state(STATE_BOOTING);
6611 }
6612
6613 void OSD::_collect_metadata(map<string,string> *pm)
6614 {
6615   // config info
6616   (*pm)["osd_data"] = dev_path;
6617   if (store->get_type() == "filestore") {
6618     // not applicable for bluestore
6619     (*pm)["osd_journal"] = journal_path;
6620   }
6621   (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6622   (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6623   (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6624   (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6625
6626   // backend
6627   (*pm)["osd_objectstore"] = store->get_type();
6628   (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6629   (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6630   (*pm)["default_device_class"] = store->get_default_device_class();
6631   string osdspec_affinity;
6632   int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6633   if (r < 0 || osdspec_affinity.empty()) {
6634     osdspec_affinity = "";
6635   }
6636   (*pm)["osdspec_affinity"] = osdspec_affinity;
6637   store->collect_metadata(pm);
6638
6639   collect_sys_info(pm, cct);
6640
6641   (*pm)["front_iface"] = pick_iface(
6642     cct,
6643     client_messenger->get_myaddrs().front().get_sockaddr_storage());
6644   (*pm)["back_iface"] = pick_iface(
6645     cct,
6646     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6647
6648   // network numa
6649   {
6650     int node = -1;
6651     set<int> nodes;
6652     set<string> unknown;
6653     for (auto nm : { "front_iface", "back_iface" }) {
6654       if (!(*pm)[nm].size()) {
6655         unknown.insert(nm);
6656         continue;
6657       }
6658       int n = -1;
6659       int r = get_iface_numa_node((*pm)[nm], &n);
6660       if (r < 0) {
6661         unknown.insert((*pm)[nm]);
6662         continue;
6663       }
6664       nodes.insert(n);
6665       if (node < 0) {
6666         node = n;
6667       }
6668     }
6669     if (unknown.size()) {
6670       (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6671     }
6672     if (!nodes.empty()) {
6673       (*pm)["network_numa_nodes"] = stringify(nodes);
6674     }
6675     if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6676       (*pm)["network_numa_node"] = stringify(node);
6677     }
6678   }
6679
6680   if (numa_node >= 0) {
6681     (*pm)["numa_node"] = stringify(numa_node);
6682     (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6683                                                   &numa_cpu_set);
6684   }
6685
6686   set<string> devnames;
6687   store->get_devices(&devnames);
6688   map<string,string> errs;
6689   get_device_metadata(devnames, pm, &errs);
6690   for (auto& i : errs) {
6691     dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6692   }
6693   dout(10) << __func__ << " " << *pm << dendl;
6694 }
6695
6696 void OSD::queue_want_up_thru(epoch_t want)
6697 {
6698   std::shared_lock map_locker{map_lock};
6699   epoch_t cur = get_osdmap()->get_up_thru(whoami);
6700   std::lock_guard report_locker(mon_report_lock);
6701   if (want > up_thru_wanted) {
6702     dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6703              << ", currently " << cur
6704              << dendl;
6705     up_thru_wanted = want;
6706     send_alive();
6707   } else {
6708     dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6709              << ", currently " << cur
6710              << dendl;
6711   }
6712 }
6713
6714 void OSD::send_alive()
6715 {
6716   ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6717   const auto osdmap = get_osdmap();
6718   if (!osdmap->exists(whoami))
6719     return;
6720   epoch_t up_thru = osdmap->get_up_thru(whoami);
6721   dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6722   if (up_thru_wanted > up_thru) {
6723     dout(10) << "send_alive want " << up_thru_wanted << dendl;
6724     monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6725   }
6726 }
6727
6728 void OSD::request_full_map(epoch_t first, epoch_t last)
6729 {
6730   dout(10) << __func__ << " " << first << ".." << last
6731            << ", previously requested "
6732            << requested_full_first << ".." << requested_full_last << dendl;
6733   ceph_assert(ceph_mutex_is_locked(osd_lock));
6734   ceph_assert(first > 0 && last > 0);
6735   ceph_assert(first <= last);
6736   ceph_assert(first >= requested_full_first);  // we shouldn't ever ask for older maps
6737   if (requested_full_first == 0) {
6738     // first request
6739     requested_full_first = first;
6740     requested_full_last = last;
6741   } else if (last <= requested_full_last) {
6742     // dup
6743     return;
6744   } else {
6745     // additional request
6746     first = requested_full_last + 1;
6747     requested_full_last = last;
6748   }
6749   MMonGetOSDMap *req = new MMonGetOSDMap;
6750   req->request_full(first, last);
6751   monc->send_mon_message(req);
6752 }
6753
6754 void OSD::got_full_map(epoch_t e)
6755 {
6756   ceph_assert(requested_full_first <= requested_full_last);
6757   ceph_assert(ceph_mutex_is_locked(osd_lock));
6758   if (requested_full_first == 0) {
6759     dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6760     return;
6761   }
6762   if (e < requested_full_first) {
6763     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6764              << ".." << requested_full_last
6765              << ", ignoring" << dendl;
6766     return;
6767   }
6768   if (e >= requested_full_last) {
6769     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6770              << ".." << requested_full_last << ", resetting" << dendl;
6771     requested_full_first = requested_full_last = 0;
6772     return;
6773   }
6774
6775   requested_full_first = e + 1;
6776
6777   dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6778            << ".." << requested_full_last
6779            << ", still need more" << dendl;
6780 }
6781
6782 void OSD::requeue_failures()
6783 {
6784   std::lock_guard l(heartbeat_lock);
6785   unsigned old_queue = failure_queue.size();
6786   unsigned old_pending = failure_pending.size();
6787   for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6788     failure_queue[p->first] = p->second.first;
6789     failure_pending.erase(p++);
6790   }
6791   dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6792            << failure_queue.size() << dendl;
6793 }
6794
6795 void OSD::send_failures()
6796 {
6797   ceph_assert(ceph_mutex_is_locked(map_lock));
6798   ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6799   std::lock_guard l(heartbeat_lock);
6800   utime_t now = ceph_clock_now();
6801   const auto osdmap = get_osdmap();
6802   while (!failure_queue.empty()) {
6803     int osd = failure_queue.begin()->first;
6804     if (!failure_pending.count(osd)) {
6805       int failed_for = (int)(double)(now - failure_queue.begin()->second);
6806       monc->send_mon_message(
6807         new MOSDFailure(
6808           monc->get_fsid(),
6809           osd,
6810           osdmap->get_addrs(osd),
6811           failed_for,
6812           osdmap->get_epoch()));
6813       failure_pending[osd] = make_pair(failure_queue.begin()->second,
6814                                        osdmap->get_addrs(osd));
6815     }
6816     failure_queue.erase(osd);
6817   }
6818 }
6819
6820 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6821 {
6822   MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6823                                    MOSDFailure::FLAG_ALIVE);
6824   monc->send_mon_message(m);
6825 }
6826
6827 void OSD::cancel_pending_failures()
6828 {
6829   std::lock_guard l(heartbeat_lock);
6830   auto it = failure_pending.begin();
6831   while (it != failure_pending.end()) {
6832     dout(10) << __func__ << " canceling in-flight failure report for osd."
6833              << it->first << dendl;
6834     send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6835     failure_pending.erase(it++);
6836   }
6837 }
6838
6839 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6840 {
6841   const auto& monmap = monc->monmap;
6842   // send beacon to mon even if we are just connected, and the monmap is not
6843   // initialized yet by then.
6844   if (monmap.epoch > 0 &&
6845       monmap.get_required_features().contains_all(
6846         ceph::features::mon::FEATURE_LUMINOUS)) {
6847     dout(20) << __func__ << " sending" << dendl;
6848     MOSDBeacon* beacon = nullptr;
6849     {
6850       std::lock_guard l{min_last_epoch_clean_lock};
6851       beacon = new MOSDBeacon(get_osdmap_epoch(),
6852                               min_last_epoch_clean,
6853                               superblock.last_purged_snaps_scrub,
6854                               cct->_conf->osd_beacon_report_interval);
6855       beacon->pgs = min_last_epoch_clean_pgs;
6856       last_sent_beacon = now;
6857     }
6858     monc->send_mon_message(beacon);
6859   } else {
6860     dout(20) << __func__ << " not sending" << dendl;
6861   }
6862 }
6863
6864 void OSD::handle_command(MCommand *m)
6865 {
6866   ConnectionRef con = m->get_connection();
6867   auto session = ceph::ref_cast<Session>(con->get_priv());
6868   if (!session) {
6869     con->send_message(new MCommandReply(m, -EACCES));
6870     m->put();
6871     return;
6872   }
6873   if (!session->caps.allow_all()) {
6874     con->send_message(new MCommandReply(m, -EACCES));
6875     m->put();
6876     return;
6877   }
6878   cct->get_admin_socket()->queue_tell_command(m);
6879   m->put();
6880 }
6881
6882 namespace {
6883   class unlock_guard {
6884     ceph::mutex& m;
6885   public:
6886     explicit unlock_guard(ceph::mutex& mutex)
6887       : m(mutex)
6888     {
6889       m.unlock();
6890     }
6891     unlock_guard(unlock_guard&) = delete;
6892     ~unlock_guard() {
6893       m.lock();
6894     }
6895   };
6896 }
6897
6898 void OSD::scrub_purged_snaps()
6899 {
6900   dout(10) << __func__ << dendl;
6901   ceph_assert(ceph_mutex_is_locked(osd_lock));
6902   SnapMapper::Scrubber s(cct, store, service.meta_ch,
6903                          make_snapmapper_oid(),
6904                          make_purged_snaps_oid());
6905   clog->debug() << "purged_snaps scrub starts";
6906   osd_lock.unlock();
6907   s.run();
6908   if (s.stray.size()) {
6909     clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6910   } else {
6911     clog->debug() << "purged_snaps scrub ok";
6912   }
6913   set<pair<spg_t,snapid_t>> queued;
6914   for (auto& [pool, snap, hash, shard] : s.stray) {
6915     const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6916     if (!pi) {
6917       dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6918       continue;
6919     }
6920     pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6921     spg_t spgid(pgid, shard);
6922     pair<spg_t,snapid_t> p(spgid, snap);
6923     if (queued.count(p)) {
6924       dout(20) << __func__ << " pg " << spgid << " snap " << snap
6925                << " already queued" << dendl;
6926       continue;
6927     }
6928     PGRef pg = lookup_lock_pg(spgid);
6929     if (!pg) {
6930       dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6931       continue;
6932     }
6933     queued.insert(p);
6934     dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6935              << snap << dendl;
6936     pg->queue_snap_retrim(snap);
6937     pg->unlock();
6938   }
6939   osd_lock.lock();
6940   if (is_stopping()) {
6941     return;
6942   }
6943   dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6944   ObjectStore::Transaction t;
6945   superblock.last_purged_snaps_scrub = ceph_clock_now();
6946   write_superblock(t);
6947   int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6948   ceph_assert(tr == 0);
6949   if (is_active()) {
6950     send_beacon(ceph::coarse_mono_clock::now());
6951   }
6952   dout(10) << __func__ << " done" << dendl;
6953 }
6954
6955 void OSD::probe_smart(const string& only_devid, ostream& ss)
6956 {
6957   set<string> devnames;
6958   store->get_devices(&devnames);
6959   uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6960     "osd_smart_report_timeout");
6961
6962   // == typedef std::map<std::string, mValue> mObject;
6963   json_spirit::mObject json_map;
6964
6965   for (auto dev : devnames) {
6966     // smartctl works only on physical devices; filter out any logical device
6967     if (dev.find("dm-") == 0) {
6968       continue;
6969     }
6970
6971     string err;
6972     string devid = get_device_id(dev, &err);
6973     if (devid.size() == 0) {
6974       dout(10) << __func__ << " no unique id for dev " << dev << " ("
6975                << err << "), skipping" << dendl;
6976       continue;
6977     }
6978     if (only_devid.size() && devid != only_devid) {
6979       continue;
6980     }
6981
6982     json_spirit::mValue smart_json;
6983     if (block_device_get_metrics(dev, smart_timeout,
6984                                  &smart_json)) {
6985       dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6986       continue;
6987     }
6988     json_map[devid] = smart_json;
6989   }
6990   json_spirit::write(json_map, ss, json_spirit::pretty_print);
6991 }
6992
6993 bool OSD::heartbeat_dispatch(Message *m)
6994 {
6995   dout(30) << "heartbeat_dispatch " << m << dendl;
6996   switch (m->get_type()) {
6997
6998   case CEPH_MSG_PING:
6999     dout(10) << "ping from " << m->get_source_inst() << dendl;
7000     m->put();
7001     break;
7002
7003   case MSG_OSD_PING:
7004     handle_osd_ping(static_cast<MOSDPing*>(m));
7005     break;
7006
7007   default:
7008     dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7009     m->put();
7010   }
7011
7012   return true;
7013 }
7014
7015 bool OSD::ms_dispatch(Message *m)
7016 {
7017   dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7018   if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7019     service.got_stop_ack();
7020     m->put();
7021     return true;
7022   }
7023
7024   // lock!
7025
7026   osd_lock.lock();
7027   if (is_stopping()) {
7028     osd_lock.unlock();
7029     m->put();
7030     return true;
7031   }
7032
7033   do_waiters();
7034   _dispatch(m);
7035
7036   osd_lock.unlock();
7037
7038   return true;
7039 }
7040
7041 void OSDService::maybe_share_map(
7042   Connection *con,
7043   const OSDMapRef& osdmap,
7044   epoch_t peer_epoch_lb)
7045 {
7046   // NOTE: we assume caller hold something that keeps the Connection itself
7047   // pinned (e.g., an OpRequest's MessageRef).
7048   auto session = ceph::ref_cast<Session>(con->get_priv());
7049   if (!session) {
7050     return;
7051   }
7052
7053   // assume the peer has the newer of the op's sent_epoch and what
7054   // we think we sent them.
7055   session->sent_epoch_lock.lock();
7056   if (peer_epoch_lb > session->last_sent_epoch) {
7057     dout(10) << __func__ << " con " << con
7058              << " " << con->get_peer_addr()
7059              << " map epoch " << session->last_sent_epoch
7060              << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7061     session->last_sent_epoch = peer_epoch_lb;
7062   }
7063   epoch_t last_sent_epoch = session->last_sent_epoch;
7064   session->sent_epoch_lock.unlock();
7065
7066   if (osdmap->get_epoch() <= last_sent_epoch) {
7067     return;
7068   }
7069
7070   send_incremental_map(last_sent_epoch, con, osdmap);
7071   last_sent_epoch = osdmap->get_epoch();
7072
7073   session->sent_epoch_lock.lock();
7074   if (session->last_sent_epoch < last_sent_epoch) {
7075     dout(10) << __func__ << " con " << con
7076              << " " << con->get_peer_addr()
7077              << " map epoch " << session->last_sent_epoch
7078              << " -> " << last_sent_epoch << " (shared)" << dendl;
7079     session->last_sent_epoch = last_sent_epoch;
7080   }
7081   session->sent_epoch_lock.unlock();
7082 }
7083
7084 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7085 {
7086   ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7087
7088   auto i = session->waiting_on_map.begin();
7089   while (i != session->waiting_on_map.end()) {
7090     OpRequestRef op = &(*i);
7091     ceph_assert(ms_can_fast_dispatch(op->get_req()));
7092     auto m = op->get_req<MOSDFastDispatchOp>();
7093     if (m->get_min_epoch() > osdmap->get_epoch()) {
7094       break;
7095     }
7096     session->waiting_on_map.erase(i++);
7097     op->put();
7098
7099     spg_t pgid;
7100     if (m->get_type() == CEPH_MSG_OSD_OP) {
7101       pg_t actual_pgid = osdmap->raw_pg_to_pg(
7102         static_cast<const MOSDOp*>(m)->get_pg());
7103       if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7104         continue;
7105       }
7106     } else {
7107       pgid = m->get_spg();
7108     }
7109     enqueue_op(pgid, std::move(op), m->get_map_epoch());
7110   }
7111
7112   if (session->waiting_on_map.empty()) {
7113     clear_session_waiting_on_map(session);
7114   } else {
7115     register_session_waiting_on_map(session);
7116   }
7117 }
7118
7119 void OSD::ms_fast_dispatch(Message *m)
7120 {
7121
7122 #ifdef HAVE_JAEGER
7123   jaeger_tracing::init_tracer("osd-services-reinit");
7124   dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl;
7125   auto dispatch_span = jaeger_tracing::new_span(__func__);
7126 #endif
7127   FUNCTRACE(cct);
7128   if (service.is_stopping()) {
7129     m->put();
7130     return;
7131   }
7132
7133   // peering event?
7134   switch (m->get_type()) {
7135   case CEPH_MSG_PING:
7136     dout(10) << "ping from " << m->get_source() << dendl;
7137     m->put();
7138     return;
7139   case MSG_OSD_FORCE_RECOVERY:
7140     handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7141     return;
7142   case MSG_OSD_SCRUB2:
7143     handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7144     return;
7145
7146   case MSG_OSD_PG_CREATE2:
7147     return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7148   case MSG_OSD_PG_QUERY:
7149     return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7150   case MSG_OSD_PG_NOTIFY:
7151     return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7152   case MSG_OSD_PG_INFO:
7153     return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7154   case MSG_OSD_PG_REMOVE:
7155     return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7156
7157     // these are single-pg messages that handle themselves
7158   case MSG_OSD_PG_LOG:
7159   case MSG_OSD_PG_TRIM:
7160   case MSG_OSD_PG_NOTIFY2:
7161   case MSG_OSD_PG_QUERY2:
7162   case MSG_OSD_PG_INFO2:
7163   case MSG_OSD_BACKFILL_RESERVE:
7164   case MSG_OSD_RECOVERY_RESERVE:
7165   case MSG_OSD_PG_LEASE:
7166   case MSG_OSD_PG_LEASE_ACK:
7167     {
7168       MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7169       if (require_osd_peer(pm)) {
7170         enqueue_peering_evt(
7171           pm->get_spg(),
7172           PGPeeringEventRef(pm->get_event()));
7173       }
7174       pm->put();
7175       return;
7176     }
7177   }
7178
7179   OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7180   {
7181 #ifdef WITH_LTTNG
7182     osd_reqid_t reqid = op->get_reqid();
7183 #endif
7184     tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7185         reqid.name._num, reqid.tid, reqid.inc);
7186   }
7187 #ifdef HAVE_JAEGER
7188   op->set_osd_parent_span(dispatch_span);
7189   if (op->osd_parent_span) {
7190     auto op_req_span = jaeger_tracing::child_span("op-request-created", op->osd_parent_span);
7191     op->set_osd_parent_span(op_req_span);
7192   }
7193 #endif
7194   if (m->trace)
7195     op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7196
7197   // note sender epoch, min req's epoch
7198   op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7199   op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7200   ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7201
7202   service.maybe_inject_dispatch_delay();
7203
7204   if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7205       m->get_type() != CEPH_MSG_OSD_OP) {
7206     // queue it directly
7207     enqueue_op(
7208       static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7209       std::move(op),
7210       static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7211   } else {
7212     // legacy client, and this is an MOSDOp (the *only* fast dispatch
7213     // message that didn't have an explicit spg_t); we need to map
7214     // them to an spg_t while preserving delivery order.
7215     auto priv = m->get_connection()->get_priv();
7216     if (auto session = static_cast<Session*>(priv.get()); session) {
7217       std::lock_guard l{session->session_dispatch_lock};
7218       op->get();
7219       session->waiting_on_map.push_back(*op);
7220       OSDMapRef nextmap = service.get_nextmap_reserved();
7221       dispatch_session_waiting(session, nextmap);
7222       service.release_map(nextmap);
7223     }
7224   }
7225   OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7226 }
7227
7228 int OSD::ms_handle_authentication(Connection *con)
7229 {
7230   int ret = 0;
7231   auto s = ceph::ref_cast<Session>(con->get_priv());
7232   if (!s) {
7233     s = ceph::make_ref<Session>(cct, con);
7234     con->set_priv(s);
7235     s->entity_name = con->get_peer_entity_name();
7236     dout(10) << __func__ << " new session " << s << " con " << s->con
7237              << " entity " << s->entity_name
7238              << " addr " << con->get_peer_addrs() << dendl;
7239   } else {
7240     dout(10) << __func__ << " existing session " << s << " con " << s->con
7241              << " entity " << s->entity_name
7242              << " addr " << con->get_peer_addrs() << dendl;
7243   }
7244
7245   AuthCapsInfo &caps_info = con->get_peer_caps_info();
7246   if (caps_info.allow_all) {
7247     s->caps.set_allow_all();
7248   } else if (caps_info.caps.length() > 0) {
7249     bufferlist::const_iterator p = caps_info.caps.cbegin();
7250     string str;
7251     try {
7252       decode(str, p);
7253     }
7254     catch (ceph::buffer::error& e) {
7255       dout(10) << __func__ << " session " << s << " " << s->entity_name
7256                << " failed to decode caps string" << dendl;
7257       ret = -EACCES;
7258     }
7259     if (!ret) {
7260       bool success = s->caps.parse(str);
7261       if (success) {
7262         dout(10) << __func__ << " session " << s
7263                  << " " << s->entity_name
7264                  << " has caps " << s->caps << " '" << str << "'" << dendl;
7265         ret = 1;
7266       } else {
7267         dout(10) << __func__ << " session " << s << " " << s->entity_name
7268                  << " failed to parse caps '" << str << "'" << dendl;
7269         ret = -EACCES;
7270       }
7271     }
7272   }
7273   return ret;
7274 }
7275
7276 void OSD::do_waiters()
7277 {
7278   ceph_assert(ceph_mutex_is_locked(osd_lock));
7279
7280   dout(10) << "do_waiters -- start" << dendl;
7281   while (!finished.empty()) {
7282     OpRequestRef next = finished.front();
7283     finished.pop_front();
7284     dispatch_op(next);
7285   }
7286   dout(10) << "do_waiters -- finish" << dendl;
7287 }
7288
7289 void OSD::dispatch_op(OpRequestRef op)
7290 {
7291   switch (op->get_req()->get_type()) {
7292
7293   case MSG_OSD_PG_CREATE:
7294     handle_pg_create(op);
7295     break;
7296   }
7297 }
7298
7299 void OSD::_dispatch(Message *m)
7300 {
7301   ceph_assert(ceph_mutex_is_locked(osd_lock));
7302   dout(20) << "_dispatch " << m << " " << *m << dendl;
7303
7304   switch (m->get_type()) {
7305     // -- don't need OSDMap --
7306
7307     // map and replication
7308   case CEPH_MSG_OSD_MAP:
7309     handle_osd_map(static_cast<MOSDMap*>(m));
7310     break;
7311   case MSG_MON_GET_PURGED_SNAPS_REPLY:
7312     handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7313     break;
7314
7315     // osd
7316   case MSG_OSD_SCRUB:
7317     handle_scrub(static_cast<MOSDScrub*>(m));
7318     break;
7319
7320   case MSG_COMMAND:
7321     handle_command(static_cast<MCommand*>(m));
7322     return;
7323
7324     // -- need OSDMap --
7325
7326   case MSG_OSD_PG_CREATE:
7327     {
7328       OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7329       if (m->trace)
7330         op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7331       // no map?  starting up?
7332       if (!get_osdmap()) {
7333         dout(7) << "no OSDMap, not booted" << dendl;
7334         logger->inc(l_osd_waiting_for_map);
7335         waiting_for_osdmap.push_back(op);
7336         op->mark_delayed("no osdmap");
7337         break;
7338       }
7339
7340       // need OSDMap
7341       dispatch_op(op);
7342     }
7343   }
7344 }
7345
7346 // remove me post-nautilus
7347 void OSD::handle_scrub(MOSDScrub *m)
7348 {
7349   dout(10) << "handle_scrub " << *m << dendl;
7350   if (!require_mon_or_mgr_peer(m)) {
7351     m->put();
7352     return;
7353   }
7354   if (m->fsid != monc->get_fsid()) {
7355     dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7356             << dendl;
7357     m->put();
7358     return;
7359   }
7360
7361   vector<spg_t> spgs;
7362   _get_pgids(&spgs);
7363
7364   if (!m->scrub_pgs.empty()) {
7365     vector<spg_t> v;
7366     for (auto pgid : m->scrub_pgs) {
7367       spg_t pcand;
7368       if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7369           std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7370         v.push_back(pcand);
7371       }
7372     }
7373     spgs.swap(v);
7374   }
7375
7376   for (auto pgid : spgs) {
7377     enqueue_peering_evt(
7378       pgid,
7379       PGPeeringEventRef(
7380         std::make_shared<PGPeeringEvent>(
7381           get_osdmap_epoch(),
7382           get_osdmap_epoch(),
7383           PeeringState::RequestScrub(m->deep, m->repair))));
7384   }
7385
7386   m->put();
7387 }
7388
7389 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7390 {
7391   dout(10) << __func__ <<  " " << *m << dendl;
7392   if (!require_mon_or_mgr_peer(m)) {
7393     m->put();
7394     return;
7395   }
7396   if (m->fsid != monc->get_fsid()) {
7397     dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7398             << dendl;
7399     m->put();
7400     return;
7401   }
7402   for (auto pgid : m->scrub_pgs) {
7403     enqueue_peering_evt(
7404       pgid,
7405       PGPeeringEventRef(
7406         std::make_shared<PGPeeringEvent>(
7407           m->epoch,
7408           m->epoch,
7409           PeeringState::RequestScrub(m->deep, m->repair))));
7410   }
7411   m->put();
7412 }
7413
7414 bool OSD::scrub_random_backoff()
7415 {
7416   bool coin_flip = (rand() / (double)RAND_MAX >=
7417                     cct->_conf->osd_scrub_backoff_ratio);
7418   if (!coin_flip) {
7419     dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7420     return true;
7421   }
7422   return false;
7423 }
7424
7425 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7426                                const spg_t& pg, const utime_t& timestamp,
7427                                double pool_scrub_min_interval,
7428                                double pool_scrub_max_interval, bool must)
7429   : cct(cct),
7430     pgid(pg),
7431     sched_time(timestamp),
7432     deadline(timestamp)
7433 {
7434   // if not explicitly requested, postpone the scrub with a random delay
7435   if (!must) {
7436     double scrub_min_interval = pool_scrub_min_interval > 0 ?
7437       pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7438     double scrub_max_interval = pool_scrub_max_interval > 0 ?
7439       pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7440
7441     sched_time += scrub_min_interval;
7442     double r = rand() / (double)RAND_MAX;
7443     sched_time +=
7444       scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7445     if (scrub_max_interval == 0) {
7446       deadline = utime_t();
7447     } else {
7448       deadline += scrub_max_interval;
7449     }
7450
7451   }
7452 }
7453
7454 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7455   if (sched_time < rhs.sched_time)
7456     return true;
7457   if (sched_time > rhs.sched_time)
7458     return false;
7459   return pgid < rhs.pgid;
7460 }
7461
7462 void OSDService::dumps_scrub(ceph::Formatter *f)
7463 {
7464   ceph_assert(f != nullptr);
7465   std::lock_guard l(sched_scrub_lock);
7466
7467   f->open_array_section("scrubs");
7468   for (const auto &i: sched_scrub_pg) {
7469     f->open_object_section("scrub");
7470     f->dump_stream("pgid") << i.pgid;
7471     f->dump_stream("sched_time") << i.sched_time;
7472     f->dump_stream("deadline") << i.deadline;
7473     f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp());
7474     f->close_section();
7475   }
7476   f->close_section();
7477 }
7478
7479 double OSD::scrub_sleep_time(bool must_scrub)
7480 {
7481   if (must_scrub) {
7482     return cct->_conf->osd_scrub_sleep;
7483   }
7484   utime_t now = ceph_clock_now();
7485   if (scrub_time_permit(now)) {
7486     return cct->_conf->osd_scrub_sleep;
7487   }
7488   double normal_sleep = cct->_conf->osd_scrub_sleep;
7489   double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7490   return std::max(extended_sleep, normal_sleep);
7491 }
7492
7493 bool OSD::scrub_time_permit(utime_t now)
7494 {
7495   struct tm bdt;
7496   time_t tt = now.sec();
7497   localtime_r(&tt, &bdt);
7498
7499   bool day_permit = false;
7500   if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7501     if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7502       day_permit = true;
7503     }
7504   } else {
7505     if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7506       day_permit = true;
7507     }
7508   }
7509
7510   if (!day_permit) {
7511     dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7512             << " - " << cct->_conf->osd_scrub_end_week_day
7513             << " now " << bdt.tm_wday << " = no" << dendl;
7514     return false;
7515   }
7516
7517   bool time_permit = false;
7518   if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7519     if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7520       time_permit = true;
7521     }
7522   } else {
7523     if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7524       time_permit = true;
7525     }
7526   }
7527   if (time_permit) {
7528     dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7529             << " - " << cct->_conf->osd_scrub_end_hour
7530             << " now " << bdt.tm_hour << " = yes" << dendl;
7531   } else {
7532     dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7533             << " - " << cct->_conf->osd_scrub_end_hour
7534             << " now " << bdt.tm_hour << " = no" << dendl;
7535   }
7536   return time_permit;
7537 }
7538
7539 bool OSD::scrub_load_below_threshold()
7540 {
7541   double loadavgs[3];
7542   if (getloadavg(loadavgs, 3) != 3) {
7543     dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7544     return false;
7545   }
7546
7547   // allow scrub if below configured threshold
7548   long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7549   double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7550   if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7551     dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7552              << " < max " << cct->_conf->osd_scrub_load_threshold
7553              << " = yes" << dendl;
7554     return true;
7555   }
7556
7557   // allow scrub if below daily avg and currently decreasing
7558   if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7559     dout(20) << __func__ << " loadavg " << loadavgs[0]
7560              << " < daily_loadavg " << daily_loadavg
7561              << " and < 15m avg " << loadavgs[2]
7562              << " = yes" << dendl;
7563     return true;
7564   }
7565
7566   dout(20) << __func__ << " loadavg " << loadavgs[0]
7567            << " >= max " << cct->_conf->osd_scrub_load_threshold
7568            << " and ( >= daily_loadavg " << daily_loadavg
7569            << " or >= 15m avg " << loadavgs[2]
7570            << ") = no" << dendl;
7571   return false;
7572 }
7573
7574 void OSD::sched_scrub()
7575 {
7576   dout(20) << __func__ << " sched_scrub starts" << dendl;
7577
7578   // if not permitted, fail fast
7579   if (!service.can_inc_scrubs()) {
7580     dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7581     return;
7582   }
7583   bool allow_requested_repair_only = false;
7584   if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7585     if (!cct->_conf->osd_repair_during_recovery) {
7586       dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl;
7587       return;
7588     }
7589     dout(10) << __func__
7590              << " will only schedule explicitly requested repair due to active recovery"
7591              << dendl;
7592     allow_requested_repair_only = true;
7593   }
7594
7595   utime_t now = ceph_clock_now();
7596   bool time_permit = scrub_time_permit(now);
7597   bool load_is_low = scrub_load_below_threshold();
7598   dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7599
7600   OSDService::ScrubJob scrub_job;
7601   if (service.first_scrub_stamp(&scrub_job)) {
7602     do {
7603       dout(30) << "sched_scrub examine " << scrub_job.pgid << " at " << scrub_job.sched_time << dendl;
7604
7605       if (scrub_job.sched_time > now) {
7606         // save ourselves some effort
7607         dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time
7608                  << " > " << now << dendl;
7609         break;
7610       }
7611
7612       if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) {
7613         dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to "
7614                  << (!time_permit ? "time not permit" : "high load") << dendl;
7615         continue;
7616       }
7617
7618       PGRef pg = _lookup_lock_pg(scrub_job.pgid);
7619       if (!pg) {
7620         dout(20) << __func__ << " pg  " << scrub_job.pgid << " not found" << dendl;
7621         continue;
7622       }
7623
7624       // This has already started, so go on to the next scrub job
7625       if (pg->is_scrub_active()) {
7626         pg->unlock();
7627         dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl;
7628         continue;
7629       }
7630       // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7631       if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
7632         pg->unlock();
7633         dout(10) << __func__ << " skip " << scrub_job.pgid
7634                  << " because repairing is not explicitly requested on it"
7635                  << dendl;
7636         continue;
7637       }
7638
7639       // If it is reserving, let it resolve before going to the next scrub job
7640       if (pg->m_scrubber->is_reserving()) {
7641         pg->unlock();
7642         dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
7643         break;
7644       }
7645       dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
7646                << (pg->get_must_scrub() ? ", explicitly requested" :
7647                    (load_is_low ? ", load_is_low" : " deadline < now"))
7648                << dendl;
7649       if (pg->sched_scrub()) {
7650         pg->unlock();
7651         dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
7652         break;
7653       }
7654       pg->unlock();
7655     } while (service.next_scrub_stamp(scrub_job, &scrub_job));
7656   }
7657   dout(20) << "sched_scrub done" << dendl;
7658 }
7659
7660 void OSD::resched_all_scrubs()
7661 {
7662   dout(10) << __func__ << ": start" << dendl;
7663   const vector<spg_t> pgs = [this] {
7664     vector<spg_t> pgs;
7665     OSDService::ScrubJob job;
7666     if (service.first_scrub_stamp(&job)) {
7667       do {
7668         pgs.push_back(job.pgid);
7669       } while (service.next_scrub_stamp(job, &job));
7670     }
7671     return pgs;
7672   }();
7673   for (auto& pgid : pgs) {
7674       dout(20) << __func__ << ": examine " << pgid << dendl;
7675       PGRef pg = _lookup_lock_pg(pgid);
7676       if (!pg)
7677         continue;
7678       if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
7679         dout(15) << __func__ << ": reschedule " << pgid << dendl;
7680         pg->on_info_history_change();
7681       }
7682       pg->unlock();
7683   }
7684   dout(10) << __func__ << ": done" << dendl;
7685 }
7686
7687 MPGStats* OSD::collect_pg_stats()
7688 {
7689   // This implementation unconditionally sends every is_primary PG's
7690   // stats every time we're called.  This has equivalent cost to the
7691   // previous implementation's worst case where all PGs are busy and
7692   // their stats are always enqueued for sending.
7693   std::shared_lock l{map_lock};
7694
7695   osd_stat_t cur_stat = service.get_osd_stat();
7696   cur_stat.os_perf_stat = store->get_cur_stats();
7697
7698   auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7699   m->osd_stat = cur_stat;
7700
7701   std::lock_guard lec{min_last_epoch_clean_lock};
7702   min_last_epoch_clean = get_osdmap_epoch();
7703   min_last_epoch_clean_pgs.clear();
7704
7705   std::set<int64_t> pool_set;
7706   vector<PGRef> pgs;
7707   _get_pgs(&pgs);
7708   for (auto& pg : pgs) {
7709     auto pool = pg->pg_id.pgid.pool();
7710     pool_set.emplace((int64_t)pool);
7711     if (!pg->is_primary()) {
7712       continue;
7713     }
7714     pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7715         m->pg_stat[pg->pg_id.pgid] = s;
7716         min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
7717         min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7718       });
7719   }
7720   store_statfs_t st;
7721   bool per_pool_stats = false;
7722   bool per_pool_omap_stats = false;
7723   for (auto p : pool_set) {
7724     int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7725     if (r == -ENOTSUP) {
7726       break;
7727     } else {
7728       assert(r >= 0);
7729       m->pool_stat[p] = st;
7730       per_pool_stats = true;
7731     }
7732   }
7733
7734   // indicate whether we are reporting per-pool stats
7735   m->osd_stat.num_osds = 1;
7736   m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7737   m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7738
7739   return m;
7740 }
7741
7742 vector<DaemonHealthMetric> OSD::get_health_metrics()
7743 {
7744   vector<DaemonHealthMetric> metrics;
7745   {
7746     utime_t oldest_secs;
7747     const utime_t now = ceph_clock_now();
7748     auto too_old = now;
7749     too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7750     int slow = 0;
7751     TrackedOpRef oldest_op;
7752     auto count_slow_ops = [&](TrackedOp& op) {
7753       if (op.get_initiated() < too_old) {
7754         stringstream ss;
7755         ss << "slow request " << op.get_desc()
7756            << " initiated "
7757            << op.get_initiated()
7758            << " currently "
7759            << op.state_string();
7760         lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7761         clog->warn() << ss.str();
7762         slow++;
7763         if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7764           oldest_op = &op;
7765         }
7766         return true;
7767       } else {
7768         return false;
7769       }
7770     };
7771     if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7772       if (slow) {
7773         derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7774              << oldest_op->get_desc() << dendl;
7775       }
7776       metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7777     } else {
7778       // no news is not good news.
7779       metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7780     }
7781   }
7782   {
7783     std::lock_guard l(pending_creates_lock);
7784     auto n_primaries = pending_creates_from_mon;
7785     for (const auto& create : pending_creates_from_osd) {
7786       if (create.second) {
7787         n_primaries++;
7788       }
7789     }
7790     metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7791   }
7792   return metrics;
7793 }
7794
7795 // =====================================================
7796 // MAP
7797
7798 void OSD::wait_for_new_map(OpRequestRef op)
7799 {
7800   // ask?
7801   if (waiting_for_osdmap.empty()) {
7802     osdmap_subscribe(get_osdmap_epoch() + 1, false);
7803   }
7804
7805   logger->inc(l_osd_waiting_for_map);
7806   waiting_for_osdmap.push_back(op);
7807   op->mark_delayed("wait for new map");
7808 }
7809
7810
7811 /** update_map
7812  * assimilate new OSDMap(s).  scan pgs, etc.
7813  */
7814
7815 void OSD::note_down_osd(int peer)
7816 {
7817   ceph_assert(ceph_mutex_is_locked(osd_lock));
7818   cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7819
7820   std::lock_guard l{heartbeat_lock};
7821   failure_queue.erase(peer);
7822   failure_pending.erase(peer);
7823   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7824   if (p != heartbeat_peers.end()) {
7825     p->second.clear_mark_down();
7826     heartbeat_peers.erase(p);
7827   }
7828 }
7829
7830 void OSD::note_up_osd(int peer)
7831 {
7832   heartbeat_set_peers_need_update();
7833 }
7834
7835 struct C_OnMapCommit : public Context {
7836   OSD *osd;
7837   epoch_t first, last;
7838   MOSDMap *msg;
7839   C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7840     : osd(o), first(f), last(l), msg(m) {}
7841   void finish(int r) override {
7842     osd->_committed_osd_maps(first, last, msg);
7843     msg->put();
7844   }
7845 };
7846
7847 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7848 {
7849   std::lock_guard l(osdmap_subscribe_lock);
7850   if (latest_subscribed_epoch >= epoch && !force_request)
7851     return;
7852
7853   latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7854
7855   if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7856       force_request) {
7857     monc->renew_subs();
7858   }
7859 }
7860
7861 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7862 {
7863   epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7864   if (min <= superblock.oldest_map)
7865     return;
7866
7867   int num = 0;
7868   ObjectStore::Transaction t;
7869   for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7870     dout(20) << " removing old osdmap epoch " << e << dendl;
7871     t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7872     t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7873     superblock.oldest_map = e + 1;
7874     num++;
7875     if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7876       service.publish_superblock(superblock);
7877       write_superblock(t);
7878       int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7879       ceph_assert(tr == 0);
7880       num = 0;
7881       if (!skip_maps) {
7882         // skip_maps leaves us with a range of old maps if we fail to remove all
7883         // of them before moving superblock.oldest_map forward to the first map
7884         // in the incoming MOSDMap msg. so we should continue removing them in
7885         // this case, even we could do huge series of delete transactions all at
7886         // once.
7887         break;
7888       }
7889     }
7890   }
7891   if (num > 0) {
7892     service.publish_superblock(superblock);
7893     write_superblock(t);
7894     int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7895     ceph_assert(tr == 0);
7896   }
7897   // we should not remove the cached maps
7898   ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7899 }
7900
7901 void OSD::handle_osd_map(MOSDMap *m)
7902 {
7903   // wait for pgs to catch up
7904   {
7905     // we extend the map cache pins to accomodate pgs slow to consume maps
7906     // for some period, until we hit the max_lag_factor bound, at which point
7907     // we block here to stop injesting more maps than they are able to keep
7908     // up with.
7909     epoch_t max_lag = cct->_conf->osd_map_cache_size *
7910       m_osd_pg_epoch_max_lag_factor;
7911     ceph_assert(max_lag > 0);
7912     epoch_t osd_min = 0;
7913     for (auto shard : shards) {
7914       epoch_t min = shard->get_min_pg_epoch();
7915       if (osd_min == 0 || min < osd_min) {
7916         osd_min = min;
7917       }
7918     }
7919     epoch_t osdmap_epoch = get_osdmap_epoch();
7920     if (osd_min > 0 &&
7921         osdmap_epoch > max_lag &&
7922         osdmap_epoch - max_lag > osd_min) {
7923       epoch_t need = osdmap_epoch - max_lag;
7924       dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7925                << " max_lag " << max_lag << ")" << dendl;
7926       for (auto shard : shards) {
7927         epoch_t min = shard->get_min_pg_epoch();
7928         if (need > min) {
7929           dout(10) << __func__ << " waiting for pgs to consume " << need
7930                    << " (shard " << shard->shard_id << " min " << min
7931                    << ", map cache is " << cct->_conf->osd_map_cache_size
7932                    << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7933                    << ")" << dendl;
7934           unlock_guard unlock{osd_lock};
7935           shard->wait_min_pg_epoch(need);
7936         }
7937       }
7938     }
7939   }
7940
7941   ceph_assert(ceph_mutex_is_locked(osd_lock));
7942   map<epoch_t,OSDMapRef> added_maps;
7943   map<epoch_t,bufferlist> added_maps_bl;
7944   if (m->fsid != monc->get_fsid()) {
7945     dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7946             << monc->get_fsid() << dendl;
7947     m->put();
7948     return;
7949   }
7950   if (is_initializing()) {
7951     dout(0) << "ignoring osdmap until we have initialized" << dendl;
7952     m->put();
7953     return;
7954   }
7955
7956   auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7957   if (session && !(session->entity_name.is_mon() ||
7958                    session->entity_name.is_osd())) {
7959     //not enough perms!
7960     dout(10) << "got osd map from Session " << session
7961              << " which we can't take maps from (not a mon or osd)" << dendl;
7962     m->put();
7963     return;
7964   }
7965
7966   // share with the objecter
7967   if (!is_preboot())
7968     service.objecter->handle_osd_map(m);
7969
7970   epoch_t first = m->get_first();
7971   epoch_t last = m->get_last();
7972   dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7973           << superblock.newest_map
7974           << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7975           << dendl;
7976
7977   logger->inc(l_osd_map);
7978   logger->inc(l_osd_mape, last - first + 1);
7979   if (first <= superblock.newest_map)
7980     logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7981   if (service.max_oldest_map < m->oldest_map) {
7982     service.max_oldest_map = m->oldest_map;
7983     ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7984   }
7985
7986   // make sure there is something new, here, before we bother flushing
7987   // the queues and such
7988   if (last <= superblock.newest_map) {
7989     dout(10) << " no new maps here, dropping" << dendl;
7990     m->put();
7991     return;
7992   }
7993
7994   // missing some?
7995   bool skip_maps = false;
7996   if (first > superblock.newest_map + 1) {
7997     dout(10) << "handle_osd_map message skips epochs "
7998              << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7999     if (m->oldest_map <= superblock.newest_map + 1) {
8000       osdmap_subscribe(superblock.newest_map + 1, false);
8001       m->put();
8002       return;
8003     }
8004     // always try to get the full range of maps--as many as we can.  this
8005     //  1- is good to have
8006     //  2- is at present the only way to ensure that we get a *full* map as
8007     //     the first map!
8008     if (m->oldest_map < first) {
8009       osdmap_subscribe(m->oldest_map - 1, true);
8010       m->put();
8011       return;
8012     }
8013     skip_maps = true;
8014   }
8015
8016   ObjectStore::Transaction t;
8017   uint64_t txn_size = 0;
8018
8019   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8020
8021   // store new maps: queue for disk and put in the osdmap cache
8022   epoch_t start = std::max(superblock.newest_map + 1, first);
8023   for (epoch_t e = start; e <= last; e++) {
8024     if (txn_size >= t.get_num_bytes()) {
8025       derr << __func__ << " transaction size overflowed" << dendl;
8026       ceph_assert(txn_size < t.get_num_bytes());
8027     }
8028     txn_size = t.get_num_bytes();
8029     map<epoch_t,bufferlist>::iterator p;
8030     p = m->maps.find(e);
8031     if (p != m->maps.end()) {
8032       dout(10) << "handle_osd_map  got full map for epoch " << e << dendl;
8033       OSDMap *o = new OSDMap;
8034       bufferlist& bl = p->second;
8035
8036       o->decode(bl);
8037
8038       purged_snaps[e] = o->get_new_purged_snaps();
8039
8040       ghobject_t fulloid = get_osdmap_pobject_name(e);
8041       t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8042       added_maps[e] = add_map(o);
8043       added_maps_bl[e] = bl;
8044       got_full_map(e);
8045       continue;
8046     }
8047
8048     p = m->incremental_maps.find(e);
8049     if (p != m->incremental_maps.end()) {
8050       dout(10) << "handle_osd_map  got inc map for epoch " << e << dendl;
8051       bufferlist& bl = p->second;
8052       ghobject_t oid = get_inc_osdmap_pobject_name(e);
8053       t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8054
8055       OSDMap *o = new OSDMap;
8056       if (e > 1) {
8057         bufferlist obl;
8058         bool got = get_map_bl(e - 1, obl);
8059         if (!got) {
8060           auto p = added_maps_bl.find(e - 1);
8061           ceph_assert(p != added_maps_bl.end());
8062           obl = p->second;
8063         }
8064         o->decode(obl);
8065       }
8066
8067       OSDMap::Incremental inc;
8068       auto p = bl.cbegin();
8069       inc.decode(p);
8070
8071       if (o->apply_incremental(inc) < 0) {
8072         derr << "ERROR: bad fsid?  i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
8073         ceph_abort_msg("bad fsid");
8074       }
8075
8076       bufferlist fbl;
8077       o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8078
8079       bool injected_failure = false;
8080       if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8081           (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8082         derr << __func__ << " injecting map crc failure" << dendl;
8083         injected_failure = true;
8084       }
8085
8086       if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8087         dout(2) << "got incremental " << e
8088                 << " but failed to encode full with correct crc; requesting"
8089                 << dendl;
8090         clog->warn() << "failed to encode map e" << e << " with expected crc";
8091         dout(20) << "my encoded map was:\n";
8092         fbl.hexdump(*_dout);
8093         *_dout << dendl;
8094         delete o;
8095         request_full_map(e, last);
8096         last = e - 1;
8097
8098         // don't continue committing if we failed to enc the first inc map
8099         if (last < start) {
8100           dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8101           m->put();
8102           return;
8103         }
8104         break;
8105       }
8106       got_full_map(e);
8107       purged_snaps[e] = o->get_new_purged_snaps();
8108
8109       ghobject_t fulloid = get_osdmap_pobject_name(e);
8110       t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8111       added_maps[e] = add_map(o);
8112       added_maps_bl[e] = fbl;
8113       continue;
8114     }
8115
8116     ceph_abort_msg("MOSDMap lied about what maps it had?");
8117   }
8118
8119   // even if this map isn't from a mon, we may have satisfied our subscription
8120   monc->sub_got("osdmap", last);
8121
8122   if (!m->maps.empty() && requested_full_first) {
8123     dout(10) << __func__ << " still missing full maps " << requested_full_first
8124              << ".." << requested_full_last << dendl;
8125     rerequest_full_maps();
8126   }
8127
8128   if (superblock.oldest_map) {
8129     // make sure we at least keep pace with incoming maps
8130     trim_maps(m->oldest_map, last - first + 1, skip_maps);
8131     pg_num_history.prune(superblock.oldest_map);
8132   }
8133
8134   if (!superblock.oldest_map || skip_maps)
8135     superblock.oldest_map = first;
8136   superblock.newest_map = last;
8137   superblock.current_epoch = last;
8138
8139   // note in the superblock that we were clean thru the prior epoch
8140   epoch_t boot_epoch = service.get_boot_epoch();
8141   if (boot_epoch && boot_epoch >= superblock.mounted) {
8142     superblock.mounted = boot_epoch;
8143     superblock.clean_thru = last;
8144   }
8145
8146   // check for pg_num changes and deleted pools
8147   OSDMapRef lastmap;
8148   for (auto& i : added_maps) {
8149     if (!lastmap) {
8150       if (!(lastmap = service.try_get_map(i.first - 1))) {
8151         dout(10) << __func__ << " can't get previous map " << i.first - 1
8152                  << " probably first start of this osd" << dendl;
8153         continue;
8154       }
8155     }
8156     ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8157     for (auto& j : lastmap->get_pools()) {
8158       if (!i.second->have_pg_pool(j.first)) {
8159         pg_num_history.log_pool_delete(i.first, j.first);
8160         dout(10) << __func__ << " recording final pg_pool_t for pool "
8161                  << j.first << dendl;
8162         // this information is needed by _make_pg() if have to restart before
8163         // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8164         ghobject_t obj = make_final_pool_info_oid(j.first);
8165         bufferlist bl;
8166         encode(j.second, bl, CEPH_FEATURES_ALL);
8167         string name = lastmap->get_pool_name(j.first);
8168         encode(name, bl);
8169         map<string,string> profile;
8170         if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8171           profile = lastmap->get_erasure_code_profile(
8172             lastmap->get_pg_pool(j.first)->erasure_code_profile);
8173         }
8174         encode(profile, bl);
8175         t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8176       } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8177                  new_pg_num != j.second.get_pg_num()) {
8178         dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8179                  << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8180         pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8181       }
8182     }
8183     for (auto& j : i.second->get_pools()) {
8184       if (!lastmap->have_pg_pool(j.first)) {
8185         dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8186                  << j.second.get_pg_num() << dendl;
8187         pg_num_history.log_pg_num_change(i.first, j.first,
8188                                          j.second.get_pg_num());
8189       }
8190     }
8191     lastmap = i.second;
8192   }
8193   pg_num_history.epoch = last;
8194   {
8195     bufferlist bl;
8196     ::encode(pg_num_history, bl);
8197     t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8198     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8199   }
8200
8201   // record new purged_snaps
8202   if (superblock.purged_snaps_last == start - 1) {
8203     SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8204                                     make_purged_snaps_oid(), &t,
8205                                     purged_snaps);
8206     superblock.purged_snaps_last = last;
8207   } else {
8208     dout(10) << __func__ << " superblock purged_snaps_last is "
8209              << superblock.purged_snaps_last
8210              << ", not recording new purged_snaps" << dendl;
8211   }
8212
8213   // superblock and commit
8214   write_superblock(t);
8215   t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8216   store->queue_transaction(
8217     service.meta_ch,
8218     std::move(t));
8219   service.publish_superblock(superblock);
8220 }
8221
8222 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8223 {
8224   dout(10) << __func__ << " " << first << ".." << last << dendl;
8225   if (is_stopping()) {
8226     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8227     return;
8228   }
8229   std::lock_guard l(osd_lock);
8230   if (is_stopping()) {
8231     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8232     return;
8233   }
8234   map_lock.lock();
8235
8236   ceph_assert(first <= last);
8237
8238   bool do_shutdown = false;
8239   bool do_restart = false;
8240   bool network_error = false;
8241   OSDMapRef osdmap = get_osdmap();
8242
8243   // advance through the new maps
8244   for (epoch_t cur = first; cur <= last; cur++) {
8245     dout(10) << " advance to epoch " << cur
8246              << " (<= last " << last
8247              << " <= newest_map " << superblock.newest_map
8248              << ")" << dendl;
8249
8250     OSDMapRef newmap = get_map(cur);
8251     ceph_assert(newmap);  // we just cached it above!
8252
8253     // start blocklisting messages sent to peers that go down.
8254     service.pre_publish_map(newmap);
8255
8256     // kill connections to newly down osds
8257     bool waited_for_reservations = false;
8258     set<int> old;
8259     osdmap = get_osdmap();
8260     osdmap->get_all_osds(old);
8261     for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8262       if (*p != whoami &&
8263           osdmap->is_up(*p) && // in old map
8264           newmap->is_down(*p)) {    // but not the new one
8265         if (!waited_for_reservations) {
8266           service.await_reserved_maps();
8267           waited_for_reservations = true;
8268         }
8269         note_down_osd(*p);
8270       } else if (*p != whoami &&
8271                 osdmap->is_down(*p) &&
8272                 newmap->is_up(*p)) {
8273         note_up_osd(*p);
8274       }
8275     }
8276
8277     if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8278       dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8279                << dendl;
8280       if (is_booting()) {
8281         // this captures the case where we sent the boot message while
8282         // NOUP was being set on the mon and our boot request was
8283         // dropped, and then later it is cleared.  it imperfectly
8284         // handles the case where our original boot message was not
8285         // dropped and we restart even though we might have booted, but
8286         // that is harmless (boot will just take slightly longer).
8287         do_restart = true;
8288       }
8289     }
8290
8291     osdmap = std::move(newmap);
8292     set_osdmap(osdmap);
8293     epoch_t up_epoch;
8294     epoch_t boot_epoch;
8295     service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8296     if (!up_epoch &&
8297         osdmap->is_up(whoami) &&
8298         osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8299       up_epoch = osdmap->get_epoch();
8300       dout(10) << "up_epoch is " << up_epoch << dendl;
8301       if (!boot_epoch) {
8302         boot_epoch = osdmap->get_epoch();
8303         dout(10) << "boot_epoch is " << boot_epoch << dendl;
8304       }
8305       service.set_epochs(&boot_epoch, &up_epoch, NULL);
8306     }
8307   }
8308
8309   epoch_t _bind_epoch = service.get_bind_epoch();
8310   if (osdmap->is_up(whoami) &&
8311       osdmap->get_addrs(whoami).legacy_equals(
8312         client_messenger->get_myaddrs()) &&
8313       _bind_epoch < osdmap->get_up_from(whoami)) {
8314
8315     if (is_booting()) {
8316       dout(1) << "state: booting -> active" << dendl;
8317       set_state(STATE_ACTIVE);
8318       do_restart = false;
8319
8320       // set incarnation so that osd_reqid_t's we generate for our
8321       // objecter requests are unique across restarts.
8322       service.objecter->set_client_incarnation(osdmap->get_epoch());
8323       cancel_pending_failures();
8324     }
8325   }
8326
8327   if (osdmap->get_epoch() > 0 &&
8328       is_active()) {
8329     if (!osdmap->exists(whoami)) {
8330       derr << "map says i do not exist.  shutting down." << dendl;
8331       do_shutdown = true;   // don't call shutdown() while we have
8332                             // everything paused
8333     } else if (osdmap->is_stop(whoami)) {
8334       derr << "map says i am stopped by admin. shutting down." << dendl;
8335       do_shutdown = true;
8336     } else if (!osdmap->is_up(whoami) ||
8337                !osdmap->get_addrs(whoami).legacy_equals(
8338                  client_messenger->get_myaddrs()) ||
8339                !osdmap->get_cluster_addrs(whoami).legacy_equals(
8340                  cluster_messenger->get_myaddrs()) ||
8341                !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8342                  hb_back_server_messenger->get_myaddrs()) ||
8343                !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8344                  hb_front_server_messenger->get_myaddrs())) {
8345       if (!osdmap->is_up(whoami)) {
8346         if (service.is_preparing_to_stop() || service.is_stopping()) {
8347           service.got_stop_ack();
8348         } else {
8349           clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8350                           "but it is still running";
8351           clog->debug() << "map e" << osdmap->get_epoch()
8352                         << " wrongly marked me down at e"
8353                         << osdmap->get_down_at(whoami);
8354         }
8355         if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8356           // note that this is best-effort...
8357           monc->send_mon_message(
8358             new MOSDMarkMeDead(
8359               monc->get_fsid(),
8360               whoami,
8361               osdmap->get_epoch()));
8362         }
8363       } else if (!osdmap->get_addrs(whoami).legacy_equals(
8364                    client_messenger->get_myaddrs())) {
8365         clog->error() << "map e" << osdmap->get_epoch()
8366                       << " had wrong client addr (" << osdmap->get_addrs(whoami)
8367                       << " != my " << client_messenger->get_myaddrs() << ")";
8368       } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8369                    cluster_messenger->get_myaddrs())) {
8370         clog->error() << "map e" << osdmap->get_epoch()
8371                       << " had wrong cluster addr ("
8372                       << osdmap->get_cluster_addrs(whoami)
8373                       << " != my " << cluster_messenger->get_myaddrs() << ")";
8374       } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8375                    hb_back_server_messenger->get_myaddrs())) {
8376         clog->error() << "map e" << osdmap->get_epoch()
8377                       << " had wrong heartbeat back addr ("
8378                       << osdmap->get_hb_back_addrs(whoami)
8379                       << " != my " << hb_back_server_messenger->get_myaddrs()
8380                       << ")";
8381       } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8382                    hb_front_server_messenger->get_myaddrs())) {
8383         clog->error() << "map e" << osdmap->get_epoch()
8384                       << " had wrong heartbeat front addr ("
8385                       << osdmap->get_hb_front_addrs(whoami)
8386                       << " != my " << hb_front_server_messenger->get_myaddrs()
8387                       << ")";
8388       }
8389
8390       if (!service.is_stopping()) {
8391         epoch_t up_epoch = 0;
8392         epoch_t bind_epoch = osdmap->get_epoch();
8393         service.set_epochs(NULL,&up_epoch, &bind_epoch);
8394         do_restart = true;
8395
8396         //add markdown log
8397         utime_t now = ceph_clock_now();
8398         utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8399         osd_markdown_log.push_back(now);
8400         if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8401           derr << __func__ << " marked down "
8402                << osd_markdown_log.size()
8403                << " > osd_max_markdown_count "
8404                << cct->_conf->osd_max_markdown_count
8405                << " in last " << grace << " seconds, shutting down"
8406                << dendl;
8407           do_restart = false;
8408           do_shutdown = true;
8409         }
8410
8411         start_waiting_for_healthy();
8412
8413         set<int> avoid_ports;
8414 #if defined(__FreeBSD__)
8415         // prevent FreeBSD from grabbing the client_messenger port during
8416         // rebinding. In which case a cluster_meesneger will connect also
8417         // to the same port
8418         client_messenger->get_myaddrs().get_ports(&avoid_ports);
8419 #endif
8420         cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8421
8422         int r = cluster_messenger->rebind(avoid_ports);
8423         if (r != 0) {
8424           do_shutdown = true;  // FIXME: do_restart?
8425           network_error = true;
8426           derr << __func__ << " marked down:"
8427                << " rebind cluster_messenger failed" << dendl;
8428         }
8429
8430         hb_back_server_messenger->mark_down_all();
8431         hb_front_server_messenger->mark_down_all();
8432         hb_front_client_messenger->mark_down_all();
8433         hb_back_client_messenger->mark_down_all();
8434
8435         reset_heartbeat_peers(true);
8436       }
8437     }
8438   }
8439
8440   map_lock.unlock();
8441
8442   check_osdmap_features();
8443
8444   // yay!
8445   consume_map();
8446
8447   if (is_active() || is_waiting_for_healthy())
8448     maybe_update_heartbeat_peers();
8449
8450   if (is_active()) {
8451     activate_map();
8452   }
8453
8454   if (do_shutdown) {
8455     if (network_error) {
8456       cancel_pending_failures();
8457     }
8458     // trigger shutdown in a different thread
8459     dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8460     queue_async_signal(SIGINT);
8461   }
8462   else if (m->newest_map && m->newest_map > last) {
8463     dout(10) << " msg say newest map is " << m->newest_map
8464              << ", requesting more" << dendl;
8465     osdmap_subscribe(osdmap->get_epoch()+1, false);
8466   }
8467   else if (is_preboot()) {
8468     if (m->get_source().is_mon())
8469       _preboot(m->oldest_map, m->newest_map);
8470     else
8471       start_boot();
8472   }
8473   else if (do_restart)
8474     start_boot();
8475
8476 }
8477
8478 void OSD::check_osdmap_features()
8479 {
8480   // adjust required feature bits?
8481
8482   // we have to be a bit careful here, because we are accessing the
8483   // Policy structures without taking any lock.  in particular, only
8484   // modify integer values that can safely be read by a racing CPU.
8485   // since we are only accessing existing Policy structures a their
8486   // current memory location, and setting or clearing bits in integer
8487   // fields, and we are the only writer, this is not a problem.
8488
8489   const auto osdmap = get_osdmap();
8490   {
8491     Messenger::Policy p = client_messenger->get_default_policy();
8492     uint64_t mask;
8493     uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8494     if ((p.features_required & mask) != features) {
8495       dout(0) << "crush map has features " << features
8496               << ", adjusting msgr requires for clients" << dendl;
8497       p.features_required = (p.features_required & ~mask) | features;
8498       client_messenger->set_default_policy(p);
8499     }
8500   }
8501   {
8502     Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8503     uint64_t mask;
8504     uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8505     if ((p.features_required & mask) != features) {
8506       dout(0) << "crush map has features " << features
8507               << " was " << p.features_required
8508               << ", adjusting msgr requires for mons" << dendl;
8509       p.features_required = (p.features_required & ~mask) | features;
8510       client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8511     }
8512   }
8513   {
8514     Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8515     uint64_t mask;
8516     uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8517
8518     if ((p.features_required & mask) != features) {
8519       dout(0) << "crush map has features " << features
8520               << ", adjusting msgr requires for osds" << dendl;
8521       p.features_required = (p.features_required & ~mask) | features;
8522       cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8523     }
8524
8525     if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8526       dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8527       superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8528       ObjectStore::Transaction t;
8529       write_superblock(t);
8530       int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8531       ceph_assert(err == 0);
8532     }
8533   }
8534
8535   if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8536     hb_front_server_messenger->set_require_authorizer(false);
8537     hb_back_server_messenger->set_require_authorizer(false);
8538   } else {
8539     hb_front_server_messenger->set_require_authorizer(true);
8540     hb_back_server_messenger->set_require_authorizer(true);
8541   }
8542
8543   if (osdmap->require_osd_release != last_require_osd_release) {
8544     dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8545             << " -> " << to_string(osdmap->require_osd_release) << dendl;
8546     store->write_meta("require_osd_release",
8547                       stringify((int)osdmap->require_osd_release));
8548     last_require_osd_release = osdmap->require_osd_release;
8549   }
8550 }
8551
8552 struct C_FinishSplits : public Context {
8553   OSD *osd;
8554   set<PGRef> pgs;
8555   C_FinishSplits(OSD *osd, const set<PGRef> &in)
8556     : osd(osd), pgs(in) {}
8557   void finish(int r) override {
8558     osd->_finish_splits(pgs);
8559   }
8560 };
8561
8562 void OSD::_finish_splits(set<PGRef>& pgs)
8563 {
8564   dout(10) << __func__ << " " << pgs << dendl;
8565   if (is_stopping())
8566     return;
8567   for (set<PGRef>::iterator i = pgs.begin();
8568        i != pgs.end();
8569        ++i) {
8570     PG *pg = i->get();
8571
8572     PeeringCtx rctx = create_context();
8573     pg->lock();
8574     dout(10) << __func__ << " " << *pg << dendl;
8575     epoch_t e = pg->get_osdmap_epoch();
8576     pg->handle_initialize(rctx);
8577     pg->queue_null(e, e);
8578     dispatch_context(rctx, pg, service.get_osdmap());
8579     pg->unlock();
8580
8581     unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8582     shards[shard_index]->register_and_wake_split_child(pg);
8583   }
8584 };
8585
8586 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8587                            unsigned need)
8588 {
8589   std::lock_guard l(merge_lock);
8590   auto& p = merge_waiters[nextmap->get_epoch()][target];
8591   p[src->pg_id] = src;
8592   dout(10) << __func__ << " added merge_waiter " << src->pg_id
8593            << " for " << target  << ", have " << p.size() << "/" << need
8594            << dendl;
8595   return p.size() == need;
8596 }
8597
8598 bool OSD::advance_pg(
8599   epoch_t osd_epoch,
8600   PG *pg,
8601   ThreadPool::TPHandle &handle,
8602   PeeringCtx &rctx)
8603 {
8604   if (osd_epoch <= pg->get_osdmap_epoch()) {
8605     return true;
8606   }
8607   ceph_assert(pg->is_locked());
8608   OSDMapRef lastmap = pg->get_osdmap();
8609   set<PGRef> new_pgs;  // any split children
8610   bool ret = true;
8611
8612   unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8613     lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8614   for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8615        next_epoch <= osd_epoch;
8616        ++next_epoch) {
8617     OSDMapRef nextmap = service.try_get_map(next_epoch);
8618     if (!nextmap) {
8619       dout(20) << __func__ << " missing map " << next_epoch << dendl;
8620       continue;
8621     }
8622
8623     unsigned new_pg_num =
8624       (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8625       nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8626     if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8627       // check for merge
8628       if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8629         spg_t parent;
8630         if (pg->pg_id.is_merge_source(
8631               old_pg_num,
8632               new_pg_num,
8633               &parent)) {
8634           // we are merge source
8635           PGRef spg = pg; // carry a ref
8636           dout(1) << __func__ << " " << pg->pg_id
8637                   << " is merge source, target is " << parent
8638                    << dendl;
8639           pg->write_if_dirty(rctx);
8640           if (!new_pgs.empty()) {
8641             rctx.transaction.register_on_applied(new C_FinishSplits(this,
8642                                                                     new_pgs));
8643             new_pgs.clear();
8644           }
8645           dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8646           pg->ch->flush();
8647           // release backoffs explicitly, since the on_shutdown path
8648           // aggressively tears down backoff state.
8649           if (pg->is_primary()) {
8650             pg->release_pg_backoffs();
8651           }
8652           pg->on_shutdown();
8653           OSDShard *sdata = pg->osd_shard;
8654           {
8655             std::lock_guard l(sdata->shard_lock);
8656             if (pg->pg_slot) {
8657               sdata->_detach_pg(pg->pg_slot);
8658               // update pg count now since we might not get an osdmap
8659               // any time soon.
8660               if (pg->is_primary())
8661                 logger->dec(l_osd_pg_primary);
8662               else if (pg->is_nonprimary())
8663                 logger->dec(l_osd_pg_replica); // misnomer
8664               else
8665                 logger->dec(l_osd_pg_stray);
8666             }
8667           }
8668           pg->unlock();
8669
8670           set<spg_t> children;
8671           parent.is_split(new_pg_num, old_pg_num, &children);
8672           if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8673             enqueue_peering_evt(
8674               parent,
8675               PGPeeringEventRef(
8676                 std::make_shared<PGPeeringEvent>(
8677                   nextmap->get_epoch(),
8678                   nextmap->get_epoch(),
8679                   NullEvt())));
8680           }
8681           ret = false;
8682           goto out;
8683         } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8684           // we are merge target
8685           set<spg_t> children;
8686           pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8687           dout(20) << __func__ << " " << pg->pg_id
8688                    << " is merge target, sources are " << children
8689                    << dendl;
8690           map<spg_t,PGRef> sources;
8691           {
8692             std::lock_guard l(merge_lock);
8693             auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8694             unsigned need = children.size();
8695             dout(20) << __func__ << " have " << s.size() << "/"
8696                      << need << dendl;
8697             if (s.size() == need) {
8698               sources.swap(s);
8699               merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8700               if (merge_waiters[nextmap->get_epoch()].empty()) {
8701                 merge_waiters.erase(nextmap->get_epoch());
8702               }
8703             }
8704           }
8705           if (!sources.empty()) {
8706             unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8707             unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8708             dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8709             pg->merge_from(
8710               sources, rctx, split_bits,
8711               nextmap->get_pg_pool(
8712                 pg->pg_id.pool())->last_pg_merge_meta);
8713             pg->pg_slot->waiting_for_merge_epoch = 0;
8714           } else {
8715             dout(20) << __func__ << " not ready to merge yet" << dendl;
8716             pg->write_if_dirty(rctx);
8717             if (!new_pgs.empty()) {
8718               rctx.transaction.register_on_applied(new C_FinishSplits(this,
8719                                                                       new_pgs));
8720               new_pgs.clear();
8721             }
8722             dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8723             pg->unlock();
8724             // kick source(s) to get them ready
8725             for (auto& i : children) {
8726               dout(20) << __func__ << " kicking source " << i << dendl;
8727               enqueue_peering_evt(
8728                 i,
8729                 PGPeeringEventRef(
8730                   std::make_shared<PGPeeringEvent>(
8731                     nextmap->get_epoch(),
8732                     nextmap->get_epoch(),
8733                     NullEvt())));
8734             }
8735             ret = false;
8736             goto out;
8737           }
8738         }
8739       }
8740     }
8741
8742     vector<int> newup, newacting;
8743     int up_primary, acting_primary;
8744     nextmap->pg_to_up_acting_osds(
8745       pg->pg_id.pgid,
8746       &newup, &up_primary,
8747       &newacting, &acting_primary);
8748     pg->handle_advance_map(
8749       nextmap, lastmap, newup, up_primary,
8750       newacting, acting_primary, rctx);
8751
8752     auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8753     auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8754     if (oldpool != lastmap->get_pools().end()
8755         && newpool != nextmap->get_pools().end()) {
8756       dout(20) << __func__
8757                << " new pool opts " << newpool->second.opts
8758                << " old pool opts " << oldpool->second.opts
8759                << dendl;
8760
8761       double old_min_interval = 0, new_min_interval = 0;
8762       oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8763       newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8764
8765       double old_max_interval = 0, new_max_interval = 0;
8766       oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8767       newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8768
8769       // Assume if an interval is change from set to unset or vice versa the actual config
8770       // is different.  Keep it simple even if it is possible to call resched_all_scrub()
8771       // unnecessarily.
8772       if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8773         pg->on_info_history_change();
8774       }
8775     }
8776
8777     if (new_pg_num && old_pg_num != new_pg_num) {
8778       // check for split
8779       set<spg_t> children;
8780       if (pg->pg_id.is_split(
8781             old_pg_num,
8782             new_pg_num,
8783             &children)) {
8784         split_pgs(
8785           pg, children, &new_pgs, lastmap, nextmap,
8786           rctx);
8787       }
8788     }
8789
8790     lastmap = nextmap;
8791     old_pg_num = new_pg_num;
8792     handle.reset_tp_timeout();
8793   }
8794   pg->handle_activate_map(rctx);
8795
8796   ret = true;
8797  out:
8798   if (!new_pgs.empty()) {
8799     rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8800   }
8801   return ret;
8802 }
8803
8804 void OSD::consume_map()
8805 {
8806   ceph_assert(ceph_mutex_is_locked(osd_lock));
8807   auto osdmap = get_osdmap();
8808   dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8809
8810   /** make sure the cluster is speaking in SORTBITWISE, because we don't
8811    *  speak the older sorting version any more. Be careful not to force
8812    *  a shutdown if we are merely processing old maps, though.
8813    */
8814   if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8815     derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8816     ceph_abort();
8817   }
8818
8819   service.pre_publish_map(osdmap);
8820   service.await_reserved_maps();
8821   service.publish_map(osdmap);
8822
8823   // prime splits and merges
8824   set<pair<spg_t,epoch_t>> newly_split;  // splits, and when
8825   set<pair<spg_t,epoch_t>> merge_pgs;    // merge participants, and when
8826   for (auto& shard : shards) {
8827     shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8828   }
8829   if (!newly_split.empty()) {
8830     for (auto& shard : shards) {
8831       shard->prime_splits(osdmap, &newly_split);
8832     }
8833     ceph_assert(newly_split.empty());
8834   }
8835
8836   // prune sent_ready_to_merge
8837   service.prune_sent_ready_to_merge(osdmap);
8838
8839   // FIXME, maybe: We could race against an incoming peering message
8840   // that instantiates a merge PG after identify_merges() below and
8841   // never set up its peer to complete the merge.  An OSD restart
8842   // would clear it up.  This is a hard race to resolve,
8843   // extraordinarily rare (we only merge PGs that are stable and
8844   // clean, so it'd have to be an imported PG to an OSD with a
8845   // slightly stale OSDMap...), so I'm ignoring it for now.  We plan to
8846   // replace all of this with a seastar-based code soon anyway.
8847   if (!merge_pgs.empty()) {
8848     // mark the pgs we already have, or create new and empty merge
8849     // participants for those we are missing.  do this all under the
8850     // shard lock so we don't have to worry about racing pg creates
8851     // via _process.
8852     for (auto& shard : shards) {
8853       shard->prime_merges(osdmap, &merge_pgs);
8854     }
8855     ceph_assert(merge_pgs.empty());
8856   }
8857
8858   service.prune_pg_created();
8859
8860   unsigned pushes_to_free = 0;
8861   for (auto& shard : shards) {
8862     shard->consume_map(osdmap, &pushes_to_free);
8863   }
8864
8865   vector<spg_t> pgids;
8866   _get_pgids(&pgids);
8867
8868   // count (FIXME, probably during seastar rewrite)
8869   int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8870   vector<PGRef> pgs;
8871   _get_pgs(&pgs);
8872   for (auto& pg : pgs) {
8873     // FIXME (probably during seastar rewrite): this is lockless and
8874     // racy, but we don't want to take pg lock here.
8875     if (pg->is_primary())
8876       num_pg_primary++;
8877     else if (pg->is_nonprimary())
8878       num_pg_replica++;  // misnomer
8879     else
8880       num_pg_stray++;
8881   }
8882
8883   {
8884     // FIXME (as part of seastar rewrite): move to OSDShard
8885     std::lock_guard l(pending_creates_lock);
8886     for (auto pg = pending_creates_from_osd.begin();
8887          pg != pending_creates_from_osd.end();) {
8888       if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8889         dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8890                  << "discarding pending_create_from_osd" << dendl;
8891         pg = pending_creates_from_osd.erase(pg);
8892       } else {
8893         ++pg;
8894       }
8895     }
8896   }
8897
8898   service.maybe_inject_dispatch_delay();
8899
8900   dispatch_sessions_waiting_on_map();
8901
8902   service.maybe_inject_dispatch_delay();
8903
8904   service.release_reserved_pushes(pushes_to_free);
8905
8906   // queue null events to push maps down to individual PGs
8907   for (auto pgid : pgids) {
8908     enqueue_peering_evt(
8909       pgid,
8910       PGPeeringEventRef(
8911         std::make_shared<PGPeeringEvent>(
8912           osdmap->get_epoch(),
8913           osdmap->get_epoch(),
8914           NullEvt())));
8915   }
8916   logger->set(l_osd_pg, pgids.size());
8917   logger->set(l_osd_pg_primary, num_pg_primary);
8918   logger->set(l_osd_pg_replica, num_pg_replica);
8919   logger->set(l_osd_pg_stray, num_pg_stray);
8920 }
8921
8922 void OSD::activate_map()
8923 {
8924   ceph_assert(ceph_mutex_is_locked(osd_lock));
8925   auto osdmap = get_osdmap();
8926
8927   dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8928
8929   // norecover?
8930   if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8931     if (!service.recovery_is_paused()) {
8932       dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8933       service.pause_recovery();
8934     }
8935   } else {
8936     if (service.recovery_is_paused()) {
8937       dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8938       service.unpause_recovery();
8939     }
8940   }
8941
8942   service.activate_map();
8943
8944   // process waiters
8945   take_waiters(waiting_for_osdmap);
8946 }
8947
8948 bool OSD::require_mon_peer(const Message *m)
8949 {
8950   if (!m->get_connection()->peer_is_mon()) {
8951     dout(0) << "require_mon_peer received from non-mon "
8952             << m->get_connection()->get_peer_addr()
8953             << " " << *m << dendl;
8954     return false;
8955   }
8956   return true;
8957 }
8958
8959 bool OSD::require_mon_or_mgr_peer(const Message *m)
8960 {
8961   if (!m->get_connection()->peer_is_mon() &&
8962       !m->get_connection()->peer_is_mgr()) {
8963     dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8964             << m->get_connection()->get_peer_addr()
8965             << " " << *m << dendl;
8966     return false;
8967   }
8968   return true;
8969 }
8970
8971 bool OSD::require_osd_peer(const Message *m)
8972 {
8973   if (!m->get_connection()->peer_is_osd()) {
8974     dout(0) << "require_osd_peer received from non-osd "
8975             << m->get_connection()->get_peer_addr()
8976             << " " << *m << dendl;
8977     return false;
8978   }
8979   return true;
8980 }
8981
8982 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8983 {
8984   epoch_t up_epoch = service.get_up_epoch();
8985   if (epoch < up_epoch) {
8986     dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8987     return false;
8988   }
8989
8990   if (!is_active()) {
8991     dout(7) << "still in boot state, dropping message " << *m << dendl;
8992     return false;
8993   }
8994
8995   return true;
8996 }
8997
8998 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
8999                                      bool is_fast_dispatch)
9000 {
9001   int from = m->get_source().num();
9002
9003   if (map->is_down(from) ||
9004       (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9005     dout(5) << "from dead osd." << from << ", marking down, "
9006             << " msg was " << m->get_source_inst().addr
9007             << " expected "
9008             << (map->is_up(from) ?
9009                 map->get_cluster_addrs(from) : entity_addrvec_t())
9010             << dendl;
9011     ConnectionRef con = m->get_connection();
9012     con->mark_down();
9013     if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
9014       if (!is_fast_dispatch)
9015         s->session_dispatch_lock.lock();
9016       clear_session_waiting_on_map(s);
9017       con->set_priv(nullptr);   // break ref <-> session cycle, if any
9018       s->con.reset();
9019       if (!is_fast_dispatch)
9020         s->session_dispatch_lock.unlock();
9021     }
9022     return false;
9023   }
9024   return true;
9025 }
9026
9027
9028 /*
9029  * require that we have same (or newer) map, and that
9030  * the source is the pg primary.
9031  */
9032 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9033                                     bool is_fast_dispatch)
9034 {
9035   const Message *m = op->get_req();
9036   const auto osdmap = get_osdmap();
9037   dout(15) << "require_same_or_newer_map " << epoch
9038            << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9039
9040   ceph_assert(ceph_mutex_is_locked(osd_lock));
9041
9042   // do they have a newer map?
9043   if (epoch > osdmap->get_epoch()) {
9044     dout(7) << "waiting for newer map epoch " << epoch
9045             << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9046     wait_for_new_map(op);
9047     return false;
9048   }
9049
9050   if (!require_self_aliveness(op->get_req(), epoch)) {
9051     return false;
9052   }
9053
9054   // ok, our map is same or newer.. do they still exist?
9055   if (m->get_connection()->get_messenger() == cluster_messenger &&
9056       !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9057     return false;
9058   }
9059
9060   return true;
9061 }
9062
9063
9064
9065
9066
9067 // ----------------------------------------
9068 // pg creation
9069
9070 void OSD::split_pgs(
9071   PG *parent,
9072   const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9073   OSDMapRef curmap,
9074   OSDMapRef nextmap,
9075   PeeringCtx &rctx)
9076 {
9077   unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9078   parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9079
9080   vector<object_stat_sum_t> updated_stats;
9081   parent->start_split_stats(childpgids, &updated_stats);
9082
9083   vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9084   for (set<spg_t>::const_iterator i = childpgids.begin();
9085        i != childpgids.end();
9086        ++i, ++stat_iter) {
9087     ceph_assert(stat_iter != updated_stats.end());
9088     dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9089     PG* child = _make_pg(nextmap, *i);
9090     child->lock(true);
9091     out_pgs->insert(child);
9092     child->ch = store->create_new_collection(child->coll);
9093
9094     {
9095       uint32_t shard_index = i->hash_to_shard(shards.size());
9096       assert(NULL != shards[shard_index]);
9097       store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9098     }
9099
9100     unsigned split_bits = i->get_split_bits(pg_num);
9101     dout(10) << " pg_num is " << pg_num
9102              << ", m_seed " << i->ps()
9103              << ", split_bits is " << split_bits << dendl;
9104     parent->split_colls(
9105       *i,
9106       split_bits,
9107       i->ps(),
9108       &child->get_pool().info,
9109       rctx.transaction);
9110     parent->split_into(
9111       i->pgid,
9112       child,
9113       split_bits);
9114
9115     child->init_collection_pool_opts();
9116
9117     child->finish_split_stats(*stat_iter, rctx.transaction);
9118     child->unlock();
9119   }
9120   ceph_assert(stat_iter != updated_stats.end());
9121   parent->finish_split_stats(*stat_iter, rctx.transaction);
9122 }
9123
9124 /*
9125  * holding osd_lock
9126  */
9127 void OSD::handle_pg_create(OpRequestRef op)
9128 {
9129   // NOTE: this can be removed in P release (mimic is the last version to
9130   // send MOSDPGCreate messages).
9131
9132   auto m = op->get_req<MOSDPGCreate>();
9133   ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9134
9135   dout(10) << "handle_pg_create " << *m << dendl;
9136
9137   if (!require_mon_peer(op->get_req())) {
9138     return;
9139   }
9140
9141   if (!require_same_or_newer_map(op, m->epoch, false))
9142     return;
9143
9144   op->mark_started();
9145
9146   const auto osdmap = get_osdmap();
9147   map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9148   for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9149        p != m->mkpg.end();
9150        ++p, ++ci) {
9151     ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9152     epoch_t created = p->second.created;
9153     if (p->second.split_bits) // Skip split pgs
9154       continue;
9155     pg_t on = p->first;
9156
9157     if (!osdmap->have_pg_pool(on.pool())) {
9158       dout(20) << "ignoring pg on deleted pool " << on << dendl;
9159       continue;
9160     }
9161
9162     dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9163
9164     spg_t pgid;
9165     bool mapped = osdmap->get_primary_shard(on, &pgid);
9166     ceph_assert(mapped);
9167
9168     // is it still ours?
9169     vector<int> up, acting;
9170     int up_primary = -1;
9171     int acting_primary = -1;
9172     osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9173     int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9174
9175     if (acting_primary != whoami) {
9176       dout(10) << "mkpg " << on << "  not acting_primary (" << acting_primary
9177                << "), my role=" << role << ", skipping" << dendl;
9178       continue;
9179     }
9180
9181
9182     PastIntervals pi;
9183     pg_history_t history;
9184     build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9185
9186     // The mon won't resend unless the primary changed, so we ignore
9187     // same_interval_since.  We'll pass this history with the current
9188     // epoch as the event.
9189     if (history.same_primary_since > m->epoch) {
9190       dout(10) << __func__ << ": got obsolete pg create on pgid "
9191                << pgid << " from epoch " << m->epoch
9192                << ", primary changed in " << history.same_primary_since
9193                << dendl;
9194       continue;
9195     }
9196     enqueue_peering_evt(
9197       pgid,
9198       PGPeeringEventRef(
9199         std::make_shared<PGPeeringEvent>(
9200           osdmap->get_epoch(),
9201           osdmap->get_epoch(),
9202           NullEvt(),
9203           true,
9204           new PGCreateInfo(
9205             pgid,
9206             osdmap->get_epoch(),
9207             history,
9208             pi,
9209             true)
9210           )));
9211   }
9212
9213   {
9214     std::lock_guard l(pending_creates_lock);
9215     if (pending_creates_from_mon == 0) {
9216       last_pg_create_epoch = m->epoch;
9217     }
9218   }
9219
9220   maybe_update_heartbeat_peers();
9221 }
9222
9223
9224 // ----------------------------------------
9225 // peering and recovery
9226
9227 PeeringCtx OSD::create_context()
9228 {
9229   return PeeringCtx(get_osdmap()->require_osd_release);
9230 }
9231
9232 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9233                            ThreadPool::TPHandle *handle)
9234 {
9235   if (!service.get_osdmap()->is_up(whoami)) {
9236     dout(20) << __func__ << " not up in osdmap" << dendl;
9237   } else if (!is_active()) {
9238     dout(20) << __func__ << " not active" << dendl;
9239   } else {
9240     for (auto& [osd, ls] : ctx.message_map) {
9241       if (!curmap->is_up(osd)) {
9242         dout(20) << __func__ << " skipping down osd." << osd << dendl;
9243         continue;
9244       }
9245       ConnectionRef con = service.get_con_osd_cluster(
9246         osd, curmap->get_epoch());
9247       if (!con) {
9248         dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9249                  << dendl;
9250         continue;
9251       }
9252       service.maybe_share_map(con.get(), curmap);
9253       for (auto m : ls) {
9254         con->send_message2(m);
9255       }
9256       ls.clear();
9257     }
9258   }
9259   if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9260     int tr = store->queue_transaction(
9261       pg->ch,
9262       std::move(ctx.transaction), TrackedOpRef(),
9263       handle);
9264     ceph_assert(tr == 0);
9265   }
9266 }
9267
9268 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9269 {
9270   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9271   if (!require_mon_peer(m)) {
9272     m->put();
9273     return;
9274   }
9275   for (auto& p : m->pgs) {
9276     spg_t pgid = p.first;
9277     epoch_t created = p.second.first;
9278     utime_t created_stamp = p.second.second;
9279     auto q = m->pg_extra.find(pgid);
9280     if (q == m->pg_extra.end()) {
9281       dout(20) << __func__ << " " << pgid << " e" << created
9282                << "@" << created_stamp
9283                << " (no history or past_intervals)" << dendl;
9284       // pre-octopus ... no pg history.  this can be removed in Q release.
9285       enqueue_peering_evt(
9286         pgid,
9287         PGPeeringEventRef(
9288           std::make_shared<PGPeeringEvent>(
9289             m->epoch,
9290             m->epoch,
9291             NullEvt(),
9292             true,
9293             new PGCreateInfo(
9294               pgid,
9295               created,
9296               pg_history_t(created, created_stamp),
9297               PastIntervals(),
9298               true)
9299             )));
9300     } else {
9301       dout(20) << __func__ << " " << pgid << " e" << created
9302                << "@" << created_stamp
9303                << " history " << q->second.first
9304                << " pi " << q->second.second << dendl;
9305       if (!q->second.second.empty() &&
9306           m->epoch < q->second.second.get_bounds().second) {
9307         clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9308                       << " and unmatched past_intervals " << q->second.second
9309                       << " (history " << q->second.first << ")";
9310       } else {
9311         enqueue_peering_evt(
9312           pgid,
9313           PGPeeringEventRef(
9314             std::make_shared<PGPeeringEvent>(
9315               m->epoch,
9316               m->epoch,
9317               NullEvt(),
9318               true,
9319               new PGCreateInfo(
9320                 pgid,
9321                 m->epoch,
9322                 q->second.first,
9323                 q->second.second,
9324                 true)
9325               )));
9326       }
9327     }
9328   }
9329
9330   {
9331     std::lock_guard l(pending_creates_lock);
9332     if (pending_creates_from_mon == 0) {
9333       last_pg_create_epoch = m->epoch;
9334     }
9335   }
9336
9337   m->put();
9338 }
9339
9340 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9341 {
9342   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9343   if (!require_osd_peer(m)) {
9344     m->put();
9345     return;
9346   }
9347   int from = m->get_source().num();
9348   for (auto& p : m->pg_list) {
9349     enqueue_peering_evt(
9350       p.first,
9351       PGPeeringEventRef(
9352         std::make_shared<PGPeeringEvent>(
9353           p.second.epoch_sent, p.second.epoch_sent,
9354           MQuery(
9355             p.first,
9356             pg_shard_t(from, p.second.from),
9357             p.second,
9358             p.second.epoch_sent),
9359           false))
9360       );
9361   }
9362   m->put();
9363 }
9364
9365 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9366 {
9367   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9368   if (!require_osd_peer(m)) {
9369     m->put();
9370     return;
9371   }
9372   int from = m->get_source().num();
9373   for (auto& p : m->get_pg_list()) {
9374     spg_t pgid(p.info.pgid.pgid, p.to);
9375     enqueue_peering_evt(
9376       pgid,
9377       PGPeeringEventRef(
9378         std::make_shared<PGPeeringEvent>(
9379           p.epoch_sent,
9380           p.query_epoch,
9381           MNotifyRec(
9382             pgid, pg_shard_t(from, p.from),
9383             p,
9384             m->get_connection()->get_features()),
9385           true,
9386           new PGCreateInfo(
9387             pgid,
9388             p.query_epoch,
9389             p.info.history,
9390             p.past_intervals,
9391             false)
9392           )));
9393   }
9394   m->put();
9395 }
9396
9397 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9398 {
9399   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9400   if (!require_osd_peer(m)) {
9401     m->put();
9402     return;
9403   }
9404   int from = m->get_source().num();
9405   for (auto& p : m->pg_list) {
9406     enqueue_peering_evt(
9407       spg_t(p.info.pgid.pgid, p.to),
9408       PGPeeringEventRef(
9409         std::make_shared<PGPeeringEvent>(
9410           p.epoch_sent, p.query_epoch,
9411           MInfoRec(
9412             pg_shard_t(from, p.from),
9413             p.info,
9414             p.epoch_sent)))
9415       );
9416   }
9417   m->put();
9418 }
9419
9420 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9421 {
9422   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9423   if (!require_osd_peer(m)) {
9424     m->put();
9425     return;
9426   }
9427   for (auto& pgid : m->pg_list) {
9428     enqueue_peering_evt(
9429       pgid,
9430       PGPeeringEventRef(
9431         std::make_shared<PGPeeringEvent>(
9432           m->get_epoch(), m->get_epoch(),
9433           PeeringState::DeleteStart())));
9434   }
9435   m->put();
9436 }
9437
9438 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9439 {
9440   dout(10) << __func__ << " " << *m << dendl;
9441   if (!require_mon_or_mgr_peer(m)) {
9442     m->put();
9443     return;
9444   }
9445   epoch_t epoch = get_osdmap_epoch();
9446   for (auto pgid : m->forced_pgs) {
9447     if (m->options & OFR_BACKFILL) {
9448       if (m->options & OFR_CANCEL) {
9449         enqueue_peering_evt(
9450           pgid,
9451           PGPeeringEventRef(
9452             std::make_shared<PGPeeringEvent>(
9453               epoch, epoch,
9454               PeeringState::UnsetForceBackfill())));
9455       } else {
9456         enqueue_peering_evt(
9457           pgid,
9458           PGPeeringEventRef(
9459             std::make_shared<PGPeeringEvent>(
9460               epoch, epoch,
9461               PeeringState::SetForceBackfill())));
9462       }
9463     } else if (m->options & OFR_RECOVERY) {
9464       if (m->options & OFR_CANCEL) {
9465         enqueue_peering_evt(
9466           pgid,
9467           PGPeeringEventRef(
9468             std::make_shared<PGPeeringEvent>(
9469               epoch, epoch,
9470               PeeringState::UnsetForceRecovery())));
9471       } else {
9472         enqueue_peering_evt(
9473           pgid,
9474           PGPeeringEventRef(
9475             std::make_shared<PGPeeringEvent>(
9476               epoch, epoch,
9477               PeeringState::SetForceRecovery())));
9478       }
9479     }
9480   }
9481   m->put();
9482 }
9483
9484 void OSD::handle_pg_query_nopg(const MQuery& q)
9485 {
9486   spg_t pgid = q.pgid;
9487   dout(10) << __func__ << " " << pgid << dendl;
9488
9489   OSDMapRef osdmap = get_osdmap();
9490   if (!osdmap->have_pg_pool(pgid.pool()))
9491     return;
9492
9493   dout(10) << " pg " << pgid << " dne" << dendl;
9494   pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9495   ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9496   if (con) {
9497     Message *m;
9498     if (q.query.type == pg_query_t::LOG ||
9499         q.query.type == pg_query_t::FULLLOG) {
9500       m = new MOSDPGLog(
9501         q.query.from, q.query.to,
9502         osdmap->get_epoch(), empty,
9503         q.query.epoch_sent);
9504     } else {
9505       vector<pg_notify_t> ls;
9506       ls.push_back(
9507         pg_notify_t(
9508           q.query.from, q.query.to,
9509           q.query.epoch_sent,
9510           osdmap->get_epoch(),
9511           empty,
9512           PastIntervals()));
9513       m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9514     }
9515     service.maybe_share_map(con.get(), osdmap);
9516     con->send_message(m);
9517   }
9518 }
9519
9520 void OSDService::queue_check_readable(spg_t spgid,
9521                                       epoch_t lpr,
9522                                       ceph::signedspan delay)
9523 {
9524   if (delay == ceph::signedspan::zero()) {
9525     osd->enqueue_peering_evt(
9526       spgid,
9527       PGPeeringEventRef(
9528         std::make_shared<PGPeeringEvent>(
9529           lpr, lpr,
9530           PeeringState::CheckReadable())));
9531   } else {
9532     mono_timer.add_event(
9533       delay,
9534       [this, spgid, lpr]() {
9535         queue_check_readable(spgid, lpr);
9536       });
9537   }
9538 }
9539
9540
9541 // =========================================================
9542 // RECOVERY
9543
9544 void OSDService::_maybe_queue_recovery() {
9545   ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9546   uint64_t available_pushes;
9547   while (!awaiting_throttle.empty() &&
9548          _recover_now(&available_pushes)) {
9549     uint64_t to_start = std::min(
9550       available_pushes,
9551       cct->_conf->osd_recovery_max_single_start);
9552     _queue_for_recovery(awaiting_throttle.front(), to_start);
9553     awaiting_throttle.pop_front();
9554     dout(10) << __func__ << " starting " << to_start
9555              << ", recovery_ops_reserved " << recovery_ops_reserved
9556              << " -> " << (recovery_ops_reserved + to_start) << dendl;
9557     recovery_ops_reserved += to_start;
9558   }
9559 }
9560
9561 bool OSDService::_recover_now(uint64_t *available_pushes)
9562 {
9563   if (available_pushes)
9564       *available_pushes = 0;
9565
9566   if (ceph_clock_now() < defer_recovery_until) {
9567     dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9568     return false;
9569   }
9570
9571   if (recovery_paused) {
9572     dout(15) << __func__ << " paused" << dendl;
9573     return false;
9574   }
9575
9576   uint64_t max = osd->get_recovery_max_active();
9577   if (max <= recovery_ops_active + recovery_ops_reserved) {
9578     dout(15) << __func__ << " active " << recovery_ops_active
9579              << " + reserved " << recovery_ops_reserved
9580              << " >= max " << max << dendl;
9581     return false;
9582   }
9583
9584   if (available_pushes)
9585     *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9586
9587   return true;
9588 }
9589
9590 unsigned OSDService::get_target_pg_log_entries() const
9591 {
9592   auto num_pgs = osd->get_num_pgs();
9593   auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9594   if (num_pgs > 0 && target > 0) {
9595     // target an even spread of our budgeted log entries across all
9596     // PGs.  note that while we only get to control the entry count
9597     // for primary PGs, we'll normally be responsible for a mix of
9598     // primary and replica PGs (for the same pool(s) even), so this
9599     // will work out.
9600     return std::max<unsigned>(
9601       std::min<unsigned>(target / num_pgs,
9602                          cct->_conf->osd_max_pg_log_entries),
9603       cct->_conf->osd_min_pg_log_entries);
9604   } else {
9605     // fall back to a per-pg value.
9606     return cct->_conf->osd_min_pg_log_entries;
9607   }
9608 }
9609
9610 void OSD::do_recovery(
9611   PG *pg, epoch_t queued, uint64_t reserved_pushes,
9612   ThreadPool::TPHandle &handle)
9613 {
9614   uint64_t started = 0;
9615
9616   /*
9617    * When the value of osd_recovery_sleep is set greater than zero, recovery
9618    * ops are scheduled after osd_recovery_sleep amount of time from the previous
9619    * recovery event's schedule time. This is done by adding a
9620    * recovery_requeue_callback event, which re-queues the recovery op using
9621    * queue_recovery_after_sleep.
9622    */
9623   float recovery_sleep = get_osd_recovery_sleep();
9624   {
9625     std::lock_guard l(service.sleep_lock);
9626     if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9627       PGRef pgref(pg);
9628       auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9629         dout(20) << "do_recovery wake up at "
9630                  << ceph_clock_now()
9631                  << ", re-queuing recovery" << dendl;
9632         std::lock_guard l(service.sleep_lock);
9633         service.recovery_needs_sleep = false;
9634         service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9635       });
9636
9637       // This is true for the first recovery op and when the previous recovery op
9638       // has been scheduled in the past. The next recovery op is scheduled after
9639       // completing the sleep from now.
9640
9641       if (auto now = ceph::real_clock::now();
9642           service.recovery_schedule_time < now) {
9643         service.recovery_schedule_time = now;
9644       }
9645       service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9646       service.sleep_timer.add_event_at(service.recovery_schedule_time,
9647                                        recovery_requeue_callback);
9648       dout(20) << "Recovery event scheduled at "
9649                << service.recovery_schedule_time << dendl;
9650       return;
9651     }
9652   }
9653
9654   {
9655     {
9656       std::lock_guard l(service.sleep_lock);
9657       service.recovery_needs_sleep = true;
9658     }
9659
9660     if (pg->pg_has_reset_since(queued)) {
9661       goto out;
9662     }
9663
9664     dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9665 #ifdef DEBUG_RECOVERY_OIDS
9666     dout(20) << "  active was " << service.recovery_oids[pg->pg_id] << dendl;
9667 #endif
9668
9669     bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9670     dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9671              << " on " << *pg << dendl;
9672
9673     if (do_unfound) {
9674       PeeringCtx rctx = create_context();
9675       rctx.handle = &handle;
9676       pg->find_unfound(queued, rctx);
9677       dispatch_context(rctx, pg, pg->get_osdmap());
9678     }
9679   }
9680
9681  out:
9682   ceph_assert(started <= reserved_pushes);
9683   service.release_reserved_pushes(reserved_pushes);
9684 }
9685
9686 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9687 {
9688   std::lock_guard l(recovery_lock);
9689   dout(10) << "start_recovery_op " << *pg << " " << soid
9690            << " (" << recovery_ops_active << "/"
9691            << osd->get_recovery_max_active() << " rops)"
9692            << dendl;
9693   recovery_ops_active++;
9694
9695 #ifdef DEBUG_RECOVERY_OIDS
9696   dout(20) << "  active was " << recovery_oids[pg->pg_id] << dendl;
9697   ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9698   recovery_oids[pg->pg_id].insert(soid);
9699 #endif
9700 }
9701
9702 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9703 {
9704   std::lock_guard l(recovery_lock);
9705   dout(10) << "finish_recovery_op " << *pg << " " << soid
9706            << " dequeue=" << dequeue
9707            << " (" << recovery_ops_active << "/"
9708            << osd->get_recovery_max_active() << " rops)"
9709            << dendl;
9710
9711   // adjust count
9712   ceph_assert(recovery_ops_active > 0);
9713   recovery_ops_active--;
9714
9715 #ifdef DEBUG_RECOVERY_OIDS
9716   dout(20) << "  active oids was " << recovery_oids[pg->pg_id] << dendl;
9717   ceph_assert(recovery_oids[pg->pg_id].count(soid));
9718   recovery_oids[pg->pg_id].erase(soid);
9719 #endif
9720
9721   _maybe_queue_recovery();
9722 }
9723
9724 bool OSDService::is_recovery_active()
9725 {
9726   if (cct->_conf->osd_debug_pretend_recovery_active) {
9727     return true;
9728   }
9729   return local_reserver.has_reservation() || remote_reserver.has_reservation();
9730 }
9731
9732 void OSDService::release_reserved_pushes(uint64_t pushes)
9733 {
9734   std::lock_guard l(recovery_lock);
9735   dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9736            << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9737            << dendl;
9738   ceph_assert(recovery_ops_reserved >= pushes);
9739   recovery_ops_reserved -= pushes;
9740   _maybe_queue_recovery();
9741 }
9742
9743 // =========================================================
9744 // OPS
9745
9746 bool OSD::op_is_discardable(const MOSDOp *op)
9747 {
9748   // drop client request if they are not connected and can't get the
9749   // reply anyway.
9750   if (!op->get_connection()->is_connected()) {
9751     return true;
9752   }
9753   return false;
9754 }
9755
9756 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9757 {
9758   const utime_t stamp = op->get_req()->get_recv_stamp();
9759   const utime_t latency = ceph_clock_now() - stamp;
9760   const unsigned priority = op->get_req()->get_priority();
9761   const int cost = op->get_req()->get_cost();
9762   const uint64_t owner = op->get_req()->get_source().num();
9763   const int type = op->get_req()->get_type();
9764
9765   dout(15) << "enqueue_op " << op << " prio " << priority
9766            << " type " << type
9767            << " cost " << cost
9768            << " latency " << latency
9769            << " epoch " << epoch
9770            << " " << *(op->get_req()) << dendl;
9771   op->osd_trace.event("enqueue op");
9772   op->osd_trace.keyval("priority", priority);
9773   op->osd_trace.keyval("cost", cost);
9774 #ifdef HAVE_JAEGER
9775   if (op->osd_parent_span) {
9776     auto enqueue_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
9777     enqueue_span->Log({
9778         {"priority", priority},
9779         {"cost", cost},
9780         {"epoch", epoch},
9781         {"owner", owner},
9782         {"type", type}
9783         });
9784   }
9785 #endif
9786   op->mark_queued_for_pg();
9787   logger->tinc(l_osd_op_before_queue_op_lat, latency);
9788   if (type == MSG_OSD_PG_PUSH ||
9789       type == MSG_OSD_PG_PUSH_REPLY) {
9790     op_shardedwq.queue(
9791       OpSchedulerItem(
9792         unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9793         cost, priority, stamp, owner, epoch));
9794   } else {
9795     op_shardedwq.queue(
9796       OpSchedulerItem(
9797         unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9798         cost, priority, stamp, owner, epoch));
9799   }
9800 }
9801
9802 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9803 {
9804   dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9805   op_shardedwq.queue(
9806     OpSchedulerItem(
9807       unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9808       10,
9809       cct->_conf->osd_peering_op_priority,
9810       utime_t(),
9811       0,
9812       evt->get_epoch_sent()));
9813 }
9814
9815 /*
9816  * NOTE: dequeue called in worker thread, with pg lock
9817  */
9818 void OSD::dequeue_op(
9819   PGRef pg, OpRequestRef op,
9820   ThreadPool::TPHandle &handle)
9821 {
9822   const Message *m = op->get_req();
9823
9824   FUNCTRACE(cct);
9825   OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9826
9827   utime_t now = ceph_clock_now();
9828   op->set_dequeued_time(now);
9829
9830   utime_t latency = now - m->get_recv_stamp();
9831   dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9832            << " cost " << m->get_cost()
9833            << " latency " << latency
9834            << " " << *m
9835            << " pg " << *pg << dendl;
9836
9837   logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9838
9839   service.maybe_share_map(m->get_connection().get(),
9840                           pg->get_osdmap(),
9841                           op->sent_epoch);
9842
9843   if (pg->is_deleting())
9844     return;
9845
9846   op->mark_reached_pg();
9847   op->osd_trace.event("dequeue_op");
9848
9849   pg->do_request(op, handle);
9850
9851   // finish
9852   dout(10) << "dequeue_op " << op << " finish" << dendl;
9853   OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9854 }
9855
9856
9857 void OSD::dequeue_peering_evt(
9858   OSDShard *sdata,
9859   PG *pg,
9860   PGPeeringEventRef evt,
9861   ThreadPool::TPHandle& handle)
9862 {
9863   PeeringCtx rctx = create_context();
9864   auto curmap = sdata->get_osdmap();
9865   bool need_up_thru = false;
9866   epoch_t same_interval_since = 0;
9867   if (!pg) {
9868     if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9869       handle_pg_query_nopg(*q);
9870     } else {
9871       derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9872       ceph_abort();
9873     }
9874   } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9875     pg->do_peering_event(evt, rctx);
9876     if (pg->is_deleted()) {
9877       pg->unlock();
9878       return;
9879     }
9880     dispatch_context(rctx, pg, curmap, &handle);
9881     need_up_thru = pg->get_need_up_thru();
9882     same_interval_since = pg->get_same_interval_since();
9883     pg->unlock();
9884   }
9885
9886   if (need_up_thru) {
9887     queue_want_up_thru(same_interval_since);
9888   }
9889
9890   service.send_pg_temp();
9891 }
9892
9893 void OSD::dequeue_delete(
9894   OSDShard *sdata,
9895   PG *pg,
9896   epoch_t e,
9897   ThreadPool::TPHandle& handle)
9898 {
9899   dequeue_peering_evt(
9900     sdata,
9901     pg,
9902     PGPeeringEventRef(
9903       std::make_shared<PGPeeringEvent>(
9904         e, e,
9905         PeeringState::DeleteSome())),
9906     handle);
9907 }
9908
9909
9910
9911 // --------------------------------
9912
9913 const char** OSD::get_tracked_conf_keys() const
9914 {
9915   static const char* KEYS[] = {
9916     "osd_max_backfills",
9917     "osd_min_recovery_priority",
9918     "osd_max_trimming_pgs",
9919     "osd_op_complaint_time",
9920     "osd_op_log_threshold",
9921     "osd_op_history_size",
9922     "osd_op_history_duration",
9923     "osd_op_history_slow_op_size",
9924     "osd_op_history_slow_op_threshold",
9925     "osd_enable_op_tracker",
9926     "osd_map_cache_size",
9927     "osd_pg_epoch_max_lag_factor",
9928     "osd_pg_epoch_persisted_max_stale",
9929     "osd_recovery_sleep",
9930     "osd_recovery_sleep_hdd",
9931     "osd_recovery_sleep_ssd",
9932     "osd_recovery_sleep_hybrid",
9933     "osd_delete_sleep",
9934     "osd_delete_sleep_hdd",
9935     "osd_delete_sleep_ssd",
9936     "osd_delete_sleep_hybrid",
9937     "osd_snap_trim_sleep",
9938     "osd_snap_trim_sleep_hdd",
9939     "osd_snap_trim_sleep_ssd",
9940     "osd_snap_trim_sleep_hybrid"
9941     "osd_scrub_sleep",
9942     "osd_recovery_max_active",
9943     "osd_recovery_max_active_hdd",
9944     "osd_recovery_max_active_ssd",
9945     // clog & admin clog
9946     "clog_to_monitors",
9947     "clog_to_syslog",
9948     "clog_to_syslog_facility",
9949     "clog_to_syslog_level",
9950     "osd_objectstore_fuse",
9951     "clog_to_graylog",
9952     "clog_to_graylog_host",
9953     "clog_to_graylog_port",
9954     "host",
9955     "fsid",
9956     "osd_recovery_delay_start",
9957     "osd_client_message_size_cap",
9958     "osd_client_message_cap",
9959     "osd_heartbeat_min_size",
9960     "osd_heartbeat_interval",
9961     "osd_object_clean_region_max_num_intervals",
9962     "osd_scrub_min_interval",
9963     "osd_scrub_max_interval",
9964     NULL
9965   };
9966   return KEYS;
9967 }
9968
9969 void OSD::handle_conf_change(const ConfigProxy& conf,
9970                              const std::set <std::string> &changed)
9971 {
9972   std::lock_guard l{osd_lock};
9973
9974   if (changed.count("osd_max_backfills") ||
9975       changed.count("osd_delete_sleep") ||
9976       changed.count("osd_delete_sleep_hdd") ||
9977       changed.count("osd_delete_sleep_ssd") ||
9978       changed.count("osd_delete_sleep_hybrid") ||
9979       changed.count("osd_snap_trim_sleep") ||
9980       changed.count("osd_snap_trim_sleep_hdd") ||
9981       changed.count("osd_snap_trim_sleep_ssd") ||
9982       changed.count("osd_snap_trim_sleep_hybrid") ||
9983       changed.count("osd_scrub_sleep") ||
9984       changed.count("osd_recovery_sleep") ||
9985       changed.count("osd_recovery_sleep_hdd") ||
9986       changed.count("osd_recovery_sleep_ssd") ||
9987       changed.count("osd_recovery_sleep_hybrid") ||
9988       changed.count("osd_recovery_max_active") ||
9989       changed.count("osd_recovery_max_active_hdd") ||
9990       changed.count("osd_recovery_max_active_ssd")) {
9991     if (!maybe_override_options_for_qos() &&
9992         changed.count("osd_max_backfills")) {
9993       // Scheduler is not "mclock". Fallback to earlier behavior
9994       service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9995       service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9996     }
9997   }
9998   if (changed.count("osd_min_recovery_priority")) {
9999     service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10000     service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10001   }
10002   if (changed.count("osd_max_trimming_pgs")) {
10003     service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10004   }
10005   if (changed.count("osd_op_complaint_time") ||
10006       changed.count("osd_op_log_threshold")) {
10007     op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10008                                            cct->_conf->osd_op_log_threshold);
10009   }
10010   if (changed.count("osd_op_history_size") ||
10011       changed.count("osd_op_history_duration")) {
10012     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10013                                              cct->_conf->osd_op_history_duration);
10014   }
10015   if (changed.count("osd_op_history_slow_op_size") ||
10016       changed.count("osd_op_history_slow_op_threshold")) {
10017     op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10018                                                       cct->_conf->osd_op_history_slow_op_threshold);
10019   }
10020   if (changed.count("osd_enable_op_tracker")) {
10021       op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10022   }
10023   if (changed.count("osd_map_cache_size")) {
10024     service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10025     service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10026     service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10027   }
10028   if (changed.count("clog_to_monitors") ||
10029       changed.count("clog_to_syslog") ||
10030       changed.count("clog_to_syslog_level") ||
10031       changed.count("clog_to_syslog_facility") ||
10032       changed.count("clog_to_graylog") ||
10033       changed.count("clog_to_graylog_host") ||
10034       changed.count("clog_to_graylog_port") ||
10035       changed.count("host") ||
10036       changed.count("fsid")) {
10037     update_log_config();
10038   }
10039   if (changed.count("osd_pg_epoch_max_lag_factor")) {
10040     m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10041       "osd_pg_epoch_max_lag_factor");
10042   }
10043
10044 #ifdef HAVE_LIBFUSE
10045   if (changed.count("osd_objectstore_fuse")) {
10046     if (store) {
10047       enable_disable_fuse(false);
10048     }
10049   }
10050 #endif
10051
10052   if (changed.count("osd_recovery_delay_start")) {
10053     service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10054     service.kick_recovery_queue();
10055   }
10056
10057   if (changed.count("osd_client_message_cap")) {
10058     uint64_t newval = cct->_conf->osd_client_message_cap;
10059     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10060     if (pol.throttler_messages && newval > 0) {
10061       pol.throttler_messages->reset_max(newval);
10062     }
10063   }
10064   if (changed.count("osd_client_message_size_cap")) {
10065     uint64_t newval = cct->_conf->osd_client_message_size_cap;
10066     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10067     if (pol.throttler_bytes && newval > 0) {
10068       pol.throttler_bytes->reset_max(newval);
10069     }
10070   }
10071   if (changed.count("osd_object_clean_region_max_num_intervals")) {
10072     ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10073   }
10074
10075   if (changed.count("osd_scrub_min_interval") ||
10076       changed.count("osd_scrub_max_interval")) {
10077     resched_all_scrubs();
10078     dout(0) << __func__ << ": scrub interval change" << dendl;
10079   }
10080   check_config();
10081   if (changed.count("osd_asio_thread_count")) {
10082     service.poolctx.stop();
10083     service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10084   }
10085 }
10086
10087 bool OSD::maybe_override_options_for_qos()
10088 {
10089   // If the scheduler enabled is mclock, override the recovery, backfill
10090   // and sleep options so that mclock can meet the QoS goals.
10091   if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") {
10092     dout(1) << __func__
10093             << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
10094
10095     // Set high value for recovery max active
10096     uint32_t rec_max_active = 1000;
10097     cct->_conf.set_val(
10098       "osd_recovery_max_active", std::to_string(rec_max_active));
10099     cct->_conf.set_val(
10100       "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
10101     cct->_conf.set_val(
10102       "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
10103
10104     // Set high value for osd_max_backfill
10105     uint32_t max_backfills = 1000;
10106     cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
10107     service.local_reserver.set_max(max_backfills);
10108     service.remote_reserver.set_max(max_backfills);
10109
10110     // Disable recovery sleep
10111     cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10112     cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10113     cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10114     cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10115
10116     // Disable delete sleep
10117     cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10118     cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10119     cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10120     cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10121
10122     // Disable snap trim sleep
10123     cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10124     cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10125     cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10126     cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10127
10128     // Disable scrub sleep
10129     cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10130     return true;
10131   }
10132   return false;
10133 }
10134
10135 void OSD::update_log_config()
10136 {
10137   map<string,string> log_to_monitors;
10138   map<string,string> log_to_syslog;
10139   map<string,string> log_channel;
10140   map<string,string> log_prio;
10141   map<string,string> log_to_graylog;
10142   map<string,string> log_to_graylog_host;
10143   map<string,string> log_to_graylog_port;
10144   uuid_d fsid;
10145   string host;
10146
10147   if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10148                                log_channel, log_prio, log_to_graylog,
10149                                log_to_graylog_host, log_to_graylog_port,
10150                                fsid, host) == 0)
10151     clog->update_config(log_to_monitors, log_to_syslog,
10152                         log_channel, log_prio, log_to_graylog,
10153                         log_to_graylog_host, log_to_graylog_port,
10154                         fsid, host);
10155   derr << "log_to_monitors " << log_to_monitors << dendl;
10156 }
10157
10158 void OSD::check_config()
10159 {
10160   // some sanity checks
10161   if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10162     clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10163                  << " is not > osd_pg_epoch_persisted_max_stale ("
10164                  << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10165   }
10166   if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
10167     clog->warn() << "osd_object_clean_region_max_num_intervals ("
10168                  << cct->_conf->osd_object_clean_region_max_num_intervals
10169                 << ") is < 0";
10170   }
10171 }
10172
10173 // --------------------------------
10174
10175 void OSD::get_latest_osdmap()
10176 {
10177   dout(10) << __func__ << " -- start" << dendl;
10178
10179   boost::system::error_code ec;
10180   service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
10181
10182   dout(10) << __func__ << " -- finish" << dendl;
10183 }
10184
10185 // --------------------------------
10186
10187 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10188   const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10189   const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
10190   dout(10) << "setting " << queries.size() << " queries" << dendl;
10191
10192   std::list<OSDPerfMetricQuery> supported_queries;
10193   for (auto &it : queries) {
10194     auto &query = it.first;
10195     if (!query.key_descriptor.empty()) {
10196       supported_queries.push_back(query);
10197     }
10198   }
10199   if (supported_queries.size() < queries.size()) {
10200     dout(1) << queries.size() - supported_queries.size()
10201             << " unsupported queries" << dendl;
10202   }
10203   {
10204     std::lock_guard locker{m_perf_queries_lock};
10205     m_perf_queries = supported_queries;
10206     m_perf_limits = queries;
10207   }
10208   std::vector<PGRef> pgs;
10209   _get_pgs(&pgs);
10210   for (auto& pg : pgs) {
10211     std::scoped_lock l{*pg};
10212     pg->set_dynamic_perf_stats_queries(supported_queries);
10213   }
10214 }
10215
10216 MetricPayload OSD::get_perf_reports() {
10217   OSDMetricPayload payload;
10218   std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10219
10220   std::vector<PGRef> pgs;
10221   _get_pgs(&pgs);
10222   DynamicPerfStats dps;
10223   for (auto& pg : pgs) {
10224     // m_perf_queries can be modified only in set_perf_queries by mgr client
10225     // request, and it is protected by by mgr client's lock, which is held
10226     // when set_perf_queries/get_perf_reports are called, so we may not hold
10227     // m_perf_queries_lock here.
10228     DynamicPerfStats pg_dps(m_perf_queries);
10229     pg->lock();
10230     pg->get_dynamic_perf_stats(&pg_dps);
10231     pg->unlock();
10232     dps.merge(pg_dps);
10233   }
10234   dps.add_to_reports(m_perf_limits, &reports);
10235   dout(20) << "reports for " << reports.size() << " queries" << dendl;
10236
10237   return payload;
10238 }
10239
10240 // =============================================================
10241
10242 #undef dout_context
10243 #define dout_context cct
10244 #undef dout_prefix
10245 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10246
10247 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10248 {
10249   dout(10) << pg->pg_id << " " << pg << dendl;
10250   slot->pg = pg;
10251   pg->osd_shard = this;
10252   pg->pg_slot = slot;
10253   osd->inc_num_pgs();
10254
10255   slot->epoch = pg->get_osdmap_epoch();
10256   pg_slots_by_epoch.insert(*slot);
10257 }
10258
10259 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10260 {
10261   dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10262   slot->pg->osd_shard = nullptr;
10263   slot->pg->pg_slot = nullptr;
10264   slot->pg = nullptr;
10265   osd->dec_num_pgs();
10266
10267   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10268   slot->epoch = 0;
10269   if (waiting_for_min_pg_epoch) {
10270     min_pg_epoch_cond.notify_all();
10271   }
10272 }
10273
10274 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10275 {
10276   std::lock_guard l(shard_lock);
10277   dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10278            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10279   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10280   dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10281   slot->epoch = e;
10282   pg_slots_by_epoch.insert(*slot);
10283   dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10284            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10285   if (waiting_for_min_pg_epoch) {
10286     min_pg_epoch_cond.notify_all();
10287   }
10288 }
10289
10290 epoch_t OSDShard::get_min_pg_epoch()
10291 {
10292   std::lock_guard l(shard_lock);
10293   auto p = pg_slots_by_epoch.begin();
10294   if (p == pg_slots_by_epoch.end()) {
10295     return 0;
10296   }
10297   return p->epoch;
10298 }
10299
10300 void OSDShard::wait_min_pg_epoch(epoch_t need)
10301 {
10302   std::unique_lock l{shard_lock};
10303   ++waiting_for_min_pg_epoch;
10304   min_pg_epoch_cond.wait(l, [need, this] {
10305     if (pg_slots_by_epoch.empty()) {
10306       return true;
10307     } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10308       return true;
10309     } else {
10310       dout(10) << need << " waiting on "
10311                << pg_slots_by_epoch.begin()->epoch << dendl;
10312       return false;
10313     }
10314   });
10315   --waiting_for_min_pg_epoch;
10316 }
10317
10318 epoch_t OSDShard::get_max_waiting_epoch()
10319 {
10320   std::lock_guard l(shard_lock);
10321   epoch_t r = 0;
10322   for (auto& i : pg_slots) {
10323     if (!i.second->waiting_peering.empty()) {
10324       r = std::max(r, i.second->waiting_peering.rbegin()->first);
10325     }
10326   }
10327   return r;
10328 }
10329
10330 void OSDShard::consume_map(
10331   const OSDMapRef& new_osdmap,
10332   unsigned *pushes_to_free)
10333 {
10334   std::lock_guard l(shard_lock);
10335   OSDMapRef old_osdmap;
10336   {
10337     std::lock_guard l(osdmap_lock);
10338     old_osdmap = std::move(shard_osdmap);
10339     shard_osdmap = new_osdmap;
10340   }
10341   dout(10) << new_osdmap->get_epoch()
10342            << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10343            << dendl;
10344   bool queued = false;
10345
10346   // check slots
10347   auto p = pg_slots.begin();
10348   while (p != pg_slots.end()) {
10349     OSDShardPGSlot *slot = p->second.get();
10350     const spg_t& pgid = p->first;
10351     dout(20) << __func__ << " " << pgid << dendl;
10352     if (!slot->waiting_for_split.empty()) {
10353       dout(20) << __func__ << "  " << pgid
10354                << " waiting for split " << slot->waiting_for_split << dendl;
10355       ++p;
10356       continue;
10357     }
10358     if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10359       dout(20) << __func__ << "  " << pgid
10360                << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10361                << dendl;
10362       ++p;
10363       continue;
10364     }
10365     if (!slot->waiting_peering.empty()) {
10366       epoch_t first = slot->waiting_peering.begin()->first;
10367       if (first <= new_osdmap->get_epoch()) {
10368         dout(20) << __func__ << "  " << pgid
10369                  << " pending_peering first epoch " << first
10370                  << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10371         _wake_pg_slot(pgid, slot);
10372         queued = true;
10373       }
10374       ++p;
10375       continue;
10376     }
10377     if (!slot->waiting.empty()) {
10378       if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10379         dout(20) << __func__ << "  " << pgid << " maps to us, keeping"
10380                  << dendl;
10381         ++p;
10382         continue;
10383       }
10384       while (!slot->waiting.empty() &&
10385              slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10386         auto& qi = slot->waiting.front();
10387         dout(20) << __func__ << "  " << pgid
10388                  << " waiting item " << qi
10389                  << " epoch " << qi.get_map_epoch()
10390                  << " <= " << new_osdmap->get_epoch()
10391                  << ", "
10392                  << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10393                      "misdirected")
10394                  << ", dropping" << dendl;
10395         *pushes_to_free += qi.get_reserved_pushes();
10396         slot->waiting.pop_front();
10397       }
10398     }
10399     if (slot->waiting.empty() &&
10400         slot->num_running == 0 &&
10401         slot->waiting_for_split.empty() &&
10402         !slot->pg) {
10403       dout(20) << __func__ << "  " << pgid << " empty, pruning" << dendl;
10404       p = pg_slots.erase(p);
10405       continue;
10406     }
10407
10408     ++p;
10409   }
10410   if (queued) {
10411     std::lock_guard l{sdata_wait_lock};
10412     sdata_cond.notify_one();
10413   }
10414 }
10415
10416 void OSDShard::_wake_pg_slot(
10417   spg_t pgid,
10418   OSDShardPGSlot *slot)
10419 {
10420   dout(20) << __func__ << " " << pgid
10421            << " to_process " << slot->to_process
10422            << " waiting " << slot->waiting
10423            << " waiting_peering " << slot->waiting_peering << dendl;
10424   for (auto i = slot->to_process.rbegin();
10425        i != slot->to_process.rend();
10426        ++i) {
10427     scheduler->enqueue_front(std::move(*i));
10428   }
10429   slot->to_process.clear();
10430   for (auto i = slot->waiting.rbegin();
10431        i != slot->waiting.rend();
10432        ++i) {
10433     scheduler->enqueue_front(std::move(*i));
10434   }
10435   slot->waiting.clear();
10436   for (auto i = slot->waiting_peering.rbegin();
10437        i != slot->waiting_peering.rend();
10438        ++i) {
10439     // this is overkill; we requeue everything, even if some of these
10440     // items are waiting for maps we don't have yet.  FIXME, maybe,
10441     // someday, if we decide this inefficiency matters
10442     for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10443       scheduler->enqueue_front(std::move(*j));
10444     }
10445   }
10446   slot->waiting_peering.clear();
10447   ++slot->requeue_seq;
10448 }
10449
10450 void OSDShard::identify_splits_and_merges(
10451   const OSDMapRef& as_of_osdmap,
10452   set<pair<spg_t,epoch_t>> *split_pgs,
10453   set<pair<spg_t,epoch_t>> *merge_pgs)
10454 {
10455   std::lock_guard l(shard_lock);
10456   if (shard_osdmap) {
10457     for (auto& i : pg_slots) {
10458       const spg_t& pgid = i.first;
10459       auto *slot = i.second.get();
10460       if (slot->pg) {
10461         osd->service.identify_splits_and_merges(
10462           shard_osdmap, as_of_osdmap, pgid,
10463           split_pgs, merge_pgs);
10464       } else if (!slot->waiting_for_split.empty()) {
10465         osd->service.identify_splits_and_merges(
10466           shard_osdmap, as_of_osdmap, pgid,
10467           split_pgs, nullptr);
10468       } else {
10469         dout(20) << __func__ << " slot " << pgid
10470                  << " has no pg and waiting_for_split " << dendl;
10471       }
10472     }
10473   }
10474 }
10475
10476 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10477                             set<pair<spg_t,epoch_t>> *pgids)
10478 {
10479   std::lock_guard l(shard_lock);
10480   _prime_splits(pgids);
10481   if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10482     set<pair<spg_t,epoch_t>> newer_children;
10483     for (auto i : *pgids) {
10484       osd->service.identify_splits_and_merges(
10485         as_of_osdmap, shard_osdmap, i.first,
10486         &newer_children, nullptr);
10487     }
10488     newer_children.insert(pgids->begin(), pgids->end());
10489     dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10490              << shard_osdmap->get_epoch() << ", new children " << newer_children
10491              << dendl;
10492     _prime_splits(&newer_children);
10493     // note: we don't care what is left over here for other shards.
10494     // if this shard is ahead of us and one isn't, e.g., one thread is
10495     // calling into prime_splits via _process (due to a newly created
10496     // pg) and this shard has a newer map due to a racing consume_map,
10497     // then any grandchildren left here will be identified (or were
10498     // identified) when the slower shard's osdmap is advanced.
10499     // _prime_splits() will tolerate the case where the pgid is
10500     // already primed.
10501   }
10502 }
10503
10504 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10505 {
10506   dout(10) << *pgids << dendl;
10507   auto p = pgids->begin();
10508   while (p != pgids->end()) {
10509     unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10510     if (shard_index == shard_id) {
10511       auto r = pg_slots.emplace(p->first, nullptr);
10512       if (r.second) {
10513         dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10514         r.first->second = make_unique<OSDShardPGSlot>();
10515         r.first->second->waiting_for_split.insert(p->second);
10516       } else {
10517         auto q = r.first;
10518         ceph_assert(q != pg_slots.end());
10519         dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10520                  << dendl;
10521         q->second->waiting_for_split.insert(p->second);
10522       }
10523       p = pgids->erase(p);
10524     } else {
10525       ++p;
10526     }
10527   }
10528 }
10529
10530 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10531                             set<pair<spg_t,epoch_t>> *merge_pgs)
10532 {
10533   std::lock_guard l(shard_lock);
10534   dout(20) << __func__ << " checking shard " << shard_id
10535            << " for remaining merge pgs " << merge_pgs << dendl;
10536   auto p = merge_pgs->begin();
10537   while (p != merge_pgs->end()) {
10538     spg_t pgid = p->first;
10539     epoch_t epoch = p->second;
10540     unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10541     if (shard_index != shard_id) {
10542       ++p;
10543       continue;
10544     }
10545     OSDShardPGSlot *slot;
10546     auto r = pg_slots.emplace(pgid, nullptr);
10547     if (r.second) {
10548       r.first->second = make_unique<OSDShardPGSlot>();
10549     }
10550     slot = r.first->second.get();
10551     if (slot->pg) {
10552       // already have pg
10553       dout(20) << __func__ << "  have merge participant pg " << pgid
10554                << " " << slot->pg << dendl;
10555     } else if (!slot->waiting_for_split.empty() &&
10556                *slot->waiting_for_split.begin() < epoch) {
10557       dout(20) << __func__ << "  pending split on merge participant pg " << pgid
10558                << " " << slot->waiting_for_split << dendl;
10559     } else {
10560       dout(20) << __func__ << "  creating empty merge participant " << pgid
10561                << " for merge in " << epoch << dendl;
10562       // leave history zeroed; PG::merge_from() will fill it in.
10563       pg_history_t history;
10564       PGCreateInfo cinfo(pgid, epoch - 1,
10565                          history, PastIntervals(), false);
10566       PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10567       _attach_pg(r.first->second.get(), pg.get());
10568       _wake_pg_slot(pgid, slot);
10569       pg->unlock();
10570     }
10571     // mark slot for merge
10572     dout(20) << __func__ << "  marking merge participant " << pgid << dendl;
10573     slot->waiting_for_merge_epoch = epoch;
10574     p = merge_pgs->erase(p);
10575   }
10576 }
10577
10578 void OSDShard::register_and_wake_split_child(PG *pg)
10579 {
10580   epoch_t epoch;
10581   {
10582     std::lock_guard l(shard_lock);
10583     dout(10) << pg->pg_id << " " << pg << dendl;
10584     auto p = pg_slots.find(pg->pg_id);
10585     ceph_assert(p != pg_slots.end());
10586     auto *slot = p->second.get();
10587     dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10588              << dendl;
10589     ceph_assert(!slot->pg);
10590     ceph_assert(!slot->waiting_for_split.empty());
10591     _attach_pg(slot, pg);
10592
10593     epoch = pg->get_osdmap_epoch();
10594     ceph_assert(slot->waiting_for_split.count(epoch));
10595     slot->waiting_for_split.erase(epoch);
10596     if (slot->waiting_for_split.empty()) {
10597       _wake_pg_slot(pg->pg_id, slot);
10598     } else {
10599       dout(10) << __func__ << " still waiting for split on "
10600                << slot->waiting_for_split << dendl;
10601     }
10602   }
10603
10604   // kick child to ensure it pulls up to the latest osdmap
10605   osd->enqueue_peering_evt(
10606     pg->pg_id,
10607     PGPeeringEventRef(
10608       std::make_shared<PGPeeringEvent>(
10609         epoch,
10610         epoch,
10611         NullEvt())));
10612
10613   std::lock_guard l{sdata_wait_lock};
10614   sdata_cond.notify_one();
10615 }
10616
10617 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10618 {
10619   std::lock_guard l(shard_lock);
10620   vector<spg_t> to_delete;
10621   for (auto& i : pg_slots) {
10622     if (i.first != parent &&
10623         i.first.get_ancestor(old_pg_num) == parent) {
10624       dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10625                << dendl;
10626       _wake_pg_slot(i.first, i.second.get());
10627       to_delete.push_back(i.first);
10628     }
10629   }
10630   for (auto pgid : to_delete) {
10631     pg_slots.erase(pgid);
10632   }
10633 }
10634
10635 OSDShard::OSDShard(
10636   int id,
10637   CephContext *cct,
10638   OSD *osd)
10639   : shard_id(id),
10640     cct(cct),
10641     osd(osd),
10642     shard_name(string("OSDShard.") + stringify(id)),
10643     sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10644     sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10645     osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10646     shard_lock_name(shard_name + "::shard_lock"),
10647     shard_lock{make_mutex(shard_lock_name)},
10648     scheduler(ceph::osd::scheduler::make_scheduler(
10649       cct, osd->num_shards, osd->store->is_rotational())),
10650     context_queue(sdata_wait_lock, sdata_cond)
10651 {
10652   dout(0) << "using op scheduler " << *scheduler << dendl;
10653 }
10654
10655
10656 // =============================================================
10657
10658 #undef dout_context
10659 #define dout_context osd->cct
10660 #undef dout_prefix
10661 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10662
10663 void OSD::ShardedOpWQ::_add_slot_waiter(
10664   spg_t pgid,
10665   OSDShardPGSlot *slot,
10666   OpSchedulerItem&& qi)
10667 {
10668   if (qi.is_peering()) {
10669     dout(20) << __func__ << " " << pgid
10670              << " peering, item epoch is "
10671              << qi.get_map_epoch()
10672              << ", will wait on " << qi << dendl;
10673     slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10674   } else {
10675     dout(20) << __func__ << " " << pgid
10676              << " item epoch is "
10677              << qi.get_map_epoch()
10678              << ", will wait on " << qi << dendl;
10679     slot->waiting.push_back(std::move(qi));
10680   }
10681 }
10682
10683 #undef dout_prefix
10684 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10685
10686 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10687 {
10688   uint32_t shard_index = thread_index % osd->num_shards;
10689   auto& sdata = osd->shards[shard_index];
10690   ceph_assert(sdata);
10691
10692   // If all threads of shards do oncommits, there is a out-of-order
10693   // problem.  So we choose the thread which has the smallest
10694   // thread_index(thread_index < num_shards) of shard to do oncommit
10695   // callback.
10696   bool is_smallest_thread_index = thread_index < osd->num_shards;
10697
10698   // peek at spg_t
10699   sdata->shard_lock.lock();
10700   if (sdata->scheduler->empty() &&
10701       (!is_smallest_thread_index || sdata->context_queue.empty())) {
10702     std::unique_lock wait_lock{sdata->sdata_wait_lock};
10703     if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10704       // we raced with a context_queue addition, don't wait
10705       wait_lock.unlock();
10706     } else if (!sdata->stop_waiting) {
10707       dout(20) << __func__ << " empty q, waiting" << dendl;
10708       osd->cct->get_heartbeat_map()->clear_timeout(hb);
10709       sdata->shard_lock.unlock();
10710       sdata->sdata_cond.wait(wait_lock);
10711       wait_lock.unlock();
10712       sdata->shard_lock.lock();
10713       if (sdata->scheduler->empty() &&
10714          !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10715         sdata->shard_lock.unlock();
10716         return;
10717       }
10718       // found a work item; reapply default wq timeouts
10719       osd->cct->get_heartbeat_map()->reset_timeout(hb,
10720         timeout_interval, suicide_interval);
10721     } else {
10722       dout(20) << __func__ << " need return immediately" << dendl;
10723       wait_lock.unlock();
10724       sdata->shard_lock.unlock();
10725       return;
10726     }
10727   }
10728
10729   list<Context *> oncommits;
10730   if (is_smallest_thread_index) {
10731     sdata->context_queue.move_to(oncommits);
10732   }
10733
10734   WorkItem work_item;
10735   while (!std::get_if<OpSchedulerItem>(&work_item)) {
10736     if (sdata->scheduler->empty()) {
10737       if (osd->is_stopping()) {
10738         sdata->shard_lock.unlock();
10739         for (auto c : oncommits) {
10740           dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10741           delete c;
10742         }
10743         return;    // OSD shutdown, discard.
10744       }
10745       sdata->shard_lock.unlock();
10746       handle_oncommits(oncommits);
10747       return;
10748     }
10749
10750     work_item = sdata->scheduler->dequeue();
10751     if (osd->is_stopping()) {
10752       sdata->shard_lock.unlock();
10753       for (auto c : oncommits) {
10754         dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10755         delete c;
10756       }
10757       return;    // OSD shutdown, discard.
10758     }
10759
10760     // If the work item is scheduled in the future, wait until
10761     // the time returned in the dequeue response before retrying.
10762     if (auto when_ready = std::get_if<double>(&work_item)) {
10763       if (is_smallest_thread_index) {
10764         sdata->shard_lock.unlock();
10765         handle_oncommits(oncommits);
10766         return;
10767       }
10768       std::unique_lock wait_lock{sdata->sdata_wait_lock};
10769       auto future_time = ceph::real_clock::from_double(*when_ready);
10770       dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10771       sdata->shard_lock.unlock();
10772       ++sdata->waiting_threads;
10773       sdata->sdata_cond.wait_until(wait_lock, future_time);
10774       --sdata->waiting_threads;
10775       wait_lock.unlock();
10776       sdata->shard_lock.lock();
10777     }
10778   } // while
10779
10780   // Access the stored item
10781   auto item = std::move(std::get<OpSchedulerItem>(work_item));
10782   if (osd->is_stopping()) {
10783     sdata->shard_lock.unlock();
10784     for (auto c : oncommits) {
10785       dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10786       delete c;
10787     }
10788     return;    // OSD shutdown, discard.
10789   }
10790
10791   const auto token = item.get_ordering_token();
10792   auto r = sdata->pg_slots.emplace(token, nullptr);
10793   if (r.second) {
10794     r.first->second = make_unique<OSDShardPGSlot>();
10795   }
10796   OSDShardPGSlot *slot = r.first->second.get();
10797   dout(20) << __func__ << " " << token
10798            << (r.second ? " (new)" : "")
10799            << " to_process " << slot->to_process
10800            << " waiting " << slot->waiting
10801            << " waiting_peering " << slot->waiting_peering
10802            << dendl;
10803   slot->to_process.push_back(std::move(item));
10804   dout(20) << __func__ << " " << slot->to_process.back()
10805            << " queued" << dendl;
10806
10807  retry_pg:
10808   PGRef pg = slot->pg;
10809
10810   // lock pg (if we have it)
10811   if (pg) {
10812     // note the requeue seq now...
10813     uint64_t requeue_seq = slot->requeue_seq;
10814     ++slot->num_running;
10815
10816     sdata->shard_lock.unlock();
10817     osd->service.maybe_inject_dispatch_delay();
10818     pg->lock();
10819     osd->service.maybe_inject_dispatch_delay();
10820     sdata->shard_lock.lock();
10821
10822     auto q = sdata->pg_slots.find(token);
10823     if (q == sdata->pg_slots.end()) {
10824       // this can happen if we race with pg removal.
10825       dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10826       pg->unlock();
10827       sdata->shard_lock.unlock();
10828       handle_oncommits(oncommits);
10829       return;
10830     }
10831     slot = q->second.get();
10832     --slot->num_running;
10833
10834     if (slot->to_process.empty()) {
10835       // raced with _wake_pg_slot or consume_map
10836       dout(20) << __func__ << " " << token
10837                << " nothing queued" << dendl;
10838       pg->unlock();
10839       sdata->shard_lock.unlock();
10840       handle_oncommits(oncommits);
10841       return;
10842     }
10843     if (requeue_seq != slot->requeue_seq) {
10844       dout(20) << __func__ << " " << token
10845                << " requeue_seq " << slot->requeue_seq << " > our "
10846                << requeue_seq << ", we raced with _wake_pg_slot"
10847                << dendl;
10848       pg->unlock();
10849       sdata->shard_lock.unlock();
10850       handle_oncommits(oncommits);
10851       return;
10852     }
10853     if (slot->pg != pg) {
10854       // this can happen if we race with pg removal.
10855       dout(20) << __func__ << " slot " << token << " no longer attached to "
10856                << pg << dendl;
10857       pg->unlock();
10858       goto retry_pg;
10859     }
10860   }
10861
10862   dout(20) << __func__ << " " << token
10863            << " to_process " << slot->to_process
10864            << " waiting " << slot->waiting
10865            << " waiting_peering " << slot->waiting_peering << dendl;
10866
10867   ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10868                                  suicide_interval);
10869
10870   // take next item
10871   auto qi = std::move(slot->to_process.front());
10872   slot->to_process.pop_front();
10873   dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10874   set<pair<spg_t,epoch_t>> new_children;
10875   OSDMapRef osdmap;
10876
10877   while (!pg) {
10878     // should this pg shard exist on this osd in this (or a later) epoch?
10879     osdmap = sdata->shard_osdmap;
10880     const PGCreateInfo *create_info = qi.creates_pg();
10881     if (!slot->waiting_for_split.empty()) {
10882       dout(20) << __func__ << " " << token
10883                << " splitting " << slot->waiting_for_split << dendl;
10884       _add_slot_waiter(token, slot, std::move(qi));
10885     } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10886       dout(20) << __func__ << " " << token
10887                << " map " << qi.get_map_epoch() << " > "
10888                << osdmap->get_epoch() << dendl;
10889       _add_slot_waiter(token, slot, std::move(qi));
10890     } else if (qi.is_peering()) {
10891       if (!qi.peering_requires_pg()) {
10892         // for pg-less events, we run them under the ordering lock, since
10893         // we don't have the pg lock to keep them ordered.
10894         qi.run(osd, sdata, pg, tp_handle);
10895       } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10896         if (create_info) {
10897           if (create_info->by_mon &&
10898               osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10899             dout(20) << __func__ << " " << token
10900                      << " no pg, no longer primary, ignoring mon create on "
10901                      << qi << dendl;
10902           } else {
10903             dout(20) << __func__ << " " << token
10904                      << " no pg, should create on " << qi << dendl;
10905             pg = osd->handle_pg_create_info(osdmap, create_info);
10906             if (pg) {
10907               // we created the pg! drop out and continue "normally"!
10908               sdata->_attach_pg(slot, pg.get());
10909               sdata->_wake_pg_slot(token, slot);
10910
10911               // identify split children between create epoch and shard epoch.
10912               osd->service.identify_splits_and_merges(
10913                 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10914               sdata->_prime_splits(&new_children);
10915               // distribute remaining split children to other shards below!
10916               break;
10917             }
10918             dout(20) << __func__ << " ignored create on " << qi << dendl;
10919           }
10920         } else {
10921           dout(20) << __func__ << " " << token
10922                    << " no pg, peering, !create, discarding " << qi << dendl;
10923         }
10924       } else {
10925         dout(20) << __func__ << " " << token
10926                  << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10927                  << ", discarding " << qi
10928                  << dendl;
10929       }
10930     } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10931       dout(20) << __func__ << " " << token
10932                << " no pg, should exist e" << osdmap->get_epoch()
10933                << ", will wait on " << qi << dendl;
10934       _add_slot_waiter(token, slot, std::move(qi));
10935     } else {
10936       dout(20) << __func__ << " " << token
10937                << " no pg, shouldn't exist e" << osdmap->get_epoch()
10938                << ", dropping " << qi << dendl;
10939       // share map with client?
10940       if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10941         osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10942                                      sdata->shard_osdmap,
10943                                      (*_op)->sent_epoch);
10944       }
10945       unsigned pushes_to_free = qi.get_reserved_pushes();
10946       if (pushes_to_free > 0) {
10947         sdata->shard_lock.unlock();
10948         osd->service.release_reserved_pushes(pushes_to_free);
10949         handle_oncommits(oncommits);
10950         return;
10951       }
10952     }
10953     sdata->shard_lock.unlock();
10954     handle_oncommits(oncommits);
10955     return;
10956   }
10957   if (qi.is_peering()) {
10958     OSDMapRef osdmap = sdata->shard_osdmap;
10959     if (qi.get_map_epoch() > osdmap->get_epoch()) {
10960       _add_slot_waiter(token, slot, std::move(qi));
10961       sdata->shard_lock.unlock();
10962       pg->unlock();
10963       handle_oncommits(oncommits);
10964       return;
10965     }
10966   }
10967   sdata->shard_lock.unlock();
10968
10969   if (!new_children.empty()) {
10970     for (auto shard : osd->shards) {
10971       shard->prime_splits(osdmap, &new_children);
10972     }
10973     ceph_assert(new_children.empty());
10974   }
10975
10976   // osd_opwq_process marks the point at which an operation has been dequeued
10977   // and will begin to be handled by a worker thread.
10978   {
10979 #ifdef WITH_LTTNG
10980     osd_reqid_t reqid;
10981     if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10982       reqid = (*_op)->get_reqid();
10983     }
10984 #endif
10985     tracepoint(osd, opwq_process_start, reqid.name._type,
10986         reqid.name._num, reqid.tid, reqid.inc);
10987   }
10988
10989   lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10990   Formatter *f = Formatter::create("json");
10991   f->open_object_section("q");
10992   dump(f);
10993   f->close_section();
10994   f->flush(*_dout);
10995   delete f;
10996   *_dout << dendl;
10997
10998   qi.run(osd, sdata, pg, tp_handle);
10999
11000   {
11001 #ifdef WITH_LTTNG
11002     osd_reqid_t reqid;
11003     if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11004       reqid = (*_op)->get_reqid();
11005     }
11006 #endif
11007     tracepoint(osd, opwq_process_finish, reqid.name._type,
11008         reqid.name._num, reqid.tid, reqid.inc);
11009   }
11010
11011   handle_oncommits(oncommits);
11012 }
11013
11014 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11015   uint32_t shard_index =
11016     item.get_ordering_token().hash_to_shard(osd->shards.size());
11017
11018   dout(20) << __func__ << " " << item << dendl;
11019
11020   OSDShard* sdata = osd->shards[shard_index];
11021   assert (NULL != sdata);
11022
11023   bool empty = true;
11024   {
11025     std::lock_guard l{sdata->shard_lock};
11026     empty = sdata->scheduler->empty();
11027     sdata->scheduler->enqueue(std::move(item));
11028   }
11029
11030   {
11031     std::lock_guard l{sdata->sdata_wait_lock};
11032     if (empty) {
11033       sdata->sdata_cond.notify_all();
11034     } else if (sdata->waiting_threads) {
11035       sdata->sdata_cond.notify_one();
11036     }
11037   }
11038 }
11039
11040 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
11041 {
11042   auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11043   auto& sdata = osd->shards[shard_index];
11044   ceph_assert(sdata);
11045   sdata->shard_lock.lock();
11046   auto p = sdata->pg_slots.find(item.get_ordering_token());
11047   if (p != sdata->pg_slots.end() &&
11048       !p->second->to_process.empty()) {
11049     // we may be racing with _process, which has dequeued a new item
11050     // from scheduler, put it on to_process, and is now busy taking the
11051     // pg lock.  ensure this old requeued item is ordered before any
11052     // such newer item in to_process.
11053     p->second->to_process.push_front(std::move(item));
11054     item = std::move(p->second->to_process.back());
11055     p->second->to_process.pop_back();
11056     dout(20) << __func__
11057              << " " << p->second->to_process.front()
11058              << " shuffled w/ " << item << dendl;
11059   } else {
11060     dout(20) << __func__ << " " << item << dendl;
11061   }
11062   sdata->scheduler->enqueue_front(std::move(item));
11063   sdata->shard_lock.unlock();
11064   std::lock_guard l{sdata->sdata_wait_lock};
11065   sdata->sdata_cond.notify_one();
11066 }
11067
11068 namespace ceph::osd_cmds {
11069
11070 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11071          std::ostream& os)
11072 {
11073   if (!ceph_using_tcmalloc()) {
11074         os << "could not issue heap profiler command -- not using tcmalloc!";
11075         return -EOPNOTSUPP;
11076   }
11077
11078   string cmd;
11079   if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
11080         os << "unable to get value for command \"" << cmd << "\"";
11081        return -EINVAL;
11082   }
11083
11084   std::vector<std::string> cmd_vec;
11085   get_str_vec(cmd, cmd_vec);
11086
11087   string val;
11088   if (cmd_getval(cmdmap, "value", val)) {
11089     cmd_vec.push_back(val);
11090   }
11091
11092   ceph_heap_profiler_handle_command(cmd_vec, os);
11093
11094   return 0;
11095 }
11096
11097 } // namespace ceph::osd_cmds