ceph/src/osd/OSD.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2017 OVH
   8  *
   9  * This is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License version 2.1, as published by the Free Software
  12  * Foundation.  See file COPYING.
  13  *
  14  */
  15
  16 #include "acconfig.h"
  17
  18 #include <cctype>
  19 #include <fstream>
  20 #include <iostream>
  21 #include <iterator>
  22
  23 #include <unistd.h>
  24 #include <sys/stat.h>
  25 #include <signal.h>
  26 #include <time.h>
  27 #include <boost/scoped_ptr.hpp>
  28 #include <boost/range/adaptor/reversed.hpp>
  29
  30 #ifdef HAVE_SYS_PARAM_H
  31 #include <sys/param.h>
  32 #endif
  33
  34 #ifdef HAVE_SYS_MOUNT_H
  35 #include <sys/mount.h>
  36 #endif
  37
  38 #include "osd/PG.h"
  39
  40 #include "include/types.h"
  41 #include "include/compat.h"
  42 #include "include/random.h"
  43
  44 #include "OSD.h"
  45 #include "OSDMap.h"
  46 #include "Watch.h"
  47 #include "osdc/Objecter.h"
  48
  49 #include "common/errno.h"
  50 #include "common/ceph_argparse.h"
  51 #include "common/ceph_time.h"
  52 #include "common/version.h"
  53 #include "common/pick_address.h"
  54 #include "common/blkdev.h"
  55 #include "common/numa.h"
  56
  57 #include "os/ObjectStore.h"
  58 #ifdef HAVE_LIBFUSE
  59 #include "os/FuseStore.h"
  60 #endif
  61
  62 #include "PrimaryLogPG.h"
  63
  64 #include "msg/Messenger.h"
  65 #include "msg/Message.h"
  66
  67 #include "mon/MonClient.h"
  68
  69 #include "messages/MLog.h"
  70
  71 #include "messages/MGenericMessage.h"
  72 #include "messages/MOSDPing.h"
  73 #include "messages/MOSDFailure.h"
  74 #include "messages/MOSDMarkMeDown.h"
  75 #include "messages/MOSDFull.h"
  76 #include "messages/MOSDOp.h"
  77 #include "messages/MOSDOpReply.h"
  78 #include "messages/MOSDBackoff.h"
  79 #include "messages/MOSDBeacon.h"
  80 #include "messages/MOSDRepOp.h"
  81 #include "messages/MOSDRepOpReply.h"
  82 #include "messages/MOSDBoot.h"
  83 #include "messages/MOSDPGTemp.h"
  84 #include "messages/MOSDPGReadyToMerge.h"
  85
  86 #include "messages/MOSDMap.h"
  87 #include "messages/MMonGetOSDMap.h"
  88 #include "messages/MOSDPGNotify.h"
  89 #include "messages/MOSDPGQuery.h"
  90 #include "messages/MOSDPGLog.h"
  91 #include "messages/MOSDPGRemove.h"
  92 #include "messages/MOSDPGInfo.h"
  93 #include "messages/MOSDPGCreate.h"
  94 #include "messages/MOSDPGCreate2.h"
  95 #include "messages/MOSDPGTrim.h"
  96 #include "messages/MOSDPGScan.h"
  97 #include "messages/MBackfillReserve.h"
  98 #include "messages/MRecoveryReserve.h"
  99 #include "messages/MOSDForceRecovery.h"
 100 #include "messages/MOSDECSubOpWrite.h"
 101 #include "messages/MOSDECSubOpWriteReply.h"
 102 #include "messages/MOSDECSubOpRead.h"
 103 #include "messages/MOSDECSubOpReadReply.h"
 104 #include "messages/MOSDPGCreated.h"
 105 #include "messages/MOSDPGUpdateLogMissing.h"
 106 #include "messages/MOSDPGUpdateLogMissingReply.h"
 107
 108 #include "messages/MOSDPeeringOp.h"
 109
 110 #include "messages/MOSDAlive.h"
 111
 112 #include "messages/MOSDScrub.h"
 113 #include "messages/MOSDScrub2.h"
 114 #include "messages/MOSDRepScrub.h"
 115
 116 #include "messages/MMonCommand.h"
 117 #include "messages/MCommand.h"
 118 #include "messages/MCommandReply.h"
 119
 120 #include "messages/MPGStats.h"
 121 #include "messages/MPGStatsAck.h"
 122
 123 #include "messages/MWatchNotify.h"
 124 #include "messages/MOSDPGPush.h"
 125 #include "messages/MOSDPGPushReply.h"
 126 #include "messages/MOSDPGPull.h"
 127
 128 #include "common/perf_counters.h"
 129 #include "common/Timer.h"
 130 #include "common/LogClient.h"
 131 #include "common/AsyncReserver.h"
 132 #include "common/HeartbeatMap.h"
 133 #include "common/admin_socket.h"
 134 #include "common/ceph_context.h"
 135
 136 #include "global/signal_handler.h"
 137 #include "global/pidfile.h"
 138
 139 #include "include/color.h"
 140 #include "perfglue/cpu_profiler.h"
 141 #include "perfglue/heap_profiler.h"
 142
 143 #include "osd/OpRequest.h"
 144
 145 #include "auth/AuthAuthorizeHandler.h"
 146 #include "auth/RotatingKeyRing.h"
 147
 148 #include "objclass/objclass.h"
 149
 150 #include "common/cmdparse.h"
 151 #include "include/str_list.h"
 152 #include "include/util.h"
 153
 154 #include "include/ceph_assert.h"
 155 #include "common/config.h"
 156 #include "common/EventTrace.h"
 157
 158 #include "json_spirit/json_spirit_reader.h"
 159 #include "json_spirit/json_spirit_writer.h"
 160
 161 #ifdef WITH_LTTNG
 162 #define TRACEPOINT_DEFINE
 163 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 164 #include "tracing/osd.h"
 165 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 166 #undef TRACEPOINT_DEFINE
 167 #else
 168 #define tracepoint(...)
 169 #endif
 170
 171 #define dout_context cct
 172 #define dout_subsys ceph_subsys_osd
 173 #undef dout_prefix
 174 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
 175
 176
 177 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
 178   return *_dout << "osd." << whoami << " " << epoch << " ";
 179 }
 180
 181 //Initial features in new superblock.
 182 //Features here are also automatically upgraded
 183 CompatSet OSD::get_osd_initial_compat_set() {
 184   CompatSet::FeatureSet ceph_osd_feature_compat;
 185   CompatSet::FeatureSet ceph_osd_feature_ro_compat;
 186   CompatSet::FeatureSet ceph_osd_feature_incompat;
 187   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
 188   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
 189   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
 190   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
 191   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
 192   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
 193   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
 194   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
 195   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
 196   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
 197   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
 198   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
 199   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
 200   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
 201   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
 202   return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
 203                    ceph_osd_feature_incompat);
 204 }
 205
 206 //Features are added here that this OSD supports.
 207 CompatSet OSD::get_osd_compat_set() {
 208   CompatSet compat =  get_osd_initial_compat_set();
 209   //Any features here can be set in code, but not in initial superblock
 210   compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
 211   return compat;
 212 }
 213
 214 OSDService::OSDService(OSD *osd) :
 215   osd(osd),
 216   cct(osd->cct),
 217   whoami(osd->whoami), store(osd->store),
 218   log_client(osd->log_client), clog(osd->clog),
 219   pg_recovery_stats(osd->pg_recovery_stats),
 220   cluster_messenger(osd->cluster_messenger),
 221   client_messenger(osd->client_messenger),
 222   logger(osd->logger),
 223   recoverystate_perf(osd->recoverystate_perf),
 224   monc(osd->monc),
 225   class_handler(osd->class_handler),
 226   osd_max_object_size(cct->_conf, "osd_max_object_size"),
 227   osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
 228   publish_lock{ceph::make_mutex("OSDService::publish_lock")},
 229   pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
 230   max_oldest_map(0),
 231   peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
 232   sched_scrub_lock("OSDService::sched_scrub_lock"),
 233   scrubs_local(0),
 234   scrubs_remote(0),
 235   agent_lock("OSDService::agent_lock"),
 236   agent_valid_iterator(false),
 237   agent_ops(0),
 238   flush_mode_high_count(0),
 239   agent_active(true),
 240   agent_thread(this),
 241   agent_stop_flag(false),
 242   agent_timer_lock("OSDService::agent_timer_lock"),
 243   agent_timer(osd->client_messenger->cct, agent_timer_lock),
 244   last_recalibrate(ceph_clock_now()),
 245   promote_max_objects(0),
 246   promote_max_bytes(0),
 247   objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
 248   m_objecter_finishers(cct->_conf->osd_objecter_finishers),
 249   watch_lock("OSDService::watch_lock"),
 250   watch_timer(osd->client_messenger->cct, watch_lock),
 251   next_notif_id(0),
 252   recovery_request_lock("OSDService::recovery_request_lock"),
 253   recovery_request_timer(cct, recovery_request_lock, false),
 254   sleep_lock("OSDService::sleep_lock"),
 255   sleep_timer(cct, sleep_lock, false),
 256   reserver_finisher(cct),
 257   local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 258                  cct->_conf->osd_min_recovery_priority),
 259   remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 260                   cct->_conf->osd_min_recovery_priority),
 261   pg_temp_lock("OSDService::pg_temp_lock"),
 262   snap_reserver(cct, &reserver_finisher,
 263                 cct->_conf->osd_max_trimming_pgs),
 264   recovery_lock("OSDService::recovery_lock"),
 265   recovery_ops_active(0),
 266   recovery_ops_reserved(0),
 267   recovery_paused(false),
 268   map_cache_lock("OSDService::map_cache_lock"),
 269   map_cache(cct, cct->_conf->osd_map_cache_size),
 270   map_bl_cache(cct->_conf->osd_map_cache_size),
 271   map_bl_inc_cache(cct->_conf->osd_map_cache_size),
 272   stat_lock("OSDService::stat_lock"),
 273   full_status_lock("OSDService::full_status_lock"),
 274   cur_state(NONE),
 275   cur_ratio(0), physical_ratio(0),
 276   epoch_lock("OSDService::epoch_lock"),
 277   boot_epoch(0), up_epoch(0), bind_epoch(0),
 278   is_stopping_lock("OSDService::is_stopping_lock")
 279 #ifdef PG_DEBUG_REFS
 280   , pgid_lock("OSDService::pgid_lock")
 281 #endif
 282 {
 283   objecter->init();
 284
 285   for (int i = 0; i < m_objecter_finishers; i++) {
 286     ostringstream str;
 287     str << "objecter-finisher-" << i;
 288     Finisher *fin = new Finisher(osd->client_messenger->cct, str.str(), "finisher");
 289     objecter_finishers.push_back(fin);
 290   }
 291 }
 292
 293 OSDService::~OSDService()
 294 {
 295   delete objecter;
 296
 297   for (auto f : objecter_finishers) {
 298     delete f;
 299     f = NULL;
 300   }
 301 }
 302
 303
 304
 305 #ifdef PG_DEBUG_REFS
 306 void OSDService::add_pgid(spg_t pgid, PG *pg){
 307   std::lock_guard l(pgid_lock);
 308   if (!pgid_tracker.count(pgid)) {
 309     live_pgs[pgid] = pg;
 310   }
 311   pgid_tracker[pgid]++;
 312 }
 313 void OSDService::remove_pgid(spg_t pgid, PG *pg)
 314 {
 315   std::lock_guard l(pgid_lock);
 316   ceph_assert(pgid_tracker.count(pgid));
 317   ceph_assert(pgid_tracker[pgid] > 0);
 318   pgid_tracker[pgid]--;
 319   if (pgid_tracker[pgid] == 0) {
 320     pgid_tracker.erase(pgid);
 321     live_pgs.erase(pgid);
 322   }
 323 }
 324 void OSDService::dump_live_pgids()
 325 {
 326   std::lock_guard l(pgid_lock);
 327   derr << "live pgids:" << dendl;
 328   for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
 329        i != pgid_tracker.cend();
 330        ++i) {
 331     derr << "\t" << *i << dendl;
 332     live_pgs[i->first]->dump_live_ids();
 333   }
 334 }
 335 #endif
 336
 337
 338
 339 void OSDService::identify_splits_and_merges(
 340   OSDMapRef old_map,
 341   OSDMapRef new_map,
 342   spg_t pgid,
 343   set<pair<spg_t,epoch_t>> *split_children,
 344   set<pair<spg_t,epoch_t>> *merge_pgs)
 345 {
 346   if (!old_map->have_pg_pool(pgid.pool())) {
 347     return;
 348   }
 349   int old_pgnum = old_map->get_pg_num(pgid.pool());
 350   auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
 351   if (p == osd->pg_num_history.pg_nums.end()) {
 352     return;
 353   }
 354   dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
 355            << " to e" << new_map->get_epoch()
 356            << " pg_nums " << p->second << dendl;
 357   deque<spg_t> queue;
 358   queue.push_back(pgid);
 359   set<spg_t> did;
 360   while (!queue.empty()) {
 361     auto cur = queue.front();
 362     queue.pop_front();
 363     did.insert(cur);
 364     unsigned pgnum = old_pgnum;
 365     for (auto q = p->second.lower_bound(old_map->get_epoch());
 366          q != p->second.end() &&
 367            q->first <= new_map->get_epoch();
 368          ++q) {
 369       if (pgnum < q->second) {
 370         // split?
 371         if (cur.ps() < pgnum) {
 372           set<spg_t> children;
 373           if (cur.is_split(pgnum, q->second, &children)) {
 374             dout(20) << __func__ << " " << cur << " e" << q->first
 375                      << " pg_num " << pgnum << " -> " << q->second
 376                      << " children " << children << dendl;
 377             for (auto i : children) {
 378               split_children->insert(make_pair(i, q->first));
 379               if (!did.count(i))
 380                 queue.push_back(i);
 381             }
 382           }
 383         } else if (cur.ps() < q->second) {
 384           dout(20) << __func__ << " " << cur << " e" << q->first
 385                    << " pg_num " << pgnum << " -> " << q->second
 386                    << " is a child" << dendl;
 387           // normally we'd capture this from the parent, but it's
 388           // possible the parent doesn't exist yet (it will be
 389           // fabricated to allow an intervening merge).  note this PG
 390           // as a split child here to be sure we catch it.
 391           split_children->insert(make_pair(cur, q->first));
 392         } else {
 393           dout(20) << __func__ << " " << cur << " e" << q->first
 394                    << " pg_num " << pgnum << " -> " << q->second
 395                    << " is post-split, skipping" << dendl;
 396         }
 397       } else if (merge_pgs) {
 398         // merge?
 399         if (cur.ps() >= q->second) {
 400           if (cur.ps() < pgnum) {
 401             spg_t parent;
 402             if (cur.is_merge_source(pgnum, q->second, &parent)) {
 403               set<spg_t> children;
 404               parent.is_split(q->second, pgnum, &children);
 405               dout(20) << __func__ << " " << cur << " e" << q->first
 406                        << " pg_num " << pgnum << " -> " << q->second
 407                        << " is merge source, target " << parent
 408                        << ", source(s) " << children << dendl;
 409               merge_pgs->insert(make_pair(parent, q->first));
 410               if (!did.count(parent)) {
 411                 // queue (and re-scan) parent in case it might not exist yet
 412                 // and there are some future splits pending on it
 413                 queue.push_back(parent);
 414               }
 415               for (auto c : children) {
 416                 merge_pgs->insert(make_pair(c, q->first));
 417                 if (!did.count(c))
 418                   queue.push_back(c);
 419               }
 420             }
 421           } else {
 422             dout(20) << __func__ << " " << cur << " e" << q->first
 423                      << " pg_num " << pgnum << " -> " << q->second
 424                      << " is beyond old pgnum, skipping" << dendl;
 425           }
 426         } else {
 427           set<spg_t> children;
 428           if (cur.is_split(q->second, pgnum, &children)) {
 429             dout(20) << __func__ << " " << cur << " e" << q->first
 430                      << " pg_num " << pgnum << " -> " << q->second
 431                      << " is merge target, source " << children << dendl;
 432             for (auto c : children) {
 433               merge_pgs->insert(make_pair(c, q->first));
 434               if (!did.count(c))
 435                 queue.push_back(c);
 436             }
 437             merge_pgs->insert(make_pair(cur, q->first));
 438           }
 439         }
 440       }
 441       pgnum = q->second;
 442     }
 443   }
 444 }
 445
 446 void OSDService::need_heartbeat_peer_update()
 447 {
 448   osd->need_heartbeat_peer_update();
 449 }
 450
 451 void OSDService::start_shutdown()
 452 {
 453   {
 454     std::lock_guard l(agent_timer_lock);
 455     agent_timer.shutdown();
 456   }
 457
 458   {
 459     std::lock_guard l(sleep_lock);
 460     sleep_timer.shutdown();
 461   }
 462
 463   {
 464     std::lock_guard l(recovery_request_lock);
 465     recovery_request_timer.shutdown();
 466   }
 467 }
 468
 469 void OSDService::shutdown_reserver()
 470 {
 471   reserver_finisher.wait_for_empty();
 472   reserver_finisher.stop();
 473 }
 474
 475 void OSDService::shutdown()
 476 {
 477   {
 478     std::lock_guard l(watch_lock);
 479     watch_timer.shutdown();
 480   }
 481
 482   objecter->shutdown();
 483   for (auto f : objecter_finishers) {
 484     f->wait_for_empty();
 485     f->stop();
 486   }
 487
 488   publish_map(OSDMapRef());
 489   next_osdmap = OSDMapRef();
 490 }
 491
 492 void OSDService::init()
 493 {
 494   reserver_finisher.start();
 495   for (auto f : objecter_finishers) {
 496     f->start();
 497   }
 498   objecter->set_client_incarnation(0);
 499
 500   // deprioritize objecter in daemonperf output
 501   objecter->get_logger()->set_prio_adjust(-3);
 502
 503   watch_timer.init();
 504   agent_timer.init();
 505
 506   agent_thread.create("osd_srv_agent");
 507
 508   if (cct->_conf->osd_recovery_delay_start)
 509     defer_recovery(cct->_conf->osd_recovery_delay_start);
 510 }
 511
 512 void OSDService::final_init()
 513 {
 514   objecter->start(osdmap.get());
 515 }
 516
 517 void OSDService::activate_map()
 518 {
 519   // wake/unwake the tiering agent
 520   agent_lock.Lock();
 521   agent_active =
 522     !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
 523     osd->is_active();
 524   agent_cond.Signal();
 525   agent_lock.Unlock();
 526 }
 527
 528 void OSDService::request_osdmap_update(epoch_t e)
 529 {
 530   osd->osdmap_subscribe(e, false);
 531 }
 532
 533 class AgentTimeoutCB : public Context {
 534   PGRef pg;
 535 public:
 536   explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
 537   void finish(int) override {
 538     pg->agent_choose_mode_restart();
 539   }
 540 };
 541
 542 void OSDService::agent_entry()
 543 {
 544   dout(10) << __func__ << " start" << dendl;
 545   agent_lock.Lock();
 546
 547   while (!agent_stop_flag) {
 548     if (agent_queue.empty()) {
 549       dout(20) << __func__ << " empty queue" << dendl;
 550       agent_cond.Wait(agent_lock);
 551       continue;
 552     }
 553     uint64_t level = agent_queue.rbegin()->first;
 554     set<PGRef>& top = agent_queue.rbegin()->second;
 555     dout(10) << __func__
 556              << " tiers " << agent_queue.size()
 557              << ", top is " << level
 558              << " with pgs " << top.size()
 559              << ", ops " << agent_ops << "/"
 560              << cct->_conf->osd_agent_max_ops
 561              << (agent_active ? " active" : " NOT ACTIVE")
 562              << dendl;
 563     dout(20) << __func__ << " oids " << agent_oids << dendl;
 564     int max = cct->_conf->osd_agent_max_ops - agent_ops;
 565     int agent_flush_quota = max;
 566     if (!flush_mode_high_count)
 567       agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
 568     if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
 569       agent_cond.Wait(agent_lock);
 570       continue;
 571     }
 572
 573     if (!agent_valid_iterator || agent_queue_pos == top.end()) {
 574       agent_queue_pos = top.begin();
 575       agent_valid_iterator = true;
 576     }
 577     PGRef pg = *agent_queue_pos;
 578     dout(10) << "high_count " << flush_mode_high_count
 579              << " agent_ops " << agent_ops
 580              << " flush_quota " << agent_flush_quota << dendl;
 581     agent_lock.Unlock();
 582     if (!pg->agent_work(max, agent_flush_quota)) {
 583       dout(10) << __func__ << " " << pg->pg_id
 584         << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
 585         << " seconds" << dendl;
 586
 587       osd->logger->inc(l_osd_tier_delay);
 588       // Queue a timer to call agent_choose_mode for this pg in 5 seconds
 589       agent_timer_lock.Lock();
 590       Context *cb = new AgentTimeoutCB(pg);
 591       agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
 592       agent_timer_lock.Unlock();
 593     }
 594     agent_lock.Lock();
 595   }
 596   agent_lock.Unlock();
 597   dout(10) << __func__ << " finish" << dendl;
 598 }
 599
 600 void OSDService::agent_stop()
 601 {
 602   {
 603     std::lock_guard l(agent_lock);
 604
 605     // By this time all ops should be cancelled
 606     ceph_assert(agent_ops == 0);
 607     // By this time all PGs are shutdown and dequeued
 608     if (!agent_queue.empty()) {
 609       set<PGRef>& top = agent_queue.rbegin()->second;
 610       derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
 611       ceph_abort_msg("agent queue not empty");
 612     }
 613
 614     agent_stop_flag = true;
 615     agent_cond.Signal();
 616   }
 617   agent_thread.join();
 618 }
 619
 620 // -------------------------------------
 621
 622 void OSDService::promote_throttle_recalibrate()
 623 {
 624   utime_t now = ceph_clock_now();
 625   double dur = now - last_recalibrate;
 626   last_recalibrate = now;
 627   unsigned prob = promote_probability_millis;
 628
 629   uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
 630   uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
 631
 632   unsigned min_prob = 1;
 633
 634   uint64_t attempts, obj, bytes;
 635   promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
 636   dout(10) << __func__ << " " << attempts << " attempts, promoted "
 637            << obj << " objects and " << byte_u_t(bytes) << "; target "
 638            << target_obj_sec << " obj/sec or "
 639            << byte_u_t(target_bytes_sec) << "/sec"
 640            << dendl;
 641
 642   // calculate what the probability *should* be, given the targets
 643   unsigned new_prob;
 644   if (attempts && dur > 0) {
 645     uint64_t avg_size = 1;
 646     if (obj)
 647       avg_size = std::max<uint64_t>(bytes / obj, 1);
 648     unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
 649     unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
 650       / (double)attempts;
 651     dout(20) << __func__ << "  po " << po << " pb " << pb << " avg_size "
 652              << avg_size << dendl;
 653     if (target_obj_sec && target_bytes_sec)
 654       new_prob = std::min(po, pb);
 655     else if (target_obj_sec)
 656       new_prob = po;
 657     else if (target_bytes_sec)
 658       new_prob = pb;
 659     else
 660       new_prob = 1000;
 661   } else {
 662     new_prob = 1000;
 663   }
 664   dout(20) << __func__ << "  new_prob " << new_prob << dendl;
 665
 666   // correct for persistent skew between target rate and actual rate, adjust
 667   double ratio = 1.0;
 668   unsigned actual = 0;
 669   if (attempts && obj) {
 670     actual = obj * 1000 / attempts;
 671     ratio = (double)actual / (double)prob;
 672     new_prob = (double)new_prob / ratio;
 673   }
 674   new_prob = std::max(new_prob, min_prob);
 675   new_prob = std::min(new_prob, 1000u);
 676
 677   // adjust
 678   prob = (prob + new_prob) / 2;
 679   prob = std::max(prob, min_prob);
 680   prob = std::min(prob, 1000u);
 681   dout(10) << __func__ << "  actual " << actual
 682            << ", actual/prob ratio " << ratio
 683            << ", adjusted new_prob " << new_prob
 684            << ", prob " << promote_probability_millis << " -> " << prob
 685            << dendl;
 686   promote_probability_millis = prob;
 687
 688   // set hard limits for this interval to mitigate stampedes
 689   promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
 690   promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
 691 }
 692
 693 // -------------------------------------
 694
 695 float OSDService::get_failsafe_full_ratio()
 696 {
 697   float full_ratio = cct->_conf->osd_failsafe_full_ratio;
 698   if (full_ratio > 1.0) full_ratio /= 100.0;
 699   return full_ratio;
 700 }
 701
 702 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
 703 {
 704   // The OSDMap ratios take precendence.  So if the failsafe is .95 and
 705   // the admin sets the cluster full to .96, the failsafe moves up to .96
 706   // too.  (Not that having failsafe == full is ideal, but it's better than
 707   // dropping writes before the clusters appears full.)
 708   OSDMapRef osdmap = get_osdmap();
 709   if (!osdmap || osdmap->get_epoch() == 0) {
 710     return NONE;
 711   }
 712   float nearfull_ratio = osdmap->get_nearfull_ratio();
 713   float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
 714   float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
 715   float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
 716
 717   if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
 718     // use the failsafe for nearfull and full; the mon isn't using the
 719     // flags anyway because we're mid-upgrade.
 720     full_ratio = failsafe_ratio;
 721     backfillfull_ratio = failsafe_ratio;
 722     nearfull_ratio = failsafe_ratio;
 723   } else if (full_ratio <= 0 ||
 724              backfillfull_ratio <= 0 ||
 725              nearfull_ratio <= 0) {
 726     derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
 727     // use failsafe flag.  ick.  the monitor did something wrong or the user
 728     // did something stupid.
 729     full_ratio = failsafe_ratio;
 730     backfillfull_ratio = failsafe_ratio;
 731     nearfull_ratio = failsafe_ratio;
 732   }
 733
 734   if (injectfull_state > NONE && injectfull) {
 735     inject = "(Injected)";
 736     return injectfull_state;
 737   } else if (pratio > failsafe_ratio) {
 738     return FAILSAFE;
 739   } else if (ratio > full_ratio) {
 740     return FULL;
 741   } else if (ratio > backfillfull_ratio) {
 742     return BACKFILLFULL;
 743   } else if (pratio > nearfull_ratio) {
 744     return NEARFULL;
 745   }
 746    return NONE;
 747 }
 748
 749 void OSDService::check_full_status(float ratio, float pratio)
 750 {
 751   std::lock_guard l(full_status_lock);
 752
 753   cur_ratio = ratio;
 754   physical_ratio = pratio;
 755
 756   string inject;
 757   s_names new_state;
 758   new_state = recalc_full_state(ratio, pratio, inject);
 759
 760   dout(20) << __func__ << " cur ratio " << ratio
 761            << ", physical ratio " << pratio
 762            << ", new state " << get_full_state_name(new_state)
 763            << " " << inject
 764            << dendl;
 765
 766   // warn
 767   if (cur_state != new_state) {
 768     dout(10) << __func__ << " " << get_full_state_name(cur_state)
 769              << " -> " << get_full_state_name(new_state) << dendl;
 770     if (new_state == FAILSAFE) {
 771       clog->error() << "full status failsafe engaged, dropping updates, now "
 772                     << (int)roundf(ratio * 100) << "% full";
 773     } else if (cur_state == FAILSAFE) {
 774       clog->error() << "full status failsafe disengaged, no longer dropping "
 775                      << "updates, now " << (int)roundf(ratio * 100) << "% full";
 776     }
 777     cur_state = new_state;
 778   }
 779 }
 780
 781 bool OSDService::need_fullness_update()
 782 {
 783   OSDMapRef osdmap = get_osdmap();
 784   s_names cur = NONE;
 785   if (osdmap->exists(whoami)) {
 786     if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
 787       cur = FULL;
 788     } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
 789       cur = BACKFILLFULL;
 790     } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
 791       cur = NEARFULL;
 792     }
 793   }
 794   s_names want = NONE;
 795   if (is_full())
 796     want = FULL;
 797   else if (is_backfillfull())
 798     want = BACKFILLFULL;
 799   else if (is_nearfull())
 800     want = NEARFULL;
 801   return want != cur;
 802 }
 803
 804 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
 805 {
 806   if (injectfull && injectfull_state >= type) {
 807     // injectfull is either a count of the number of times to return failsafe full
 808     // or if -1 then always return full
 809     if (injectfull > 0)
 810       --injectfull;
 811     ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
 812              << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
 813              << dendl;
 814     return true;
 815   }
 816   return false;
 817 }
 818
 819 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
 820 {
 821   std::lock_guard l(full_status_lock);
 822
 823   if (_check_inject_full(dpp, type))
 824     return true;
 825
 826   if (cur_state >= type)
 827     ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
 828                        << " physical " << physical_ratio << dendl;
 829
 830   return cur_state >= type;
 831 }
 832
 833 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
 834 {
 835   ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
 836   {
 837     std::lock_guard l(full_status_lock);
 838     if (_check_inject_full(dpp, type)) {
 839       return true;
 840     }
 841   }
 842
 843   float pratio;
 844   float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
 845
 846   string notused;
 847   s_names tentative_state = recalc_full_state(ratio, pratio, notused);
 848
 849   if (tentative_state >= type)
 850     ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
 851
 852   return tentative_state >= type;
 853 }
 854
 855 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
 856 {
 857   return _check_full(dpp, FAILSAFE);
 858 }
 859
 860 bool OSDService::check_full(DoutPrefixProvider *dpp) const
 861 {
 862   return _check_full(dpp, FULL);
 863 }
 864
 865 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
 866 {
 867   return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
 868 }
 869
 870 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
 871 {
 872   return _check_full(dpp, BACKFILLFULL);
 873 }
 874
 875 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
 876 {
 877   return _check_full(dpp, NEARFULL);
 878 }
 879
 880 bool OSDService::is_failsafe_full() const
 881 {
 882   std::lock_guard l(full_status_lock);
 883   return cur_state == FAILSAFE;
 884 }
 885
 886 bool OSDService::is_full() const
 887 {
 888   std::lock_guard l(full_status_lock);
 889   return cur_state >= FULL;
 890 }
 891
 892 bool OSDService::is_backfillfull() const
 893 {
 894   std::lock_guard l(full_status_lock);
 895   return cur_state >= BACKFILLFULL;
 896 }
 897
 898 bool OSDService::is_nearfull() const
 899 {
 900   std::lock_guard l(full_status_lock);
 901   return cur_state >= NEARFULL;
 902 }
 903
 904 void OSDService::set_injectfull(s_names type, int64_t count)
 905 {
 906   std::lock_guard l(full_status_lock);
 907   injectfull_state = type;
 908   injectfull = count;
 909 }
 910
 911 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
 912                             osd_alert_list_t& alerts)
 913 {
 914   uint64_t bytes = stbuf.total;
 915   uint64_t avail = stbuf.available;
 916   uint64_t used = stbuf.get_used_raw();
 917
 918   // For testing fake statfs values so it doesn't matter if all
 919   // OSDs are using the same partition.
 920   if (cct->_conf->fake_statfs_for_testing) {
 921     uint64_t total_num_bytes = 0;
 922     vector<PGRef> pgs;
 923     osd->_get_pgs(&pgs);
 924     for (auto p : pgs) {
 925       total_num_bytes += p->get_stats_num_bytes();
 926     }
 927     bytes = cct->_conf->fake_statfs_for_testing;
 928     if (total_num_bytes < bytes)
 929       avail = bytes - total_num_bytes;
 930     else
 931       avail = 0;
 932     dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
 933             << " adjust available " << avail
 934             << dendl;
 935     used = bytes - avail;
 936   }
 937
 938   osd->logger->set(l_osd_stat_bytes, bytes);
 939   osd->logger->set(l_osd_stat_bytes_used, used);
 940   osd->logger->set(l_osd_stat_bytes_avail, avail);
 941
 942   std::lock_guard l(stat_lock);
 943   osd_stat.statfs = stbuf;
 944   osd_stat.os_alerts.clear();
 945   osd_stat.os_alerts[whoami].swap(alerts);
 946   if (cct->_conf->fake_statfs_for_testing) {
 947     osd_stat.statfs.total = bytes;
 948     osd_stat.statfs.available = avail;
 949     // For testing don't want used to go negative, so clear reserved
 950     osd_stat.statfs.internally_reserved = 0;
 951   }
 952 }
 953
 954 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
 955                                     int num_pgs)
 956 {
 957   utime_t now = ceph_clock_now();
 958   auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
 959   std::lock_guard l(stat_lock);
 960   osd_stat.hb_peers.swap(hb_peers);
 961   osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
 962   osd_stat.num_pgs = num_pgs;
 963   // Clean entries that aren't updated
 964   // This is called often enough that we can just remove 1 at a time
 965   for (auto i: osd_stat.hb_pingtime) {
 966     if (i.second.last_update == 0)
 967       continue;
 968     if (stale_time && now.sec() - i.second.last_update > stale_time) {
 969       dout(20) << __func__ << " time out heartbeat for osd " << i.first
 970                << " last_update " << i.second.last_update << dendl;
 971       osd_stat.hb_pingtime.erase(i.first);
 972       break;
 973     }
 974   }
 975   return osd_stat;
 976 }
 977
 978 void OSDService::inc_osd_stat_repaired()
 979 {
 980   std::lock_guard l(stat_lock);
 981   osd_stat.num_shards_repaired++;
 982   return;
 983 }
 984
 985 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
 986                                          uint64_t adjust_used)
 987 {
 988   *pratio =
 989    ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
 990
 991   if (adjust_used) {
 992     dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used()  << dendl;
 993     if (new_stat.statfs.available > adjust_used)
 994       new_stat.statfs.available -= adjust_used;
 995     else
 996       new_stat.statfs.available = 0;
 997     dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
 998   }
 999
1000   // Check all pgs and adjust kb_used to include all pending backfill data
1001   int backfill_adjusted = 0;
1002   vector<PGRef> pgs;
1003   osd->_get_pgs(&pgs);
1004   for (auto p : pgs) {
1005     backfill_adjusted += p->pg_stat_adjust(&new_stat);
1006   }
1007   if (backfill_adjusted) {
1008     dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1009   }
1010   return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
1011 }
1012
1013 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
1014 {
1015   OSDMapRef osdmap = get_osdmap();
1016   for (auto shard : missing_on) {
1017     if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
1018       return true;
1019   }
1020   return false;
1021 }
1022
1023 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1024 {
1025   OSDMapRef next_map = get_nextmap_reserved();
1026   // service map is always newer/newest
1027   ceph_assert(from_epoch <= next_map->get_epoch());
1028
1029   if (next_map->is_down(peer) ||
1030       next_map->get_info(peer).up_from > from_epoch) {
1031     m->put();
1032     release_map(next_map);
1033     return;
1034   }
1035   ConnectionRef peer_con = osd->cluster_messenger->connect_to_osd(
1036     next_map->get_cluster_addrs(peer));
1037   share_map_peer(peer, peer_con.get(), next_map);
1038   peer_con->send_message(m);
1039   release_map(next_map);
1040 }
1041
1042 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1043 {
1044   OSDMapRef next_map = get_nextmap_reserved();
1045   // service map is always newer/newest
1046   ceph_assert(from_epoch <= next_map->get_epoch());
1047
1048   if (next_map->is_down(peer) ||
1049       next_map->get_info(peer).up_from > from_epoch) {
1050     release_map(next_map);
1051     return NULL;
1052   }
1053   ConnectionRef con = osd->cluster_messenger->connect_to_osd(
1054     next_map->get_cluster_addrs(peer));
1055   release_map(next_map);
1056   return con;
1057 }
1058
1059 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1060 {
1061   OSDMapRef next_map = get_nextmap_reserved();
1062   // service map is always newer/newest
1063   ceph_assert(from_epoch <= next_map->get_epoch());
1064
1065   pair<ConnectionRef,ConnectionRef> ret;
1066   if (next_map->is_down(peer) ||
1067       next_map->get_info(peer).up_from > from_epoch) {
1068     release_map(next_map);
1069     return ret;
1070   }
1071   ret.first = osd->hb_back_client_messenger->connect_to_osd(
1072     next_map->get_hb_back_addrs(peer));
1073   ret.second = osd->hb_front_client_messenger->connect_to_osd(
1074     next_map->get_hb_front_addrs(peer));
1075   release_map(next_map);
1076   return ret;
1077 }
1078
1079 entity_name_t OSDService::get_cluster_msgr_name() const
1080 {
1081   return cluster_messenger->get_myname();
1082 }
1083
1084 void OSDService::queue_want_pg_temp(pg_t pgid,
1085                                     const vector<int>& want,
1086                                     bool forced)
1087 {
1088   std::lock_guard l(pg_temp_lock);
1089   auto p = pg_temp_pending.find(pgid);
1090   if (p == pg_temp_pending.end() ||
1091       p->second.acting != want ||
1092       forced) {
1093     pg_temp_wanted[pgid] = {want, forced};
1094   }
1095 }
1096
1097 void OSDService::remove_want_pg_temp(pg_t pgid)
1098 {
1099   std::lock_guard l(pg_temp_lock);
1100   pg_temp_wanted.erase(pgid);
1101   pg_temp_pending.erase(pgid);
1102 }
1103
1104 void OSDService::_sent_pg_temp()
1105 {
1106 #ifdef HAVE_STDLIB_MAP_SPLICING
1107   pg_temp_pending.merge(pg_temp_wanted);
1108 #else
1109   pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1110                          make_move_iterator(end(pg_temp_wanted)));
1111 #endif
1112   pg_temp_wanted.clear();
1113 }
1114
1115 void OSDService::requeue_pg_temp()
1116 {
1117   std::lock_guard l(pg_temp_lock);
1118   // wanted overrides pending.  note that remove_want_pg_temp
1119   // clears the item out of both.
1120   unsigned old_wanted = pg_temp_wanted.size();
1121   unsigned old_pending = pg_temp_pending.size();
1122   _sent_pg_temp();
1123   pg_temp_wanted.swap(pg_temp_pending);
1124   dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1125            << pg_temp_wanted.size() << dendl;
1126 }
1127
1128 std::ostream& operator<<(std::ostream& out,
1129                          const OSDService::pg_temp_t& pg_temp)
1130 {
1131   out << pg_temp.acting;
1132   if (pg_temp.forced) {
1133     out << " (forced)";
1134   }
1135   return out;
1136 }
1137
1138 void OSDService::send_pg_temp()
1139 {
1140   std::lock_guard l(pg_temp_lock);
1141   if (pg_temp_wanted.empty())
1142     return;
1143   dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1144   MOSDPGTemp *ms[2] = {nullptr, nullptr};
1145   for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1146     auto& m = ms[pg_temp.forced];
1147     if (!m) {
1148       m = new MOSDPGTemp(osdmap->get_epoch());
1149       m->forced = pg_temp.forced;
1150     }
1151     m->pg_temp.emplace(pgid, pg_temp.acting);
1152   }
1153   for (auto m : ms) {
1154     if (m) {
1155       monc->send_mon_message(m);
1156     }
1157   }
1158   _sent_pg_temp();
1159 }
1160
1161 void OSDService::send_pg_created(pg_t pgid)
1162 {
1163   std::lock_guard l(pg_created_lock);
1164   dout(20) << __func__ << dendl;
1165   auto o = get_osdmap();
1166   if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1167     pg_created.insert(pgid);
1168     monc->send_mon_message(new MOSDPGCreated(pgid));
1169   }
1170 }
1171
1172 void OSDService::send_pg_created()
1173 {
1174   std::lock_guard l(pg_created_lock);
1175   dout(20) << __func__ << dendl;
1176   auto o = get_osdmap();
1177   if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1178     for (auto pgid : pg_created) {
1179       monc->send_mon_message(new MOSDPGCreated(pgid));
1180     }
1181   }
1182 }
1183
1184 void OSDService::prune_pg_created()
1185 {
1186   std::lock_guard l(pg_created_lock);
1187   dout(20) << __func__ << dendl;
1188   auto o = get_osdmap();
1189   auto i = pg_created.begin();
1190   while (i != pg_created.end()) {
1191     auto p = o->get_pg_pool(i->pool());
1192     if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1193       dout(20) << __func__ << " pruning " << *i << dendl;
1194       i = pg_created.erase(i);
1195     } else {
1196       dout(20) << __func__ << " keeping " << *i << dendl;
1197       ++i;
1198     }
1199   }
1200 }
1201
1202
1203 // --------------------------------------
1204 // dispatch
1205
1206 epoch_t OSDService::get_peer_epoch(int peer)
1207 {
1208   std::lock_guard l(peer_map_epoch_lock);
1209   map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1210   if (p == peer_map_epoch.end())
1211     return 0;
1212   return p->second;
1213 }
1214
1215 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1216 {
1217   std::lock_guard l(peer_map_epoch_lock);
1218   map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1219   if (p != peer_map_epoch.end()) {
1220     if (p->second < e) {
1221       dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1222       p->second = e;
1223     } else {
1224       dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1225     }
1226     return p->second;
1227   } else {
1228     dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1229     peer_map_epoch[peer] = e;
1230     return e;
1231   }
1232 }
1233
1234 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1235 {
1236   std::lock_guard l(peer_map_epoch_lock);
1237   map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1238   if (p != peer_map_epoch.end()) {
1239     if (p->second <= as_of) {
1240       dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1241                << " had " << p->second << dendl;
1242       peer_map_epoch.erase(p);
1243     } else {
1244       dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1245                << " has " << p->second << " - not forgetting" << dendl;
1246     }
1247   }
1248 }
1249
1250 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1251                                   epoch_t epoch, const OSDMapRef& osdmap,
1252                                   const epoch_t *sent_epoch_p)
1253 {
1254   dout(20) << "should_share_map "
1255            << name << " " << con->get_peer_addr()
1256            << " " << epoch << dendl;
1257
1258   // does client have old map?
1259   if (name.is_client()) {
1260     bool message_sendmap = epoch < osdmap->get_epoch();
1261     if (message_sendmap && sent_epoch_p) {
1262       dout(20) << "client session last_sent_epoch: "
1263                << *sent_epoch_p
1264                << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1265       if (*sent_epoch_p < osdmap->get_epoch()) {
1266         return true;
1267       } // else we don't need to send it out again
1268     }
1269   }
1270
1271   if (con->get_messenger() == osd->cluster_messenger &&
1272       con != osd->cluster_messenger->get_loopback_connection() &&
1273       osdmap->is_up(name.num()) &&
1274       (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1275        osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1276     // remember
1277     epoch_t has = std::max(get_peer_epoch(name.num()), epoch);
1278
1279     // share?
1280     if (has < osdmap->get_epoch()) {
1281       dout(10) << name << " " << con->get_peer_addr()
1282                << " has old map " << epoch << " < "
1283                << osdmap->get_epoch() << dendl;
1284       return true;
1285     }
1286   }
1287
1288   return false;
1289 }
1290
1291 void OSDService::share_map(
1292     entity_name_t name,
1293     Connection *con,
1294     epoch_t epoch,
1295     OSDMapRef& osdmap,
1296     epoch_t *sent_epoch_p)
1297 {
1298   dout(20) << "share_map "
1299            << name << " " << con->get_peer_addr()
1300            << " " << epoch << dendl;
1301
1302   if (!osd->is_active()) {
1303     /*It is safe not to proceed as OSD is not in healthy state*/
1304     return;
1305   }
1306
1307   bool want_shared = should_share_map(name, con, epoch,
1308                                       osdmap, sent_epoch_p);
1309
1310   if (want_shared){
1311     if (name.is_client()) {
1312       dout(10) << name << " has old map " << epoch
1313           << " < " << osdmap->get_epoch() << dendl;
1314       // we know the Session is valid or we wouldn't be sending
1315       if (sent_epoch_p) {
1316         *sent_epoch_p = osdmap->get_epoch();
1317       }
1318       send_incremental_map(epoch, con, osdmap);
1319     } else if (con->get_messenger() == osd->cluster_messenger &&
1320         osdmap->is_up(name.num()) &&
1321         (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1322             osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1323       dout(10) << name << " " << con->get_peer_addrs()
1324                        << " has old map " << epoch << " < "
1325                        << osdmap->get_epoch() << dendl;
1326       note_peer_epoch(name.num(), osdmap->get_epoch());
1327       send_incremental_map(epoch, con, osdmap);
1328     }
1329   }
1330 }
1331
1332 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1333 {
1334   if (!map)
1335     map = get_osdmap();
1336
1337   // send map?
1338   epoch_t pe = get_peer_epoch(peer);
1339   if (pe) {
1340     if (pe < map->get_epoch()) {
1341       send_incremental_map(pe, con, map);
1342       note_peer_epoch(peer, map->get_epoch());
1343     } else
1344       dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1345   } else {
1346     dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1347     // no idea about peer's epoch.
1348     // ??? send recent ???
1349     // do nothing.
1350   }
1351 }
1352
1353 bool OSDService::can_inc_scrubs()
1354 {
1355   bool can_inc = false;
1356   std::lock_guard l(sched_scrub_lock);
1357
1358   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1359     dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1360              << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1361     can_inc = true;
1362   } else {
1363     dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1364              << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1365   }
1366
1367   return can_inc;
1368 }
1369
1370 bool OSDService::inc_scrubs_local()
1371 {
1372   bool result = false;
1373   std::lock_guard l{sched_scrub_lock};
1374   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1375     dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1376              << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1377     result = true;
1378     ++scrubs_local;
1379   } else {
1380     dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1381   }
1382   return result;
1383 }
1384
1385 void OSDService::dec_scrubs_local()
1386 {
1387   std::lock_guard l{sched_scrub_lock};
1388   dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1389            << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1390   --scrubs_local;
1391   ceph_assert(scrubs_local >= 0);
1392 }
1393
1394 bool OSDService::inc_scrubs_remote()
1395 {
1396   bool result = false;
1397   std::lock_guard l{sched_scrub_lock};
1398   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1399     dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1400              << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1401     result = true;
1402     ++scrubs_remote;
1403   } else {
1404     dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1405   }
1406   return result;
1407 }
1408
1409 void OSDService::dec_scrubs_remote()
1410 {
1411   std::lock_guard l{sched_scrub_lock};
1412   dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1413            << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1414   --scrubs_remote;
1415   ceph_assert(scrubs_remote >= 0);
1416 }
1417
1418 void OSDService::dump_scrub_reservations(Formatter *f)
1419 {
1420   std::lock_guard l{sched_scrub_lock};
1421   f->dump_int("scrubs_local", scrubs_local);
1422   f->dump_int("scrubs_remote", scrubs_remote);
1423   f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1424 }
1425
1426 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1427                                  epoch_t *_bind_epoch) const
1428 {
1429   std::lock_guard l(epoch_lock);
1430   if (_boot_epoch)
1431     *_boot_epoch = boot_epoch;
1432   if (_up_epoch)
1433     *_up_epoch = up_epoch;
1434   if (_bind_epoch)
1435     *_bind_epoch = bind_epoch;
1436 }
1437
1438 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1439                             const epoch_t *_bind_epoch)
1440 {
1441   std::lock_guard l(epoch_lock);
1442   if (_boot_epoch) {
1443     ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1444     boot_epoch = *_boot_epoch;
1445   }
1446   if (_up_epoch) {
1447     ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1448     up_epoch = *_up_epoch;
1449   }
1450   if (_bind_epoch) {
1451     ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1452     bind_epoch = *_bind_epoch;
1453   }
1454 }
1455
1456 bool OSDService::prepare_to_stop()
1457 {
1458   std::lock_guard l(is_stopping_lock);
1459   if (get_state() != NOT_STOPPING)
1460     return false;
1461
1462   OSDMapRef osdmap = get_osdmap();
1463   if (osdmap && osdmap->is_up(whoami)) {
1464     dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1465     set_state(PREPARING_TO_STOP);
1466     monc->send_mon_message(
1467       new MOSDMarkMeDown(
1468         monc->get_fsid(),
1469         whoami,
1470         osdmap->get_addrs(whoami),
1471         osdmap->get_epoch(),
1472         true  // request ack
1473         ));
1474     utime_t now = ceph_clock_now();
1475     utime_t timeout;
1476     timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1477     while ((ceph_clock_now() < timeout) &&
1478        (get_state() != STOPPING)) {
1479       is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1480     }
1481   }
1482   dout(0) << __func__ << " starting shutdown" << dendl;
1483   set_state(STOPPING);
1484   return true;
1485 }
1486
1487 void OSDService::got_stop_ack()
1488 {
1489   std::lock_guard l(is_stopping_lock);
1490   if (get_state() == PREPARING_TO_STOP) {
1491     dout(0) << __func__ << " starting shutdown" << dendl;
1492     set_state(STOPPING);
1493     is_stopping_cond.Signal();
1494   } else {
1495     dout(10) << __func__ << " ignoring msg" << dendl;
1496   }
1497 }
1498
1499 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1500                                                OSDSuperblock& sblock)
1501 {
1502   MOSDMap *m = new MOSDMap(monc->get_fsid(),
1503                            osdmap->get_encoding_features());
1504   m->oldest_map = max_oldest_map;
1505   m->newest_map = sblock.newest_map;
1506
1507   int max = cct->_conf->osd_map_message_max;
1508   ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1509
1510   if (since < m->oldest_map) {
1511     // we don't have the next map the target wants, so start with a
1512     // full map.
1513     bufferlist bl;
1514     dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1515              << since << ", starting with full map" << dendl;
1516     since = m->oldest_map;
1517     if (!get_map_bl(since, bl)) {
1518       derr << __func__ << " missing full map " << since << dendl;
1519       goto panic;
1520     }
1521     max--;
1522     max_bytes -= bl.length();
1523     m->maps[since].claim(bl);
1524   }
1525   for (epoch_t e = since + 1; e <= to; ++e) {
1526     bufferlist bl;
1527     if (get_inc_map_bl(e, bl)) {
1528       m->incremental_maps[e].claim(bl);
1529     } else {
1530       derr << __func__ << " missing incremental map " << e << dendl;
1531       if (!get_map_bl(e, bl)) {
1532         derr << __func__ << " also missing full map " << e << dendl;
1533         goto panic;
1534       }
1535       m->maps[e].claim(bl);
1536     }
1537     max--;
1538     max_bytes -= bl.length();
1539     if (max <= 0 || max_bytes <= 0) {
1540       break;
1541     }
1542   }
1543   return m;
1544
1545  panic:
1546   if (!m->maps.empty() ||
1547       !m->incremental_maps.empty()) {
1548     // send what we have so far
1549     return m;
1550   }
1551   // send something
1552   bufferlist bl;
1553   if (get_inc_map_bl(m->newest_map, bl)) {
1554     m->incremental_maps[m->newest_map].claim(bl);
1555   } else {
1556     derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1557     if (!get_map_bl(m->newest_map, bl)) {
1558       derr << __func__ << " unable to load latest full map " << m->newest_map
1559            << dendl;
1560       ceph_abort();
1561     }
1562     m->maps[m->newest_map].claim(bl);
1563   }
1564   return m;
1565 }
1566
1567 void OSDService::send_map(MOSDMap *m, Connection *con)
1568 {
1569   con->send_message(m);
1570 }
1571
1572 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1573                                       OSDMapRef& osdmap)
1574 {
1575   epoch_t to = osdmap->get_epoch();
1576   dout(10) << "send_incremental_map " << since << " -> " << to
1577            << " to " << con << " " << con->get_peer_addr() << dendl;
1578
1579   MOSDMap *m = NULL;
1580   while (!m) {
1581     OSDSuperblock sblock(get_superblock());
1582     if (since < sblock.oldest_map) {
1583       // just send latest full map
1584       MOSDMap *m = new MOSDMap(monc->get_fsid(),
1585                                osdmap->get_encoding_features());
1586       m->oldest_map = max_oldest_map;
1587       m->newest_map = sblock.newest_map;
1588       get_map_bl(to, m->maps[to]);
1589       send_map(m, con);
1590       return;
1591     }
1592
1593     if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1594       dout(10) << "  " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1595                << ", only sending most recent" << dendl;
1596       since = to - cct->_conf->osd_map_share_max_epochs;
1597     }
1598
1599     m = build_incremental_map_msg(since, to, sblock);
1600   }
1601   send_map(m, con);
1602 }
1603
1604 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1605 {
1606   bool found = map_bl_cache.lookup(e, &bl);
1607   if (found) {
1608     if (logger)
1609       logger->inc(l_osd_map_bl_cache_hit);
1610     return true;
1611   }
1612   if (logger)
1613     logger->inc(l_osd_map_bl_cache_miss);
1614   found = store->read(meta_ch,
1615                       OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1616                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1617   if (found) {
1618     _add_map_bl(e, bl);
1619   }
1620   return found;
1621 }
1622
1623 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1624 {
1625   std::lock_guard l(map_cache_lock);
1626   bool found = map_bl_inc_cache.lookup(e, &bl);
1627   if (found) {
1628     if (logger)
1629       logger->inc(l_osd_map_bl_cache_hit);
1630     return true;
1631   }
1632   if (logger)
1633     logger->inc(l_osd_map_bl_cache_miss);
1634   found = store->read(meta_ch,
1635                       OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1636                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1637   if (found) {
1638     _add_map_inc_bl(e, bl);
1639   }
1640   return found;
1641 }
1642
1643 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1644 {
1645   dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1646   // cache a contiguous buffer
1647   if (bl.get_num_buffers() > 1) {
1648     bl.rebuild();
1649   }
1650   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1651   map_bl_cache.add(e, bl);
1652 }
1653
1654 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1655 {
1656   dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1657   // cache a contiguous buffer
1658   if (bl.get_num_buffers() > 1) {
1659     bl.rebuild();
1660   }
1661   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1662   map_bl_inc_cache.add(e, bl);
1663 }
1664
1665 int OSDService::get_deleted_pool_pg_num(int64_t pool)
1666 {
1667   std::lock_guard l(map_cache_lock);
1668   auto p = deleted_pool_pg_nums.find(pool);
1669   if (p != deleted_pool_pg_nums.end()) {
1670     return p->second;
1671   }
1672   dout(20) << __func__ << " " << pool << " loading" << dendl;
1673   ghobject_t oid = OSD::make_final_pool_info_oid(pool);
1674   bufferlist bl;
1675   int r = store->read(meta_ch, oid, 0, 0, bl);
1676   ceph_assert(r >= 0);
1677   auto blp = bl.cbegin();
1678   pg_pool_t pi;
1679   ::decode(pi, blp);
1680   deleted_pool_pg_nums[pool] = pi.get_pg_num();
1681   dout(20) << __func__ << " " << pool << " got " << pi.get_pg_num() << dendl;
1682   return pi.get_pg_num();
1683 }
1684
1685 OSDMapRef OSDService::_add_map(OSDMap *o)
1686 {
1687   epoch_t e = o->get_epoch();
1688
1689   if (cct->_conf->osd_map_dedup) {
1690     // Dedup against an existing map at a nearby epoch
1691     OSDMapRef for_dedup = map_cache.lower_bound(e);
1692     if (for_dedup) {
1693       OSDMap::dedup(for_dedup.get(), o);
1694     }
1695   }
1696   bool existed;
1697   OSDMapRef l = map_cache.add(e, o, &existed);
1698   if (existed) {
1699     delete o;
1700   }
1701   return l;
1702 }
1703
1704 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1705 {
1706   std::lock_guard l(map_cache_lock);
1707   OSDMapRef retval = map_cache.lookup(epoch);
1708   if (retval) {
1709     dout(30) << "get_map " << epoch << " -cached" << dendl;
1710     if (logger) {
1711       logger->inc(l_osd_map_cache_hit);
1712     }
1713     return retval;
1714   }
1715   if (logger) {
1716     logger->inc(l_osd_map_cache_miss);
1717     epoch_t lb = map_cache.cached_key_lower_bound();
1718     if (epoch < lb) {
1719       dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1720       logger->inc(l_osd_map_cache_miss_low);
1721       logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1722     }
1723   }
1724
1725   OSDMap *map = new OSDMap;
1726   if (epoch > 0) {
1727     dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1728     bufferlist bl;
1729     if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1730       derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1731       delete map;
1732       return OSDMapRef();
1733     }
1734     map->decode(bl);
1735   } else {
1736     dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1737   }
1738   return _add_map(map);
1739 }
1740
1741 // ops
1742
1743
1744 void OSDService::reply_op_error(OpRequestRef op, int err)
1745 {
1746   reply_op_error(op, err, eversion_t(), 0);
1747 }
1748
1749 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1750                                 version_t uv)
1751 {
1752   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1753   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1754   int flags;
1755   flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1756
1757   MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, true);
1758   reply->set_reply_versions(v, uv);
1759   m->get_connection()->send_message(reply);
1760 }
1761
1762 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1763 {
1764   if (!cct->_conf->osd_debug_misdirected_ops) {
1765     return;
1766   }
1767
1768   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1769   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1770
1771   ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1772
1773   if (pg->is_ec_pg()) {
1774     /**
1775        * OSD recomputes op target based on current OSDMap. With an EC pg, we
1776        * can get this result:
1777        * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1778        *    [CRUSH_ITEM_NONE, 2, 3]/3
1779        * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1780        *    [3, 2, 3]/3
1781        * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1782        *    -- misdirected op
1783        * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1784        *    it and fulfils it
1785        *
1786        * We can't compute the op target based on the sending map epoch due to
1787        * splitting.  The simplest thing is to detect such cases here and drop
1788        * them without an error (the client will resend anyway).
1789        */
1790     ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1791     OSDMapRef opmap = try_get_map(m->get_map_epoch());
1792     if (!opmap) {
1793       dout(7) << __func__ << ": " << *pg << " no longer have map for "
1794               << m->get_map_epoch() << ", dropping" << dendl;
1795       return;
1796     }
1797     pg_t _pgid = m->get_raw_pg();
1798     spg_t pgid;
1799     if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1800       _pgid = opmap->raw_pg_to_pg(_pgid);
1801     if (opmap->get_primary_shard(_pgid, &pgid) &&
1802         pgid.shard != pg->pg_id.shard) {
1803       dout(7) << __func__ << ": " << *pg << " primary changed since "
1804               << m->get_map_epoch() << ", dropping" << dendl;
1805       return;
1806     }
1807   }
1808
1809   dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1810   clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1811                << " pg " << m->get_raw_pg()
1812                << " to osd." << whoami
1813                << " not " << pg->get_acting()
1814                << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1815 }
1816
1817 void OSDService::enqueue_back(OpQueueItem&& qi)
1818 {
1819   osd->op_shardedwq.queue(std::move(qi));
1820 }
1821
1822 void OSDService::enqueue_front(OpQueueItem&& qi)
1823 {
1824   osd->op_shardedwq.queue_front(std::move(qi));
1825 }
1826
1827 void OSDService::queue_recovery_context(
1828   PG *pg,
1829   GenContext<ThreadPool::TPHandle&> *c)
1830 {
1831   epoch_t e = get_osdmap_epoch();
1832   enqueue_back(
1833     OpQueueItem(
1834       unique_ptr<OpQueueItem::OpQueueable>(
1835         new PGRecoveryContext(pg->get_pgid(), c, e)),
1836       cct->_conf->osd_recovery_cost,
1837       cct->_conf->osd_recovery_priority,
1838       ceph_clock_now(),
1839       0,
1840       e));
1841 }
1842
1843 void OSDService::queue_for_snap_trim(PG *pg)
1844 {
1845   dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1846   enqueue_back(
1847     OpQueueItem(
1848       unique_ptr<OpQueueItem::OpQueueable>(
1849         new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1850       cct->_conf->osd_snap_trim_cost,
1851       cct->_conf->osd_snap_trim_priority,
1852       ceph_clock_now(),
1853       0,
1854       pg->get_osdmap_epoch()));
1855 }
1856
1857 void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1858 {
1859   unsigned scrub_queue_priority = pg->scrubber.priority;
1860   if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1861     scrub_queue_priority = cct->_conf->osd_client_op_priority;
1862   }
1863   const auto epoch = pg->get_osdmap_epoch();
1864   enqueue_back(
1865     OpQueueItem(
1866       unique_ptr<OpQueueItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1867       cct->_conf->osd_scrub_cost,
1868       scrub_queue_priority,
1869       ceph_clock_now(),
1870       0,
1871       epoch));
1872 }
1873
1874 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1875 {
1876   dout(10) << __func__ << " on " << pgid << " e " << e  << dendl;
1877   enqueue_back(
1878     OpQueueItem(
1879       unique_ptr<OpQueueItem::OpQueueable>(
1880         new PGDelete(pgid, e)),
1881       cct->_conf->osd_pg_delete_cost,
1882       cct->_conf->osd_pg_delete_priority,
1883       ceph_clock_now(),
1884       0,
1885       e));
1886 }
1887
1888 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1889 {
1890   return osd->try_finish_pg_delete(pg, old_pg_num);
1891 }
1892
1893 // ---
1894
1895 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1896 {
1897   std::lock_guard l(merge_lock);
1898   dout(10) << __func__ << " " << pg->pg_id << dendl;
1899   ready_to_merge_source[pg->pg_id.pgid] = version;
1900   assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1901   _send_ready_to_merge();
1902 }
1903
1904 void OSDService::set_ready_to_merge_target(PG *pg,
1905                                            eversion_t version,
1906                                            epoch_t last_epoch_started,
1907                                            epoch_t last_epoch_clean)
1908 {
1909   std::lock_guard l(merge_lock);
1910   dout(10) << __func__ << " " << pg->pg_id << dendl;
1911   ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1912                                          make_tuple(version,
1913                                                     last_epoch_started,
1914                                                     last_epoch_clean)));
1915   assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1916   _send_ready_to_merge();
1917 }
1918
1919 void OSDService::set_not_ready_to_merge_source(pg_t source)
1920 {
1921   std::lock_guard l(merge_lock);
1922   dout(10) << __func__ << " " << source << dendl;
1923   not_ready_to_merge_source.insert(source);
1924   assert(ready_to_merge_source.count(source) == 0);
1925   _send_ready_to_merge();
1926 }
1927
1928 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1929 {
1930   std::lock_guard l(merge_lock);
1931   dout(10) << __func__ << " " << target << " source " << source << dendl;
1932   not_ready_to_merge_target[target] = source;
1933   assert(ready_to_merge_target.count(target) == 0);
1934   _send_ready_to_merge();
1935 }
1936
1937 void OSDService::send_ready_to_merge()
1938 {
1939   std::lock_guard l(merge_lock);
1940   _send_ready_to_merge();
1941 }
1942
1943 void OSDService::_send_ready_to_merge()
1944 {
1945   dout(20) << __func__
1946            << " ready_to_merge_source " << ready_to_merge_source
1947            << " not_ready_to_merge_source " << not_ready_to_merge_source
1948            << " ready_to_merge_target " << ready_to_merge_target
1949            << " not_ready_to_merge_target " << not_ready_to_merge_target
1950            << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1951            << dendl;
1952   for (auto src : not_ready_to_merge_source) {
1953     if (sent_ready_to_merge_source.count(src) == 0) {
1954       monc->send_mon_message(new MOSDPGReadyToMerge(
1955                                src,
1956                                {}, {}, 0, 0,
1957                                false,
1958                                osdmap->get_epoch()));
1959       sent_ready_to_merge_source.insert(src);
1960     }
1961   }
1962   for (auto p : not_ready_to_merge_target) {
1963     if (sent_ready_to_merge_source.count(p.second) == 0) {
1964       monc->send_mon_message(new MOSDPGReadyToMerge(
1965                                p.second,
1966                                {}, {}, 0, 0,
1967                                false,
1968                                osdmap->get_epoch()));
1969       sent_ready_to_merge_source.insert(p.second);
1970     }
1971   }
1972   for (auto src : ready_to_merge_source) {
1973     if (not_ready_to_merge_source.count(src.first) ||
1974         not_ready_to_merge_target.count(src.first.get_parent())) {
1975       continue;
1976     }
1977     auto p = ready_to_merge_target.find(src.first.get_parent());
1978     if (p != ready_to_merge_target.end() &&
1979         sent_ready_to_merge_source.count(src.first) == 0) {
1980       monc->send_mon_message(new MOSDPGReadyToMerge(
1981                                src.first,           // source pgid
1982                                src.second,          // src version
1983                                std::get<0>(p->second), // target version
1984                                std::get<1>(p->second), // PG's last_epoch_started
1985                                std::get<2>(p->second), // PG's last_epoch_clean
1986                                true,
1987                                osdmap->get_epoch()));
1988       sent_ready_to_merge_source.insert(src.first);
1989     }
1990   }
1991 }
1992
1993 void OSDService::clear_ready_to_merge(PG *pg)
1994 {
1995   std::lock_guard l(merge_lock);
1996   dout(10) << __func__ << " " << pg->pg_id << dendl;
1997   ready_to_merge_source.erase(pg->pg_id.pgid);
1998   ready_to_merge_target.erase(pg->pg_id.pgid);
1999   not_ready_to_merge_source.erase(pg->pg_id.pgid);
2000   not_ready_to_merge_target.erase(pg->pg_id.pgid);
2001   sent_ready_to_merge_source.erase(pg->pg_id.pgid);
2002 }
2003
2004 void OSDService::clear_sent_ready_to_merge()
2005 {
2006   std::lock_guard l(merge_lock);
2007   sent_ready_to_merge_source.clear();
2008 }
2009
2010 void OSDService::prune_sent_ready_to_merge(OSDMapRef& osdmap)
2011 {
2012   std::lock_guard l(merge_lock);
2013   auto i = sent_ready_to_merge_source.begin();
2014   while (i != sent_ready_to_merge_source.end()) {
2015     if (!osdmap->pg_exists(*i)) {
2016       dout(10) << __func__ << " " << *i << dendl;
2017       i = sent_ready_to_merge_source.erase(i);
2018     } else {
2019       ++i;
2020     }
2021   }
2022 }
2023
2024 // ---
2025
2026 void OSDService::_queue_for_recovery(
2027   std::pair<epoch_t, PGRef> p,
2028   uint64_t reserved_pushes)
2029 {
2030   ceph_assert(recovery_lock.is_locked_by_me());
2031   enqueue_back(
2032     OpQueueItem(
2033       unique_ptr<OpQueueItem::OpQueueable>(
2034         new PGRecovery(
2035           p.second->get_pgid(), p.first, reserved_pushes)),
2036       cct->_conf->osd_recovery_cost,
2037       cct->_conf->osd_recovery_priority,
2038       ceph_clock_now(),
2039       0,
2040       p.first));
2041 }
2042
2043 // ====================================================================
2044 // OSD
2045
2046 #undef dout_prefix
2047 #define dout_prefix *_dout
2048
2049 // Commands shared between OSD's console and admin console:
2050 namespace ceph {
2051 namespace osd_cmds {
2052
2053 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
2054
2055 }} // namespace ceph::osd_cmds
2056
2057 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami)
2058 {
2059   int ret;
2060
2061   OSDSuperblock sb;
2062   bufferlist sbbl;
2063   ObjectStore::CollectionHandle ch;
2064
2065   // if we are fed a uuid for this osd, use it.
2066   store->set_fsid(cct->_conf->osd_uuid);
2067
2068   ret = store->mkfs();
2069   if (ret) {
2070     derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2071          << cpp_strerror(ret) << dendl;
2072     goto free_store;
2073   }
2074
2075   store->set_cache_shards(1);  // doesn't matter for mkfs!
2076
2077   ret = store->mount();
2078   if (ret) {
2079     derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2080          << cpp_strerror(ret) << dendl;
2081     goto free_store;
2082   }
2083
2084   ch = store->open_collection(coll_t::meta());
2085   if (ch) {
2086     ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2087     if (ret < 0) {
2088       derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2089       goto free_store;
2090     }
2091     /* if we already have superblock, check content of superblock */
2092     dout(0) << " have superblock" << dendl;
2093     auto p = sbbl.cbegin();
2094     decode(sb, p);
2095     if (whoami != sb.whoami) {
2096       derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2097            << dendl;
2098       ret = -EINVAL;
2099       goto umount_store;
2100     }
2101     if (fsid != sb.cluster_fsid) {
2102       derr << "provided cluster fsid " << fsid
2103            << " != superblock's " << sb.cluster_fsid << dendl;
2104       ret = -EINVAL;
2105       goto umount_store;
2106     }
2107   } else {
2108     // create superblock
2109     sb.cluster_fsid = fsid;
2110     sb.osd_fsid = store->get_fsid();
2111     sb.whoami = whoami;
2112     sb.compat_features = get_osd_initial_compat_set();
2113
2114     bufferlist bl;
2115     encode(sb, bl);
2116
2117     ObjectStore::CollectionHandle ch = store->create_new_collection(
2118       coll_t::meta());
2119     ObjectStore::Transaction t;
2120     t.create_collection(coll_t::meta(), 0);
2121     t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2122     ret = store->queue_transaction(ch, std::move(t));
2123     if (ret) {
2124       derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2125            << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2126       goto umount_store;
2127     }
2128   }
2129
2130   ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
2131   if (ret) {
2132     derr << "OSD::mkfs: failed to write fsid file: error "
2133          << cpp_strerror(ret) << dendl;
2134     goto umount_store;
2135   }
2136
2137 umount_store:
2138   if (ch) {
2139     ch.reset();
2140   }
2141   store->umount();
2142 free_store:
2143   delete store;
2144   return ret;
2145 }
2146
2147 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
2148 {
2149   char val[80];
2150   int r;
2151
2152   snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2153   r = store->write_meta("magic", val);
2154   if (r < 0)
2155     return r;
2156
2157   snprintf(val, sizeof(val), "%d", whoami);
2158   r = store->write_meta("whoami", val);
2159   if (r < 0)
2160     return r;
2161
2162   cluster_fsid.print(val);
2163   r = store->write_meta("ceph_fsid", val);
2164   if (r < 0)
2165     return r;
2166
2167   string key = cct->_conf.get_val<string>("key");
2168   if (key.size()) {
2169     r = store->write_meta("osd_key", key);
2170     if (r < 0)
2171       return r;
2172   } else {
2173     string keyfile = cct->_conf.get_val<string>("keyfile");
2174     if (!keyfile.empty()) {
2175       bufferlist keybl;
2176       string err;
2177       r = keybl.read_file(keyfile.c_str(), &err);
2178       if (r < 0) {
2179         derr << __func__ << " failed to read keyfile " << keyfile << ": "
2180              << err << ": " << cpp_strerror(r) << dendl;
2181         return r;
2182       }
2183       r = store->write_meta("osd_key", keybl.to_str());
2184       if (r < 0)
2185         return r;
2186     }
2187   }
2188
2189   r = store->write_meta("ready", "ready");
2190   if (r < 0)
2191     return r;
2192
2193   return 0;
2194 }
2195
2196 int OSD::peek_meta(ObjectStore *store,
2197                    std::string *magic,
2198                    uuid_d *cluster_fsid,
2199                    uuid_d *osd_fsid,
2200                    int *whoami,
2201                    int *require_osd_release)
2202 {
2203   string val;
2204
2205   int r = store->read_meta("magic", &val);
2206   if (r < 0)
2207     return r;
2208   *magic = val;
2209
2210   r = store->read_meta("whoami", &val);
2211   if (r < 0)
2212     return r;
2213   *whoami = atoi(val.c_str());
2214
2215   r = store->read_meta("ceph_fsid", &val);
2216   if (r < 0)
2217     return r;
2218   r = cluster_fsid->parse(val.c_str());
2219   if (!r)
2220     return -EINVAL;
2221
2222   r = store->read_meta("fsid", &val);
2223   if (r < 0) {
2224     *osd_fsid = uuid_d();
2225   } else {
2226     r = osd_fsid->parse(val.c_str());
2227     if (!r)
2228       return -EINVAL;
2229   }
2230
2231   r = store->read_meta("require_osd_release", &val);
2232   if (r >= 0) {
2233     *require_osd_release = atoi(val.c_str());
2234   }
2235
2236   return 0;
2237 }
2238
2239
2240 #undef dout_prefix
2241 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2242
2243 // cons/des
2244
2245 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2246          int id,
2247          Messenger *internal_messenger,
2248          Messenger *external_messenger,
2249          Messenger *hb_client_front,
2250          Messenger *hb_client_back,
2251          Messenger *hb_front_serverm,
2252          Messenger *hb_back_serverm,
2253          Messenger *osdc_messenger,
2254          MonClient *mc,
2255          const std::string &dev, const std::string &jdev) :
2256   Dispatcher(cct_),
2257   osd_lock("OSD::osd_lock"),
2258   tick_timer(cct, osd_lock),
2259   tick_timer_lock("OSD::tick_timer_lock"),
2260   tick_timer_without_osd_lock(cct, tick_timer_lock),
2261   gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2262   cluster_messenger(internal_messenger),
2263   client_messenger(external_messenger),
2264   objecter_messenger(osdc_messenger),
2265   monc(mc),
2266   mgrc(cct_, client_messenger),
2267   logger(NULL),
2268   recoverystate_perf(NULL),
2269   store(store_),
2270   log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2271   clog(log_client.create_channel()),
2272   whoami(id),
2273   dev_path(dev), journal_path(jdev),
2274   store_is_rotational(store->is_rotational()),
2275   trace_endpoint("0.0.0.0", 0, "osd"),
2276   asok_hook(NULL),
2277   m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2278                                   "osd_pg_epoch_max_lag_factor")),
2279   osd_compat(get_osd_compat_set()),
2280   osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2281             get_num_op_threads()),
2282   command_tp(cct, "OSD::command_tp", "tp_osd_cmd",  1),
2283   session_waiting_lock("OSD::session_waiting_lock"),
2284   osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
2285   heartbeat_lock("OSD::heartbeat_lock"),
2286   heartbeat_stop(false),
2287   heartbeat_need_update(true),
2288   hb_front_client_messenger(hb_client_front),
2289   hb_back_client_messenger(hb_client_back),
2290   hb_front_server_messenger(hb_front_serverm),
2291   hb_back_server_messenger(hb_back_serverm),
2292   daily_loadavg(0.0),
2293   heartbeat_thread(this),
2294   heartbeat_dispatcher(this),
2295   op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2296                   cct->_conf->osd_num_op_tracker_shard),
2297   test_ops_hook(NULL),
2298   op_queue(get_io_queue()),
2299   op_prio_cutoff(get_io_prio_cut()),
2300   op_shardedwq(
2301     this,
2302     cct->_conf->osd_op_thread_timeout,
2303     cct->_conf->osd_op_thread_suicide_timeout,
2304     &osd_op_tp),
2305   map_lock("OSD::map_lock"),
2306   last_pg_create_epoch(0),
2307   mon_report_lock("OSD::mon_report_lock"),
2308   boot_finisher(cct),
2309   up_thru_wanted(0),
2310   requested_full_first(0),
2311   requested_full_last(0),
2312   command_wq(
2313     this,
2314     cct->_conf->osd_command_thread_timeout,
2315     cct->_conf->osd_command_thread_suicide_timeout,
2316     &command_tp),
2317   service(this)
2318 {
2319
2320   if (!gss_ktfile_client.empty()) {
2321     // Assert we can export environment variable
2322     /*
2323         The default client keytab is used, if it is present and readable,
2324         to automatically obtain initial credentials for GSSAPI client
2325         applications. The principal name of the first entry in the client
2326         keytab is used by default when obtaining initial credentials.
2327         1. The KRB5_CLIENT_KTNAME environment variable.
2328         2. The default_client_keytab_name profile variable in [libdefaults].
2329         3. The hardcoded default, DEFCKTNAME.
2330     */
2331     const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2332                                     gss_ktfile_client.c_str(), 1));
2333     ceph_assert(set_result == 0);
2334   }
2335
2336   monc->set_messenger(client_messenger);
2337   op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2338                                          cct->_conf->osd_op_log_threshold);
2339   op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2340                                            cct->_conf->osd_op_history_duration);
2341   op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2342                                                     cct->_conf->osd_op_history_slow_op_threshold);
2343 #ifdef WITH_BLKIN
2344   std::stringstream ss;
2345   ss << "osd." << whoami;
2346   trace_endpoint.copy_name(ss.str());
2347 #endif
2348
2349   // initialize shards
2350   num_shards = get_num_op_shards();
2351   for (uint32_t i = 0; i < num_shards; i++) {
2352     OSDShard *one_shard = new OSDShard(
2353       i,
2354       cct,
2355       this,
2356       cct->_conf->osd_op_pq_max_tokens_per_priority,
2357       cct->_conf->osd_op_pq_min_cost,
2358       op_queue);
2359     shards.push_back(one_shard);
2360   }
2361 }
2362
2363 OSD::~OSD()
2364 {
2365   while (!shards.empty()) {
2366     delete shards.back();
2367     shards.pop_back();
2368   }
2369   delete class_handler;
2370   cct->get_perfcounters_collection()->remove(recoverystate_perf);
2371   cct->get_perfcounters_collection()->remove(logger);
2372   delete recoverystate_perf;
2373   delete logger;
2374   delete store;
2375 }
2376
2377 double OSD::get_tick_interval() const
2378 {
2379   // vary +/- 5% to avoid scrub scheduling livelocks
2380   constexpr auto delta = 0.05;
2381   return (OSD_TICK_INTERVAL *
2382           ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2383 }
2384
2385 void cls_initialize(ClassHandler *ch);
2386
2387 void OSD::handle_signal(int signum)
2388 {
2389   ceph_assert(signum == SIGINT || signum == SIGTERM);
2390   derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2391   shutdown();
2392 }
2393
2394 int OSD::pre_init()
2395 {
2396   std::lock_guard lock(osd_lock);
2397   if (is_stopping())
2398     return 0;
2399
2400   if (store->test_mount_in_use()) {
2401     derr << "OSD::pre_init: object store '" << dev_path << "' is "
2402          << "currently in use. (Is ceph-osd already running?)" << dendl;
2403     return -EBUSY;
2404   }
2405
2406   cct->_conf.add_observer(this);
2407   return 0;
2408 }
2409
2410 int OSD::set_numa_affinity()
2411 {
2412   // storage numa node
2413   int store_node = -1;
2414   store->get_numa_node(&store_node, nullptr, nullptr);
2415   if (store_node >= 0) {
2416     dout(1) << __func__ << " storage numa node " << store_node << dendl;
2417   }
2418
2419   // check network numa node(s)
2420   int front_node = -1, back_node = -1;
2421   string front_iface = pick_iface(
2422     cct,
2423     client_messenger->get_myaddrs().front().get_sockaddr_storage());
2424   string back_iface = pick_iface(
2425     cct,
2426     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2427   int r = get_iface_numa_node(front_iface, &front_node);
2428   if (r >= 0 && front_node >= 0) {
2429     dout(1) << __func__ << " public network " << front_iface << " numa node "
2430             << front_node << dendl;
2431     r = get_iface_numa_node(back_iface, &back_node);
2432     if (r >= 0 && back_node >= 0) {
2433       dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2434               << back_node << dendl;
2435       if (front_node == back_node &&
2436           front_node == store_node) {
2437         dout(1) << " objectstore and network numa nodes all match" << dendl;
2438         if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2439           numa_node = front_node;
2440         }
2441       } else if (front_node != back_node) {
2442         dout(1) << __func__ << " public and cluster network numa nodes do not match"
2443                 << dendl;
2444       } else {
2445         dout(1) << __func__ << " objectstore and network numa nodes do not match"
2446                 << dendl;
2447       }
2448     } else if (back_node == -2) {
2449       dout(1) << __func__ << " cluster network " << back_iface
2450               << " ports numa nodes do not match" << dendl;
2451     } else {
2452       derr << __func__ << " unable to identify cluster interface '" << back_iface
2453            << "' numa node: " << cpp_strerror(r) << dendl;
2454     }
2455   } else if (front_node == -2) {
2456     dout(1) << __func__ << " public network " << front_iface
2457             << " ports numa nodes do not match" << dendl;
2458   } else {
2459     derr << __func__ << " unable to identify public interface '" << front_iface
2460          << "' numa node: " << cpp_strerror(r) << dendl;
2461   }
2462   if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2463     // this takes precedence over the automagic logic above
2464     numa_node = node;
2465   }
2466   if (numa_node >= 0) {
2467     int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2468     if (r < 0) {
2469       dout(1) << __func__ << " unable to determine numa node " << numa_node
2470               << " CPUs" << dendl;
2471       numa_node = -1;
2472     } else {
2473       dout(1) << __func__ << " setting numa affinity to node " << numa_node
2474               << " cpus "
2475               << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2476               << dendl;
2477       r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2478       if (r < 0) {
2479         r = -errno;
2480         derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2481              << dendl;
2482         numa_node = -1;
2483       }
2484     }
2485   } else {
2486     dout(1) << __func__ << " not setting numa affinity" << dendl;
2487   }
2488   return 0;
2489 }
2490
2491 // asok
2492
2493 class OSDSocketHook : public AdminSocketHook {
2494   OSD *osd;
2495 public:
2496   explicit OSDSocketHook(OSD *o) : osd(o) {}
2497   bool call(std::string_view admin_command, const cmdmap_t& cmdmap,
2498             std::string_view format, bufferlist& out) override {
2499     stringstream ss;
2500     bool r = true;
2501     try {
2502       r = osd->asok_command(admin_command, cmdmap, format, ss);
2503     } catch (const bad_cmd_get& e) {
2504       ss << e.what();
2505       r = true;
2506     }
2507     out.append(ss);
2508     return r;
2509   }
2510 };
2511
2512 std::set<int64_t> OSD::get_mapped_pools()
2513 {
2514   std::set<int64_t> pools;
2515   std::vector<spg_t> pgids;
2516   _get_pgids(&pgids);
2517   for (const auto &pgid : pgids) {
2518     pools.insert(pgid.pool());
2519   }
2520   return pools;
2521 }
2522
2523 bool OSD::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
2524                        std::string_view format, ostream& ss)
2525 {
2526   Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2527   if (admin_command == "status") {
2528     f->open_object_section("status");
2529     f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2530     f->dump_stream("osd_fsid") << superblock.osd_fsid;
2531     f->dump_unsigned("whoami", superblock.whoami);
2532     f->dump_string("state", get_state_name(get_state()));
2533     f->dump_unsigned("oldest_map", superblock.oldest_map);
2534     f->dump_unsigned("newest_map", superblock.newest_map);
2535     f->dump_unsigned("num_pgs", num_pgs);
2536     f->close_section();
2537   } else if (admin_command == "flush_journal") {
2538     store->flush_journal();
2539   } else if (admin_command == "dump_ops_in_flight" ||
2540              admin_command == "ops" ||
2541              admin_command == "dump_blocked_ops" ||
2542              admin_command == "dump_historic_ops" ||
2543              admin_command == "dump_historic_ops_by_duration" ||
2544              admin_command == "dump_historic_slow_ops") {
2545
2546     const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2547 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2548 will start to track new ops received afterwards.";
2549
2550     set<string> filters;
2551     vector<string> filter_str;
2552     if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2553         copy(filter_str.begin(), filter_str.end(),
2554            inserter(filters, filters.end()));
2555     }
2556
2557     if (admin_command == "dump_ops_in_flight" ||
2558         admin_command == "ops") {
2559       if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2560         ss << error_str;
2561       }
2562     }
2563     if (admin_command == "dump_blocked_ops") {
2564       if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2565         ss << error_str;
2566       }
2567     }
2568     if (admin_command == "dump_historic_ops") {
2569       if (!op_tracker.dump_historic_ops(f, false, filters)) {
2570         ss << error_str;
2571       }
2572     }
2573     if (admin_command == "dump_historic_ops_by_duration") {
2574       if (!op_tracker.dump_historic_ops(f, true, filters)) {
2575         ss << error_str;
2576       }
2577     }
2578     if (admin_command == "dump_historic_slow_ops") {
2579       if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2580         ss << error_str;
2581       }
2582     }
2583   } else if (admin_command == "dump_op_pq_state") {
2584     f->open_object_section("pq");
2585     op_shardedwq.dump(f);
2586     f->close_section();
2587   } else if (admin_command == "dump_blacklist") {
2588     list<pair<entity_addr_t,utime_t> > bl;
2589     OSDMapRef curmap = service.get_osdmap();
2590
2591     f->open_array_section("blacklist");
2592     curmap->get_blacklist(&bl);
2593     for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2594         it != bl.end(); ++it) {
2595       f->open_object_section("entry");
2596       f->open_object_section("entity_addr_t");
2597       it->first.dump(f);
2598       f->close_section(); //entity_addr_t
2599       it->second.localtime(f->dump_stream("expire_time"));
2600       f->close_section(); //entry
2601     }
2602     f->close_section(); //blacklist
2603   } else if (admin_command == "dump_watchers") {
2604     list<obj_watch_item_t> watchers;
2605     // scan pg's
2606     vector<PGRef> pgs;
2607     _get_pgs(&pgs);
2608     for (auto& pg : pgs) {
2609       list<obj_watch_item_t> pg_watchers;
2610       pg->get_watchers(&pg_watchers);
2611       watchers.splice(watchers.end(), pg_watchers);
2612     }
2613
2614     f->open_array_section("watchers");
2615     for (list<obj_watch_item_t>::iterator it = watchers.begin();
2616         it != watchers.end(); ++it) {
2617
2618       f->open_object_section("watch");
2619
2620       f->dump_string("namespace", it->obj.nspace);
2621       f->dump_string("object", it->obj.oid.name);
2622
2623       f->open_object_section("entity_name");
2624       it->wi.name.dump(f);
2625       f->close_section(); //entity_name_t
2626
2627       f->dump_unsigned("cookie", it->wi.cookie);
2628       f->dump_unsigned("timeout", it->wi.timeout_seconds);
2629
2630       f->open_object_section("entity_addr_t");
2631       it->wi.addr.dump(f);
2632       f->close_section(); //entity_addr_t
2633
2634       f->close_section(); //watch
2635     }
2636
2637     f->close_section(); //watchers
2638   } else if (admin_command == "dump_recovery_reservations") {
2639     f->open_object_section("reservations");
2640     f->open_object_section("local_reservations");
2641     service.local_reserver.dump(f);
2642     f->close_section();
2643     f->open_object_section("remote_reservations");
2644     service.remote_reserver.dump(f);
2645     f->close_section();
2646     f->close_section();
2647   } else if (admin_command == "dump_scrub_reservations") {
2648     f->open_object_section("scrub_reservations");
2649     service.dump_scrub_reservations(f);
2650     f->close_section();
2651   } else if (admin_command == "get_latest_osdmap") {
2652     get_latest_osdmap();
2653   } else if (admin_command == "heap") {
2654     auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2655
2656     // Note: Failed heap profile commands won't necessarily trigger an error:
2657     f->open_object_section("result");
2658     f->dump_string("error", cpp_strerror(result));
2659     f->dump_bool("success", result >= 0);
2660     f->close_section();
2661   } else if (admin_command == "set_heap_property") {
2662     string property;
2663     int64_t value = 0;
2664     string error;
2665     bool success = false;
2666     if (!cmd_getval(cct, cmdmap, "property", property)) {
2667       error = "unable to get property";
2668       success = false;
2669     } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2670       error = "unable to get value";
2671       success = false;
2672     } else if (value < 0) {
2673       error = "negative value not allowed";
2674       success = false;
2675     } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2676       error = "invalid property";
2677       success = false;
2678     } else {
2679       success = true;
2680     }
2681     f->open_object_section("result");
2682     f->dump_string("error", error);
2683     f->dump_bool("success", success);
2684     f->close_section();
2685   } else if (admin_command == "get_heap_property") {
2686     string property;
2687     size_t value = 0;
2688     string error;
2689     bool success = false;
2690     if (!cmd_getval(cct, cmdmap, "property", property)) {
2691       error = "unable to get property";
2692       success = false;
2693     } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2694       error = "invalid property";
2695       success = false;
2696     } else {
2697       success = true;
2698     }
2699     f->open_object_section("result");
2700     f->dump_string("error", error);
2701     f->dump_bool("success", success);
2702     f->dump_int("value", value);
2703     f->close_section();
2704   } else if (admin_command == "dump_objectstore_kv_stats") {
2705     store->get_db_statistics(f);
2706   } else if (admin_command == "dump_scrubs") {
2707     service.dumps_scrub(f);
2708   } else if (admin_command == "calc_objectstore_db_histogram") {
2709     store->generate_db_histogram(f);
2710   } else if (admin_command == "flush_store_cache") {
2711     store->flush_cache(&ss);
2712   } else if (admin_command == "dump_pgstate_history") {
2713     f->open_object_section("pgstate_history");
2714     vector<PGRef> pgs;
2715     _get_pgs(&pgs);
2716     for (auto& pg : pgs) {
2717       f->dump_stream("pg") << pg->pg_id;
2718       pg->dump_pgstate_history(f);
2719     }
2720     f->close_section();
2721   } else if (admin_command == "compact") {
2722     dout(1) << "triggering manual compaction" << dendl;
2723     auto start = ceph::coarse_mono_clock::now();
2724     store->compact();
2725     auto end = ceph::coarse_mono_clock::now();
2726     double duration = std::chrono::duration<double>(end-start).count();
2727     dout(1) << "finished manual compaction in "
2728             << duration
2729             << " seconds" << dendl;
2730     f->open_object_section("compact_result");
2731     f->dump_float("elapsed_time", duration);
2732     f->close_section();
2733   } else if (admin_command == "get_mapped_pools") {
2734     f->open_array_section("mapped_pools");
2735     set<int64_t> poollist = get_mapped_pools();
2736     for (auto pool : poollist) {
2737       f->dump_int("pool_id", pool);
2738     }
2739     f->close_section();
2740   } else if (admin_command == "smart") {
2741     string devid;
2742     cmd_getval(cct, cmdmap, "devid", devid);
2743     probe_smart(devid, ss);
2744   } else if (admin_command == "list_devices") {
2745     set<string> devnames;
2746     store->get_devices(&devnames);
2747     f->open_object_section("list_devices");
2748     for (auto dev : devnames) {
2749       if (dev.find("dm-") == 0) {
2750         continue;
2751       }
2752       f->dump_string("device", "/dev/" + dev);
2753     }
2754     f->close_section();
2755   } else if (admin_command == "send_beacon") {
2756     if (is_active()) {
2757       send_beacon(ceph::coarse_mono_clock::now());
2758     }
2759   } else if (admin_command == "dump_osd_network") {
2760     int64_t value = 0;
2761     if (!(cmd_getval(cct, cmdmap, "value", value))) {
2762       // Convert milliseconds to microseconds
2763       value = static_cast<int64_t>(g_conf().get_val<double>("mon_warn_on_slow_ping_time")) * 1000;
2764       if (value == 0) {
2765         double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2766         value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2767         value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2768       }
2769     } else {
2770       // Convert user input to microseconds
2771       value *= 1000;
2772     }
2773     if (value < 0) value = 0;
2774
2775     struct osd_ping_time_t {
2776       uint32_t pingtime;
2777       int to;
2778       bool back;
2779       std::array<uint32_t,3> times;
2780       std::array<uint32_t,3> min;
2781       std::array<uint32_t,3> max;
2782       uint32_t last;
2783       uint32_t last_update;
2784
2785       bool operator<(const osd_ping_time_t& rhs) const {
2786         if (pingtime < rhs.pingtime)
2787           return true;
2788         if (pingtime > rhs.pingtime)
2789           return false;
2790         if (to < rhs.to)
2791           return true;
2792         if (to > rhs.to)
2793           return false;
2794         return back;
2795       }
2796     };
2797
2798     set<osd_ping_time_t> sorted;
2799     // Get pingtimes under lock and not on the stack
2800     map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
2801     service.get_hb_pingtime(pingtimes);
2802     for (auto j : *pingtimes) {
2803       if (j.second.last_update == 0)
2804         continue;
2805       osd_ping_time_t item;
2806       item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2807       item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
2808       if (item.pingtime >= value) {
2809         item.to = j.first;
2810         item.times[0] = j.second.back_pingtime[0];
2811         item.times[1] = j.second.back_pingtime[1];
2812         item.times[2] = j.second.back_pingtime[2];
2813         item.min[0] = j.second.back_min[0];
2814         item.min[1] = j.second.back_min[1];
2815         item.min[2] = j.second.back_min[2];
2816         item.max[0] = j.second.back_max[0];
2817         item.max[1] = j.second.back_max[1];
2818         item.max[2] = j.second.back_max[2];
2819         item.last = j.second.back_last;
2820         item.back = true;
2821         item.last_update = j.second.last_update;
2822         sorted.emplace(item);
2823       }
2824       if (j.second.front_last == 0)
2825         continue;
2826       item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2827       item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
2828       if (item.pingtime >= value) {
2829         item.to = j.first;
2830         item.times[0] = j.second.front_pingtime[0];
2831         item.times[1] = j.second.front_pingtime[1];
2832         item.times[2] = j.second.front_pingtime[2];
2833         item.min[0] = j.second.front_min[0];
2834         item.min[1] = j.second.front_min[1];
2835         item.min[2] = j.second.front_min[2];
2836         item.max[0] = j.second.front_max[0];
2837         item.max[1] = j.second.front_max[1];
2838         item.max[2] = j.second.front_max[2];
2839         item.last = j.second.front_last;
2840         item.last_update = j.second.last_update;
2841         item.back = false;
2842         sorted.emplace(item);
2843       }
2844     }
2845     delete pingtimes;
2846     //
2847     // Network ping times (1min 5min 15min)
2848     f->open_object_section("network_ping_times");
2849     f->dump_int("threshold", value / 1000);
2850     f->open_array_section("entries");
2851     for (auto &sitem : boost::adaptors::reverse(sorted)) {
2852       ceph_assert(sitem.pingtime >= value);
2853       f->open_object_section("entry");
2854
2855       const time_t lu(sitem.last_update);
2856       char buffer[26];
2857       string lustr(ctime_r(&lu, buffer));
2858       lustr.pop_back();   // Remove trailing \n
2859       auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
2860       f->dump_string("last update", lustr);
2861       f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
2862       f->dump_int("from osd", whoami);
2863       f->dump_int("to osd", sitem.to);
2864       f->dump_string("interface", (sitem.back ? "back" : "front"));
2865       f->open_object_section("average");
2866       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
2867       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
2868       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
2869       f->close_section();  // average
2870       f->open_object_section("min");
2871       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2872       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2873       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2874       f->close_section();  // min
2875       f->open_object_section("max");
2876       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2877       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2878       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2879       f->close_section();  // max
2880       f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
2881       f->close_section();  // entry
2882     }
2883     f->close_section(); // entries
2884     f->close_section(); // network_ping_times
2885   } else {
2886     ceph_abort_msg("broken asok registration");
2887   }
2888   f->flush(ss);
2889   delete f;
2890   return true;
2891 }
2892
2893 class TestOpsSocketHook : public AdminSocketHook {
2894   OSDService *service;
2895   ObjectStore *store;
2896 public:
2897   TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2898   bool call(std::string_view command, const cmdmap_t& cmdmap,
2899             std::string_view format, bufferlist& out) override {
2900     stringstream ss;
2901     try {
2902       test_ops(service, store, command, cmdmap, ss);
2903     } catch (const bad_cmd_get& e) {
2904       ss << e.what();
2905     }
2906     out.append(ss);
2907     return true;
2908   }
2909   void test_ops(OSDService *service, ObjectStore *store,
2910                 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
2911
2912 };
2913
2914 class OSD::C_Tick : public Context {
2915   OSD *osd;
2916   public:
2917   explicit C_Tick(OSD *o) : osd(o) {}
2918   void finish(int r) override {
2919     osd->tick();
2920   }
2921 };
2922
2923 class OSD::C_Tick_WithoutOSDLock : public Context {
2924   OSD *osd;
2925   public:
2926   explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2927   void finish(int r) override {
2928     osd->tick_without_osd_lock();
2929   }
2930 };
2931
2932 int OSD::enable_disable_fuse(bool stop)
2933 {
2934 #ifdef HAVE_LIBFUSE
2935   int r;
2936   string mntpath = cct->_conf->osd_data + "/fuse";
2937   if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2938     dout(1) << __func__ << " disabling" << dendl;
2939     fuse_store->stop();
2940     delete fuse_store;
2941     fuse_store = NULL;
2942     r = ::rmdir(mntpath.c_str());
2943     if (r < 0) {
2944       r = -errno;
2945       derr << __func__ << " failed to rmdir " << mntpath << ": "
2946            << cpp_strerror(r) << dendl;
2947       return r;
2948     }
2949     return 0;
2950   }
2951   if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2952     dout(1) << __func__ << " enabling" << dendl;
2953     r = ::mkdir(mntpath.c_str(), 0700);
2954     if (r < 0)
2955       r = -errno;
2956     if (r < 0 && r != -EEXIST) {
2957       derr << __func__ << " unable to create " << mntpath << ": "
2958            << cpp_strerror(r) << dendl;
2959       return r;
2960     }
2961     fuse_store = new FuseStore(store, mntpath);
2962     r = fuse_store->start();
2963     if (r < 0) {
2964       derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2965       delete fuse_store;
2966       fuse_store = NULL;
2967       return r;
2968     }
2969   }
2970 #endif  // HAVE_LIBFUSE
2971   return 0;
2972 }
2973
2974 int OSD::get_num_op_shards()
2975 {
2976   if (cct->_conf->osd_op_num_shards)
2977     return cct->_conf->osd_op_num_shards;
2978   if (store_is_rotational)
2979     return cct->_conf->osd_op_num_shards_hdd;
2980   else
2981     return cct->_conf->osd_op_num_shards_ssd;
2982 }
2983
2984 int OSD::get_num_op_threads()
2985 {
2986   if (cct->_conf->osd_op_num_threads_per_shard)
2987     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2988   if (store_is_rotational)
2989     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2990   else
2991     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2992 }
2993
2994 float OSD::get_osd_recovery_sleep()
2995 {
2996   if (cct->_conf->osd_recovery_sleep)
2997     return cct->_conf->osd_recovery_sleep;
2998   if (!store_is_rotational && !journal_is_rotational)
2999     return cct->_conf->osd_recovery_sleep_ssd;
3000   else if (store_is_rotational && !journal_is_rotational)
3001     return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3002   else
3003     return cct->_conf->osd_recovery_sleep_hdd;
3004 }
3005
3006 float OSD::get_osd_delete_sleep()
3007 {
3008   float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3009   if (osd_delete_sleep > 0)
3010     return osd_delete_sleep;
3011   if (!store_is_rotational && !journal_is_rotational)
3012     return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3013   if (store_is_rotational && !journal_is_rotational)
3014     return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3015   return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3016 }
3017
3018 float OSD::get_osd_snap_trim_sleep()
3019 {
3020   float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3021   if (osd_snap_trim_sleep > 0)
3022     return osd_snap_trim_sleep;
3023   if (!store_is_rotational && !journal_is_rotational)
3024     return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3025   if (store_is_rotational && !journal_is_rotational)
3026     return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3027   return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3028 }
3029
3030 int OSD::init()
3031 {
3032   CompatSet initial, diff;
3033   std::lock_guard lock(osd_lock);
3034   if (is_stopping())
3035     return 0;
3036
3037   tick_timer.init();
3038   tick_timer_without_osd_lock.init();
3039   service.recovery_request_timer.init();
3040   service.sleep_timer.init();
3041
3042   boot_finisher.start();
3043
3044   {
3045     string val;
3046     store->read_meta("require_osd_release", &val);
3047     last_require_osd_release = atoi(val.c_str());
3048   }
3049
3050   // mount.
3051   dout(2) << "init " << dev_path
3052           << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3053           << dendl;
3054   dout(2) << "journal " << journal_path << dendl;
3055   ceph_assert(store);  // call pre_init() first!
3056
3057   store->set_cache_shards(get_num_op_shards());
3058
3059   int r = store->mount();
3060   if (r < 0) {
3061     derr << "OSD:init: unable to mount object store" << dendl;
3062     return r;
3063   }
3064   journal_is_rotational = store->is_journal_rotational();
3065   dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3066           << dendl;
3067
3068   enable_disable_fuse(false);
3069
3070   dout(2) << "boot" << dendl;
3071
3072   service.meta_ch = store->open_collection(coll_t::meta());
3073
3074   // initialize the daily loadavg with current 15min loadavg
3075   double loadavgs[3];
3076   if (getloadavg(loadavgs, 3) == 3) {
3077     daily_loadavg = loadavgs[2];
3078   } else {
3079     derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3080     daily_loadavg = 1.0;
3081   }
3082
3083   int rotating_auth_attempts = 0;
3084   auto rotating_auth_timeout =
3085     g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3086
3087   // sanity check long object name handling
3088   {
3089     hobject_t l;
3090     l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3091     l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3092     l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3093     r = store->validate_hobject_key(l);
3094     if (r < 0) {
3095       derr << "backend (" << store->get_type() << ") is unable to support max "
3096            << "object name[space] len" << dendl;
3097       derr << "   osd max object name len = "
3098            << cct->_conf->osd_max_object_name_len << dendl;
3099       derr << "   osd max object namespace len = "
3100            << cct->_conf->osd_max_object_namespace_len << dendl;
3101       derr << cpp_strerror(r) << dendl;
3102       if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3103         goto out;
3104       }
3105       derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3106            << dendl;
3107     } else {
3108       dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3109     }
3110   }
3111
3112   // read superblock
3113   r = read_superblock();
3114   if (r < 0) {
3115     derr << "OSD::init() : unable to read osd superblock" << dendl;
3116     r = -EINVAL;
3117     goto out;
3118   }
3119
3120   if (osd_compat.compare(superblock.compat_features) < 0) {
3121     derr << "The disk uses features unsupported by the executable." << dendl;
3122     derr << " ondisk features " << superblock.compat_features << dendl;
3123     derr << " daemon features " << osd_compat << dendl;
3124
3125     if (osd_compat.writeable(superblock.compat_features)) {
3126       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3127       derr << "it is still writeable, though. Missing features: " << diff << dendl;
3128       r = -EOPNOTSUPP;
3129       goto out;
3130     }
3131     else {
3132       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3133       derr << "Cannot write to disk! Missing features: " << diff << dendl;
3134       r = -EOPNOTSUPP;
3135       goto out;
3136     }
3137   }
3138
3139   assert_warn(whoami == superblock.whoami);
3140   if (whoami != superblock.whoami) {
3141     derr << "OSD::init: superblock says osd"
3142          << superblock.whoami << " but I am osd." << whoami << dendl;
3143     r = -EINVAL;
3144     goto out;
3145   }
3146
3147   // load up "current" osdmap
3148   assert_warn(!osdmap);
3149   if (osdmap) {
3150     derr << "OSD::init: unable to read current osdmap" << dendl;
3151     r = -EINVAL;
3152     goto out;
3153   }
3154   osdmap = get_map(superblock.current_epoch);
3155
3156   // make sure we don't have legacy pgs deleting
3157   {
3158     vector<coll_t> ls;
3159     int r = store->list_collections(ls);
3160     ceph_assert(r >= 0);
3161     for (auto c : ls) {
3162       spg_t pgid;
3163       if (c.is_pg(&pgid) &&
3164           !osdmap->have_pg_pool(pgid.pool())) {
3165         ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3166         if (!store->exists(service.meta_ch, oid)) {
3167           derr << __func__ << " missing pg_pool_t for deleted pool "
3168                << pgid.pool() << " for pg " << pgid
3169                << "; please downgrade to luminous and allow "
3170                << "pg deletion to complete before upgrading" << dendl;
3171           ceph_abort();
3172         }
3173       }
3174     }
3175   }
3176
3177   initial = get_osd_initial_compat_set();
3178   diff = superblock.compat_features.unsupported(initial);
3179   if (superblock.compat_features.merge(initial)) {
3180     // We need to persist the new compat_set before we
3181     // do anything else
3182     dout(5) << "Upgrading superblock adding: " << diff << dendl;
3183     ObjectStore::Transaction t;
3184     write_superblock(t);
3185     r = store->queue_transaction(service.meta_ch, std::move(t));
3186     if (r < 0)
3187       goto out;
3188   }
3189
3190   // make sure snap mapper object exists
3191   if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3192     dout(10) << "init creating/touching snapmapper object" << dendl;
3193     ObjectStore::Transaction t;
3194     t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3195     r = store->queue_transaction(service.meta_ch, std::move(t));
3196     if (r < 0)
3197       goto out;
3198   }
3199
3200   class_handler = new ClassHandler(cct);
3201   cls_initialize(class_handler);
3202
3203   if (cct->_conf->osd_open_classes_on_start) {
3204     int r = class_handler->open_all_classes();
3205     if (r)
3206       dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3207   }
3208
3209   check_osdmap_features();
3210
3211   create_recoverystate_perf();
3212
3213   {
3214     epoch_t bind_epoch = osdmap->get_epoch();
3215     service.set_epochs(NULL, NULL, &bind_epoch);
3216   }
3217
3218   clear_temp_objects();
3219
3220   // initialize osdmap references in sharded wq
3221   for (auto& shard : shards) {
3222     std::lock_guard l(shard->osdmap_lock);
3223     shard->shard_osdmap = osdmap;
3224   }
3225
3226   // load up pgs (as they previously existed)
3227   load_pgs();
3228
3229   dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3230   dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
3231     op_prio_cutoff << "." << dendl;
3232
3233   create_logger();
3234
3235   // prime osd stats
3236   {
3237     struct store_statfs_t stbuf;
3238     osd_alert_list_t alerts;
3239     int r = store->statfs(&stbuf, &alerts);
3240     ceph_assert(r == 0);
3241     service.set_statfs(stbuf, alerts);
3242   }
3243
3244   // client_messenger auth_client is already set up by monc.
3245   for (auto m : { cluster_messenger,
3246         objecter_messenger,
3247         hb_front_client_messenger,
3248         hb_back_client_messenger,
3249         hb_front_server_messenger,
3250         hb_back_server_messenger } ) {
3251     m->set_auth_client(monc);
3252   }
3253   for (auto m : { client_messenger,
3254         cluster_messenger,
3255         hb_front_server_messenger,
3256         hb_back_server_messenger }) {
3257     m->set_auth_server(monc);
3258   }
3259   monc->set_handle_authentication_dispatcher(this);
3260
3261   monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3262                       | CEPH_ENTITY_TYPE_MGR);
3263   r = monc->init();
3264   if (r < 0)
3265     goto out;
3266
3267   mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3268   mgrc.set_perf_metric_query_cb(
3269     [this](const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
3270         set_perf_queries(queries);
3271       },
3272       [this](std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
3273         get_perf_reports(reports);
3274       });
3275   mgrc.init();
3276
3277   // tell monc about log_client so it will know about mon session resets
3278   monc->set_log_client(&log_client);
3279   update_log_config();
3280
3281   // i'm ready!
3282   client_messenger->add_dispatcher_tail(&mgrc);
3283   client_messenger->add_dispatcher_tail(this);
3284   cluster_messenger->add_dispatcher_head(this);
3285
3286   hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3287   hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3288   hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3289   hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3290
3291   objecter_messenger->add_dispatcher_head(service.objecter);
3292
3293   service.init();
3294   service.publish_map(osdmap);
3295   service.publish_superblock(superblock);
3296   service.max_oldest_map = superblock.oldest_map;
3297
3298   for (auto& shard : shards) {
3299     // put PGs in a temporary set because we may modify pg_slots
3300     // unordered_map below.
3301     set<PGRef> pgs;
3302     for (auto& i : shard->pg_slots) {
3303       PGRef pg = i.second->pg;
3304       if (!pg) {
3305         continue;
3306       }
3307       pgs.insert(pg);
3308     }
3309     for (auto pg : pgs) {
3310       pg->lock();
3311       set<pair<spg_t,epoch_t>> new_children;
3312       set<pair<spg_t,epoch_t>> merge_pgs;
3313       service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3314                                          &new_children, &merge_pgs);
3315       if (!new_children.empty()) {
3316         for (auto shard : shards) {
3317           shard->prime_splits(osdmap, &new_children);
3318         }
3319         assert(new_children.empty());
3320       }
3321       if (!merge_pgs.empty()) {
3322         for (auto shard : shards) {
3323           shard->prime_merges(osdmap, &merge_pgs);
3324         }
3325         assert(merge_pgs.empty());
3326       }
3327       pg->unlock();
3328     }
3329   }
3330
3331   osd_op_tp.start();
3332   command_tp.start();
3333
3334   // start the heartbeat
3335   heartbeat_thread.create("osd_srv_heartbt");
3336
3337   // tick
3338   tick_timer.add_event_after(get_tick_interval(),
3339                              new C_Tick(this));
3340   {
3341     std::lock_guard l(tick_timer_lock);
3342     tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3343                                                 new C_Tick_WithoutOSDLock(this));
3344   }
3345
3346   osd_lock.Unlock();
3347
3348   r = monc->authenticate();
3349   if (r < 0) {
3350     derr << __func__ << " authentication failed: " << cpp_strerror(r)
3351          << dendl;
3352     exit(1);
3353   }
3354
3355   while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3356     derr << "unable to obtain rotating service keys; retrying" << dendl;
3357     ++rotating_auth_attempts;
3358     if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3359         derr << __func__ << " wait_auth_rotating timed out" << dendl;
3360         exit(1);
3361     }
3362   }
3363
3364   r = update_crush_device_class();
3365   if (r < 0) {
3366     derr << __func__ << " unable to update_crush_device_class: "
3367          << cpp_strerror(r) << dendl;
3368     exit(1);
3369   }
3370
3371   r = update_crush_location();
3372   if (r < 0) {
3373     derr << __func__ << " unable to update_crush_location: "
3374          << cpp_strerror(r) << dendl;
3375     exit(1);
3376   }
3377
3378   osd_lock.Lock();
3379   if (is_stopping())
3380     return 0;
3381
3382   // start objecter *after* we have authenticated, so that we don't ignore
3383   // the OSDMaps it requests.
3384   service.final_init();
3385
3386   check_config();
3387
3388   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3389   consume_map();
3390
3391   dout(0) << "done with init, starting boot process" << dendl;
3392
3393   // subscribe to any pg creations
3394   monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3395
3396   // MgrClient needs this (it doesn't have MonClient reference itself)
3397   monc->sub_want("mgrmap", 0, 0);
3398
3399   // we don't need to ask for an osdmap here; objecter will
3400   //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3401
3402   monc->renew_subs();
3403
3404   start_boot();
3405
3406   return 0;
3407
3408 out:
3409   enable_disable_fuse(true);
3410   store->umount();
3411   delete store;
3412   store = NULL;
3413   return r;
3414 }
3415
3416 void OSD::final_init()
3417 {
3418   AdminSocket *admin_socket = cct->get_admin_socket();
3419   asok_hook = new OSDSocketHook(this);
3420   int r = admin_socket->register_command("status", "status", asok_hook,
3421                                          "high-level status of OSD");
3422   ceph_assert(r == 0);
3423   r = admin_socket->register_command("flush_journal", "flush_journal",
3424                                      asok_hook,
3425                                      "flush the journal to permanent store");
3426   ceph_assert(r == 0);
3427   r = admin_socket->register_command("dump_ops_in_flight",
3428                                      "dump_ops_in_flight " \
3429                                      "name=filterstr,type=CephString,n=N,req=false",
3430                                      asok_hook,
3431                                      "show the ops currently in flight");
3432   ceph_assert(r == 0);
3433   r = admin_socket->register_command("ops",
3434                                      "ops " \
3435                                      "name=filterstr,type=CephString,n=N,req=false",
3436                                      asok_hook,
3437                                      "show the ops currently in flight");
3438   ceph_assert(r == 0);
3439   r = admin_socket->register_command("dump_blocked_ops",
3440                                      "dump_blocked_ops " \
3441                                      "name=filterstr,type=CephString,n=N,req=false",
3442                                      asok_hook,
3443                                      "show the blocked ops currently in flight");
3444   ceph_assert(r == 0);
3445   r = admin_socket->register_command("dump_historic_ops",
3446                                      "dump_historic_ops " \
3447                                      "name=filterstr,type=CephString,n=N,req=false",
3448                                      asok_hook,
3449                                      "show recent ops");
3450   ceph_assert(r == 0);
3451   r = admin_socket->register_command("dump_historic_slow_ops",
3452                                      "dump_historic_slow_ops " \
3453                                      "name=filterstr,type=CephString,n=N,req=false",
3454                                      asok_hook,
3455                                      "show slowest recent ops");
3456   ceph_assert(r == 0);
3457   r = admin_socket->register_command("dump_historic_ops_by_duration",
3458                                      "dump_historic_ops_by_duration " \
3459                                      "name=filterstr,type=CephString,n=N,req=false",
3460                                      asok_hook,
3461                                      "show slowest recent ops, sorted by duration");
3462   ceph_assert(r == 0);
3463   r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
3464                                      asok_hook,
3465                                      "dump op priority queue state");
3466   ceph_assert(r == 0);
3467   r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
3468                                      asok_hook,
3469                                      "dump blacklisted clients and times");
3470   ceph_assert(r == 0);
3471   r = admin_socket->register_command("dump_watchers", "dump_watchers",
3472                                      asok_hook,
3473                                      "show clients which have active watches,"
3474                                      " and on which objects");
3475   ceph_assert(r == 0);
3476   r = admin_socket->register_command("dump_recovery_reservations", "dump_recovery_reservations",
3477                                      asok_hook,
3478                                      "show recovery reservations");
3479   ceph_assert(r == 0);
3480   r = admin_socket->register_command("dump_scrub_reservations", "dump_scrub_reservations",
3481                                      asok_hook,
3482                                      "show scrub reservations");
3483   ceph_assert(r == 0);
3484   r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
3485                                      asok_hook,
3486                                      "force osd to update the latest map from "
3487                                      "the mon");
3488   ceph_assert(r == 0);
3489
3490   r = admin_socket->register_command( "heap",
3491                                       "heap " \
3492                                       "name=heapcmd,type=CephString " \
3493                                       "name=value,type=CephString,req=false",
3494                                       asok_hook,
3495                                       "show heap usage info (available only if "
3496                                       "compiled with tcmalloc)");
3497   ceph_assert(r == 0);
3498
3499   r = admin_socket->register_command("set_heap_property",
3500                                      "set_heap_property " \
3501                                      "name=property,type=CephString " \
3502                                      "name=value,type=CephInt",
3503                                      asok_hook,
3504                                      "update malloc extension heap property");
3505   ceph_assert(r == 0);
3506
3507   r = admin_socket->register_command("get_heap_property",
3508                                      "get_heap_property " \
3509                                      "name=property,type=CephString",
3510                                      asok_hook,
3511                                      "get malloc extension heap property");
3512   ceph_assert(r == 0);
3513
3514   r = admin_socket->register_command("dump_objectstore_kv_stats",
3515                                      "dump_objectstore_kv_stats",
3516                                      asok_hook,
3517                                      "print statistics of kvdb which used by bluestore");
3518   ceph_assert(r == 0);
3519
3520   r = admin_socket->register_command("dump_scrubs",
3521                                      "dump_scrubs",
3522                                      asok_hook,
3523                                      "print scheduled scrubs");
3524   ceph_assert(r == 0);
3525
3526   r = admin_socket->register_command("calc_objectstore_db_histogram",
3527                                      "calc_objectstore_db_histogram",
3528                                      asok_hook,
3529                                      "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3530   ceph_assert(r == 0);
3531
3532   r = admin_socket->register_command("flush_store_cache",
3533                                      "flush_store_cache",
3534                                      asok_hook,
3535                                      "Flush bluestore internal cache");
3536   ceph_assert(r == 0);
3537   r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
3538                                      asok_hook,
3539                                      "show recent state history");
3540   ceph_assert(r == 0);
3541
3542   r = admin_socket->register_command("compact", "compact",
3543                                      asok_hook,
3544                                      "Commpact object store's omap."
3545                                      " WARNING: Compaction probably slows your requests");
3546   ceph_assert(r == 0);
3547
3548   r = admin_socket->register_command("get_mapped_pools", "get_mapped_pools",
3549                                      asok_hook,
3550                                      "dump pools whose PG(s) are mapped to this OSD.");
3551
3552   ceph_assert(r == 0);
3553
3554   r = admin_socket->register_command("smart", "smart name=devid,type=CephString,req=False",
3555                                      asok_hook,
3556                                      "probe OSD devices for SMART data.");
3557
3558   ceph_assert(r == 0);
3559
3560   r = admin_socket->register_command("list_devices", "list_devices",
3561                                      asok_hook,
3562                                      "list OSD devices.");
3563   r = admin_socket->register_command("send_beacon", "send_beacon",
3564                                      asok_hook,
3565                                      "send OSD beacon to mon immediately");
3566
3567   r = admin_socket->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3568                                          "Dump osd heartbeat network ping times");
3569   ceph_assert(r == 0);
3570
3571   test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3572   // Note: pools are CephString instead of CephPoolname because
3573   // these commands traditionally support both pool names and numbers
3574   r = admin_socket->register_command(
3575    "setomapval",
3576    "setomapval " \
3577    "name=pool,type=CephString " \
3578    "name=objname,type=CephObjectname " \
3579    "name=key,type=CephString "\
3580    "name=val,type=CephString",
3581    test_ops_hook,
3582    "set omap key");
3583   ceph_assert(r == 0);
3584   r = admin_socket->register_command(
3585     "rmomapkey",
3586     "rmomapkey " \
3587     "name=pool,type=CephString " \
3588     "name=objname,type=CephObjectname " \
3589     "name=key,type=CephString",
3590     test_ops_hook,
3591     "remove omap key");
3592   ceph_assert(r == 0);
3593   r = admin_socket->register_command(
3594     "setomapheader",
3595     "setomapheader " \
3596     "name=pool,type=CephString " \
3597     "name=objname,type=CephObjectname " \
3598     "name=header,type=CephString",
3599     test_ops_hook,
3600     "set omap header");
3601   ceph_assert(r == 0);
3602
3603   r = admin_socket->register_command(
3604     "getomap",
3605     "getomap " \
3606     "name=pool,type=CephString " \
3607     "name=objname,type=CephObjectname",
3608     test_ops_hook,
3609     "output entire object map");
3610   ceph_assert(r == 0);
3611
3612   r = admin_socket->register_command(
3613     "truncobj",
3614     "truncobj " \
3615     "name=pool,type=CephString " \
3616     "name=objname,type=CephObjectname " \
3617     "name=len,type=CephInt",
3618     test_ops_hook,
3619     "truncate object to length");
3620   ceph_assert(r == 0);
3621
3622   r = admin_socket->register_command(
3623     "injectdataerr",
3624     "injectdataerr " \
3625     "name=pool,type=CephString " \
3626     "name=objname,type=CephObjectname " \
3627     "name=shardid,type=CephInt,req=false,range=0|255",
3628     test_ops_hook,
3629     "inject data error to an object");
3630   ceph_assert(r == 0);
3631
3632   r = admin_socket->register_command(
3633     "injectmdataerr",
3634     "injectmdataerr " \
3635     "name=pool,type=CephString " \
3636     "name=objname,type=CephObjectname " \
3637     "name=shardid,type=CephInt,req=false,range=0|255",
3638     test_ops_hook,
3639     "inject metadata error to an object");
3640   ceph_assert(r == 0);
3641   r = admin_socket->register_command(
3642     "set_recovery_delay",
3643     "set_recovery_delay " \
3644     "name=utime,type=CephInt,req=false",
3645     test_ops_hook,
3646      "Delay osd recovery by specified seconds");
3647   ceph_assert(r == 0);
3648   r = admin_socket->register_command(
3649    "trigger_scrub",
3650    "trigger_scrub " \
3651    "name=pgid,type=CephString " \
3652    "name=time,type=CephInt,req=false",
3653    test_ops_hook,
3654    "Trigger a scheduled scrub ");
3655   ceph_assert(r == 0);
3656   r = admin_socket->register_command(
3657    "trigger_deep_scrub",
3658    "trigger_deep_scrub " \
3659    "name=pgid,type=CephString " \
3660    "name=time,type=CephInt,req=false",
3661    test_ops_hook,
3662    "Trigger a scheduled deep scrub ");
3663   ceph_assert(r == 0);
3664   r = admin_socket->register_command(
3665    "injectfull",
3666    "injectfull " \
3667    "name=type,type=CephString,req=false " \
3668    "name=count,type=CephInt,req=false ",
3669    test_ops_hook,
3670    "Inject a full disk (optional count times)");
3671   ceph_assert(r == 0);
3672 }
3673
3674 void OSD::create_logger()
3675 {
3676   dout(10) << "create_logger" << dendl;
3677
3678   PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
3679
3680   // Latency axis configuration for op histograms, values are in nanoseconds
3681   PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3682     "Latency (usec)",
3683     PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3684     0,                               ///< Start at 0
3685     100000,                          ///< Quantization unit is 100usec
3686     32,                              ///< Enough to cover much longer than slow requests
3687   };
3688
3689   // Op size axis configuration for op histograms, values are in bytes
3690   PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3691     "Request size (bytes)",
3692     PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3693     0,                               ///< Start at 0
3694     512,                             ///< Quantization unit is 512 bytes
3695     32,                              ///< Enough to cover requests larger than GB
3696   };
3697
3698
3699   // All the basic OSD operation stats are to be considered useful
3700   osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3701
3702   osd_plb.add_u64(
3703     l_osd_op_wip, "op_wip",
3704     "Replication operations currently being processed (primary)");
3705   osd_plb.add_u64_counter(
3706     l_osd_op, "op",
3707     "Client operations",
3708     "ops", PerfCountersBuilder::PRIO_CRITICAL);
3709   osd_plb.add_u64_counter(
3710     l_osd_op_inb,   "op_in_bytes",
3711     "Client operations total write size",
3712     "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
3713   osd_plb.add_u64_counter(
3714     l_osd_op_outb,  "op_out_bytes",
3715     "Client operations total read size",
3716     "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
3717   osd_plb.add_time_avg(
3718     l_osd_op_lat,   "op_latency",
3719     "Latency of client operations (including queue time)",
3720     "l", 9);
3721   osd_plb.add_time_avg(
3722     l_osd_op_process_lat, "op_process_latency",
3723     "Latency of client operations (excluding queue time)");
3724   osd_plb.add_time_avg(
3725     l_osd_op_prepare_lat, "op_prepare_latency",
3726     "Latency of client operations (excluding queue time and wait for finished)");
3727
3728   osd_plb.add_u64_counter(
3729     l_osd_op_r, "op_r", "Client read operations");
3730   osd_plb.add_u64_counter(
3731     l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3732   osd_plb.add_time_avg(
3733     l_osd_op_r_lat, "op_r_latency",
3734     "Latency of read operation (including queue time)");
3735   osd_plb.add_u64_counter_histogram(
3736     l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3737     op_hist_x_axis_config, op_hist_y_axis_config,
3738     "Histogram of operation latency (including queue time) + data read");
3739   osd_plb.add_time_avg(
3740     l_osd_op_r_process_lat, "op_r_process_latency",
3741     "Latency of read operation (excluding queue time)");
3742   osd_plb.add_time_avg(
3743     l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3744     "Latency of read operations (excluding queue time and wait for finished)");
3745   osd_plb.add_u64_counter(
3746     l_osd_op_w, "op_w", "Client write operations");
3747   osd_plb.add_u64_counter(
3748     l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3749   osd_plb.add_time_avg(
3750     l_osd_op_w_lat,  "op_w_latency",
3751     "Latency of write operation (including queue time)");
3752   osd_plb.add_u64_counter_histogram(
3753     l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3754     op_hist_x_axis_config, op_hist_y_axis_config,
3755     "Histogram of operation latency (including queue time) + data written");
3756   osd_plb.add_time_avg(
3757     l_osd_op_w_process_lat, "op_w_process_latency",
3758     "Latency of write operation (excluding queue time)");
3759   osd_plb.add_time_avg(
3760     l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3761     "Latency of write operations (excluding queue time and wait for finished)");
3762   osd_plb.add_u64_counter(
3763     l_osd_op_rw, "op_rw",
3764     "Client read-modify-write operations");
3765   osd_plb.add_u64_counter(
3766     l_osd_op_rw_inb, "op_rw_in_bytes",
3767     "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3768   osd_plb.add_u64_counter(
3769     l_osd_op_rw_outb,"op_rw_out_bytes",
3770     "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3771   osd_plb.add_time_avg(
3772     l_osd_op_rw_lat, "op_rw_latency",
3773     "Latency of read-modify-write operation (including queue time)");
3774   osd_plb.add_u64_counter_histogram(
3775     l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3776     op_hist_x_axis_config, op_hist_y_axis_config,
3777     "Histogram of rw operation latency (including queue time) + data written");
3778   osd_plb.add_u64_counter_histogram(
3779     l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3780     op_hist_x_axis_config, op_hist_y_axis_config,
3781     "Histogram of rw operation latency (including queue time) + data read");
3782   osd_plb.add_time_avg(
3783     l_osd_op_rw_process_lat, "op_rw_process_latency",
3784     "Latency of read-modify-write operation (excluding queue time)");
3785   osd_plb.add_time_avg(
3786     l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3787     "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3788
3789   // Now we move on to some more obscure stats, revert to assuming things
3790   // are low priority unless otherwise specified.
3791   osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3792
3793   osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3794     "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3795   osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3796     "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3797
3798   osd_plb.add_u64_counter(
3799     l_osd_sop, "subop", "Suboperations");
3800   osd_plb.add_u64_counter(
3801     l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
3802   osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3803
3804   osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3805   osd_plb.add_u64_counter(
3806     l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
3807   osd_plb.add_time_avg(
3808     l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3809   osd_plb.add_u64_counter(
3810     l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3811   osd_plb.add_time_avg(
3812     l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3813   osd_plb.add_u64_counter(
3814     l_osd_sop_push, "subop_push", "Suboperations push messages");
3815   osd_plb.add_u64_counter(
3816     l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
3817   osd_plb.add_time_avg(
3818     l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3819
3820   osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3821   osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3822   osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
3823
3824   osd_plb.add_u64_counter(
3825     l_osd_rop, "recovery_ops",
3826     "Started recovery operations",
3827     "rop", PerfCountersBuilder::PRIO_INTERESTING);
3828
3829   osd_plb.add_u64_counter(
3830    l_osd_rbytes, "recovery_bytes",
3831    "recovery bytes",
3832    "rbt", PerfCountersBuilder::PRIO_INTERESTING);
3833
3834   osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3835   osd_plb.add_u64(
3836     l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3837   osd_plb.add_u64(
3838     l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3839     "Total number getting crc from crc_cache with adjusting");
3840   osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3841     "Total number of crc cache misses");
3842
3843   osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3844                   "pgs", PerfCountersBuilder::PRIO_USEFUL);
3845   osd_plb.add_u64(
3846     l_osd_pg_primary, "numpg_primary",
3847     "Placement groups for which this osd is primary");
3848   osd_plb.add_u64(
3849     l_osd_pg_replica, "numpg_replica",
3850     "Placement groups for which this osd is replica");
3851   osd_plb.add_u64(
3852     l_osd_pg_stray, "numpg_stray",
3853     "Placement groups ready to be deleted from this osd");
3854   osd_plb.add_u64(
3855     l_osd_pg_removing, "numpg_removing",
3856     "Placement groups queued for local deletion", "pgsr",
3857     PerfCountersBuilder::PRIO_USEFUL);
3858   osd_plb.add_u64(
3859     l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3860   osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3861   osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3862   osd_plb.add_u64_counter(
3863     l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3864   osd_plb.add_u64_counter(
3865     l_osd_waiting_for_map, "messages_delayed_for_map",
3866     "Operations waiting for OSD map");
3867
3868   osd_plb.add_u64_counter(
3869     l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3870   osd_plb.add_u64_counter(
3871     l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3872   osd_plb.add_u64_counter(
3873     l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3874     "osdmap cache miss below cache lower bound");
3875   osd_plb.add_u64_avg(
3876     l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3877     "osdmap cache miss, avg distance below cache lower bound");
3878   osd_plb.add_u64_counter(
3879     l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3880     "OSDMap buffer cache hits");
3881   osd_plb.add_u64_counter(
3882     l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3883     "OSDMap buffer cache misses");
3884
3885   osd_plb.add_u64(
3886     l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
3887     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3888   osd_plb.add_u64(
3889     l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
3890     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3891   osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
3892
3893   osd_plb.add_u64_counter(
3894     l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3895
3896   osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3897   osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3898   osd_plb.add_u64_counter(
3899     l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3900   osd_plb.add_u64_counter(
3901     l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3902   osd_plb.add_u64_counter(
3903     l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3904     "Failed tier flush attempts");
3905   osd_plb.add_u64_counter(
3906     l_osd_tier_evict, "tier_evict", "Tier evictions");
3907   osd_plb.add_u64_counter(
3908     l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3909   osd_plb.add_u64_counter(
3910     l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3911   osd_plb.add_u64_counter(
3912     l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3913   osd_plb.add_u64_counter(
3914     l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3915   osd_plb.add_u64_counter(
3916     l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3917   osd_plb.add_u64_counter(
3918     l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3919
3920   osd_plb.add_u64_counter(
3921     l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3922   osd_plb.add_u64_counter(
3923     l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3924   osd_plb.add_u64_counter(
3925     l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3926   osd_plb.add_u64_counter(
3927     l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3928
3929   osd_plb.add_u64_counter(
3930     l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3931   osd_plb.add_u64_counter(
3932     l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3933
3934   osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3935   osd_plb.add_time_avg(
3936     l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3937   osd_plb.add_time_avg(
3938     l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3939   osd_plb.add_time_avg(
3940     l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3941
3942   osd_plb.add_u64_counter(
3943     l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3944   osd_plb.add_u64_counter(
3945     l_osd_pg_fastinfo, "osd_pg_fastinfo",
3946     "PG updated its info using fastinfo attr");
3947   osd_plb.add_u64_counter(
3948     l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3949
3950   logger = osd_plb.create_perf_counters();
3951   cct->get_perfcounters_collection()->add(logger);
3952 }
3953
3954 void OSD::create_recoverystate_perf()
3955 {
3956   dout(10) << "create_recoverystate_perf" << dendl;
3957
3958   PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3959
3960   rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3961   rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3962   rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3963   rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3964   rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3965   rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3966   rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3967   rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3968   rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3969   rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3970   rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3971   rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3972   rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3973   rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3974   rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3975   rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3976   rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3977   rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3978   rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3979   rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3980   rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3981   rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3982   rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3983   rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3984   rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3985   rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3986   rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3987   rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3988   rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3989   rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3990   rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3991
3992   recoverystate_perf = rs_perf.create_perf_counters();
3993   cct->get_perfcounters_collection()->add(recoverystate_perf);
3994 }
3995
3996 int OSD::shutdown()
3997 {
3998   if (cct->_conf->osd_fast_shutdown) {
3999     derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4000     cct->_log->flush();
4001     _exit(0);
4002   }
4003
4004   if (!service.prepare_to_stop())
4005     return 0; // already shutting down
4006   osd_lock.Lock();
4007   if (is_stopping()) {
4008     osd_lock.Unlock();
4009     return 0;
4010   }
4011   dout(0) << "shutdown" << dendl;
4012
4013   set_state(STATE_STOPPING);
4014
4015   // Debugging
4016   if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4017     cct->_conf.set_val("debug_osd", "100");
4018     cct->_conf.set_val("debug_journal", "100");
4019     cct->_conf.set_val("debug_filestore", "100");
4020     cct->_conf.set_val("debug_bluestore", "100");
4021     cct->_conf.set_val("debug_ms", "100");
4022     cct->_conf.apply_changes(nullptr);
4023   }
4024
4025   // stop MgrClient earlier as it's more like an internal consumer of OSD
4026   mgrc.shutdown();
4027
4028   service.start_shutdown();
4029
4030   // stop sending work to pgs.  this just prevents any new work in _process
4031   // from racing with on_shutdown and potentially entering the pg after.
4032   op_shardedwq.drain();
4033
4034   // Shutdown PGs
4035   {
4036     vector<PGRef> pgs;
4037     _get_pgs(&pgs);
4038     for (auto pg : pgs) {
4039       pg->shutdown();
4040     }
4041   }
4042
4043   // drain op queue again (in case PGs requeued something)
4044   op_shardedwq.drain();
4045   {
4046     finished.clear(); // zap waiters (bleh, this is messy)
4047     waiting_for_osdmap.clear();
4048   }
4049
4050   // unregister commands
4051   cct->get_admin_socket()->unregister_commands(asok_hook);
4052   delete asok_hook;
4053   asok_hook = NULL;
4054
4055   cct->get_admin_socket()->unregister_commands(test_ops_hook);
4056   delete test_ops_hook;
4057   test_ops_hook = NULL;
4058
4059   osd_lock.Unlock();
4060
4061   heartbeat_lock.Lock();
4062   heartbeat_stop = true;
4063   heartbeat_cond.Signal();
4064   heartbeat_lock.Unlock();
4065   heartbeat_thread.join();
4066
4067   osd_op_tp.drain();
4068   osd_op_tp.stop();
4069   dout(10) << "op sharded tp stopped" << dendl;
4070
4071   command_tp.drain();
4072   command_tp.stop();
4073   dout(10) << "command tp stopped" << dendl;
4074
4075   dout(10) << "stopping agent" << dendl;
4076   service.agent_stop();
4077
4078   boot_finisher.wait_for_empty();
4079
4080   osd_lock.Lock();
4081
4082   boot_finisher.stop();
4083   reset_heartbeat_peers(true);
4084
4085   tick_timer.shutdown();
4086
4087   {
4088     std::lock_guard l(tick_timer_lock);
4089     tick_timer_without_osd_lock.shutdown();
4090   }
4091
4092   // note unmount epoch
4093   dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
4094   superblock.mounted = service.get_boot_epoch();
4095   superblock.clean_thru = osdmap->get_epoch();
4096   ObjectStore::Transaction t;
4097   write_superblock(t);
4098   int r = store->queue_transaction(service.meta_ch, std::move(t));
4099   if (r) {
4100     derr << "OSD::shutdown: error writing superblock: "
4101          << cpp_strerror(r) << dendl;
4102   }
4103
4104
4105   service.shutdown_reserver();
4106
4107   // Remove PGs
4108 #ifdef PG_DEBUG_REFS
4109   service.dump_live_pgids();
4110 #endif
4111   while (true) {
4112     vector<PGRef> pgs;
4113     _get_pgs(&pgs, true);
4114     if (pgs.empty()) {
4115       break;
4116     }
4117     for (auto& pg : pgs) {
4118       if (pg->is_deleted()) {
4119         continue;
4120       }
4121       dout(20) << " kicking pg " << pg << dendl;
4122       pg->lock();
4123       if (pg->get_num_ref() != 1) {
4124         derr << "pgid " << pg->get_pgid() << " has ref count of "
4125              << pg->get_num_ref() << dendl;
4126 #ifdef PG_DEBUG_REFS
4127         pg->dump_live_ids();
4128 #endif
4129         if (cct->_conf->osd_shutdown_pgref_assert) {
4130           ceph_abort();
4131         }
4132       }
4133       pg->ch.reset();
4134       pg->unlock();
4135     }
4136   }
4137 #ifdef PG_DEBUG_REFS
4138   service.dump_live_pgids();
4139 #endif
4140
4141   osd_lock.Unlock();
4142   cct->_conf.remove_observer(this);
4143   osd_lock.Lock();
4144
4145   service.meta_ch.reset();
4146
4147   dout(10) << "syncing store" << dendl;
4148   enable_disable_fuse(true);
4149
4150   if (cct->_conf->osd_journal_flush_on_shutdown) {
4151     dout(10) << "flushing journal" << dendl;
4152     store->flush_journal();
4153   }
4154
4155   monc->shutdown();
4156   osd_lock.Unlock();
4157
4158   map_lock.get_write();
4159   osdmap = OSDMapRef();
4160   map_lock.put_write();
4161
4162   for (auto s : shards) {
4163     std::lock_guard l(s->osdmap_lock);
4164     s->shard_osdmap = OSDMapRef();
4165   }
4166   service.shutdown();
4167
4168   std::lock_guard lock(osd_lock);
4169   store->umount();
4170   delete store;
4171   store = nullptr;
4172   dout(10) << "Store synced" << dendl;
4173
4174   op_tracker.on_shutdown();
4175
4176   class_handler->shutdown();
4177   client_messenger->shutdown();
4178   cluster_messenger->shutdown();
4179   hb_front_client_messenger->shutdown();
4180   hb_back_client_messenger->shutdown();
4181   objecter_messenger->shutdown();
4182   hb_front_server_messenger->shutdown();
4183   hb_back_server_messenger->shutdown();
4184
4185   return r;
4186 }
4187
4188 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4189 {
4190   bool created = false;
4191   while (true) {
4192     dout(10) << __func__ << " cmd: " << cmd << dendl;
4193     vector<string> vcmd{cmd};
4194     bufferlist inbl;
4195     C_SaferCond w;
4196     string outs;
4197     monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4198     int r = w.wait();
4199     if (r < 0) {
4200       if (r == -ENOENT && !created) {
4201         string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4202           + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4203         vector<string> vnewcmd{newcmd};
4204         bufferlist inbl;
4205         C_SaferCond w;
4206         string outs;
4207         monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4208         int r = w.wait();
4209         if (r < 0) {
4210           derr << __func__ << " fail: osd does not exist and created failed: "
4211                << cpp_strerror(r) << dendl;
4212           return r;
4213         }
4214         created = true;
4215         continue;
4216       }
4217       derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4218       return r;
4219     }
4220     break;
4221   }
4222
4223   return 0;
4224 }
4225
4226 int OSD::update_crush_location()
4227 {
4228   if (!cct->_conf->osd_crush_update_on_start) {
4229     dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4230     return 0;
4231   }
4232
4233   char weight[32];
4234   if (cct->_conf->osd_crush_initial_weight >= 0) {
4235     snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4236   } else {
4237     struct store_statfs_t st;
4238     osd_alert_list_t alerts;
4239     int r = store->statfs(&st, &alerts);
4240     if (r < 0) {
4241       derr << "statfs: " << cpp_strerror(r) << dendl;
4242       return r;
4243     }
4244     snprintf(weight, sizeof(weight), "%.4lf",
4245              std::max(.00001,
4246                       double(st.total) /
4247                       double(1ull << 40 /* TB */)));
4248   }
4249
4250   std::multimap<string,string> loc = cct->crush_location.get_location();
4251   dout(10) << __func__ << " crush location is " << loc << dendl;
4252
4253   string cmd =
4254     string("{\"prefix\": \"osd crush create-or-move\", ") +
4255     string("\"id\": ") + stringify(whoami) + string(", ") +
4256     string("\"weight\":") + weight + string(", ") +
4257     string("\"args\": [");
4258   for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
4259     if (p != loc.begin())
4260       cmd += ", ";
4261     cmd += "\"" + p->first + "=" + p->second + "\"";
4262   }
4263   cmd += "]}";
4264
4265   return mon_cmd_maybe_osd_create(cmd);
4266 }
4267
4268 int OSD::update_crush_device_class()
4269 {
4270   if (!cct->_conf->osd_class_update_on_start) {
4271     dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4272     return 0;
4273   }
4274
4275   string device_class;
4276   int r = store->read_meta("crush_device_class", &device_class);
4277   if (r < 0 || device_class.empty()) {
4278     device_class = store->get_default_device_class();
4279   }
4280
4281   if (device_class.empty()) {
4282     dout(20) << __func__ << " no device class stored locally" << dendl;
4283     return 0;
4284   }
4285
4286   string cmd =
4287     string("{\"prefix\": \"osd crush set-device-class\", ") +
4288     string("\"class\": \"") + device_class + string("\", ") +
4289     string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4290
4291   r = mon_cmd_maybe_osd_create(cmd);
4292   if (r == -EBUSY) {
4293     // good, already bound to a device-class
4294     return 0;
4295   } else {
4296     return r;
4297   }
4298 }
4299
4300 void OSD::write_superblock(ObjectStore::Transaction& t)
4301 {
4302   dout(10) << "write_superblock " << superblock << dendl;
4303
4304   //hack: at minimum it's using the baseline feature set
4305   if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4306     superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4307
4308   bufferlist bl;
4309   encode(superblock, bl);
4310   t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4311 }
4312
4313 int OSD::read_superblock()
4314 {
4315   bufferlist bl;
4316   int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4317   if (r < 0)
4318     return r;
4319
4320   auto p = bl.cbegin();
4321   decode(superblock, p);
4322
4323   dout(10) << "read_superblock " << superblock << dendl;
4324
4325   return 0;
4326 }
4327
4328 void OSD::clear_temp_objects()
4329 {
4330   dout(10) << __func__ << dendl;
4331   vector<coll_t> ls;
4332   store->list_collections(ls);
4333   for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4334     spg_t pgid;
4335     if (!p->is_pg(&pgid))
4336       continue;
4337
4338     // list temp objects
4339     dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4340
4341     vector<ghobject_t> temps;
4342     ghobject_t next;
4343     while (1) {
4344       vector<ghobject_t> objects;
4345       auto ch = store->open_collection(*p);
4346       ceph_assert(ch);
4347       store->collection_list(ch, next, ghobject_t::get_max(),
4348                              store->get_ideal_list_max(),
4349                              &objects, &next);
4350       if (objects.empty())
4351         break;
4352       vector<ghobject_t>::iterator q;
4353       for (q = objects.begin(); q != objects.end(); ++q) {
4354         // Hammer set pool for temps to -1, so check for clean-up
4355         if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4356           temps.push_back(*q);
4357         } else {
4358           break;
4359         }
4360       }
4361       // If we saw a non-temp object and hit the break above we can
4362       // break out of the while loop too.
4363       if (q != objects.end())
4364         break;
4365     }
4366     if (!temps.empty()) {
4367       ObjectStore::Transaction t;
4368       int removed = 0;
4369       for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4370         dout(20) << "  removing " << *p << " object " << *q << dendl;
4371         t.remove(*p, *q);
4372         if (++removed > cct->_conf->osd_target_transaction_size) {
4373           store->queue_transaction(service.meta_ch, std::move(t));
4374           t = ObjectStore::Transaction();
4375           removed = 0;
4376         }
4377       }
4378       if (removed) {
4379         store->queue_transaction(service.meta_ch, std::move(t));
4380       }
4381     }
4382   }
4383 }
4384
4385 void OSD::recursive_remove_collection(CephContext* cct,
4386                                       ObjectStore *store, spg_t pgid,
4387                                       coll_t tmp)
4388 {
4389   OSDriver driver(
4390     store,
4391     coll_t(),
4392     make_snapmapper_oid());
4393
4394   ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4395   ObjectStore::Transaction t;
4396   SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4397
4398   ghobject_t next;
4399   int max = cct->_conf->osd_target_transaction_size;
4400   vector<ghobject_t> objects;
4401   objects.reserve(max);
4402   while (true) {
4403     objects.clear();
4404     store->collection_list(ch, next, ghobject_t::get_max(),
4405       max, &objects, &next);
4406     generic_dout(10) << __func__ << " " << objects << dendl;
4407     if (objects.empty())
4408       break;
4409     for (auto& p: objects) {
4410       OSDriver::OSTransaction _t(driver.get_transaction(&t));
4411       int r = mapper.remove_oid(p.hobj, &_t);
4412       if (r != 0 && r != -ENOENT)
4413         ceph_abort();
4414       t.remove(tmp, p);
4415     }
4416     int r = store->queue_transaction(ch, std::move(t));
4417     ceph_assert(r == 0);
4418     t = ObjectStore::Transaction();
4419   }
4420   t.remove_collection(tmp);
4421   int r = store->queue_transaction(ch, std::move(t));
4422   ceph_assert(r == 0);
4423
4424   C_SaferCond waiter;
4425   if (!ch->flush_commit(&waiter)) {
4426     waiter.wait();
4427   }
4428 }
4429
4430
4431 // ======================================================
4432 // PG's
4433
4434 PG* OSD::_make_pg(
4435   OSDMapRef createmap,
4436   spg_t pgid)
4437 {
4438   dout(10) << __func__ << " " << pgid << dendl;
4439   pg_pool_t pi;
4440   map<string,string> ec_profile;
4441   string name;
4442   if (createmap->have_pg_pool(pgid.pool())) {
4443     pi = *createmap->get_pg_pool(pgid.pool());
4444     name = createmap->get_pool_name(pgid.pool());
4445     if (pi.is_erasure()) {
4446       ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4447     }
4448   } else {
4449     // pool was deleted; grab final pg_pool_t off disk.
4450     ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4451     bufferlist bl;
4452     int r = store->read(service.meta_ch, oid, 0, 0, bl);
4453     if (r < 0) {
4454       derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4455            << dendl;
4456       return nullptr;
4457     }
4458     ceph_assert(r >= 0);
4459     auto p = bl.cbegin();
4460     decode(pi, p);
4461     decode(name, p);
4462     if (p.end()) { // dev release v13.0.2 did not include ec_profile
4463       derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4464            << " tombstone" << dendl;
4465       return nullptr;
4466     }
4467     decode(ec_profile, p);
4468   }
4469   PGPool pool(cct, createmap, pgid.pool(), pi, name);
4470   PG *pg;
4471   if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4472       pi.type == pg_pool_t::TYPE_ERASURE)
4473     pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4474   else
4475     ceph_abort();
4476   return pg;
4477 }
4478
4479 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4480 {
4481   v->clear();
4482   v->reserve(get_num_pgs());
4483   for (auto& s : shards) {
4484     std::lock_guard l(s->shard_lock);
4485     for (auto& j : s->pg_slots) {
4486       if (j.second->pg &&
4487           !j.second->pg->is_deleted()) {
4488         v->push_back(j.second->pg);
4489         if (clear_too) {
4490           s->_detach_pg(j.second.get());
4491         }
4492       }
4493     }
4494   }
4495 }
4496
4497 void OSD::_get_pgids(vector<spg_t> *v)
4498 {
4499   v->clear();
4500   v->reserve(get_num_pgs());
4501   for (auto& s : shards) {
4502     std::lock_guard l(s->shard_lock);
4503     for (auto& j : s->pg_slots) {
4504       if (j.second->pg &&
4505           !j.second->pg->is_deleted()) {
4506         v->push_back(j.first);
4507       }
4508     }
4509   }
4510 }
4511
4512 void OSD::register_pg(PGRef pg)
4513 {
4514   spg_t pgid = pg->get_pgid();
4515   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4516   auto sdata = shards[shard_index];
4517   std::lock_guard l(sdata->shard_lock);
4518   auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4519   ceph_assert(r.second);
4520   auto *slot = r.first->second.get();
4521   dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4522   sdata->_attach_pg(slot, pg.get());
4523 }
4524
4525 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4526 {
4527   auto sdata = pg->osd_shard;
4528   ceph_assert(sdata);
4529   {
4530     std::lock_guard l(sdata->shard_lock);
4531     auto p = sdata->pg_slots.find(pg->pg_id);
4532     if (p == sdata->pg_slots.end() ||
4533         !p->second->pg) {
4534       dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4535       return false;
4536     }
4537     if (p->second->waiting_for_merge_epoch) {
4538       dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4539       return false;
4540     }
4541     dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4542     sdata->_detach_pg(p->second.get());
4543   }
4544
4545   for (auto shard : shards) {
4546     shard->unprime_split_children(pg->pg_id, old_pg_num);
4547   }
4548
4549   // update pg count now since we might not get an osdmap any time soon.
4550   if (pg->is_primary())
4551     service.logger->dec(l_osd_pg_primary);
4552   else if (pg->is_replica())
4553     service.logger->dec(l_osd_pg_replica);
4554   else
4555     service.logger->dec(l_osd_pg_stray);
4556
4557   return true;
4558 }
4559
4560 PGRef OSD::_lookup_pg(spg_t pgid)
4561 {
4562   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4563   auto sdata = shards[shard_index];
4564   std::lock_guard l(sdata->shard_lock);
4565   auto p = sdata->pg_slots.find(pgid);
4566   if (p == sdata->pg_slots.end()) {
4567     return nullptr;
4568   }
4569   return p->second->pg;
4570 }
4571
4572 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4573 {
4574   PGRef pg = _lookup_pg(pgid);
4575   if (!pg) {
4576     return nullptr;
4577   }
4578   pg->lock();
4579   if (!pg->is_deleted()) {
4580     return pg;
4581   }
4582   pg->unlock();
4583   return nullptr;
4584 }
4585
4586 PGRef OSD::lookup_lock_pg(spg_t pgid)
4587 {
4588   return _lookup_lock_pg(pgid);
4589 }
4590
4591 void OSD::load_pgs()
4592 {
4593   ceph_assert(osd_lock.is_locked());
4594   dout(0) << "load_pgs" << dendl;
4595
4596   {
4597     auto pghist = make_pg_num_history_oid();
4598     bufferlist bl;
4599     int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4600     if (r >= 0 && bl.length() > 0) {
4601       auto p = bl.cbegin();
4602       decode(pg_num_history, p);
4603     }
4604     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4605   }
4606
4607   vector<coll_t> ls;
4608   int r = store->list_collections(ls);
4609   if (r < 0) {
4610     derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4611   }
4612
4613   int num = 0;
4614   for (vector<coll_t>::iterator it = ls.begin();
4615        it != ls.end();
4616        ++it) {
4617     spg_t pgid;
4618     if (it->is_temp(&pgid) ||
4619        (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4620       dout(10) << "load_pgs " << *it
4621                << " removing, legacy or flagged for removal pg" << dendl;
4622       recursive_remove_collection(cct, store, pgid, *it);
4623       continue;
4624     }
4625
4626     if (!it->is_pg(&pgid)) {
4627       dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4628       continue;
4629     }
4630
4631     dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4632     epoch_t map_epoch = 0;
4633     int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4634     if (r < 0) {
4635       derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4636            << dendl;
4637       continue;
4638     }
4639
4640     PGRef pg;
4641     if (map_epoch > 0) {
4642       OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4643       if (!pgosdmap) {
4644         if (!osdmap->have_pg_pool(pgid.pool())) {
4645           derr << __func__ << ": could not find map for epoch " << map_epoch
4646                << " on pg " << pgid << ", but the pool is not present in the "
4647                << "current map, so this is probably a result of bug 10617.  "
4648                << "Skipping the pg for now, you can use ceph-objectstore-tool "
4649                << "to clean it up later." << dendl;
4650           continue;
4651         } else {
4652           derr << __func__ << ": have pgid " << pgid << " at epoch "
4653                << map_epoch << ", but missing map.  Crashing."
4654                << dendl;
4655           ceph_abort_msg("Missing map in load_pgs");
4656         }
4657       }
4658       pg = _make_pg(pgosdmap, pgid);
4659     } else {
4660       pg = _make_pg(osdmap, pgid);
4661     }
4662     if (!pg) {
4663       recursive_remove_collection(cct, store, pgid, *it);
4664       continue;
4665     }
4666
4667     // there can be no waiters here, so we don't call _wake_pg_slot
4668
4669     pg->lock();
4670     pg->ch = store->open_collection(pg->coll);
4671
4672     // read pg state, log
4673     pg->read_state(store);
4674
4675     if (pg->dne())  {
4676       dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4677       pg->ch = nullptr;
4678       pg->unlock();
4679       recursive_remove_collection(cct, store, pgid, *it);
4680       continue;
4681     }
4682     {
4683       uint32_t shard_index = pgid.hash_to_shard(shards.size());
4684       assert(NULL != shards[shard_index]);
4685       store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4686     }
4687
4688     pg->reg_next_scrub();
4689
4690     dout(10) << __func__ << " loaded " << *pg << dendl;
4691     pg->unlock();
4692
4693     register_pg(pg);
4694     ++num;
4695   }
4696   dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4697 }
4698
4699
4700 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4701                                  const PGCreateInfo *info)
4702 {
4703   spg_t pgid = info->pgid;
4704
4705   if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4706     dout(10) << __func__ << " hit max pg, dropping" << dendl;
4707     return nullptr;
4708   }
4709
4710   PG::RecoveryCtx rctx = create_context();
4711
4712   OSDMapRef startmap = get_map(info->epoch);
4713
4714   if (info->by_mon) {
4715     int64_t pool_id = pgid.pgid.pool();
4716     const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4717     if (!pool) {
4718       dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4719       return nullptr;
4720     }
4721     if (osdmap->require_osd_release >= CEPH_RELEASE_NAUTILUS &&
4722         !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4723       // this ensures we do not process old creating messages after the
4724       // pool's initial pgs have been created (and pg are subsequently
4725       // allowed to split or merge).
4726       dout(20) << __func__ << "  dropping " << pgid
4727                << "create, pool does not have CREATING flag set" << dendl;
4728       return nullptr;
4729     }
4730   }
4731
4732   int up_primary, acting_primary;
4733   vector<int> up, acting;
4734   startmap->pg_to_up_acting_osds(
4735     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4736
4737   const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4738   if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4739       store->get_type() != "bluestore") {
4740     clog->warn() << "pg " << pgid
4741                  << " is at risk of silent data corruption: "
4742                  << "the pool allows ec overwrites but is not stored in "
4743                  << "bluestore, so deep scrubbing will not detect bitrot";
4744   }
4745   PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4746   PG::_init(*rctx.transaction, pgid, pp);
4747
4748   int role = startmap->calc_pg_role(whoami, acting, acting.size());
4749   if (!pp->is_replicated() && role != pgid.shard) {
4750     role = -1;
4751   }
4752
4753   PGRef pg = _make_pg(startmap, pgid);
4754   pg->ch = store->create_new_collection(pg->coll);
4755
4756   {
4757     uint32_t shard_index = pgid.hash_to_shard(shards.size());
4758     assert(NULL != shards[shard_index]);
4759     store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4760   }
4761
4762   pg->lock(true);
4763
4764   // we are holding the shard lock
4765   ceph_assert(!pg->is_deleted());
4766
4767   pg->init(
4768     role,
4769     up,
4770     up_primary,
4771     acting,
4772     acting_primary,
4773     info->history,
4774     info->past_intervals,
4775     false,
4776     rctx.transaction);
4777
4778   pg->init_collection_pool_opts();
4779
4780   if (pg->is_primary()) {
4781     Mutex::Locker locker(m_perf_queries_lock);
4782     pg->set_dynamic_perf_stats_queries(m_perf_queries);
4783   }
4784
4785   pg->handle_initialize(&rctx);
4786   pg->handle_activate_map(&rctx);
4787
4788   dispatch_context(rctx, pg.get(), osdmap, nullptr);
4789
4790   dout(10) << __func__ << " new pg " << *pg << dendl;
4791   return pg;
4792 }
4793
4794 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4795                                 spg_t pgid,
4796                                 bool is_mon_create)
4797 {
4798   const auto max_pgs_per_osd =
4799     (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4800      cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4801
4802   if (num_pgs < max_pgs_per_osd) {
4803     return false;
4804   }
4805
4806   std::lock_guard l(pending_creates_lock);
4807   if (is_mon_create) {
4808     pending_creates_from_mon++;
4809   } else {
4810     bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4811     pending_creates_from_osd.emplace(pgid.pgid, is_primary);
4812   }
4813   dout(1) << __func__ << " withhold creation of pg " << pgid
4814           << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4815   return true;
4816 }
4817
4818 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4819 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4820 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4821 static vector<int32_t> twiddle(const vector<int>& acting) {
4822   if (acting.size() > 1) {
4823     return {acting[0]};
4824   } else {
4825     vector<int32_t> twiddled(acting.begin(), acting.end());
4826     twiddled.push_back(-1);
4827     return twiddled;
4828   }
4829 }
4830
4831 void OSD::resume_creating_pg()
4832 {
4833   bool do_sub_pg_creates = false;
4834   bool have_pending_creates = false;
4835   {
4836     const auto max_pgs_per_osd =
4837       (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4838        cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4839     if (max_pgs_per_osd <= num_pgs) {
4840       // this could happen if admin decreases this setting before a PG is removed
4841       return;
4842     }
4843     unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4844     std::lock_guard l(pending_creates_lock);
4845     if (pending_creates_from_mon > 0) {
4846       dout(20) << __func__ << " pending_creates_from_mon "
4847                << pending_creates_from_mon << dendl;
4848       do_sub_pg_creates = true;
4849       if (pending_creates_from_mon >= spare_pgs) {
4850         spare_pgs = pending_creates_from_mon = 0;
4851       } else {
4852         spare_pgs -= pending_creates_from_mon;
4853         pending_creates_from_mon = 0;
4854       }
4855     }
4856     auto pg = pending_creates_from_osd.cbegin();
4857     while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4858       dout(20) << __func__ << " pg " << pg->first << dendl;
4859       vector<int> acting;
4860       osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
4861       service.queue_want_pg_temp(pg->first, twiddle(acting), true);
4862       pg = pending_creates_from_osd.erase(pg);
4863       do_sub_pg_creates = true;
4864       spare_pgs--;
4865     }
4866     have_pending_creates = (pending_creates_from_mon > 0 ||
4867                             !pending_creates_from_osd.empty());
4868   }
4869
4870   bool do_renew_subs = false;
4871   if (do_sub_pg_creates) {
4872     if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4873       dout(4) << __func__ << ": resolicit pg creates from mon since "
4874               << last_pg_create_epoch << dendl;
4875       do_renew_subs = true;
4876     }
4877   }
4878   version_t start = osdmap->get_epoch() + 1;
4879   if (have_pending_creates) {
4880     // don't miss any new osdmap deleting PGs
4881     if (monc->sub_want("osdmap", start, 0)) {
4882       dout(4) << __func__ << ": resolicit osdmap from mon since "
4883               << start << dendl;
4884       do_renew_subs = true;
4885     }
4886   } else if (do_sub_pg_creates) {
4887     // no need to subscribe the osdmap continuously anymore
4888     // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4889     if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4890       dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4891               << start << dendl;
4892       do_renew_subs = true;
4893     }
4894   }
4895
4896   if (do_renew_subs) {
4897     monc->renew_subs();
4898   }
4899
4900   service.send_pg_temp();
4901 }
4902
4903 void OSD::build_initial_pg_history(
4904   spg_t pgid,
4905   epoch_t created,
4906   utime_t created_stamp,
4907   pg_history_t *h,
4908   PastIntervals *pi)
4909 {
4910   dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4911   h->epoch_created = created;
4912   h->epoch_pool_created = created;
4913   h->same_interval_since = created;
4914   h->same_up_since = created;
4915   h->same_primary_since = created;
4916   h->last_scrub_stamp = created_stamp;
4917   h->last_deep_scrub_stamp = created_stamp;
4918   h->last_clean_scrub_stamp = created_stamp;
4919
4920   OSDMapRef lastmap = service.get_map(created);
4921   int up_primary, acting_primary;
4922   vector<int> up, acting;
4923   lastmap->pg_to_up_acting_osds(
4924     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4925
4926   ostringstream debug;
4927   for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4928     OSDMapRef osdmap = service.get_map(e);
4929     int new_up_primary, new_acting_primary;
4930     vector<int> new_up, new_acting;
4931     osdmap->pg_to_up_acting_osds(
4932       pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4933
4934     // this is a bit imprecise, but sufficient?
4935     struct min_size_predicate_t : public IsPGRecoverablePredicate {
4936       const pg_pool_t *pi;
4937       bool operator()(const set<pg_shard_t> &have) const {
4938         return have.size() >= pi->min_size;
4939       }
4940       explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4941     } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4942
4943     bool new_interval = PastIntervals::check_new_interval(
4944       acting_primary,
4945       new_acting_primary,
4946       acting, new_acting,
4947       up_primary,
4948       new_up_primary,
4949       up, new_up,
4950       h->same_interval_since,
4951       h->last_epoch_clean,
4952       osdmap,
4953       lastmap,
4954       pgid.pgid,
4955       &min_size_predicate,
4956       pi,
4957       &debug);
4958     if (new_interval) {
4959       h->same_interval_since = e;
4960       if (up != new_up) {
4961         h->same_up_since = e;
4962       }
4963       if (acting_primary != new_acting_primary) {
4964         h->same_primary_since = e;
4965       }
4966       if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4967                              osdmap->get_pg_num(pgid.pgid.pool()),
4968                              nullptr)) {
4969         h->last_epoch_split = e;
4970       }
4971       up = new_up;
4972       acting = new_acting;
4973       up_primary = new_up_primary;
4974       acting_primary = new_acting_primary;
4975     }
4976     lastmap = osdmap;
4977   }
4978   dout(20) << __func__ << " " << debug.str() << dendl;
4979   dout(10) << __func__ << " " << *h << " " << *pi
4980            << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4981                        pi->get_bounds()) << ")"
4982            << dendl;
4983 }
4984
4985 void OSD::_add_heartbeat_peer(int p)
4986 {
4987   if (p == whoami)
4988     return;
4989   HeartbeatInfo *hi;
4990
4991   map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4992   if (i == heartbeat_peers.end()) {
4993     pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4994     if (!cons.first)
4995       return;
4996     hi = &heartbeat_peers[p];
4997     hi->peer = p;
4998     RefCountedPtr s{new HeartbeatSession{p}, false};
4999     hi->hb_interval_start = ceph_clock_now();
5000     hi->con_back = cons.first.get();
5001     hi->con_back->set_priv(s);
5002     if (cons.second) {
5003       hi->con_front = cons.second.get();
5004       hi->con_front->set_priv(s);
5005       dout(10) << "_add_heartbeat_peer: new peer osd." << p
5006                << " " << hi->con_back->get_peer_addr()
5007                << " " << hi->con_front->get_peer_addr()
5008                << dendl;
5009     } else {
5010       hi->con_front.reset(NULL);
5011       dout(10) << "_add_heartbeat_peer: new peer osd." << p
5012                << " " << hi->con_back->get_peer_addr()
5013                << dendl;
5014     }
5015   } else {
5016     hi = &i->second;
5017   }
5018   hi->epoch = osdmap->get_epoch();
5019 }
5020
5021 void OSD::_remove_heartbeat_peer(int n)
5022 {
5023   map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5024   ceph_assert(q != heartbeat_peers.end());
5025   dout(20) << " removing heartbeat peer osd." << n
5026            << " " << q->second.con_back->get_peer_addr()
5027            << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5028            << dendl;
5029   q->second.con_back->mark_down();
5030   if (q->second.con_front) {
5031     q->second.con_front->mark_down();
5032   }
5033   heartbeat_peers.erase(q);
5034 }
5035
5036 void OSD::need_heartbeat_peer_update()
5037 {
5038   if (is_stopping())
5039     return;
5040   dout(20) << "need_heartbeat_peer_update" << dendl;
5041   heartbeat_set_peers_need_update();
5042 }
5043
5044 void OSD::maybe_update_heartbeat_peers()
5045 {
5046   ceph_assert(osd_lock.is_locked());
5047
5048   if (is_waiting_for_healthy() || is_active()) {
5049     utime_t now = ceph_clock_now();
5050     if (last_heartbeat_resample == utime_t()) {
5051       last_heartbeat_resample = now;
5052       heartbeat_set_peers_need_update();
5053     } else if (!heartbeat_peers_need_update()) {
5054       utime_t dur = now - last_heartbeat_resample;
5055       if (dur > cct->_conf->osd_heartbeat_grace) {
5056         dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5057         heartbeat_set_peers_need_update();
5058         last_heartbeat_resample = now;
5059         // automatically clean up any stale heartbeat peers
5060         // if we are unhealthy, then clean all
5061         reset_heartbeat_peers(is_waiting_for_healthy());
5062       }
5063     }
5064   }
5065
5066   if (!heartbeat_peers_need_update())
5067     return;
5068   heartbeat_clear_peers_need_update();
5069
5070   std::lock_guard l(heartbeat_lock);
5071
5072   dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5073
5074
5075   // build heartbeat from set
5076   if (is_active()) {
5077     vector<PGRef> pgs;
5078     _get_pgs(&pgs);
5079     for (auto& pg : pgs) {
5080       pg->with_heartbeat_peers([&](int peer) {
5081           if (osdmap->is_up(peer)) {
5082             _add_heartbeat_peer(peer);
5083           }
5084         });
5085     }
5086   }
5087
5088   // include next and previous up osds to ensure we have a fully-connected set
5089   set<int> want, extras;
5090   const int next = osdmap->get_next_up_osd_after(whoami);
5091   if (next >= 0)
5092     want.insert(next);
5093   int prev = osdmap->get_previous_up_osd_before(whoami);
5094   if (prev >= 0 && prev != next)
5095     want.insert(prev);
5096
5097   // make sure we have at least **min_down** osds coming from different
5098   // subtree level (e.g., hosts) for fast failure detection.
5099   auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5100   auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5101   osdmap->get_random_up_osds_by_subtree(
5102     whoami, subtree, min_down, want, &want);
5103
5104   for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5105     dout(10) << " adding neighbor peer osd." << *p << dendl;
5106     extras.insert(*p);
5107     _add_heartbeat_peer(*p);
5108   }
5109
5110   // remove down peers; enumerate extras
5111   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5112   while (p != heartbeat_peers.end()) {
5113     if (!osdmap->is_up(p->first)) {
5114       int o = p->first;
5115       ++p;
5116       _remove_heartbeat_peer(o);
5117       continue;
5118     }
5119     if (p->second.epoch < osdmap->get_epoch()) {
5120       extras.insert(p->first);
5121     }
5122     ++p;
5123   }
5124
5125   // too few?
5126   for (int n = next; n >= 0; ) {
5127     if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5128       break;
5129     if (!extras.count(n) && !want.count(n) && n != whoami) {
5130       dout(10) << " adding random peer osd." << n << dendl;
5131       extras.insert(n);
5132       _add_heartbeat_peer(n);
5133     }
5134     n = osdmap->get_next_up_osd_after(n);
5135     if (n == next)
5136       break;  // came full circle; stop
5137   }
5138
5139   // too many?
5140   for (set<int>::iterator p = extras.begin();
5141        (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5142        ++p) {
5143     if (want.count(*p))
5144       continue;
5145     _remove_heartbeat_peer(*p);
5146   }
5147
5148   dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5149 }
5150
5151 void OSD::reset_heartbeat_peers(bool all)
5152 {
5153   ceph_assert(osd_lock.is_locked());
5154   dout(10) << "reset_heartbeat_peers" << dendl;
5155   utime_t stale = ceph_clock_now();
5156   stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5157   std::lock_guard l(heartbeat_lock);
5158   for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5159     HeartbeatInfo& hi = it->second;
5160     if (all || hi.is_stale(stale)) {
5161       hi.con_back->mark_down();
5162       if (hi.con_front) {
5163         hi.con_front->mark_down();
5164       }
5165       // stop sending failure_report to mon too
5166       failure_queue.erase(it->first);
5167       heartbeat_peers.erase(it++);
5168     } else {
5169       it++;
5170     }
5171   }
5172 }
5173
5174 void OSD::handle_osd_ping(MOSDPing *m)
5175 {
5176   if (superblock.cluster_fsid != m->fsid) {
5177     dout(20) << "handle_osd_ping from " << m->get_source_inst()
5178              << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
5179     m->put();
5180     return;
5181   }
5182
5183   int from = m->get_source().num();
5184
5185   heartbeat_lock.Lock();
5186   if (is_stopping()) {
5187     heartbeat_lock.Unlock();
5188     m->put();
5189     return;
5190   }
5191
5192   OSDMapRef curmap = service.get_osdmap();
5193   if (!curmap) {
5194     heartbeat_lock.Unlock();
5195     m->put();
5196     return;
5197   }
5198
5199   switch (m->op) {
5200
5201   case MOSDPing::PING:
5202     {
5203       if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5204         auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5205         if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5206           if (heartbeat_drop->second == 0) {
5207             debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5208           } else {
5209             --heartbeat_drop->second;
5210             dout(5) << "Dropping heartbeat from " << from
5211                     << ", " << heartbeat_drop->second
5212                     << " remaining to drop" << dendl;
5213             break;
5214           }
5215         } else if (cct->_conf->osd_debug_drop_ping_probability >
5216                    ((((double)(rand()%100))/100.0))) {
5217           heartbeat_drop =
5218             debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5219                              cct->_conf->osd_debug_drop_ping_duration)).first;
5220           dout(5) << "Dropping heartbeat from " << from
5221                   << ", " << heartbeat_drop->second
5222                   << " remaining to drop" << dendl;
5223           break;
5224         }
5225       }
5226
5227       if (!cct->get_heartbeat_map()->is_healthy()) {
5228         dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5229         break;
5230       }
5231
5232       Message *r = new MOSDPing(monc->get_fsid(),
5233                                 curmap->get_epoch(),
5234                                 MOSDPing::PING_REPLY, m->stamp,
5235                                 cct->_conf->osd_heartbeat_min_size);
5236       m->get_connection()->send_message(r);
5237
5238       if (curmap->is_up(from)) {
5239         service.note_peer_epoch(from, m->map_epoch);
5240         if (is_active()) {
5241           ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5242           if (con) {
5243             service.share_map_peer(from, con.get());
5244           }
5245         }
5246       } else if (!curmap->exists(from) ||
5247                  curmap->get_down_at(from) > m->map_epoch) {
5248         // tell them they have died
5249         Message *r = new MOSDPing(monc->get_fsid(),
5250                                   curmap->get_epoch(),
5251                                   MOSDPing::YOU_DIED,
5252                                   m->stamp,
5253                                   cct->_conf->osd_heartbeat_min_size);
5254         m->get_connection()->send_message(r);
5255       }
5256     }
5257     break;
5258
5259   case MOSDPing::PING_REPLY:
5260     {
5261       map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5262       if (i != heartbeat_peers.end()) {
5263         auto acked = i->second.ping_history.find(m->stamp);
5264         if (acked != i->second.ping_history.end()) {
5265           utime_t now = ceph_clock_now();
5266           int &unacknowledged = acked->second.second;
5267           if (m->get_connection() == i->second.con_back) {
5268             dout(25) << "handle_osd_ping got reply from osd." << from
5269                      << " first_tx " << i->second.first_tx
5270                      << " last_tx " << i->second.last_tx
5271                      << " last_rx_back " << i->second.last_rx_back << " -> " << now
5272                      << " last_rx_front " << i->second.last_rx_front
5273                      << dendl;
5274             i->second.last_rx_back = now;
5275             ceph_assert(unacknowledged > 0);
5276             --unacknowledged;
5277             // if there is no front con, set both stamps.
5278             if (i->second.con_front == NULL) {
5279               i->second.last_rx_front = now;
5280               ceph_assert(unacknowledged > 0);
5281               --unacknowledged;
5282             }
5283           } else if (m->get_connection() == i->second.con_front) {
5284             dout(25) << "handle_osd_ping got reply from osd." << from
5285                      << " first_tx " << i->second.first_tx
5286                      << " last_tx " << i->second.last_tx
5287                      << " last_rx_back " << i->second.last_rx_back
5288                      << " last_rx_front " << i->second.last_rx_front << " -> " << now
5289                      << dendl;
5290             i->second.last_rx_front = now;
5291             ceph_assert(unacknowledged > 0);
5292             --unacknowledged;
5293           }
5294
5295           if (unacknowledged == 0) {
5296             // succeeded in getting all replies
5297             dout(25) << "handle_osd_ping got all replies from osd." << from
5298                      << " , erase pending ping(sent at " << m->stamp << ")"
5299                      << " and older pending ping(s)"
5300                      << dendl;
5301
5302 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5303             ++i->second.hb_average_count;
5304             uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp);
5305             i->second.hb_total_back += back_pingtime;
5306             if (back_pingtime < i->second.hb_min_back)
5307               i->second.hb_min_back = back_pingtime;
5308             if (back_pingtime > i->second.hb_max_back)
5309               i->second.hb_max_back = back_pingtime;
5310             uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp);
5311             i->second.hb_total_front += front_pingtime;
5312             if (front_pingtime < i->second.hb_min_front)
5313               i->second.hb_min_front = front_pingtime;
5314             if (front_pingtime > i->second.hb_max_front)
5315               i->second.hb_max_front = front_pingtime;
5316
5317             ceph_assert(i->second.hb_interval_start != utime_t());
5318             if (i->second.hb_interval_start == utime_t())
5319               i->second.hb_interval_start = now;
5320             int64_t hb_avg_time_period = 60;
5321             if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5322               hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5323             }
5324             if (now - i->second.hb_interval_start >=  utime_t(hb_avg_time_period, 0)) {
5325               uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5326               uint32_t back_min = i->second.hb_min_back;
5327               uint32_t back_max = i->second.hb_max_back;
5328               uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5329               uint32_t front_min = i->second.hb_min_front;
5330               uint32_t front_max = i->second.hb_max_front;
5331
5332               // Reset for new interval
5333               i->second.hb_average_count = 0;
5334               i->second.hb_interval_start = now;
5335               i->second.hb_total_back = i->second.hb_max_back = 0;
5336               i->second.hb_min_back =  UINT_MAX;
5337               i->second.hb_total_front = i->second.hb_max_front = 0;
5338               i->second.hb_min_front = UINT_MAX;
5339
5340               // Record per osd interace ping times
5341               // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5342               if (i->second.hb_back_pingtime.size() == 0) {
5343                 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5344                 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5345                   i->second.hb_back_pingtime.push_back(back_avg);
5346                   i->second.hb_back_min.push_back(back_min);
5347                   i->second.hb_back_max.push_back(back_max);
5348                   i->second.hb_front_pingtime.push_back(front_avg);
5349                   i->second.hb_front_min.push_back(front_min);
5350                   i->second.hb_front_max.push_back(front_max);
5351                   ++i->second.hb_index;
5352                 }
5353               } else {
5354                 int index = i->second.hb_index & (hb_vector_size - 1);
5355                 i->second.hb_back_pingtime[index] = back_avg;
5356                 i->second.hb_back_min[index] = back_min;
5357                 i->second.hb_back_max[index] = back_max;
5358                 i->second.hb_front_pingtime[index] = front_avg;
5359                 i->second.hb_front_min[index] = front_min;
5360                 i->second.hb_front_max[index] = front_max;
5361                 ++i->second.hb_index;
5362               }
5363
5364               {
5365                 std::lock_guard l(service.stat_lock);
5366                 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5367                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5368
5369                 uint32_t total = 0;
5370                 uint32_t min = UINT_MAX;
5371                 uint32_t max = 0;
5372                 uint32_t count = 0;
5373                 uint32_t which = 0;
5374                 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5375                 for (int32_t k = size - 1 ; k >= 0; --k) {
5376                   ++count;
5377                   int index = (i->second.hb_index + k) % size;
5378                   total += i->second.hb_back_pingtime[index];
5379                   if (i->second.hb_back_min[index] < min)
5380                     min = i->second.hb_back_min[index];
5381                   if (i->second.hb_back_max[index] > max)
5382                     max = i->second.hb_back_max[index];
5383                   if (count == 1 || count == 5 || count == 15) {
5384                     service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5385                     service.osd_stat.hb_pingtime[from].back_min[which] = min;
5386                     service.osd_stat.hb_pingtime[from].back_max[which] = max;
5387                     which++;
5388                     if (count == 15)
5389                       break;
5390                   }
5391                 }
5392
5393                 if (i->second.con_front != NULL) {
5394                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5395
5396                   total = 0;
5397                   min = UINT_MAX;
5398                   max = 0;
5399                   count = 0;
5400                   which = 0;
5401                   for (int32_t k = size - 1 ; k >= 0; --k) {
5402                     ++count;
5403                     int index = (i->second.hb_index + k) % size;
5404                     total += i->second.hb_front_pingtime[index];
5405                     if (i->second.hb_front_min[index] < min)
5406                       min = i->second.hb_front_min[index];
5407                     if (i->second.hb_front_max[index] > max)
5408                       max = i->second.hb_front_max[index];
5409                     if (count == 1 || count == 5 || count == 15) {
5410                       service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5411                       service.osd_stat.hb_pingtime[from].front_min[which] = min;
5412                       service.osd_stat.hb_pingtime[from].front_max[which] = max;
5413                       which++;
5414                       if (count == 15)
5415                         break;
5416                     }
5417                   }
5418                 }
5419               }
5420             } else {
5421                 std::lock_guard l(service.stat_lock);
5422                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5423                 if (i->second.con_front != NULL)
5424                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5425             }
5426             i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5427           }
5428
5429           if (i->second.is_healthy(now)) {
5430             // Cancel false reports
5431             auto failure_queue_entry = failure_queue.find(from);
5432             if (failure_queue_entry != failure_queue.end()) {
5433               dout(10) << "handle_osd_ping canceling queued "
5434                        << "failure report for osd." << from << dendl;
5435               failure_queue.erase(failure_queue_entry);
5436             }
5437
5438             auto failure_pending_entry = failure_pending.find(from);
5439             if (failure_pending_entry != failure_pending.end()) {
5440               dout(10) << "handle_osd_ping canceling in-flight "
5441                        << "failure report for osd." << from << dendl;
5442               send_still_alive(curmap->get_epoch(),
5443                                from,
5444                                failure_pending_entry->second.second);
5445               failure_pending.erase(failure_pending_entry);
5446             }
5447           }
5448         } else {
5449           // old replies, deprecated by newly sent pings.
5450           dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp
5451                    << ") is found, treat as covered by newly sent pings "
5452                    << "and ignore"
5453                    << dendl;
5454         }
5455       }
5456
5457       if (m->map_epoch &&
5458           curmap->is_up(from)) {
5459         service.note_peer_epoch(from, m->map_epoch);
5460         if (is_active()) {
5461           ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5462           if (con) {
5463             service.share_map_peer(from, con.get());
5464           }
5465         }
5466       }
5467     }
5468     break;
5469
5470   case MOSDPing::YOU_DIED:
5471     dout(10) << "handle_osd_ping " << m->get_source_inst()
5472              << " says i am down in " << m->map_epoch << dendl;
5473     osdmap_subscribe(curmap->get_epoch()+1, false);
5474     break;
5475   }
5476
5477   heartbeat_lock.Unlock();
5478   m->put();
5479 }
5480
5481 void OSD::heartbeat_entry()
5482 {
5483   std::lock_guard l(heartbeat_lock);
5484   if (is_stopping())
5485     return;
5486   while (!heartbeat_stop) {
5487     heartbeat();
5488
5489     double wait;
5490     if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5491       wait = (float)cct->_conf->osd_heartbeat_interval;
5492     } else {
5493       wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5494     }
5495     utime_t w;
5496     w.set_from_double(wait);
5497     dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5498     heartbeat_cond.WaitInterval(heartbeat_lock, w);
5499     if (is_stopping())
5500       return;
5501     dout(30) << "heartbeat_entry woke up" << dendl;
5502   }
5503 }
5504
5505 void OSD::heartbeat_check()
5506 {
5507   ceph_assert(heartbeat_lock.is_locked());
5508   utime_t now = ceph_clock_now();
5509
5510   // check for incoming heartbeats (move me elsewhere?)
5511   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5512        p != heartbeat_peers.end();
5513        ++p) {
5514
5515     if (p->second.first_tx == utime_t()) {
5516       dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5517                << " yet, skipping" << dendl;
5518       continue;
5519     }
5520
5521     dout(25) << "heartbeat_check osd." << p->first
5522              << " first_tx " << p->second.first_tx
5523              << " last_tx " << p->second.last_tx
5524              << " last_rx_back " << p->second.last_rx_back
5525              << " last_rx_front " << p->second.last_rx_front
5526              << dendl;
5527     if (p->second.is_unhealthy(now)) {
5528       utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5529       if (p->second.last_rx_back == utime_t() ||
5530           p->second.last_rx_front == utime_t()) {
5531         derr << "heartbeat_check: no reply from "
5532              << p->second.con_front->get_peer_addr().get_sockaddr()
5533              << " osd." << p->first
5534              << " ever on either front or back, first ping sent "
5535              << p->second.first_tx
5536              << " (oldest deadline " << oldest_deadline << ")"
5537              << dendl;
5538         // fail
5539         failure_queue[p->first] = p->second.first_tx;
5540       } else {
5541         derr << "heartbeat_check: no reply from "
5542              << p->second.con_front->get_peer_addr().get_sockaddr()
5543              << " osd." << p->first << " since back " << p->second.last_rx_back
5544              << " front " << p->second.last_rx_front
5545              << " (oldest deadline " << oldest_deadline << ")"
5546              << dendl;
5547         // fail
5548         failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5549       }
5550     }
5551   }
5552 }
5553
5554 void OSD::heartbeat()
5555 {
5556   ceph_assert(heartbeat_lock.is_locked_by_me());
5557   dout(30) << "heartbeat" << dendl;
5558
5559   // get CPU load avg
5560   double loadavgs[1];
5561   int hb_interval = cct->_conf->osd_heartbeat_interval;
5562   int n_samples = 86400;
5563   if (hb_interval > 1) {
5564     n_samples /= hb_interval;
5565     if (n_samples < 1)
5566       n_samples = 1;
5567   }
5568
5569   if (getloadavg(loadavgs, 1) == 1) {
5570     logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5571     daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5572     dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5573   }
5574
5575   dout(30) << "heartbeat checking stats" << dendl;
5576
5577   // refresh peer list and osd stats
5578   vector<int> hb_peers;
5579   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5580        p != heartbeat_peers.end();
5581        ++p)
5582     hb_peers.push_back(p->first);
5583
5584   auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5585   dout(5) << __func__ << " " << new_stat << dendl;
5586   ceph_assert(new_stat.statfs.total);
5587
5588   float pratio;
5589   float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5590
5591   service.check_full_status(ratio, pratio);
5592
5593   utime_t now = ceph_clock_now();
5594   utime_t deadline = now;
5595   deadline += cct->_conf->osd_heartbeat_grace;
5596
5597   // send heartbeats
5598   for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5599        i != heartbeat_peers.end();
5600        ++i) {
5601     int peer = i->first;
5602     i->second.last_tx = now;
5603     if (i->second.first_tx == utime_t())
5604       i->second.first_tx = now;
5605     i->second.ping_history[now] = make_pair(deadline,
5606       HeartbeatInfo::HEARTBEAT_MAX_CONN);
5607     if (i->second.hb_interval_start == utime_t())
5608       i->second.hb_interval_start = now;
5609     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5610     i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5611                                           service.get_osdmap_epoch(),
5612                                           MOSDPing::PING, now,
5613                                           cct->_conf->osd_heartbeat_min_size));
5614
5615     if (i->second.con_front)
5616       i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5617                                              service.get_osdmap_epoch(),
5618                                              MOSDPing::PING, now,
5619                                           cct->_conf->osd_heartbeat_min_size));
5620   }
5621
5622   logger->set(l_osd_hb_to, heartbeat_peers.size());
5623
5624   // hmm.. am i all alone?
5625   dout(30) << "heartbeat lonely?" << dendl;
5626   if (heartbeat_peers.empty()) {
5627     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5628       last_mon_heartbeat = now;
5629       dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5630       osdmap_subscribe(osdmap->get_epoch() + 1, false);
5631     }
5632   }
5633
5634   dout(30) << "heartbeat done" << dendl;
5635 }
5636
5637 bool OSD::heartbeat_reset(Connection *con)
5638 {
5639   std::lock_guard l(heartbeat_lock);
5640   auto s = con->get_priv();
5641   con->set_priv(nullptr);
5642   if (s) {
5643     if (is_stopping()) {
5644       return true;
5645     }
5646     auto heartbeat_session = static_cast<HeartbeatSession*>(s.get());
5647     auto p = heartbeat_peers.find(heartbeat_session->peer);
5648     if (p != heartbeat_peers.end() &&
5649         (p->second.con_back == con ||
5650          p->second.con_front == con)) {
5651       dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5652                << ", reopening" << dendl;
5653       if (con != p->second.con_back) {
5654         p->second.con_back->mark_down();
5655       }
5656       p->second.con_back.reset(NULL);
5657       if (p->second.con_front && con != p->second.con_front) {
5658         p->second.con_front->mark_down();
5659       }
5660       p->second.con_front.reset(NULL);
5661       pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5662       if (newcon.first) {
5663         p->second.con_back = newcon.first.get();
5664         p->second.con_back->set_priv(s);
5665         if (newcon.second) {
5666           p->second.con_front = newcon.second.get();
5667           p->second.con_front->set_priv(s);
5668         }
5669         p->second.ping_history.clear();
5670       } else {
5671         dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5672                  << ", raced with osdmap update, closing out peer" << dendl;
5673         heartbeat_peers.erase(p);
5674       }
5675     } else {
5676       dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5677     }
5678   }
5679   return true;
5680 }
5681
5682
5683
5684 // =========================================
5685
5686 void OSD::tick()
5687 {
5688   ceph_assert(osd_lock.is_locked());
5689   dout(10) << "tick" << dendl;
5690
5691   if (is_active() || is_waiting_for_healthy()) {
5692     maybe_update_heartbeat_peers();
5693   }
5694
5695   if (is_waiting_for_healthy()) {
5696     start_boot();
5697   }
5698
5699   if (is_waiting_for_healthy() || is_booting()) {
5700     std::lock_guard l(heartbeat_lock);
5701     utime_t now = ceph_clock_now();
5702     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5703       last_mon_heartbeat = now;
5704       dout(1) << __func__ << " checking mon for new map" << dendl;
5705       osdmap_subscribe(osdmap->get_epoch() + 1, false);
5706     }
5707   }
5708
5709   do_waiters();
5710
5711   tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5712 }
5713
5714 void OSD::tick_without_osd_lock()
5715 {
5716   ceph_assert(tick_timer_lock.is_locked());
5717   dout(10) << "tick_without_osd_lock" << dendl;
5718
5719   logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5720   logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5721   logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5722
5723   // refresh osd stats
5724   struct store_statfs_t stbuf;
5725   osd_alert_list_t alerts;
5726   int r = store->statfs(&stbuf, &alerts);
5727   ceph_assert(r == 0);
5728   service.set_statfs(stbuf, alerts);
5729
5730   // osd_lock is not being held, which means the OSD state
5731   // might change when doing the monitor report
5732   if (is_active() || is_waiting_for_healthy()) {
5733     heartbeat_lock.Lock();
5734     heartbeat_check();
5735     heartbeat_lock.Unlock();
5736
5737     map_lock.get_read();
5738     std::lock_guard l(mon_report_lock);
5739
5740     // mon report?
5741     utime_t now = ceph_clock_now();
5742     if (service.need_fullness_update() ||
5743         now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5744       last_mon_report = now;
5745       send_full_update();
5746       send_failures();
5747     }
5748     map_lock.put_read();
5749
5750     epoch_t max_waiting_epoch = 0;
5751     for (auto s : shards) {
5752       max_waiting_epoch = std::max(max_waiting_epoch,
5753                                    s->get_max_waiting_epoch());
5754     }
5755     if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5756       dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5757                << ", requesting new map" << dendl;
5758       osdmap_subscribe(superblock.newest_map + 1, false);
5759     }
5760   }
5761
5762   if (is_active()) {
5763     if (!scrub_random_backoff()) {
5764       sched_scrub();
5765     }
5766     service.promote_throttle_recalibrate();
5767     resume_creating_pg();
5768     bool need_send_beacon = false;
5769     const auto now = ceph::coarse_mono_clock::now();
5770     {
5771       // borrow lec lock to pretect last_sent_beacon from changing
5772       std::lock_guard l{min_last_epoch_clean_lock};
5773       const auto elapsed = now - last_sent_beacon;
5774       if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5775         cct->_conf->osd_beacon_report_interval) {
5776         need_send_beacon = true;
5777       }
5778     }
5779     if (need_send_beacon) {
5780       send_beacon(now);
5781     }
5782   }
5783
5784   mgrc.update_daemon_health(get_health_metrics());
5785   service.kick_recovery_queue();
5786   tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5787                                               new C_Tick_WithoutOSDLock(this));
5788 }
5789
5790 // Usage:
5791 //   setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5792 //   rmomapkey <pool-id> [namespace/]<obj-name> <key>
5793 //   setomapheader <pool-id> [namespace/]<obj-name> <header>
5794 //   getomap <pool> [namespace/]<obj-name>
5795 //   truncobj <pool-id> [namespace/]<obj-name> <newlen>
5796 //   injectmdataerr [namespace/]<obj-name> [shardid]
5797 //   injectdataerr [namespace/]<obj-name> [shardid]
5798 //
5799 //   set_recovery_delay [utime]
5800 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5801                                  std::string_view command,
5802                                  const cmdmap_t& cmdmap, ostream &ss)
5803 {
5804   //Test support
5805   //Support changing the omap on a single osd by using the Admin Socket to
5806   //directly request the osd make a change.
5807   if (command == "setomapval" || command == "rmomapkey" ||
5808       command == "setomapheader" || command == "getomap" ||
5809       command == "truncobj" || command == "injectmdataerr" ||
5810       command == "injectdataerr"
5811     ) {
5812     pg_t rawpg;
5813     int64_t pool;
5814     OSDMapRef curmap = service->get_osdmap();
5815     int r = -1;
5816
5817     string poolstr;
5818
5819     cmd_getval(service->cct, cmdmap, "pool", poolstr);
5820     pool = curmap->lookup_pg_pool_name(poolstr);
5821     //If we can't find it by name then maybe id specified
5822     if (pool < 0 && isdigit(poolstr[0]))
5823       pool = atoll(poolstr.c_str());
5824     if (pool < 0) {
5825       ss << "Invalid pool '" << poolstr << "''";
5826       return;
5827     }
5828
5829     string objname, nspace;
5830     cmd_getval(service->cct, cmdmap, "objname", objname);
5831     std::size_t found = objname.find_first_of('/');
5832     if (found != string::npos) {
5833       nspace = objname.substr(0, found);
5834       objname = objname.substr(found+1);
5835     }
5836     object_locator_t oloc(pool, nspace);
5837     r = curmap->object_locator_to_pg(object_t(objname), oloc,  rawpg);
5838
5839     if (r < 0) {
5840       ss << "Invalid namespace/objname";
5841       return;
5842     }
5843
5844     int64_t shardid;
5845     cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5846     hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5847     ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5848     spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5849     if (curmap->pg_is_ec(rawpg)) {
5850         if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5851             ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5852             return;
5853         }
5854     }
5855
5856     ObjectStore::Transaction t;
5857
5858     if (command == "setomapval") {
5859       map<string, bufferlist> newattrs;
5860       bufferlist val;
5861       string key, valstr;
5862       cmd_getval(service->cct, cmdmap, "key", key);
5863       cmd_getval(service->cct, cmdmap, "val", valstr);
5864
5865       val.append(valstr);
5866       newattrs[key] = val;
5867       t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5868       r = store->queue_transaction(service->meta_ch, std::move(t));
5869       if (r < 0)
5870         ss << "error=" << r;
5871       else
5872         ss << "ok";
5873     } else if (command == "rmomapkey") {
5874       string key;
5875       set<string> keys;
5876       cmd_getval(service->cct, cmdmap, "key", key);
5877
5878       keys.insert(key);
5879       t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5880       r = store->queue_transaction(service->meta_ch, std::move(t));
5881       if (r < 0)
5882         ss << "error=" << r;
5883       else
5884         ss << "ok";
5885     } else if (command == "setomapheader") {
5886       bufferlist newheader;
5887       string headerstr;
5888
5889       cmd_getval(service->cct, cmdmap, "header", headerstr);
5890       newheader.append(headerstr);
5891       t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5892       r = store->queue_transaction(service->meta_ch, std::move(t));
5893       if (r < 0)
5894         ss << "error=" << r;
5895       else
5896         ss << "ok";
5897     } else if (command == "getomap") {
5898       //Debug: Output entire omap
5899       bufferlist hdrbl;
5900       map<string, bufferlist> keyvals;
5901       auto ch = store->open_collection(coll_t(pgid));
5902       if (!ch) {
5903         ss << "unable to open collection for " << pgid;
5904         r = -ENOENT;
5905       } else {
5906         r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
5907         if (r >= 0) {
5908           ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5909           for (map<string, bufferlist>::iterator it = keyvals.begin();
5910                it != keyvals.end(); ++it)
5911             ss << " key=" << (*it).first << " val="
5912                << string((*it).second.c_str(), (*it).second.length());
5913         } else {
5914           ss << "error=" << r;
5915         }
5916       }
5917     } else if (command == "truncobj") {
5918       int64_t trunclen;
5919       cmd_getval(service->cct, cmdmap, "len", trunclen);
5920       t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5921       r = store->queue_transaction(service->meta_ch, std::move(t));
5922       if (r < 0)
5923         ss << "error=" << r;
5924       else
5925         ss << "ok";
5926     } else if (command == "injectdataerr") {
5927       store->inject_data_error(gobj);
5928       ss << "ok";
5929     } else if (command == "injectmdataerr") {
5930       store->inject_mdata_error(gobj);
5931       ss << "ok";
5932     }
5933     return;
5934   }
5935   if (command == "set_recovery_delay") {
5936     int64_t delay;
5937     cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5938     ostringstream oss;
5939     oss << delay;
5940     int r = service->cct->_conf.set_val("osd_recovery_delay_start",
5941                                          oss.str().c_str());
5942     if (r != 0) {
5943       ss << "set_recovery_delay: error setting "
5944          << "osd_recovery_delay_start to '" << delay << "': error "
5945          << r;
5946       return;
5947     }
5948     service->cct->_conf.apply_changes(nullptr);
5949     ss << "set_recovery_delay: set osd_recovery_delay_start "
5950        << "to " << service->cct->_conf->osd_recovery_delay_start;
5951     return;
5952   }
5953   if (command ==  "trigger_scrub" || command == "trigger_deep_scrub") {
5954     spg_t pgid;
5955     bool deep = (command == "trigger_deep_scrub");
5956     OSDMapRef curmap = service->get_osdmap();
5957
5958     string pgidstr;
5959
5960     cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5961     if (!pgid.parse(pgidstr.c_str())) {
5962       ss << "Invalid pgid specified";
5963       return;
5964     }
5965
5966     int64_t time;
5967     cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
5968
5969     PGRef pg = service->osd->_lookup_lock_pg(pgid);
5970     if (pg == nullptr) {
5971       ss << "Can't find pg " << pgid;
5972       return;
5973     }
5974
5975     if (pg->is_primary()) {
5976       pg->unreg_next_scrub();
5977       const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5978       double pool_scrub_max_interval = 0;
5979       double scrub_max_interval;
5980       if (deep) {
5981         p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
5982         scrub_max_interval = pool_scrub_max_interval > 0 ?
5983           pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
5984       } else {
5985         p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5986         scrub_max_interval = pool_scrub_max_interval > 0 ?
5987           pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
5988       }
5989       // Instead of marking must_scrub force a schedule scrub
5990       utime_t stamp = ceph_clock_now();
5991       if (time == 0)
5992         stamp -= scrub_max_interval;
5993       else
5994         stamp -=  (float)time;
5995       stamp -= 100.0;  // push back last scrub more for good measure
5996       if (deep) {
5997         pg->set_last_deep_scrub_stamp(stamp);
5998       } else {
5999         pg->set_last_scrub_stamp(stamp);
6000       }
6001       pg->reg_next_scrub();
6002       pg->publish_stats_to_osd();
6003       ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
6004     } else {
6005       ss << "Not primary";
6006     }
6007     pg->unlock();
6008     return;
6009   }
6010   if (command == "injectfull") {
6011     int64_t count;
6012     string type;
6013     OSDService::s_names state;
6014     cmd_getval(service->cct, cmdmap, "type", type, string("full"));
6015     cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
6016     if (type == "none" || count == 0) {
6017       type = "none";
6018       count = 0;
6019     }
6020     state = service->get_full_state(type);
6021     if (state == OSDService::s_names::INVALID) {
6022       ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6023       return;
6024     }
6025     service->set_injectfull(state, count);
6026     return;
6027   }
6028   ss << "Internal error - command=" << command;
6029 }
6030
6031 // =========================================
6032
6033 void OSD::ms_handle_connect(Connection *con)
6034 {
6035   dout(10) << __func__ << " con " << con << dendl;
6036   if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6037     std::lock_guard l(osd_lock);
6038     if (is_stopping())
6039       return;
6040     dout(10) << __func__ << " on mon" << dendl;
6041
6042     if (is_preboot()) {
6043       start_boot();
6044     } else if (is_booting()) {
6045       _send_boot();       // resend boot message
6046     } else {
6047       map_lock.get_read();
6048       std::lock_guard l2(mon_report_lock);
6049
6050       utime_t now = ceph_clock_now();
6051       last_mon_report = now;
6052
6053       // resend everything, it's a new session
6054       send_full_update();
6055       send_alive();
6056       service.requeue_pg_temp();
6057       service.clear_sent_ready_to_merge();
6058       service.send_pg_temp();
6059       service.send_ready_to_merge();
6060       service.send_pg_created();
6061       requeue_failures();
6062       send_failures();
6063
6064       map_lock.put_read();
6065       if (is_active()) {
6066         send_beacon(ceph::coarse_mono_clock::now());
6067       }
6068     }
6069
6070     // full map requests may happen while active or pre-boot
6071     if (requested_full_first) {
6072       rerequest_full_maps();
6073     }
6074   }
6075 }
6076
6077 void OSD::ms_handle_fast_connect(Connection *con)
6078 {
6079   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6080       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6081     auto priv = con->get_priv();
6082     auto s = static_cast<Session*>(priv.get());
6083     if (!s) {
6084       s = new Session{cct, con};
6085       con->set_priv(RefCountedPtr{s, false});
6086       dout(10) << " new session (outgoing) " << s << " con=" << s->con
6087           << " addr=" << s->con->get_peer_addr() << dendl;
6088       // we don't connect to clients
6089       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6090       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6091     }
6092   }
6093 }
6094
6095 void OSD::ms_handle_fast_accept(Connection *con)
6096 {
6097   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6098       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6099     auto priv = con->get_priv();
6100     auto s = static_cast<Session*>(priv.get());
6101     if (!s) {
6102       s = new Session{cct, con};
6103       con->set_priv(RefCountedPtr{s, false});
6104       dout(10) << "new session (incoming)" << s << " con=" << con
6105           << " addr=" << con->get_peer_addr()
6106           << " must have raced with connect" << dendl;
6107       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6108       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6109     }
6110   }
6111 }
6112
6113 bool OSD::ms_handle_reset(Connection *con)
6114 {
6115   auto s = con->get_priv();
6116   auto session = static_cast<Session*>(s.get());
6117   dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
6118   if (!session)
6119     return false;
6120   session->wstate.reset(con);
6121   session->con->set_priv(nullptr);
6122   session->con.reset();  // break con <-> session ref cycle
6123   // note that we break session->con *before* the session_handle_reset
6124   // cleanup below.  this avoids a race between us and
6125   // PG::add_backoff, Session::check_backoff, etc.
6126   session_handle_reset(SessionRef{session});
6127   return true;
6128 }
6129
6130 bool OSD::ms_handle_refused(Connection *con)
6131 {
6132   if (!cct->_conf->osd_fast_fail_on_connection_refused)
6133     return false;
6134
6135   auto priv = con->get_priv();
6136   auto session = static_cast<Session*>(priv.get());
6137   dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
6138   if (!session)
6139     return false;
6140   int type = con->get_peer_type();
6141   // handle only OSD failures here
6142   if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6143     OSDMapRef osdmap = get_osdmap();
6144     if (osdmap) {
6145       int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6146       if (id >= 0 && osdmap->is_up(id)) {
6147         // I'm cheating mon heartbeat grace logic, because we know it's not going
6148         // to respawn alone. +1 so we won't hit any boundary case.
6149         monc->send_mon_message(
6150           new MOSDFailure(
6151             monc->get_fsid(),
6152             id,
6153             osdmap->get_addrs(id),
6154             cct->_conf->osd_heartbeat_grace + 1,
6155             osdmap->get_epoch(),
6156             MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6157             ));
6158       }
6159     }
6160   }
6161   return true;
6162 }
6163
6164 struct C_OSD_GetVersion : public Context {
6165   OSD *osd;
6166   uint64_t oldest, newest;
6167   explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6168   void finish(int r) override {
6169     if (r >= 0)
6170       osd->_got_mon_epochs(oldest, newest);
6171   }
6172 };
6173
6174 void OSD::start_boot()
6175 {
6176   if (!_is_healthy()) {
6177     // if we are not healthy, do not mark ourselves up (yet)
6178     dout(1) << "not healthy; waiting to boot" << dendl;
6179     if (!is_waiting_for_healthy())
6180       start_waiting_for_healthy();
6181     // send pings sooner rather than later
6182     heartbeat_kick();
6183     return;
6184   }
6185   dout(1) << __func__ << dendl;
6186   set_state(STATE_PREBOOT);
6187   dout(10) << "start_boot - have maps " << superblock.oldest_map
6188            << ".." << superblock.newest_map << dendl;
6189   C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6190   monc->get_version("osdmap", &c->newest, &c->oldest, c);
6191 }
6192
6193 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6194 {
6195   std::lock_guard l(osd_lock);
6196   if (is_preboot()) {
6197     _preboot(oldest, newest);
6198   }
6199 }
6200
6201 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6202 {
6203   ceph_assert(is_preboot());
6204   dout(10) << __func__ << " _preboot mon has osdmaps "
6205            << oldest << ".." << newest << dendl;
6206
6207   // ensure our local fullness awareness is accurate
6208   {
6209     std::lock_guard l(heartbeat_lock);
6210     heartbeat();
6211   }
6212
6213   // if our map within recent history, try to add ourselves to the osdmap.
6214   if (osdmap->get_epoch() == 0) {
6215     derr << "waiting for initial osdmap" << dendl;
6216   } else if (osdmap->is_destroyed(whoami)) {
6217     derr << "osdmap says I am destroyed" << dendl;
6218     // provide a small margin so we don't livelock seeing if we
6219     // un-destroyed ourselves.
6220     if (osdmap->get_epoch() > newest - 1) {
6221       exit(0);
6222     }
6223   } else if (osdmap->is_noup(whoami)) {
6224     derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6225   } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6226     derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6227          << dendl;
6228   } else if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6229     derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6230          << dendl;
6231   } else if (service.need_fullness_update()) {
6232     derr << "osdmap fullness state needs update" << dendl;
6233     send_full_update();
6234   } else if (osdmap->get_epoch() >= oldest - 1 &&
6235              osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6236
6237     // wait for pgs to fully catch up in a different thread, since
6238     // this thread might be required for splitting and merging PGs to
6239     // make progress.
6240     boot_finisher.queue(
6241       new FunctionContext(
6242         [this](int r) {
6243           std::lock_guard l(osd_lock);
6244           if (is_preboot()) {
6245             dout(10) << __func__ << " waiting for peering work to drain"
6246                      << dendl;
6247             osd_lock.Unlock();
6248             for (auto shard : shards) {
6249               shard->wait_min_pg_epoch(osdmap->get_epoch());
6250             }
6251             osd_lock.Lock();
6252           }
6253           if (is_preboot()) {
6254             _send_boot();
6255           }
6256         }));
6257     return;
6258   }
6259
6260   // get all the latest maps
6261   if (osdmap->get_epoch() + 1 >= oldest)
6262     osdmap_subscribe(osdmap->get_epoch() + 1, false);
6263   else
6264     osdmap_subscribe(oldest - 1, true);
6265 }
6266
6267 void OSD::send_full_update()
6268 {
6269   if (!service.need_fullness_update())
6270     return;
6271   unsigned state = 0;
6272   if (service.is_full()) {
6273     state = CEPH_OSD_FULL;
6274   } else if (service.is_backfillfull()) {
6275     state = CEPH_OSD_BACKFILLFULL;
6276   } else if (service.is_nearfull()) {
6277     state = CEPH_OSD_NEARFULL;
6278   }
6279   set<string> s;
6280   OSDMap::calc_state_set(state, s);
6281   dout(10) << __func__ << " want state " << s << dendl;
6282   monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
6283 }
6284
6285 void OSD::start_waiting_for_healthy()
6286 {
6287   dout(1) << "start_waiting_for_healthy" << dendl;
6288   set_state(STATE_WAITING_FOR_HEALTHY);
6289   last_heartbeat_resample = utime_t();
6290
6291   // subscribe to osdmap updates, in case our peers really are known to be dead
6292   osdmap_subscribe(osdmap->get_epoch() + 1, false);
6293 }
6294
6295 bool OSD::_is_healthy()
6296 {
6297   if (!cct->get_heartbeat_map()->is_healthy()) {
6298     dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6299     return false;
6300   }
6301
6302   if (is_waiting_for_healthy()) {
6303      utime_t now = ceph_clock_now();
6304      utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6305      while (!osd_markdown_log.empty() &&
6306              osd_markdown_log.front() + grace < now)
6307        osd_markdown_log.pop_front();
6308      if (osd_markdown_log.size() <= 1) {
6309        dout(5) << __func__ << " first time marked as down,"
6310                << " try reboot unconditionally" << dendl;
6311        return true;
6312     }
6313     std::lock_guard l(heartbeat_lock);
6314     int num = 0, up = 0;
6315     for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6316          p != heartbeat_peers.end();
6317          ++p) {
6318       if (p->second.is_healthy(now))
6319         ++up;
6320       ++num;
6321     }
6322     if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6323       dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6324               << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6325       return false;
6326     }
6327   }
6328
6329   return true;
6330 }
6331
6332 void OSD::_send_boot()
6333 {
6334   dout(10) << "_send_boot" << dendl;
6335   Connection *local_connection =
6336     cluster_messenger->get_loopback_connection().get();
6337   entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6338   entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6339   entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6340   entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6341
6342   dout(20) << " initial client_addrs " << client_addrs
6343            << ", cluster_addrs " << cluster_addrs
6344            << ", hb_back_addrs " << hb_back_addrs
6345            << ", hb_front_addrs " << hb_front_addrs
6346            << dendl;
6347   if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6348     dout(10) << " assuming cluster_addrs match client_addrs "
6349              << client_addrs << dendl;
6350     cluster_addrs = cluster_messenger->get_myaddrs();
6351   }
6352   if (auto session = local_connection->get_priv(); !session) {
6353     cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6354   }
6355
6356   local_connection = hb_back_server_messenger->get_loopback_connection().get();
6357   if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6358     dout(10) << " assuming hb_back_addrs match cluster_addrs "
6359              << cluster_addrs << dendl;
6360     hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6361   }
6362   if (auto session = local_connection->get_priv(); !session) {
6363     hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6364   }
6365
6366   local_connection = hb_front_server_messenger->get_loopback_connection().get();
6367   if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6368     dout(10) << " assuming hb_front_addrs match client_addrs "
6369              << client_addrs << dendl;
6370     hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6371   }
6372   if (auto session = local_connection->get_priv(); !session) {
6373     hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6374   }
6375
6376   // we now know what our front and back addrs will be, and we are
6377   // about to tell the mon what our metadata (including numa bindings)
6378   // are, so now is a good time!
6379   set_numa_affinity();
6380
6381   MOSDBoot *mboot = new MOSDBoot(
6382     superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6383     hb_back_addrs, hb_front_addrs, cluster_addrs,
6384     CEPH_FEATURES_ALL);
6385   dout(10) << " final client_addrs " << client_addrs
6386            << ", cluster_addrs " << cluster_addrs
6387            << ", hb_back_addrs " << hb_back_addrs
6388            << ", hb_front_addrs " << hb_front_addrs
6389            << dendl;
6390   _collect_metadata(&mboot->metadata);
6391   monc->send_mon_message(mboot);
6392   set_state(STATE_BOOTING);
6393 }
6394
6395 void OSD::_collect_metadata(map<string,string> *pm)
6396 {
6397   // config info
6398   (*pm)["osd_data"] = dev_path;
6399   if (store->get_type() == "filestore") {
6400     // not applicable for bluestore
6401     (*pm)["osd_journal"] = journal_path;
6402   }
6403   (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6404   (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6405   (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6406   (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6407
6408   // backend
6409   (*pm)["osd_objectstore"] = store->get_type();
6410   (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6411   (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6412   (*pm)["default_device_class"] = store->get_default_device_class();
6413   store->collect_metadata(pm);
6414
6415   collect_sys_info(pm, cct);
6416
6417   (*pm)["front_iface"] = pick_iface(
6418     cct,
6419     client_messenger->get_myaddrs().front().get_sockaddr_storage());
6420   (*pm)["back_iface"] = pick_iface(
6421     cct,
6422     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6423
6424   // network numa
6425   {
6426     int node = -1;
6427     set<int> nodes;
6428     set<string> unknown;
6429     for (auto nm : { "front_iface", "back_iface" }) {
6430       if (!(*pm)[nm].size()) {
6431         unknown.insert(nm);
6432         continue;
6433       }
6434       int n = -1;
6435       int r = get_iface_numa_node((*pm)[nm], &n);
6436       if (r < 0) {
6437         unknown.insert((*pm)[nm]);
6438         continue;
6439       }
6440       nodes.insert(n);
6441       if (node < 0) {
6442         node = n;
6443       }
6444     }
6445     if (unknown.size()) {
6446       (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6447     }
6448     if (!nodes.empty()) {
6449       (*pm)["network_numa_nodes"] = stringify(nodes);
6450     }
6451     if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6452       (*pm)["network_numa_node"] = stringify(node);
6453     }
6454   }
6455
6456   if (numa_node >= 0) {
6457     (*pm)["numa_node"] = stringify(numa_node);
6458     (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6459                                                   &numa_cpu_set);
6460   }
6461
6462   set<string> devnames;
6463   store->get_devices(&devnames);
6464   (*pm)["devices"] = stringify(devnames);
6465   string devids;
6466   for (auto& dev : devnames) {
6467     string err;
6468     string id = get_device_id(dev, &err);
6469     if (id.size()) {
6470       if (!devids.empty()) {
6471         devids += ",";
6472       }
6473       devids += dev + "=" + id;
6474     } else {
6475       dout(10) << __func__ << " no unique device id for " << dev << ": "
6476                << err << dendl;
6477     }
6478   }
6479   (*pm)["device_ids"] = devids;
6480
6481   dout(10) << __func__ << " " << *pm << dendl;
6482 }
6483
6484 void OSD::queue_want_up_thru(epoch_t want)
6485 {
6486   map_lock.get_read();
6487   epoch_t cur = osdmap->get_up_thru(whoami);
6488   std::lock_guard l(mon_report_lock);
6489   if (want > up_thru_wanted) {
6490     dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6491              << ", currently " << cur
6492              << dendl;
6493     up_thru_wanted = want;
6494     send_alive();
6495   } else {
6496     dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6497              << ", currently " << cur
6498              << dendl;
6499   }
6500   map_lock.put_read();
6501 }
6502
6503 void OSD::send_alive()
6504 {
6505   ceph_assert(mon_report_lock.is_locked());
6506   if (!osdmap->exists(whoami))
6507     return;
6508   epoch_t up_thru = osdmap->get_up_thru(whoami);
6509   dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6510   if (up_thru_wanted > up_thru) {
6511     dout(10) << "send_alive want " << up_thru_wanted << dendl;
6512     monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6513   }
6514 }
6515
6516 void OSD::request_full_map(epoch_t first, epoch_t last)
6517 {
6518   dout(10) << __func__ << " " << first << ".." << last
6519            << ", previously requested "
6520            << requested_full_first << ".." << requested_full_last << dendl;
6521   ceph_assert(osd_lock.is_locked());
6522   ceph_assert(first > 0 && last > 0);
6523   ceph_assert(first <= last);
6524   ceph_assert(first >= requested_full_first);  // we shouldn't ever ask for older maps
6525   if (requested_full_first == 0) {
6526     // first request
6527     requested_full_first = first;
6528     requested_full_last = last;
6529   } else if (last <= requested_full_last) {
6530     // dup
6531     return;
6532   } else {
6533     // additional request
6534     first = requested_full_last + 1;
6535     requested_full_last = last;
6536   }
6537   MMonGetOSDMap *req = new MMonGetOSDMap;
6538   req->request_full(first, last);
6539   monc->send_mon_message(req);
6540 }
6541
6542 void OSD::got_full_map(epoch_t e)
6543 {
6544   ceph_assert(requested_full_first <= requested_full_last);
6545   ceph_assert(osd_lock.is_locked());
6546   if (requested_full_first == 0) {
6547     dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6548     return;
6549   }
6550   if (e < requested_full_first) {
6551     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6552              << ".." << requested_full_last
6553              << ", ignoring" << dendl;
6554     return;
6555   }
6556   if (e >= requested_full_last) {
6557     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6558              << ".." << requested_full_last << ", resetting" << dendl;
6559     requested_full_first = requested_full_last = 0;
6560     return;
6561   }
6562
6563   requested_full_first = e + 1;
6564
6565   dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6566            << ".." << requested_full_last
6567            << ", still need more" << dendl;
6568 }
6569
6570 void OSD::requeue_failures()
6571 {
6572   std::lock_guard l(heartbeat_lock);
6573   unsigned old_queue = failure_queue.size();
6574   unsigned old_pending = failure_pending.size();
6575   for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6576     failure_queue[p->first] = p->second.first;
6577     failure_pending.erase(p++);
6578   }
6579   dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6580            << failure_queue.size() << dendl;
6581 }
6582
6583 void OSD::send_failures()
6584 {
6585   ceph_assert(map_lock.is_locked());
6586   ceph_assert(mon_report_lock.is_locked());
6587   std::lock_guard l(heartbeat_lock);
6588   utime_t now = ceph_clock_now();
6589   while (!failure_queue.empty()) {
6590     int osd = failure_queue.begin()->first;
6591     if (!failure_pending.count(osd)) {
6592       int failed_for = (int)(double)(now - failure_queue.begin()->second);
6593       monc->send_mon_message(
6594         new MOSDFailure(
6595           monc->get_fsid(),
6596           osd,
6597           osdmap->get_addrs(osd),
6598           failed_for,
6599           osdmap->get_epoch()));
6600       failure_pending[osd] = make_pair(failure_queue.begin()->second,
6601                                        osdmap->get_addrs(osd));
6602     }
6603     failure_queue.erase(osd);
6604   }
6605 }
6606
6607 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6608 {
6609   MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6610                                    MOSDFailure::FLAG_ALIVE);
6611   monc->send_mon_message(m);
6612 }
6613
6614 void OSD::cancel_pending_failures()
6615 {
6616   std::lock_guard l(heartbeat_lock);
6617   auto it = failure_pending.begin();
6618   while (it != failure_pending.end()) {
6619     dout(10) << __func__ << " canceling in-flight failure report for osd."
6620              << it->first << dendl;
6621     send_still_alive(osdmap->get_epoch(), it->first, it->second.second);
6622     failure_pending.erase(it++);
6623   }
6624 }
6625
6626 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6627 {
6628   const auto& monmap = monc->monmap;
6629   // send beacon to mon even if we are just connected, and the monmap is not
6630   // initialized yet by then.
6631   if (monmap.epoch > 0 &&
6632       monmap.get_required_features().contains_all(
6633         ceph::features::mon::FEATURE_LUMINOUS)) {
6634     dout(20) << __func__ << " sending" << dendl;
6635     MOSDBeacon* beacon = nullptr;
6636     {
6637       std::lock_guard l{min_last_epoch_clean_lock};
6638       beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6639       beacon->pgs = min_last_epoch_clean_pgs;
6640       last_sent_beacon = now;
6641     }
6642     monc->send_mon_message(beacon);
6643   } else {
6644     dout(20) << __func__ << " not sending" << dendl;
6645   }
6646 }
6647
6648 void OSD::handle_command(MMonCommand *m)
6649 {
6650   if (!require_mon_peer(m)) {
6651     m->put();
6652     return;
6653   }
6654
6655   Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6656   command_wq.queue(c);
6657   m->put();
6658 }
6659
6660 void OSD::handle_command(MCommand *m)
6661 {
6662   ConnectionRef con = m->get_connection();
6663   auto priv = con->get_priv();
6664   auto session = static_cast<Session *>(priv.get());
6665   if (!session) {
6666     con->send_message(new MCommandReply(m, -EPERM));
6667     m->put();
6668     return;
6669   }
6670
6671   OSDCap& caps = session->caps;
6672   priv.reset();
6673
6674   if (!caps.allow_all() || m->get_source().is_mon()) {
6675     con->send_message(new MCommandReply(m, -EPERM));
6676     m->put();
6677     return;
6678   }
6679
6680   Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6681   command_wq.queue(c);
6682
6683   m->put();
6684 }
6685
6686 struct OSDCommand {
6687   string cmdstring;
6688   string helpstring;
6689   string module;
6690   string perm;
6691 } osd_commands[] = {
6692
6693 #define COMMAND(parsesig, helptext, module, perm) \
6694   {parsesig, helptext, module, perm},
6695
6696 // yes, these are really pg commands, but there's a limit to how
6697 // much work it's worth.  The OSD returns all of them.  Make this
6698 // form (pg <pgid> <cmd>) valid only for the cli.
6699 // Rest uses "tell <pgid> <cmd>"
6700
6701 COMMAND("pg " \
6702         "name=pgid,type=CephPgid " \
6703         "name=cmd,type=CephChoices,strings=query", \
6704         "show details of a specific pg", "osd", "r")
6705 COMMAND("pg " \
6706         "name=pgid,type=CephPgid " \
6707         "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6708         "name=mulcmd,type=CephChoices,strings=revert|delete", \
6709         "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6710         "osd", "rw")
6711 COMMAND("pg " \
6712         "name=pgid,type=CephPgid " \
6713         "name=cmd,type=CephChoices,strings=list_unfound " \
6714         "name=offset,type=CephString,req=false",
6715         "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6716         "osd", "r")
6717
6718 // new form: tell <pgid> <cmd> for both cli and rest
6719
6720 COMMAND("query",
6721         "show details of a specific pg", "osd", "r")
6722 COMMAND("mark_unfound_lost " \
6723         "name=mulcmd,type=CephChoices,strings=revert|delete", \
6724         "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6725         "osd", "rw")
6726 COMMAND("list_unfound " \
6727         "name=offset,type=CephString,req=false",
6728         "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6729         "osd", "r")
6730 COMMAND("perf histogram dump "
6731         "name=logger,type=CephString,req=false "
6732         "name=counter,type=CephString,req=false",
6733         "Get histogram data",
6734         "osd", "r")
6735
6736 // tell <osd.n> commands.  Validation of osd.n must be special-cased in client
6737 COMMAND("version", "report version of OSD", "osd", "r")
6738 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
6739 COMMAND("injectargs " \
6740         "name=injected_args,type=CephString,n=N",
6741         "inject configuration arguments into running OSD",
6742         "osd", "rw")
6743 COMMAND("config set " \
6744         "name=key,type=CephString name=value,type=CephString",
6745         "Set a configuration option at runtime (not persistent)",
6746         "osd", "rw")
6747 COMMAND("config get " \
6748         "name=key,type=CephString",
6749         "Get a configuration option at runtime",
6750         "osd", "r")
6751 COMMAND("config unset " \
6752         "name=key,type=CephString",
6753         "Unset a configuration option at runtime (not persistent)",
6754         "osd", "rw")
6755 COMMAND("cluster_log " \
6756         "name=level,type=CephChoices,strings=error,warning,info,debug " \
6757         "name=message,type=CephString,n=N",
6758         "log a message to the cluster log",
6759         "osd", "rw")
6760 COMMAND("bench " \
6761         "name=count,type=CephInt,req=false " \
6762         "name=size,type=CephInt,req=false " \
6763         "name=object_size,type=CephInt,req=false " \
6764         "name=object_num,type=CephInt,req=false ", \
6765         "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
6766         "(default count=1G default size=4MB). Results in log.",
6767         "osd", "rw")
6768 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
6769 COMMAND("heap " \
6770         "name=heapcmd,type=CephChoices,strings="\
6771             "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6772         "name=value,type=CephString,req=false",
6773         "show heap usage info (available only if compiled with tcmalloc)",
6774         "osd", "rw")
6775 COMMAND("debug dump_missing " \
6776         "name=filename,type=CephFilepath",
6777         "dump missing objects to a named file", "osd", "r")
6778 COMMAND("debug kick_recovery_wq " \
6779         "name=delay,type=CephInt,range=0",
6780         "set osd_recovery_delay_start to <val>", "osd", "rw")
6781 COMMAND("cpu_profiler " \
6782         "name=arg,type=CephChoices,strings=status|flush",
6783         "run cpu profiling on daemon", "osd", "rw")
6784 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6785         "osd", "r")
6786 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6787         "osd", "rw")
6788 COMMAND("compact",
6789         "compact object store's omap. "
6790         "WARNING: Compaction probably slows your requests",
6791         "osd", "rw")
6792 COMMAND("smart name=devid,type=CephString,req=False",
6793         "runs smartctl on this osd devices.  ",
6794         "osd", "rw")
6795 COMMAND("cache drop",
6796         "Drop all OSD caches",
6797         "osd", "rwx")
6798 COMMAND("cache status",
6799         "Get OSD caches statistics",
6800         "osd", "r")
6801 COMMAND("send_beacon",
6802         "Send OSD beacon to mon immediately",
6803         "osd", "r")
6804 };
6805
6806 void OSD::do_command(
6807   Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6808 {
6809   dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6810
6811   int r = 0;
6812   stringstream ss, ds;
6813   bufferlist odata;
6814   cmdmap_t cmdmap;
6815   if (cmd.empty()) {
6816     ss << "no command given";
6817     goto out;
6818   }
6819   if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6820     r = -EINVAL;
6821     goto out;
6822   }
6823
6824   try {
6825     r = _do_command(con, cmdmap, tid, data, odata, ss, ds);
6826   } catch (const bad_cmd_get& e) {
6827     r = -EINVAL;
6828     ss << e.what();
6829   }
6830   if (r == -EAGAIN) {
6831     return;
6832   }
6833  out:
6834   string rs = ss.str();
6835   odata.append(ds);
6836   dout(0) << "do_command r=" << r << " " << rs << dendl;
6837   clog->info() << rs;
6838   if (con) {
6839     MCommandReply *reply = new MCommandReply(r, rs);
6840     reply->set_tid(tid);
6841     reply->set_data(odata);
6842     con->send_message(reply);
6843   }
6844 }
6845
6846 namespace {
6847   class unlock_guard {
6848     Mutex& m;
6849   public:
6850     explicit unlock_guard(Mutex& mutex)
6851       : m(mutex)
6852     {
6853       m.unlock();
6854     }
6855     unlock_guard(unlock_guard&) = delete;
6856     ~unlock_guard() {
6857       m.lock();
6858     }
6859   };
6860 }
6861
6862 int OSD::_do_command(
6863   Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data,
6864   bufferlist& odata, stringstream& ss, stringstream& ds)
6865 {
6866   int r = 0;
6867   string prefix;
6868   string format;
6869   string pgidstr;
6870   boost::scoped_ptr<Formatter> f;
6871
6872   cmd_getval(cct, cmdmap, "prefix", prefix);
6873
6874   if (prefix == "get_command_descriptions") {
6875     int cmdnum = 0;
6876     JSONFormatter *f = new JSONFormatter();
6877     f->open_object_section("command_descriptions");
6878     for (OSDCommand *cp = osd_commands;
6879          cp < &osd_commands[std::size(osd_commands)]; cp++) {
6880
6881       ostringstream secname;
6882       secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6883       dump_cmddesc_to_json(f, con->get_features(),
6884                            secname.str(), cp->cmdstring, cp->helpstring,
6885                            cp->module, cp->perm, 0);
6886       cmdnum++;
6887     }
6888     f->close_section(); // command_descriptions
6889
6890     f->flush(ds);
6891     delete f;
6892     goto out;
6893   }
6894
6895   cmd_getval(cct, cmdmap, "format", format);
6896   f.reset(Formatter::create(format));
6897
6898   if (prefix == "version") {
6899     if (f) {
6900       f->open_object_section("version");
6901       f->dump_string("version", pretty_version_to_str());
6902       f->close_section();
6903       f->flush(ds);
6904     } else {
6905       ds << pretty_version_to_str();
6906     }
6907     goto out;
6908   }
6909   else if (prefix == "injectargs") {
6910     vector<string> argsvec;
6911     cmd_getval(cct, cmdmap, "injected_args", argsvec);
6912
6913     if (argsvec.empty()) {
6914       r = -EINVAL;
6915       ss << "ignoring empty injectargs";
6916       goto out;
6917     }
6918     string args = argsvec.front();
6919     for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6920       args += " " + *a;
6921     unlock_guard unlock{osd_lock};
6922     r = cct->_conf.injectargs(args, &ss);
6923   }
6924   else if (prefix == "config set") {
6925     std::string key;
6926     std::string val;
6927     cmd_getval(cct, cmdmap, "key", key);
6928     cmd_getval(cct, cmdmap, "value", val);
6929     unlock_guard unlock{osd_lock};
6930     r = cct->_conf.set_val(key, val, &ss);
6931     if (r == 0) {
6932       cct->_conf.apply_changes(nullptr);
6933     }
6934   }
6935   else if (prefix == "config get") {
6936     std::string key;
6937     cmd_getval(cct, cmdmap, "key", key);
6938     unlock_guard unlock{osd_lock};
6939     std::string val;
6940     r = cct->_conf.get_val(key, &val);
6941     if (r == 0) {
6942       ds << val;
6943     }
6944   }
6945   else if (prefix == "config unset") {
6946     std::string key;
6947     cmd_getval(cct, cmdmap, "key", key);
6948     unlock_guard unlock{osd_lock};
6949     r = cct->_conf.rm_val(key);
6950     if (r == 0) {
6951       cct->_conf.apply_changes(nullptr);
6952     }
6953     if (r == -ENOENT) {
6954       r = 0;  // make command idempotent
6955     }
6956   }
6957   else if (prefix == "cluster_log") {
6958     vector<string> msg;
6959     cmd_getval(cct, cmdmap, "message", msg);
6960     if (msg.empty()) {
6961       r = -EINVAL;
6962       ss << "ignoring empty log message";
6963       goto out;
6964     }
6965     string message = msg.front();
6966     for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6967       message += " " + *a;
6968     string lvl;
6969     cmd_getval(cct, cmdmap, "level", lvl);
6970     clog_type level = string_to_clog_type(lvl);
6971     if (level < 0) {
6972       r = -EINVAL;
6973       ss << "unknown level '" << lvl << "'";
6974       goto out;
6975     }
6976     clog->do_log(level, message);
6977   }
6978
6979   // either 'pg <pgid> <command>' or
6980   // 'tell <pgid>' (which comes in without any of that prefix)?
6981
6982   else if (prefix == "pg" ||
6983             prefix == "query" ||
6984             prefix == "mark_unfound_lost" ||
6985             prefix == "list_unfound"
6986            ) {
6987     pg_t pgid;
6988
6989     if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6990       ss << "no pgid specified";
6991       r = -EINVAL;
6992     } else if (!pgid.parse(pgidstr.c_str())) {
6993       ss << "couldn't parse pgid '" << pgidstr << "'";
6994       r = -EINVAL;
6995     } else {
6996       spg_t pcand;
6997       PGRef pg;
6998       if (osdmap->get_primary_shard(pgid, &pcand) &&
6999           (pg = _lookup_lock_pg(pcand))) {
7000         if (pg->is_primary()) {
7001           // simulate pg <pgid> cmd= for pg->do-command
7002           if (prefix != "pg")
7003             cmd_putval(cct, cmdmap, "cmd", prefix);
7004           try {
7005             r = pg->do_command(cmdmap, ss, data, odata, con, tid);
7006           } catch (const bad_cmd_get& e) {
7007             pg->unlock();
7008             ss << e.what();
7009             return -EINVAL;
7010           }
7011           if (r == -EAGAIN) {
7012             pg->unlock();
7013             // don't reply, pg will do so async
7014             return -EAGAIN;
7015           }
7016         } else {
7017           ss << "not primary for pgid " << pgid;
7018
7019           // send them the latest diff to ensure they realize the mapping
7020           // has changed.
7021           service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
7022
7023           // do not reply; they will get newer maps and realize they
7024           // need to resend.
7025           pg->unlock();
7026           return -EAGAIN;
7027         }
7028         pg->unlock();
7029       } else {
7030         ss << "i don't have pgid " << pgid;
7031         r = -ENOENT;
7032       }
7033     }
7034   }
7035
7036   else if (prefix == "bench") {
7037     int64_t count;
7038     int64_t bsize;
7039     int64_t osize, onum;
7040     // default count 1G, size 4MB
7041     cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
7042     cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
7043     cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
7044     cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
7045
7046     uint32_t duration = cct->_conf->osd_bench_duration;
7047
7048     if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
7049       // let us limit the block size because the next checks rely on it
7050       // having a sane value.  If we allow any block size to be set things
7051       // can still go sideways.
7052       ss << "block 'size' values are capped at "
7053          << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
7054          << " a higher value, please adjust 'osd_bench_max_block_size'";
7055       r = -EINVAL;
7056       goto out;
7057     } else if (bsize < (int64_t) (1 << 20)) {
7058       // entering the realm of small block sizes.
7059       // limit the count to a sane value, assuming a configurable amount of
7060       // IOPS and duration, so that the OSD doesn't get hung up on this,
7061       // preventing timeouts from going off
7062       int64_t max_count =
7063         bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
7064       if (count > max_count) {
7065         ss << "'count' values greater than " << max_count
7066            << " for a block size of " << byte_u_t(bsize) << ", assuming "
7067            << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
7068            << " for " << duration << " seconds,"
7069            << " can cause ill effects on osd. "
7070            << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
7071            << " value if you wish to use a higher 'count'.";
7072         r = -EINVAL;
7073         goto out;
7074       }
7075     } else {
7076       // 1MB block sizes are big enough so that we get more stuff done.
7077       // However, to avoid the osd from getting hung on this and having
7078       // timers being triggered, we are going to limit the count assuming
7079       // a configurable throughput and duration.
7080       // NOTE: max_count is the total amount of bytes that we believe we
7081       //       will be able to write during 'duration' for the given
7082       //       throughput.  The block size hardly impacts this unless it's
7083       //       way too big.  Given we already check how big the block size
7084       //       is, it's safe to assume everything will check out.
7085       int64_t max_count =
7086         cct->_conf->osd_bench_large_size_max_throughput * duration;
7087       if (count > max_count) {
7088         ss << "'count' values greater than " << max_count
7089            << " for a block size of " << byte_u_t(bsize) << ", assuming "
7090            << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
7091            << " for " << duration << " seconds,"
7092            << " can cause ill effects on osd. "
7093            << " Please adjust 'osd_bench_large_size_max_throughput'"
7094            << " with a higher value if you wish to use a higher 'count'.";
7095         r = -EINVAL;
7096         goto out;
7097       }
7098     }
7099
7100     if (osize && bsize > osize)
7101       bsize = osize;
7102
7103     dout(1) << " bench count " << count
7104             << " bsize " << byte_u_t(bsize) << dendl;
7105
7106     ObjectStore::Transaction cleanupt;
7107
7108     if (osize && onum) {
7109       bufferlist bl;
7110       bufferptr bp(osize);
7111       bp.zero();
7112       bl.push_back(std::move(bp));
7113       bl.rebuild_page_aligned();
7114       for (int i=0; i<onum; ++i) {
7115         char nm[30];
7116         snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
7117         object_t oid(nm);
7118         hobject_t soid(sobject_t(oid, 0));
7119         ObjectStore::Transaction t;
7120         t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
7121         store->queue_transaction(service.meta_ch, std::move(t), NULL);
7122         cleanupt.remove(coll_t(), ghobject_t(soid));
7123       }
7124     }
7125
7126     bufferlist bl;
7127     bufferptr bp(bsize);
7128     bp.zero();
7129     bl.push_back(std::move(bp));
7130     bl.rebuild_page_aligned();
7131
7132     {
7133       C_SaferCond waiter;
7134       if (!service.meta_ch->flush_commit(&waiter)) {
7135         waiter.wait();
7136       }
7137     }
7138
7139     utime_t start = ceph_clock_now();
7140     for (int64_t pos = 0; pos < count; pos += bsize) {
7141       char nm[30];
7142       unsigned offset = 0;
7143       if (onum && osize) {
7144         snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
7145         offset = rand() % (osize / bsize) * bsize;
7146       } else {
7147         snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
7148       }
7149       object_t oid(nm);
7150       hobject_t soid(sobject_t(oid, 0));
7151       ObjectStore::Transaction t;
7152       t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
7153       store->queue_transaction(service.meta_ch, std::move(t), NULL);
7154       if (!onum || !osize)
7155         cleanupt.remove(coll_t::meta(), ghobject_t(soid));
7156     }
7157
7158     {
7159       C_SaferCond waiter;
7160       if (!service.meta_ch->flush_commit(&waiter)) {
7161         waiter.wait();
7162       }
7163     }
7164     utime_t end = ceph_clock_now();
7165
7166     // clean up
7167     store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
7168     {
7169       C_SaferCond waiter;
7170       if (!service.meta_ch->flush_commit(&waiter)) {
7171         waiter.wait();
7172       }
7173     }
7174
7175     double elapsed = end - start;
7176     double rate = count / elapsed;
7177     double iops = rate / bsize;
7178     if (f) {
7179       f->open_object_section("osd_bench_results");
7180       f->dump_int("bytes_written", count);
7181       f->dump_int("blocksize", bsize);
7182       f->dump_float("elapsed_sec", elapsed);
7183       f->dump_float("bytes_per_sec", rate);
7184       f->dump_float("iops", iops);
7185       f->close_section();
7186       f->flush(ds);
7187     } else {
7188       ds << "bench: wrote " << byte_u_t(count)
7189          << " in blocks of " << byte_u_t(bsize) << " in "
7190          << elapsed << " sec at " << byte_u_t(rate) << "/sec "
7191          << si_u_t(iops) << " IOPS";
7192     }
7193   }
7194
7195   else if (prefix == "flush_pg_stats") {
7196     mgrc.send_pgstats();
7197     ds << service.get_osd_stat_seq() << "\n";
7198   }
7199
7200   else if (prefix == "heap") {
7201     r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
7202   }
7203
7204   else if (prefix == "debug dump_missing") {
7205     if (!f) {
7206       f.reset(new JSONFormatter(true));
7207     }
7208     f->open_array_section("pgs");
7209     vector<PGRef> pgs;
7210     _get_pgs(&pgs);
7211     for (auto& pg : pgs) {
7212       string s = stringify(pg->pg_id);
7213       f->open_array_section(s.c_str());
7214       pg->lock();
7215       pg->dump_missing(f.get());
7216       pg->unlock();
7217       f->close_section();
7218     }
7219     f->close_section();
7220     f->flush(ds);
7221   }
7222   else if (prefix == "debug kick_recovery_wq") {
7223     int64_t delay;
7224     cmd_getval(cct, cmdmap, "delay", delay);
7225     ostringstream oss;
7226     oss << delay;
7227     unlock_guard unlock{osd_lock};
7228     r = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
7229     if (r != 0) {
7230       ss << "kick_recovery_wq: error setting "
7231          << "osd_recovery_delay_start to '" << delay << "': error "
7232          << r;
7233       goto out;
7234     }
7235     cct->_conf.apply_changes(nullptr);
7236     ss << "kicking recovery queue. set osd_recovery_delay_start "
7237        << "to " << cct->_conf->osd_recovery_delay_start;
7238   }
7239
7240   else if (prefix == "cpu_profiler") {
7241     string arg;
7242     cmd_getval(cct, cmdmap, "arg", arg);
7243     vector<string> argvec;
7244     get_str_vec(arg, argvec);
7245     cpu_profiler_handle_command(argvec, ds);
7246   }
7247
7248   else if (prefix == "dump_pg_recovery_stats") {
7249     stringstream s;
7250     if (f) {
7251       pg_recovery_stats.dump_formatted(f.get());
7252       f->flush(ds);
7253     } else {
7254       pg_recovery_stats.dump(s);
7255       ds << "dump pg recovery stats: " << s.str();
7256     }
7257   }
7258
7259   else if (prefix == "reset_pg_recovery_stats") {
7260     ss << "reset pg recovery stats";
7261     pg_recovery_stats.reset();
7262   }
7263
7264   else if (prefix == "perf histogram dump") {
7265     std::string logger;
7266     std::string counter;
7267     cmd_getval(cct, cmdmap, "logger", logger);
7268     cmd_getval(cct, cmdmap, "counter", counter);
7269     if (f) {
7270       cct->get_perfcounters_collection()->dump_formatted_histograms(
7271           f.get(), false, logger, counter);
7272       f->flush(ds);
7273     }
7274   }
7275
7276   else if (prefix == "compact") {
7277     dout(1) << "triggering manual compaction" << dendl;
7278     auto start = ceph::coarse_mono_clock::now();
7279     store->compact();
7280     auto end = ceph::coarse_mono_clock::now();
7281     double duration = std::chrono::duration<double>(end-start).count();
7282     dout(1) << "finished manual compaction in "
7283             << duration
7284             << " seconds" << dendl;
7285     ss << "compacted omap in " << duration << " seconds";
7286   }
7287
7288   else if (prefix == "smart") {
7289     string devid;
7290     cmd_getval(cct, cmdmap, "devid", devid);
7291     probe_smart(devid, ds);
7292   }
7293
7294   else if (prefix == "cache drop") {
7295     dout(20) << "clearing all caches" << dendl;
7296     // Clear the objectstore's cache - onode and buffer for Bluestore,
7297     // system's pagecache for Filestore
7298     r = store->flush_cache(&ss);
7299     if (r < 0) {
7300       ds << "Error flushing objectstore cache: " << cpp_strerror(r);
7301       goto out;
7302     }
7303     // Clear the objectcontext cache (per PG)
7304     vector<PGRef> pgs;
7305     _get_pgs(&pgs);
7306     for (auto& pg: pgs) {
7307       pg->clear_cache();
7308     }
7309   }
7310
7311   else if (prefix == "cache status") {
7312     int obj_ctx_count = 0;
7313     vector<PGRef> pgs;
7314     _get_pgs(&pgs);
7315     for (auto& pg: pgs) {
7316       obj_ctx_count += pg->get_cache_obj_count();
7317     }
7318     if (f) {
7319       f->open_object_section("cache_status");
7320       f->dump_int("object_ctx", obj_ctx_count);
7321       store->dump_cache_stats(f.get());
7322       f->close_section();
7323       f->flush(ds);
7324     } else {
7325       ds << "object_ctx: " << obj_ctx_count;
7326       store->dump_cache_stats(ds);
7327     }
7328   }
7329   else if (prefix == "send_beacon") {
7330     if (is_active()) {
7331       send_beacon(ceph::coarse_mono_clock::now());
7332     }
7333   } else {
7334     ss << "unrecognized command '" << prefix << "'";
7335     r = -EINVAL;
7336   }
7337
7338  out:
7339   return r;
7340 }
7341
7342 void OSD::probe_smart(const string& only_devid, ostream& ss)
7343 {
7344   set<string> devnames;
7345   store->get_devices(&devnames);
7346   uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7347     "osd_smart_report_timeout");
7348
7349   // == typedef std::map<std::string, mValue> mObject;
7350   json_spirit::mObject json_map;
7351
7352   for (auto dev : devnames) {
7353     // smartctl works only on physical devices; filter out any logical device
7354     if (dev.find("dm-") == 0) {
7355       continue;
7356     }
7357
7358     string err;
7359     string devid = get_device_id(dev, &err);
7360     if (devid.size() == 0) {
7361       dout(10) << __func__ << " no unique id for dev " << dev << " ("
7362                << err << "), skipping" << dendl;
7363       continue;
7364     }
7365     if (only_devid.size() && devid != only_devid) {
7366       continue;
7367     }
7368
7369     json_spirit::mValue smart_json;
7370     if (block_device_get_metrics(dev, smart_timeout,
7371                                  &smart_json)) {
7372       dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7373       continue;
7374     }
7375     json_map[devid] = smart_json;
7376   }
7377   json_spirit::write(json_map, ss, json_spirit::pretty_print);
7378 }
7379
7380 bool OSD::heartbeat_dispatch(Message *m)
7381 {
7382   dout(30) << "heartbeat_dispatch " << m << dendl;
7383   switch (m->get_type()) {
7384
7385   case CEPH_MSG_PING:
7386     dout(10) << "ping from " << m->get_source_inst() << dendl;
7387     m->put();
7388     break;
7389
7390   case MSG_OSD_PING:
7391     handle_osd_ping(static_cast<MOSDPing*>(m));
7392     break;
7393
7394   default:
7395     dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7396     m->put();
7397   }
7398
7399   return true;
7400 }
7401
7402 bool OSD::ms_dispatch(Message *m)
7403 {
7404   dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7405   if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7406     service.got_stop_ack();
7407     m->put();
7408     return true;
7409   }
7410
7411   // lock!
7412
7413   osd_lock.Lock();
7414   if (is_stopping()) {
7415     osd_lock.Unlock();
7416     m->put();
7417     return true;
7418   }
7419
7420   do_waiters();
7421   _dispatch(m);
7422
7423   osd_lock.Unlock();
7424
7425   return true;
7426 }
7427
7428 void OSD::maybe_share_map(
7429   Session *session,
7430   OpRequestRef op,
7431   OSDMapRef osdmap)
7432 {
7433   if (!op->check_send_map) {
7434     return;
7435   }
7436   epoch_t last_sent_epoch = 0;
7437
7438   session->sent_epoch_lock.lock();
7439   last_sent_epoch = session->last_sent_epoch;
7440   session->sent_epoch_lock.unlock();
7441
7442   // assume the peer has the newer of the op's sent_epoch and what
7443   // we think we sent them.
7444   epoch_t from = std::max(last_sent_epoch, op->sent_epoch);
7445
7446   const Message *m = op->get_req();
7447   service.share_map(
7448     m->get_source(),
7449     m->get_connection().get(),
7450     from,
7451     osdmap,
7452     session ? &last_sent_epoch : NULL);
7453
7454   session->sent_epoch_lock.lock();
7455   if (session->last_sent_epoch < last_sent_epoch) {
7456     session->last_sent_epoch = last_sent_epoch;
7457   }
7458   session->sent_epoch_lock.unlock();
7459
7460   op->check_send_map = false;
7461 }
7462
7463 void OSD::dispatch_session_waiting(SessionRef session, OSDMapRef osdmap)
7464 {
7465   ceph_assert(session->session_dispatch_lock.is_locked());
7466
7467   auto i = session->waiting_on_map.begin();
7468   while (i != session->waiting_on_map.end()) {
7469     OpRequestRef op = &(*i);
7470     ceph_assert(ms_can_fast_dispatch(op->get_req()));
7471     const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7472       op->get_req());
7473     if (m->get_min_epoch() > osdmap->get_epoch()) {
7474       break;
7475     }
7476     session->waiting_on_map.erase(i++);
7477     op->put();
7478
7479     spg_t pgid;
7480     if (m->get_type() == CEPH_MSG_OSD_OP) {
7481       pg_t actual_pgid = osdmap->raw_pg_to_pg(
7482         static_cast<const MOSDOp*>(m)->get_pg());
7483       if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7484         continue;
7485       }
7486     } else {
7487       pgid = m->get_spg();
7488     }
7489     enqueue_op(pgid, std::move(op), m->get_map_epoch());
7490   }
7491
7492   if (session->waiting_on_map.empty()) {
7493     clear_session_waiting_on_map(session);
7494   } else {
7495     register_session_waiting_on_map(session);
7496   }
7497 }
7498
7499 void OSD::ms_fast_dispatch(Message *m)
7500 {
7501   FUNCTRACE(cct);
7502   if (service.is_stopping()) {
7503     m->put();
7504     return;
7505   }
7506
7507   // peering event?
7508   switch (m->get_type()) {
7509   case CEPH_MSG_PING:
7510     dout(10) << "ping from " << m->get_source() << dendl;
7511     m->put();
7512     return;
7513   case MSG_MON_COMMAND:
7514     handle_command(static_cast<MMonCommand*>(m));
7515     return;
7516   case MSG_OSD_FORCE_RECOVERY:
7517     handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7518     return;
7519   case MSG_OSD_SCRUB2:
7520     handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7521     return;
7522
7523   case MSG_OSD_PG_CREATE2:
7524     return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7525   case MSG_OSD_PG_QUERY:
7526     return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7527   case MSG_OSD_PG_NOTIFY:
7528     return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7529   case MSG_OSD_PG_INFO:
7530     return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7531   case MSG_OSD_PG_REMOVE:
7532     return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7533
7534     // these are single-pg messages that handle themselves
7535   case MSG_OSD_PG_LOG:
7536   case MSG_OSD_PG_TRIM:
7537   case MSG_OSD_BACKFILL_RESERVE:
7538   case MSG_OSD_RECOVERY_RESERVE:
7539     {
7540       MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7541       if (require_osd_peer(pm)) {
7542         enqueue_peering_evt(
7543           pm->get_spg(),
7544           PGPeeringEventRef(pm->get_event()));
7545       }
7546       pm->put();
7547       return;
7548     }
7549   }
7550
7551   OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7552   {
7553 #ifdef WITH_LTTNG
7554     osd_reqid_t reqid = op->get_reqid();
7555 #endif
7556     tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7557         reqid.name._num, reqid.tid, reqid.inc);
7558   }
7559
7560   if (m->trace)
7561     op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7562
7563   // note sender epoch, min req's epoch
7564   op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7565   op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7566   ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7567
7568   service.maybe_inject_dispatch_delay();
7569
7570   if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7571       m->get_type() != CEPH_MSG_OSD_OP) {
7572     // queue it directly
7573     enqueue_op(
7574       static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7575       std::move(op),
7576       static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7577   } else {
7578     // legacy client, and this is an MOSDOp (the *only* fast dispatch
7579     // message that didn't have an explicit spg_t); we need to map
7580     // them to an spg_t while preserving delivery order.
7581     auto priv = m->get_connection()->get_priv();
7582     if (auto session = static_cast<Session*>(priv.get()); session) {
7583       std::lock_guard l{session->session_dispatch_lock};
7584       op->get();
7585       session->waiting_on_map.push_back(*op);
7586       OSDMapRef nextmap = service.get_nextmap_reserved();
7587       dispatch_session_waiting(session, nextmap);
7588       service.release_map(nextmap);
7589     }
7590   }
7591   OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7592 }
7593
7594 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7595 {
7596   dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7597
7598   if (is_stopping()) {
7599     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7600     return false;
7601   }
7602
7603   if (dest_type == CEPH_ENTITY_TYPE_MON)
7604     return true;
7605
7606   *authorizer = monc->build_authorizer(dest_type);
7607   return *authorizer != NULL;
7608 }
7609
7610 KeyStore *OSD::ms_get_auth1_authorizer_keystore()
7611 {
7612   return monc->rotating_secrets.get();
7613 }
7614
7615 int OSD::ms_handle_authentication(Connection *con)
7616 {
7617   int ret = 0;
7618   auto priv = con->get_priv();
7619   Session *s = static_cast<Session*>(priv.get());
7620   if (!s) {
7621     s = new Session(cct, con);
7622     con->set_priv(RefCountedPtr{s, false});
7623     s->entity_name = con->get_peer_entity_name();
7624     dout(10) << __func__ << " new session " << s << " con " << s->con
7625              << " entity " << s->entity_name
7626              << " addr " << con->get_peer_addrs() << dendl;
7627   } else {
7628     dout(10) << __func__ << " existing session " << s << " con " << s->con
7629              << " entity " << s->entity_name
7630              << " addr " << con->get_peer_addrs() << dendl;
7631   }
7632
7633   AuthCapsInfo &caps_info = con->get_peer_caps_info();
7634   if (caps_info.allow_all)
7635     s->caps.set_allow_all();
7636
7637   if (caps_info.caps.length() > 0) {
7638     bufferlist::const_iterator p = caps_info.caps.cbegin();
7639     string str;
7640     try {
7641       decode(str, p);
7642     }
7643     catch (buffer::error& e) {
7644       dout(10) << __func__ << " session " << s << " " << s->entity_name
7645                << " failed to decode caps string" << dendl;
7646       ret = -EPERM;
7647     }
7648     if (!ret) {
7649       bool success = s->caps.parse(str);
7650       if (success) {
7651         dout(10) << __func__ << " session " << s
7652                  << " " << s->entity_name
7653                  << " has caps " << s->caps << " '" << str << "'" << dendl;
7654         ret = 1;
7655       } else {
7656         dout(10) << __func__ << " session " << s << " " << s->entity_name
7657                  << " failed to parse caps '" << str << "'" << dendl;
7658         ret = -EPERM;
7659       }
7660     }
7661   }
7662   return ret;
7663 }
7664
7665 void OSD::do_waiters()
7666 {
7667   ceph_assert(osd_lock.is_locked());
7668
7669   dout(10) << "do_waiters -- start" << dendl;
7670   while (!finished.empty()) {
7671     OpRequestRef next = finished.front();
7672     finished.pop_front();
7673     dispatch_op(next);
7674   }
7675   dout(10) << "do_waiters -- finish" << dendl;
7676 }
7677
7678 void OSD::dispatch_op(OpRequestRef op)
7679 {
7680   switch (op->get_req()->get_type()) {
7681
7682   case MSG_OSD_PG_CREATE:
7683     handle_pg_create(op);
7684     break;
7685   }
7686 }
7687
7688 void OSD::_dispatch(Message *m)
7689 {
7690   ceph_assert(osd_lock.is_locked());
7691   dout(20) << "_dispatch " << m << " " << *m << dendl;
7692
7693   switch (m->get_type()) {
7694     // -- don't need OSDMap --
7695
7696     // map and replication
7697   case CEPH_MSG_OSD_MAP:
7698     handle_osd_map(static_cast<MOSDMap*>(m));
7699     break;
7700
7701     // osd
7702   case MSG_OSD_SCRUB:
7703     handle_scrub(static_cast<MOSDScrub*>(m));
7704     break;
7705
7706   case MSG_COMMAND:
7707     handle_command(static_cast<MCommand*>(m));
7708     return;
7709
7710     // -- need OSDMap --
7711
7712   case MSG_OSD_PG_CREATE:
7713     {
7714       OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7715       if (m->trace)
7716         op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7717       // no map?  starting up?
7718       if (!osdmap) {
7719         dout(7) << "no OSDMap, not booted" << dendl;
7720         logger->inc(l_osd_waiting_for_map);
7721         waiting_for_osdmap.push_back(op);
7722         op->mark_delayed("no osdmap");
7723         break;
7724       }
7725
7726       // need OSDMap
7727       dispatch_op(op);
7728     }
7729   }
7730 }
7731
7732 // remove me post-nautilus
7733 void OSD::handle_scrub(MOSDScrub *m)
7734 {
7735   dout(10) << "handle_scrub " << *m << dendl;
7736   if (!require_mon_or_mgr_peer(m)) {
7737     m->put();
7738     return;
7739   }
7740   if (m->fsid != monc->get_fsid()) {
7741     dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7742             << dendl;
7743     m->put();
7744     return;
7745   }
7746
7747   vector<spg_t> spgs;
7748   _get_pgids(&spgs);
7749
7750   if (!m->scrub_pgs.empty()) {
7751     vector<spg_t> v;
7752     for (auto pgid : m->scrub_pgs) {
7753       spg_t pcand;
7754       if (osdmap->get_primary_shard(pgid, &pcand) &&
7755           std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7756         v.push_back(pcand);
7757       }
7758     }
7759     spgs.swap(v);
7760   }
7761
7762   for (auto pgid : spgs) {
7763     enqueue_peering_evt(
7764       pgid,
7765       PGPeeringEventRef(
7766         std::make_shared<PGPeeringEvent>(
7767           get_osdmap_epoch(),
7768           get_osdmap_epoch(),
7769           PG::RequestScrub(m->deep, m->repair))));
7770   }
7771
7772   m->put();
7773 }
7774
7775 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7776 {
7777   dout(10) << __func__ <<  " " << *m << dendl;
7778   if (!require_mon_or_mgr_peer(m)) {
7779     m->put();
7780     return;
7781   }
7782   if (m->fsid != monc->get_fsid()) {
7783     dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7784             << dendl;
7785     m->put();
7786     return;
7787   }
7788   for (auto pgid : m->scrub_pgs) {
7789     enqueue_peering_evt(
7790       pgid,
7791       PGPeeringEventRef(
7792         std::make_shared<PGPeeringEvent>(
7793           m->epoch,
7794           m->epoch,
7795           PG::RequestScrub(m->deep, m->repair))));
7796   }
7797   m->put();
7798 }
7799
7800 bool OSD::scrub_random_backoff()
7801 {
7802   bool coin_flip = (rand() / (double)RAND_MAX >=
7803                     cct->_conf->osd_scrub_backoff_ratio);
7804   if (!coin_flip) {
7805     dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7806     return true;
7807   }
7808   return false;
7809 }
7810
7811 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7812                                const spg_t& pg, const utime_t& timestamp,
7813                                double pool_scrub_min_interval,
7814                                double pool_scrub_max_interval, bool must)
7815   : cct(cct),
7816     pgid(pg),
7817     sched_time(timestamp),
7818     deadline(timestamp)
7819 {
7820   // if not explicitly requested, postpone the scrub with a random delay
7821   if (!must) {
7822     double scrub_min_interval = pool_scrub_min_interval > 0 ?
7823       pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7824     double scrub_max_interval = pool_scrub_max_interval > 0 ?
7825       pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7826
7827     sched_time += scrub_min_interval;
7828     double r = rand() / (double)RAND_MAX;
7829     sched_time +=
7830       scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7831     if (scrub_max_interval == 0) {
7832       deadline = utime_t();
7833     } else {
7834       deadline += scrub_max_interval;
7835     }
7836
7837   }
7838 }
7839
7840 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7841   if (sched_time < rhs.sched_time)
7842     return true;
7843   if (sched_time > rhs.sched_time)
7844     return false;
7845   return pgid < rhs.pgid;
7846 }
7847
7848 bool OSD::scrub_time_permit(utime_t now)
7849 {
7850   struct tm bdt;
7851   time_t tt = now.sec();
7852   localtime_r(&tt, &bdt);
7853
7854   bool day_permit = false;
7855   if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7856     if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7857       day_permit = true;
7858     }
7859   } else {
7860     if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7861       day_permit = true;
7862     }
7863   }
7864
7865   if (!day_permit) {
7866     dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7867             << " - " << cct->_conf->osd_scrub_end_week_day
7868             << " now " << bdt.tm_wday << " = no" << dendl;
7869     return false;
7870   }
7871
7872   bool time_permit = false;
7873   if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7874     if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7875       time_permit = true;
7876     }
7877   } else {
7878     if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7879       time_permit = true;
7880     }
7881   }
7882   if (!time_permit) {
7883     dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7884             << " - " << cct->_conf->osd_scrub_end_hour
7885             << " now " << bdt.tm_hour << " = no" << dendl;
7886   } else {
7887     dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7888             << " - " << cct->_conf->osd_scrub_end_hour
7889             << " now " << bdt.tm_hour << " = yes" << dendl;
7890   }
7891   return time_permit;
7892 }
7893
7894 bool OSD::scrub_load_below_threshold()
7895 {
7896   double loadavgs[3];
7897   if (getloadavg(loadavgs, 3) != 3) {
7898     dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7899     return false;
7900   }
7901
7902   // allow scrub if below configured threshold
7903   long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7904   double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7905   if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7906     dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7907              << " < max " << cct->_conf->osd_scrub_load_threshold
7908              << " = yes" << dendl;
7909     return true;
7910   }
7911
7912   // allow scrub if below daily avg and currently decreasing
7913   if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7914     dout(20) << __func__ << " loadavg " << loadavgs[0]
7915              << " < daily_loadavg " << daily_loadavg
7916              << " and < 15m avg " << loadavgs[2]
7917              << " = yes" << dendl;
7918     return true;
7919   }
7920
7921   dout(20) << __func__ << " loadavg " << loadavgs[0]
7922            << " >= max " << cct->_conf->osd_scrub_load_threshold
7923            << " and ( >= daily_loadavg " << daily_loadavg
7924            << " or >= 15m avg " << loadavgs[2]
7925            << ") = no" << dendl;
7926   return false;
7927 }
7928
7929 void OSD::sched_scrub()
7930 {
7931   // if not permitted, fail fast
7932   if (!service.can_inc_scrubs()) {
7933     return;
7934   }
7935   bool allow_requested_repair_only = false;
7936   if (service.is_recovery_active()) {
7937     if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
7938       dout(10) << __func__
7939                << " will only schedule explicitly requested repair due to active recovery"
7940                << dendl;
7941       allow_requested_repair_only = true;
7942     } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
7943       dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7944       return;
7945     }
7946   }
7947
7948   utime_t now = ceph_clock_now();
7949   bool time_permit = scrub_time_permit(now);
7950   bool load_is_low = scrub_load_below_threshold();
7951   dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7952
7953   OSDService::ScrubJob scrub;
7954   if (service.first_scrub_stamp(&scrub)) {
7955     do {
7956       dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7957
7958       if (scrub.sched_time > now) {
7959         // save ourselves some effort
7960         dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7961                  << " > " << now << dendl;
7962         break;
7963       }
7964
7965       if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7966         dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7967                  << (!time_permit ? "time not permit" : "high load") << dendl;
7968         continue;
7969       }
7970
7971       PGRef pg = _lookup_lock_pg(scrub.pgid);
7972       if (!pg)
7973         continue;
7974       // This has already started, so go on to the next scrub job
7975       if (pg->scrubber.active) {
7976         pg->unlock();
7977         dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7978         continue;
7979       }
7980       // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7981       if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7982         pg->unlock();
7983         dout(10) << __func__ << " skip " << scrub.pgid
7984                  << " because repairing is not explicitly requested on it"
7985                  << dendl;
7986         continue;
7987       }
7988       // If it is reserving, let it resolve before going to the next scrub job
7989       if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7990         pg->unlock();
7991         dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7992         break;
7993       }
7994       dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7995                << (pg->get_must_scrub() ? ", explicitly requested" :
7996                    (load_is_low ? ", load_is_low" : " deadline < now"))
7997                << dendl;
7998       if (pg->sched_scrub()) {
7999         pg->unlock();
8000         break;
8001       }
8002       pg->unlock();
8003     } while (service.next_scrub_stamp(scrub, &scrub));
8004   }
8005   dout(20) << "sched_scrub done" << dendl;
8006 }
8007
8008 void OSD::resched_all_scrubs()
8009 {
8010   dout(10) << __func__ << ": start" << dendl;
8011   OSDService::ScrubJob scrub;
8012   if (service.first_scrub_stamp(&scrub)) {
8013     do {
8014       dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
8015
8016       PGRef pg = _lookup_lock_pg(scrub.pgid);
8017       if (!pg)
8018         continue;
8019       if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
8020         dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
8021         pg->on_info_history_change();
8022       }
8023       pg->unlock();
8024     } while (service.next_scrub_stamp(scrub, &scrub));
8025   }
8026   dout(10) << __func__ << ": done" << dendl;
8027 }
8028
8029 MPGStats* OSD::collect_pg_stats()
8030 {
8031   // This implementation unconditionally sends every is_primary PG's
8032   // stats every time we're called.  This has equivalent cost to the
8033   // previous implementation's worst case where all PGs are busy and
8034   // their stats are always enqueued for sending.
8035   RWLock::RLocker l(map_lock);
8036
8037   utime_t had_for = ceph_clock_now() - had_map_since;
8038   osd_stat_t cur_stat = service.get_osd_stat();
8039   cur_stat.os_perf_stat = store->get_cur_stats();
8040
8041   auto m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
8042   m->osd_stat = cur_stat;
8043
8044   std::lock_guard lec{min_last_epoch_clean_lock};
8045   min_last_epoch_clean = osdmap->get_epoch();
8046   min_last_epoch_clean_pgs.clear();
8047
8048   std::set<int64_t> pool_set;
8049   vector<PGRef> pgs;
8050   _get_pgs(&pgs);
8051   for (auto& pg : pgs) {
8052     auto pool = pg->pg_id.pgid.pool();
8053     pool_set.emplace((int64_t)pool);
8054     if (!pg->is_primary()) {
8055       continue;
8056     }
8057     pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
8058         m->pg_stat[pg->pg_id.pgid] = s;
8059         min_last_epoch_clean = min(min_last_epoch_clean, lec);
8060         min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
8061       });
8062   }
8063   store_statfs_t st;
8064   bool per_pool_stats = false;
8065   for (auto p : pool_set) {
8066     int r = store->pool_statfs(p, &st);
8067     if (r == -ENOTSUP) {
8068       break;
8069     } else {
8070       assert(r >= 0);
8071       m->pool_stat[p] = st;
8072       per_pool_stats = true;
8073     }
8074   }
8075
8076   // indicate whether we are reporting per-pool stats
8077   m->osd_stat.num_osds = 1;
8078   m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
8079
8080   return m;
8081 }
8082
8083 vector<DaemonHealthMetric> OSD::get_health_metrics()
8084 {
8085   vector<DaemonHealthMetric> metrics;
8086   {
8087     utime_t oldest_secs;
8088     const utime_t now = ceph_clock_now();
8089     auto too_old = now;
8090     too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
8091     int slow = 0;
8092     TrackedOpRef oldest_op;
8093     auto count_slow_ops = [&](TrackedOp& op) {
8094       if (op.get_initiated() < too_old) {
8095         lgeneric_subdout(cct,osd,20) << "slow op " << op.get_desc()
8096                                      << " initiated "
8097                                      << op.get_initiated() << dendl;
8098         slow++;
8099         if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
8100           oldest_op = &op;
8101         }
8102         return true;
8103       } else {
8104         return false;
8105       }
8106     };
8107     if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
8108       if (slow) {
8109         derr << __func__ << " reporting " << slow << " slow ops, oldest is "
8110              << oldest_op->get_desc() << dendl;
8111       }
8112       metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
8113     } else {
8114       // no news is not good news.
8115       metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
8116     }
8117   }
8118   {
8119     std::lock_guard l(pending_creates_lock);
8120     auto n_primaries = pending_creates_from_mon;
8121     for (const auto& create : pending_creates_from_osd) {
8122       if (create.second) {
8123         n_primaries++;
8124       }
8125     }
8126     metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
8127   }
8128   return metrics;
8129 }
8130
8131 // =====================================================
8132 // MAP
8133
8134 void OSD::wait_for_new_map(OpRequestRef op)
8135 {
8136   // ask?
8137   if (waiting_for_osdmap.empty()) {
8138     osdmap_subscribe(osdmap->get_epoch() + 1, false);
8139   }
8140
8141   logger->inc(l_osd_waiting_for_map);
8142   waiting_for_osdmap.push_back(op);
8143   op->mark_delayed("wait for new map");
8144 }
8145
8146
8147 /** update_map
8148  * assimilate new OSDMap(s).  scan pgs, etc.
8149  */
8150
8151 void OSD::note_down_osd(int peer)
8152 {
8153   ceph_assert(osd_lock.is_locked());
8154   cluster_messenger->mark_down_addrs(osdmap->get_cluster_addrs(peer));
8155
8156   heartbeat_lock.Lock();
8157   failure_queue.erase(peer);
8158   failure_pending.erase(peer);
8159   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
8160   if (p != heartbeat_peers.end()) {
8161     p->second.con_back->mark_down();
8162     if (p->second.con_front) {
8163       p->second.con_front->mark_down();
8164     }
8165     heartbeat_peers.erase(p);
8166   }
8167   heartbeat_lock.Unlock();
8168 }
8169
8170 void OSD::note_up_osd(int peer)
8171 {
8172   service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
8173   heartbeat_set_peers_need_update();
8174 }
8175
8176 struct C_OnMapCommit : public Context {
8177   OSD *osd;
8178   epoch_t first, last;
8179   MOSDMap *msg;
8180   C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
8181     : osd(o), first(f), last(l), msg(m) {}
8182   void finish(int r) override {
8183     osd->_committed_osd_maps(first, last, msg);
8184     msg->put();
8185   }
8186 };
8187
8188 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
8189 {
8190   std::lock_guard l(osdmap_subscribe_lock);
8191   if (latest_subscribed_epoch >= epoch && !force_request)
8192     return;
8193
8194   latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
8195
8196   if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
8197       force_request) {
8198     monc->renew_subs();
8199   }
8200 }
8201
8202 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
8203 {
8204   epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
8205   if (min <= superblock.oldest_map)
8206     return;
8207
8208   int num = 0;
8209   ObjectStore::Transaction t;
8210   for (epoch_t e = superblock.oldest_map; e < min; ++e) {
8211     dout(20) << " removing old osdmap epoch " << e << dendl;
8212     t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
8213     t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
8214     superblock.oldest_map = e + 1;
8215     num++;
8216     if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
8217       service.publish_superblock(superblock);
8218       write_superblock(t);
8219       int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8220       ceph_assert(tr == 0);
8221       num = 0;
8222       if (!skip_maps) {
8223         // skip_maps leaves us with a range of old maps if we fail to remove all
8224         // of them before moving superblock.oldest_map forward to the first map
8225         // in the incoming MOSDMap msg. so we should continue removing them in
8226         // this case, even we could do huge series of delete transactions all at
8227         // once.
8228         break;
8229       }
8230     }
8231   }
8232   if (num > 0) {
8233     service.publish_superblock(superblock);
8234     write_superblock(t);
8235     int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8236     ceph_assert(tr == 0);
8237   }
8238   // we should not remove the cached maps
8239   ceph_assert(min <= service.map_cache.cached_key_lower_bound());
8240 }
8241
8242 void OSD::handle_osd_map(MOSDMap *m)
8243 {
8244   // wait for pgs to catch up
8245   {
8246     // we extend the map cache pins to accomodate pgs slow to consume maps
8247     // for some period, until we hit the max_lag_factor bound, at which point
8248     // we block here to stop injesting more maps than they are able to keep
8249     // up with.
8250     epoch_t max_lag = cct->_conf->osd_map_cache_size *
8251       m_osd_pg_epoch_max_lag_factor;
8252     ceph_assert(max_lag > 0);
8253     epoch_t osd_min = 0;
8254     for (auto shard : shards) {
8255       epoch_t min = shard->get_min_pg_epoch();
8256       if (osd_min == 0 || min < osd_min) {
8257         osd_min = min;
8258       }
8259     }
8260     if (osd_min > 0 &&
8261         osdmap->get_epoch() > max_lag &&
8262         osdmap->get_epoch() - max_lag > osd_min) {
8263       epoch_t need = osdmap->get_epoch() - max_lag;
8264       dout(10) << __func__ << " waiting for pgs to catch up (need " << need
8265                << " max_lag " << max_lag << ")" << dendl;
8266       for (auto shard : shards) {
8267         epoch_t min = shard->get_min_pg_epoch();
8268         if (need > min) {
8269           dout(10) << __func__ << " waiting for pgs to consume " << need
8270                    << " (shard " << shard->shard_id << " min " << min
8271                    << ", map cache is " << cct->_conf->osd_map_cache_size
8272                    << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
8273                    << ")" << dendl;
8274           unlock_guard unlock{osd_lock};
8275           shard->wait_min_pg_epoch(need);
8276         }
8277       }
8278     }
8279   }
8280
8281   ceph_assert(osd_lock.is_locked());
8282   map<epoch_t,OSDMapRef> added_maps;
8283   map<epoch_t,bufferlist> added_maps_bl;
8284   if (m->fsid != monc->get_fsid()) {
8285     dout(0) << "handle_osd_map fsid " << m->fsid << " != "
8286             << monc->get_fsid() << dendl;
8287     m->put();
8288     return;
8289   }
8290   if (is_initializing()) {
8291     dout(0) << "ignoring osdmap until we have initialized" << dendl;
8292     m->put();
8293     return;
8294   }
8295
8296   auto priv = m->get_connection()->get_priv();
8297   if (auto session = static_cast<Session *>(priv.get());
8298       session && !(session->entity_name.is_mon() ||
8299                    session->entity_name.is_osd())) {
8300     //not enough perms!
8301     dout(10) << "got osd map from Session " << session
8302              << " which we can't take maps from (not a mon or osd)" << dendl;
8303     m->put();
8304     return;
8305   }
8306
8307   // share with the objecter
8308   if (!is_preboot())
8309     service.objecter->handle_osd_map(m);
8310
8311   epoch_t first = m->get_first();
8312   epoch_t last = m->get_last();
8313   dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
8314           << superblock.newest_map
8315           << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
8316           << dendl;
8317
8318   logger->inc(l_osd_map);
8319   logger->inc(l_osd_mape, last - first + 1);
8320   if (first <= superblock.newest_map)
8321     logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8322   if (service.max_oldest_map < m->oldest_map) {
8323     service.max_oldest_map = m->oldest_map;
8324     ceph_assert(service.max_oldest_map >= superblock.oldest_map);
8325   }
8326
8327   // make sure there is something new, here, before we bother flushing
8328   // the queues and such
8329   if (last <= superblock.newest_map) {
8330     dout(10) << " no new maps here, dropping" << dendl;
8331     m->put();
8332     return;
8333   }
8334
8335   // missing some?
8336   bool skip_maps = false;
8337   if (first > superblock.newest_map + 1) {
8338     dout(10) << "handle_osd_map message skips epochs "
8339              << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8340     if (m->oldest_map <= superblock.newest_map + 1) {
8341       osdmap_subscribe(superblock.newest_map + 1, false);
8342       m->put();
8343       return;
8344     }
8345     // always try to get the full range of maps--as many as we can.  this
8346     //  1- is good to have
8347     //  2- is at present the only way to ensure that we get a *full* map as
8348     //     the first map!
8349     if (m->oldest_map < first) {
8350       osdmap_subscribe(m->oldest_map - 1, true);
8351       m->put();
8352       return;
8353     }
8354     skip_maps = true;
8355   }
8356
8357   ObjectStore::Transaction t;
8358   uint64_t txn_size = 0;
8359
8360   // store new maps: queue for disk and put in the osdmap cache
8361   epoch_t start = std::max(superblock.newest_map + 1, first);
8362   for (epoch_t e = start; e <= last; e++) {
8363     if (txn_size >= t.get_num_bytes()) {
8364       derr << __func__ << " transaction size overflowed" << dendl;
8365       ceph_assert(txn_size < t.get_num_bytes());
8366     }
8367     txn_size = t.get_num_bytes();
8368     map<epoch_t,bufferlist>::iterator p;
8369     p = m->maps.find(e);
8370     if (p != m->maps.end()) {
8371       dout(10) << "handle_osd_map  got full map for epoch " << e << dendl;
8372       OSDMap *o = new OSDMap;
8373       bufferlist& bl = p->second;
8374
8375       o->decode(bl);
8376
8377       ghobject_t fulloid = get_osdmap_pobject_name(e);
8378       t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8379       added_maps[e] = add_map(o);
8380       added_maps_bl[e] = bl;
8381       got_full_map(e);
8382       continue;
8383     }
8384
8385     p = m->incremental_maps.find(e);
8386     if (p != m->incremental_maps.end()) {
8387       dout(10) << "handle_osd_map  got inc map for epoch " << e << dendl;
8388       bufferlist& bl = p->second;
8389       ghobject_t oid = get_inc_osdmap_pobject_name(e);
8390       t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8391
8392       OSDMap *o = new OSDMap;
8393       if (e > 1) {
8394         bufferlist obl;
8395         bool got = get_map_bl(e - 1, obl);
8396         if (!got) {
8397           auto p = added_maps_bl.find(e - 1);
8398           ceph_assert(p != added_maps_bl.end());
8399           obl = p->second;
8400         }
8401         o->decode(obl);
8402       }
8403
8404       OSDMap::Incremental inc;
8405       auto p = bl.cbegin();
8406       inc.decode(p);
8407
8408       if (o->apply_incremental(inc) < 0) {
8409         derr << "ERROR: bad fsid?  i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
8410         ceph_abort_msg("bad fsid");
8411       }
8412
8413       bufferlist fbl;
8414       o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8415
8416       bool injected_failure = false;
8417       if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8418           (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8419         derr << __func__ << " injecting map crc failure" << dendl;
8420         injected_failure = true;
8421       }
8422
8423       if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8424         dout(2) << "got incremental " << e
8425                 << " but failed to encode full with correct crc; requesting"
8426                 << dendl;
8427         clog->warn() << "failed to encode map e" << e << " with expected crc";
8428         dout(20) << "my encoded map was:\n";
8429         fbl.hexdump(*_dout);
8430         *_dout << dendl;
8431         delete o;
8432         request_full_map(e, last);
8433         last = e - 1;
8434         break;
8435       }
8436       got_full_map(e);
8437
8438       ghobject_t fulloid = get_osdmap_pobject_name(e);
8439       t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8440       added_maps[e] = add_map(o);
8441       added_maps_bl[e] = fbl;
8442       continue;
8443     }
8444
8445     ceph_abort_msg("MOSDMap lied about what maps it had?");
8446   }
8447
8448   // even if this map isn't from a mon, we may have satisfied our subscription
8449   monc->sub_got("osdmap", last);
8450
8451   if (!m->maps.empty() && requested_full_first) {
8452     dout(10) << __func__ << " still missing full maps " << requested_full_first
8453              << ".." << requested_full_last << dendl;
8454     rerequest_full_maps();
8455   }
8456
8457   if (superblock.oldest_map) {
8458     // make sure we at least keep pace with incoming maps
8459     trim_maps(m->oldest_map, last - first + 1, skip_maps);
8460     pg_num_history.prune(superblock.oldest_map);
8461   }
8462
8463   if (!superblock.oldest_map || skip_maps)
8464     superblock.oldest_map = first;
8465   superblock.newest_map = last;
8466   superblock.current_epoch = last;
8467
8468   // note in the superblock that we were clean thru the prior epoch
8469   epoch_t boot_epoch = service.get_boot_epoch();
8470   if (boot_epoch && boot_epoch >= superblock.mounted) {
8471     superblock.mounted = boot_epoch;
8472     superblock.clean_thru = last;
8473   }
8474
8475   // check for pg_num changes and deleted pools
8476   OSDMapRef lastmap;
8477   for (auto& i : added_maps) {
8478     if (!lastmap) {
8479       if (!(lastmap = service.try_get_map(i.first - 1))) {
8480         dout(10) << __func__ << " can't get previous map " << i.first - 1
8481                  << " probably first start of this osd" << dendl;
8482         continue;
8483       }
8484     }
8485     ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8486     for (auto& j : lastmap->get_pools()) {
8487       if (!i.second->have_pg_pool(j.first)) {
8488         pg_num_history.log_pool_delete(i.first, j.first);
8489         dout(10) << __func__ << " recording final pg_pool_t for pool "
8490                  << j.first << dendl;
8491         // this information is needed by _make_pg() if have to restart before
8492         // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8493         ghobject_t obj = make_final_pool_info_oid(j.first);
8494         bufferlist bl;
8495         encode(j.second, bl, CEPH_FEATURES_ALL);
8496         string name = lastmap->get_pool_name(j.first);
8497         encode(name, bl);
8498         map<string,string> profile;
8499         if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8500           profile = lastmap->get_erasure_code_profile(
8501             lastmap->get_pg_pool(j.first)->erasure_code_profile);
8502         }
8503         encode(profile, bl);
8504         t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8505         service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num());
8506       } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8507                  new_pg_num != j.second.get_pg_num()) {
8508         dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8509                  << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8510         pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8511       }
8512     }
8513     for (auto& j : i.second->get_pools()) {
8514       if (!lastmap->have_pg_pool(j.first)) {
8515         dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8516                  << j.second.get_pg_num() << dendl;
8517         pg_num_history.log_pg_num_change(i.first, j.first,
8518                                          j.second.get_pg_num());
8519       }
8520     }
8521     lastmap = i.second;
8522   }
8523   pg_num_history.epoch = last;
8524   {
8525     bufferlist bl;
8526     ::encode(pg_num_history, bl);
8527     t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8528     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8529   }
8530
8531   // superblock and commit
8532   write_superblock(t);
8533   t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8534   store->queue_transaction(
8535     service.meta_ch,
8536     std::move(t));
8537   service.publish_superblock(superblock);
8538 }
8539
8540 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8541 {
8542   dout(10) << __func__ << " " << first << ".." << last << dendl;
8543   if (is_stopping()) {
8544     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8545     return;
8546   }
8547   std::lock_guard l(osd_lock);
8548   if (is_stopping()) {
8549     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8550     return;
8551   }
8552   map_lock.get_write();
8553
8554   bool do_shutdown = false;
8555   bool do_restart = false;
8556   bool network_error = false;
8557
8558   // advance through the new maps
8559   for (epoch_t cur = first; cur <= last; cur++) {
8560     dout(10) << " advance to epoch " << cur
8561              << " (<= last " << last
8562              << " <= newest_map " << superblock.newest_map
8563              << ")" << dendl;
8564
8565     OSDMapRef newmap = get_map(cur);
8566     ceph_assert(newmap);  // we just cached it above!
8567
8568     // start blacklisting messages sent to peers that go down.
8569     service.pre_publish_map(newmap);
8570
8571     // kill connections to newly down osds
8572     bool waited_for_reservations = false;
8573     set<int> old;
8574     osdmap->get_all_osds(old);
8575     for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8576       if (*p != whoami &&
8577           osdmap->is_up(*p) && // in old map
8578           newmap->is_down(*p)) {    // but not the new one
8579         if (!waited_for_reservations) {
8580           service.await_reserved_maps();
8581           waited_for_reservations = true;
8582         }
8583         note_down_osd(*p);
8584       } else if (*p != whoami &&
8585                 osdmap->is_down(*p) &&
8586                 newmap->is_up(*p)) {
8587         note_up_osd(*p);
8588       }
8589     }
8590
8591     if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8592       dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8593                << dendl;
8594       if (is_booting()) {
8595         // this captures the case where we sent the boot message while
8596         // NOUP was being set on the mon and our boot request was
8597         // dropped, and then later it is cleared.  it imperfectly
8598         // handles the case where our original boot message was not
8599         // dropped and we restart even though we might have booted, but
8600         // that is harmless (boot will just take slightly longer).
8601         do_restart = true;
8602       }
8603     }
8604
8605     osdmap = newmap;
8606     epoch_t up_epoch;
8607     epoch_t boot_epoch;
8608     service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8609     if (!up_epoch &&
8610         osdmap->is_up(whoami) &&
8611         osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8612       up_epoch = osdmap->get_epoch();
8613       dout(10) << "up_epoch is " << up_epoch << dendl;
8614       if (!boot_epoch) {
8615         boot_epoch = osdmap->get_epoch();
8616         dout(10) << "boot_epoch is " << boot_epoch << dendl;
8617       }
8618       service.set_epochs(&boot_epoch, &up_epoch, NULL);
8619     }
8620   }
8621
8622   had_map_since = ceph_clock_now();
8623
8624   epoch_t _bind_epoch = service.get_bind_epoch();
8625   if (osdmap->is_up(whoami) &&
8626       osdmap->get_addrs(whoami).legacy_equals(
8627         client_messenger->get_myaddrs()) &&
8628       _bind_epoch < osdmap->get_up_from(whoami)) {
8629
8630     if (is_booting()) {
8631       dout(1) << "state: booting -> active" << dendl;
8632       set_state(STATE_ACTIVE);
8633       do_restart = false;
8634
8635       // set incarnation so that osd_reqid_t's we generate for our
8636       // objecter requests are unique across restarts.
8637       service.objecter->set_client_incarnation(osdmap->get_epoch());
8638       cancel_pending_failures();
8639     }
8640   }
8641
8642   if (osdmap->get_epoch() > 0 &&
8643       is_active()) {
8644     if (!osdmap->exists(whoami)) {
8645       dout(0) << "map says i do not exist.  shutting down." << dendl;
8646       do_shutdown = true;   // don't call shutdown() while we have
8647                             // everything paused
8648     } else if (!osdmap->is_up(whoami) ||
8649                !osdmap->get_addrs(whoami).legacy_equals(
8650                  client_messenger->get_myaddrs()) ||
8651                !osdmap->get_cluster_addrs(whoami).legacy_equals(
8652                  cluster_messenger->get_myaddrs()) ||
8653                !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8654                  hb_back_server_messenger->get_myaddrs()) ||
8655                !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8656                  hb_front_server_messenger->get_myaddrs())) {
8657       if (!osdmap->is_up(whoami)) {
8658         if (service.is_preparing_to_stop() || service.is_stopping()) {
8659           service.got_stop_ack();
8660         } else {
8661           clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8662                           "but it is still running";
8663           clog->debug() << "map e" << osdmap->get_epoch()
8664                         << " wrongly marked me down at e"
8665                         << osdmap->get_down_at(whoami);
8666         }
8667       } else if (!osdmap->get_addrs(whoami).legacy_equals(
8668                    client_messenger->get_myaddrs())) {
8669         clog->error() << "map e" << osdmap->get_epoch()
8670                       << " had wrong client addr (" << osdmap->get_addrs(whoami)
8671                       << " != my " << client_messenger->get_myaddrs() << ")";
8672       } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8673                    cluster_messenger->get_myaddrs())) {
8674         clog->error() << "map e" << osdmap->get_epoch()
8675                       << " had wrong cluster addr ("
8676                       << osdmap->get_cluster_addrs(whoami)
8677                       << " != my " << cluster_messenger->get_myaddrs() << ")";
8678       } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8679                    hb_back_server_messenger->get_myaddrs())) {
8680         clog->error() << "map e" << osdmap->get_epoch()
8681                       << " had wrong heartbeat back addr ("
8682                       << osdmap->get_hb_back_addrs(whoami)
8683                       << " != my " << hb_back_server_messenger->get_myaddrs()
8684                       << ")";
8685       } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8686                    hb_front_server_messenger->get_myaddrs())) {
8687         clog->error() << "map e" << osdmap->get_epoch()
8688                       << " had wrong heartbeat front addr ("
8689                       << osdmap->get_hb_front_addrs(whoami)
8690                       << " != my " << hb_front_server_messenger->get_myaddrs()
8691                       << ")";
8692       }
8693
8694       if (!service.is_stopping()) {
8695         epoch_t up_epoch = 0;
8696         epoch_t bind_epoch = osdmap->get_epoch();
8697         service.set_epochs(NULL,&up_epoch, &bind_epoch);
8698         do_restart = true;
8699
8700         //add markdown log
8701         utime_t now = ceph_clock_now();
8702         utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8703         osd_markdown_log.push_back(now);
8704         //clear all out-of-date log
8705         while (!osd_markdown_log.empty() &&
8706                osd_markdown_log.front() + grace < now)
8707           osd_markdown_log.pop_front();
8708         if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8709           dout(0) << __func__ << " marked down "
8710                   << osd_markdown_log.size()
8711                   << " > osd_max_markdown_count "
8712                   << cct->_conf->osd_max_markdown_count
8713                   << " in last " << grace << " seconds, shutting down"
8714                   << dendl;
8715           do_restart = false;
8716           do_shutdown = true;
8717         }
8718
8719         start_waiting_for_healthy();
8720
8721         set<int> avoid_ports;
8722 #if defined(__FreeBSD__)
8723         // prevent FreeBSD from grabbing the client_messenger port during
8724         // rebinding. In which case a cluster_meesneger will connect also
8725         // to the same port
8726         client_messenger->get_myaddrs().get_ports(&avoid_ports);
8727 #endif
8728         cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8729         hb_back_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8730         hb_front_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8731
8732         int r = cluster_messenger->rebind(avoid_ports);
8733         if (r != 0) {
8734           do_shutdown = true;  // FIXME: do_restart?
8735           network_error = true;
8736           dout(0) << __func__ << " marked down:"
8737                   << " rebind cluster_messenger failed" << dendl;
8738         }
8739
8740         r = hb_back_server_messenger->rebind(avoid_ports);
8741         if (r != 0) {
8742           do_shutdown = true;  // FIXME: do_restart?
8743           network_error = true;
8744           dout(0) << __func__ << " marked down:"
8745                   << " rebind hb_back_server_messenger failed" << dendl;
8746         }
8747
8748         r = hb_front_server_messenger->rebind(avoid_ports);
8749         if (r != 0) {
8750           do_shutdown = true;  // FIXME: do_restart?
8751           network_error = true;
8752           dout(0) << __func__ << " marked down:"
8753                   << " rebind hb_front_server_messenger failed" << dendl;
8754         }
8755
8756         hb_front_client_messenger->mark_down_all();
8757         hb_back_client_messenger->mark_down_all();
8758
8759         reset_heartbeat_peers(true);
8760       }
8761     }
8762   }
8763
8764   map_lock.put_write();
8765
8766   check_osdmap_features();
8767
8768   // yay!
8769   consume_map();
8770
8771   if (is_active() || is_waiting_for_healthy())
8772     maybe_update_heartbeat_peers();
8773
8774   if (is_active()) {
8775     activate_map();
8776   }
8777
8778   if (do_shutdown) {
8779     if (network_error) {
8780       cancel_pending_failures();
8781     }
8782     // trigger shutdown in a different thread
8783     dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8784     queue_async_signal(SIGINT);
8785   }
8786   else if (m->newest_map && m->newest_map > last) {
8787     dout(10) << " msg say newest map is " << m->newest_map
8788              << ", requesting more" << dendl;
8789     osdmap_subscribe(osdmap->get_epoch()+1, false);
8790   }
8791   else if (is_preboot()) {
8792     if (m->get_source().is_mon())
8793       _preboot(m->oldest_map, m->newest_map);
8794     else
8795       start_boot();
8796   }
8797   else if (do_restart)
8798     start_boot();
8799
8800 }
8801
8802 void OSD::check_osdmap_features()
8803 {
8804   // adjust required feature bits?
8805
8806   // we have to be a bit careful here, because we are accessing the
8807   // Policy structures without taking any lock.  in particular, only
8808   // modify integer values that can safely be read by a racing CPU.
8809   // since we are only accessing existing Policy structures a their
8810   // current memory location, and setting or clearing bits in integer
8811   // fields, and we are the only writer, this is not a problem.
8812
8813   {
8814     Messenger::Policy p = client_messenger->get_default_policy();
8815     uint64_t mask;
8816     uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8817     if ((p.features_required & mask) != features) {
8818       dout(0) << "crush map has features " << features
8819               << ", adjusting msgr requires for clients" << dendl;
8820       p.features_required = (p.features_required & ~mask) | features;
8821       client_messenger->set_default_policy(p);
8822     }
8823   }
8824   {
8825     Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8826     uint64_t mask;
8827     uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8828     if ((p.features_required & mask) != features) {
8829       dout(0) << "crush map has features " << features
8830               << " was " << p.features_required
8831               << ", adjusting msgr requires for mons" << dendl;
8832       p.features_required = (p.features_required & ~mask) | features;
8833       client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8834     }
8835   }
8836   {
8837     Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8838     uint64_t mask;
8839     uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8840
8841     if ((p.features_required & mask) != features) {
8842       dout(0) << "crush map has features " << features
8843               << ", adjusting msgr requires for osds" << dendl;
8844       p.features_required = (p.features_required & ~mask) | features;
8845       cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8846     }
8847
8848     if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8849       dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8850       superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8851       ObjectStore::Transaction t;
8852       write_superblock(t);
8853       int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8854       ceph_assert(err == 0);
8855     }
8856   }
8857
8858   if (osdmap->require_osd_release < CEPH_RELEASE_NAUTILUS) {
8859     heartbeat_dispatcher.ms_set_require_authorizer(false);
8860   }
8861
8862   if (osdmap->require_osd_release != last_require_osd_release) {
8863     dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8864             << " -> " << to_string(osdmap->require_osd_release) << dendl;
8865     store->write_meta("require_osd_release",
8866                       stringify((int)osdmap->require_osd_release));
8867     last_require_osd_release = osdmap->require_osd_release;
8868   }
8869 }
8870
8871 struct C_FinishSplits : public Context {
8872   OSD *osd;
8873   set<PGRef> pgs;
8874   C_FinishSplits(OSD *osd, const set<PGRef> &in)
8875     : osd(osd), pgs(in) {}
8876   void finish(int r) override {
8877     osd->_finish_splits(pgs);
8878   }
8879 };
8880
8881 void OSD::_finish_splits(set<PGRef>& pgs)
8882 {
8883   dout(10) << __func__ << " " << pgs << dendl;
8884   if (is_stopping())
8885     return;
8886   PG::RecoveryCtx rctx = create_context();
8887   for (set<PGRef>::iterator i = pgs.begin();
8888        i != pgs.end();
8889        ++i) {
8890     PG *pg = i->get();
8891
8892     pg->lock();
8893     dout(10) << __func__ << " " << *pg << dendl;
8894     epoch_t e = pg->get_osdmap_epoch();
8895     pg->handle_initialize(&rctx);
8896     pg->queue_null(e, e);
8897     dispatch_context_transaction(rctx, pg);
8898     pg->unlock();
8899
8900     unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8901     shards[shard_index]->register_and_wake_split_child(pg);
8902   }
8903
8904   dispatch_context(rctx, 0, service.get_osdmap());
8905 };
8906
8907 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8908                            unsigned need)
8909 {
8910   std::lock_guard l(merge_lock);
8911   auto& p = merge_waiters[nextmap->get_epoch()][target];
8912   p[src->pg_id] = src;
8913   dout(10) << __func__ << " added merge_waiter " << src->pg_id
8914            << " for " << target  << ", have " << p.size() << "/" << need
8915            << dendl;
8916   return p.size() == need;
8917 }
8918
8919 bool OSD::advance_pg(
8920   epoch_t osd_epoch,
8921   PG *pg,
8922   ThreadPool::TPHandle &handle,
8923   PG::RecoveryCtx *rctx)
8924 {
8925   if (osd_epoch <= pg->get_osdmap_epoch()) {
8926     return true;
8927   }
8928   ceph_assert(pg->is_locked());
8929   OSDMapRef lastmap = pg->get_osdmap();
8930   ceph_assert(lastmap->get_epoch() < osd_epoch);
8931   set<PGRef> new_pgs;  // any split children
8932   bool ret = true;
8933
8934   unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8935     lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8936   for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8937        next_epoch <= osd_epoch;
8938        ++next_epoch) {
8939     OSDMapRef nextmap = service.try_get_map(next_epoch);
8940     if (!nextmap) {
8941       dout(20) << __func__ << " missing map " << next_epoch << dendl;
8942       continue;
8943     }
8944
8945     unsigned new_pg_num =
8946       (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8947       nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8948     if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8949       // check for merge
8950       if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8951         spg_t parent;
8952         if (pg->pg_id.is_merge_source(
8953               old_pg_num,
8954               new_pg_num,
8955               &parent)) {
8956           // we are merge source
8957           PGRef spg = pg; // carry a ref
8958           dout(1) << __func__ << " " << pg->pg_id
8959                   << " is merge source, target is " << parent
8960                    << dendl;
8961           pg->write_if_dirty(rctx);
8962           dispatch_context_transaction(*rctx, pg, &handle);
8963           pg->ch->flush();
8964           // release backoffs explicitly, since the on_shutdown path
8965           // aggressively tears down backoff state.
8966           if (pg->is_primary()) {
8967             pg->release_pg_backoffs();
8968           }
8969           pg->on_shutdown();
8970           OSDShard *sdata = pg->osd_shard;
8971           {
8972             std::lock_guard l(sdata->shard_lock);
8973             if (pg->pg_slot) {
8974               sdata->_detach_pg(pg->pg_slot);
8975               // update pg count now since we might not get an osdmap
8976               // any time soon.
8977               if (pg->is_primary())
8978                 logger->dec(l_osd_pg_primary);
8979               else if (pg->is_replica())
8980                 logger->dec(l_osd_pg_replica);
8981               else
8982                 logger->dec(l_osd_pg_stray);
8983             }
8984           }
8985           pg->unlock();
8986
8987           set<spg_t> children;
8988           parent.is_split(new_pg_num, old_pg_num, &children);
8989           if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8990             enqueue_peering_evt(
8991               parent,
8992               PGPeeringEventRef(
8993                 std::make_shared<PGPeeringEvent>(
8994                   nextmap->get_epoch(),
8995                   nextmap->get_epoch(),
8996                   NullEvt())));
8997           }
8998           ret = false;
8999           goto out;
9000         } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
9001           // we are merge target
9002           set<spg_t> children;
9003           pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
9004           dout(20) << __func__ << " " << pg->pg_id
9005                    << " is merge target, sources are " << children
9006                    << dendl;
9007           map<spg_t,PGRef> sources;
9008           {
9009             std::lock_guard l(merge_lock);
9010             auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
9011             unsigned need = children.size();
9012             dout(20) << __func__ << " have " << s.size() << "/"
9013                      << need << dendl;
9014             if (s.size() == need) {
9015               sources.swap(s);
9016               merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
9017               if (merge_waiters[nextmap->get_epoch()].empty()) {
9018                 merge_waiters.erase(nextmap->get_epoch());
9019               }
9020             }
9021           }
9022           if (!sources.empty()) {
9023             unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
9024             unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
9025             dout(1) << __func__ << " merging " << pg->pg_id << dendl;
9026             pg->merge_from(
9027               sources, rctx, split_bits,
9028               nextmap->get_pg_pool(
9029                 pg->pg_id.pool())->last_pg_merge_meta);
9030             pg->pg_slot->waiting_for_merge_epoch = 0;
9031           } else {
9032             dout(20) << __func__ << " not ready to merge yet" << dendl;
9033             pg->write_if_dirty(rctx);
9034             pg->unlock();
9035             // kick source(s) to get them ready
9036             for (auto& i : children) {
9037               dout(20) << __func__ << " kicking source " << i << dendl;
9038               enqueue_peering_evt(
9039                 i,
9040                 PGPeeringEventRef(
9041                   std::make_shared<PGPeeringEvent>(
9042                     nextmap->get_epoch(),
9043                     nextmap->get_epoch(),
9044                     NullEvt())));
9045             }
9046             ret = false;
9047             goto out;
9048           }
9049         }
9050       }
9051     }
9052
9053     vector<int> newup, newacting;
9054     int up_primary, acting_primary;
9055     nextmap->pg_to_up_acting_osds(
9056       pg->pg_id.pgid,
9057       &newup, &up_primary,
9058       &newacting, &acting_primary);
9059     pg->handle_advance_map(
9060       nextmap, lastmap, newup, up_primary,
9061       newacting, acting_primary, rctx);
9062
9063     auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
9064     auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
9065     if (oldpool != lastmap->get_pools().end()
9066         && newpool != nextmap->get_pools().end()) {
9067       dout(20) << __func__
9068                << " new pool opts " << newpool->second.opts
9069                << " old pool opts " << oldpool->second.opts
9070                << dendl;
9071
9072       double old_min_interval = 0, new_min_interval = 0;
9073       oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
9074       newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
9075
9076       double old_max_interval = 0, new_max_interval = 0;
9077       oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
9078       newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
9079
9080       // Assume if an interval is change from set to unset or vice versa the actual config
9081       // is different.  Keep it simple even if it is possible to call resched_all_scrub()
9082       // unnecessarily.
9083       if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
9084         pg->on_info_history_change();
9085       }
9086     }
9087
9088     if (new_pg_num && old_pg_num != new_pg_num) {
9089       // check for split
9090       set<spg_t> children;
9091       if (pg->pg_id.is_split(
9092             old_pg_num,
9093             new_pg_num,
9094             &children)) {
9095         split_pgs(
9096           pg, children, &new_pgs, lastmap, nextmap,
9097           rctx);
9098       }
9099     }
9100
9101     lastmap = nextmap;
9102     old_pg_num = new_pg_num;
9103     handle.reset_tp_timeout();
9104   }
9105   pg->handle_activate_map(rctx);
9106
9107   ret = true;
9108  out:
9109   if (!new_pgs.empty()) {
9110     rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs));
9111   }
9112   return ret;
9113 }
9114
9115 void OSD::consume_map()
9116 {
9117   ceph_assert(osd_lock.is_locked());
9118   dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
9119
9120   /** make sure the cluster is speaking in SORTBITWISE, because we don't
9121    *  speak the older sorting version any more. Be careful not to force
9122    *  a shutdown if we are merely processing old maps, though.
9123    */
9124   if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
9125     derr << __func__ << " SORTBITWISE flag is not set" << dendl;
9126     ceph_abort();
9127   }
9128
9129   service.pre_publish_map(osdmap);
9130   service.await_reserved_maps();
9131   service.publish_map(osdmap);
9132
9133   // prime splits and merges
9134   set<pair<spg_t,epoch_t>> newly_split;  // splits, and when
9135   set<pair<spg_t,epoch_t>> merge_pgs;    // merge participants, and when
9136   for (auto& shard : shards) {
9137     shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
9138   }
9139   if (!newly_split.empty()) {
9140     for (auto& shard : shards) {
9141       shard->prime_splits(osdmap, &newly_split);
9142     }
9143     ceph_assert(newly_split.empty());
9144   }
9145
9146   // prune sent_ready_to_merge
9147   service.prune_sent_ready_to_merge(osdmap);
9148
9149   // FIXME, maybe: We could race against an incoming peering message
9150   // that instantiates a merge PG after identify_merges() below and
9151   // never set up its peer to complete the merge.  An OSD restart
9152   // would clear it up.  This is a hard race to resolve,
9153   // extraordinarily rare (we only merge PGs that are stable and
9154   // clean, so it'd have to be an imported PG to an OSD with a
9155   // slightly stale OSDMap...), so I'm ignoring it for now.  We plan to
9156   // replace all of this with a seastar-based code soon anyway.
9157   if (!merge_pgs.empty()) {
9158     // mark the pgs we already have, or create new and empty merge
9159     // participants for those we are missing.  do this all under the
9160     // shard lock so we don't have to worry about racing pg creates
9161     // via _process.
9162     for (auto& shard : shards) {
9163       shard->prime_merges(osdmap, &merge_pgs);
9164     }
9165     ceph_assert(merge_pgs.empty());
9166   }
9167
9168   service.prune_pg_created();
9169
9170   unsigned pushes_to_free = 0;
9171   for (auto& shard : shards) {
9172     shard->consume_map(osdmap, &pushes_to_free);
9173   }
9174
9175   vector<spg_t> pgids;
9176   _get_pgids(&pgids);
9177
9178   // count (FIXME, probably during seastar rewrite)
9179   int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
9180   vector<PGRef> pgs;
9181   _get_pgs(&pgs);
9182   for (auto& pg : pgs) {
9183     // FIXME (probably during seastar rewrite): this is lockless and
9184     // racy, but we don't want to take pg lock here.
9185     if (pg->is_primary())
9186       num_pg_primary++;
9187     else if (pg->is_replica())
9188       num_pg_replica++;
9189     else
9190       num_pg_stray++;
9191   }
9192
9193   {
9194     // FIXME (as part of seastar rewrite): move to OSDShard
9195     std::lock_guard l(pending_creates_lock);
9196     for (auto pg = pending_creates_from_osd.begin();
9197          pg != pending_creates_from_osd.end();) {
9198       if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
9199         dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
9200                  << "discarding pending_create_from_osd" << dendl;
9201         pg = pending_creates_from_osd.erase(pg);
9202       } else {
9203         ++pg;
9204       }
9205     }
9206   }
9207
9208   service.maybe_inject_dispatch_delay();
9209
9210   dispatch_sessions_waiting_on_map();
9211
9212   service.maybe_inject_dispatch_delay();
9213
9214   service.release_reserved_pushes(pushes_to_free);
9215
9216   // queue null events to push maps down to individual PGs
9217   for (auto pgid : pgids) {
9218     enqueue_peering_evt(
9219       pgid,
9220       PGPeeringEventRef(
9221         std::make_shared<PGPeeringEvent>(
9222           osdmap->get_epoch(),
9223           osdmap->get_epoch(),
9224           NullEvt())));
9225   }
9226   logger->set(l_osd_pg, pgids.size());
9227   logger->set(l_osd_pg_primary, num_pg_primary);
9228   logger->set(l_osd_pg_replica, num_pg_replica);
9229   logger->set(l_osd_pg_stray, num_pg_stray);
9230 }
9231
9232 void OSD::activate_map()
9233 {
9234   ceph_assert(osd_lock.is_locked());
9235
9236   dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
9237
9238   if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
9239     dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
9240     osdmap_subscribe(osdmap->get_epoch() + 1, false);
9241   }
9242
9243   // norecover?
9244   if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
9245     if (!service.recovery_is_paused()) {
9246       dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
9247       service.pause_recovery();
9248     }
9249   } else {
9250     if (service.recovery_is_paused()) {
9251       dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
9252       service.unpause_recovery();
9253     }
9254   }
9255
9256   service.activate_map();
9257
9258   // process waiters
9259   take_waiters(waiting_for_osdmap);
9260 }
9261
9262 bool OSD::require_mon_peer(const Message *m)
9263 {
9264   if (!m->get_connection()->peer_is_mon()) {
9265     dout(0) << "require_mon_peer received from non-mon "
9266             << m->get_connection()->get_peer_addr()
9267             << " " << *m << dendl;
9268     return false;
9269   }
9270   return true;
9271 }
9272
9273 bool OSD::require_mon_or_mgr_peer(const Message *m)
9274 {
9275   if (!m->get_connection()->peer_is_mon() &&
9276       !m->get_connection()->peer_is_mgr()) {
9277     dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
9278             << m->get_connection()->get_peer_addr()
9279             << " " << *m << dendl;
9280     return false;
9281   }
9282   return true;
9283 }
9284
9285 bool OSD::require_osd_peer(const Message *m)
9286 {
9287   if (!m->get_connection()->peer_is_osd()) {
9288     dout(0) << "require_osd_peer received from non-osd "
9289             << m->get_connection()->get_peer_addr()
9290             << " " << *m << dendl;
9291     return false;
9292   }
9293   return true;
9294 }
9295
9296 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9297 {
9298   epoch_t up_epoch = service.get_up_epoch();
9299   if (epoch < up_epoch) {
9300     dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9301     return false;
9302   }
9303
9304   if (!is_active()) {
9305     dout(7) << "still in boot state, dropping message " << *m << dendl;
9306     return false;
9307   }
9308
9309   return true;
9310 }
9311
9312 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
9313                                      bool is_fast_dispatch)
9314 {
9315   int from = m->get_source().num();
9316
9317   if (map->is_down(from) ||
9318       (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9319     dout(5) << "from dead osd." << from << ", marking down, "
9320             << " msg was " << m->get_source_inst().addr
9321             << " expected "
9322             << (map->is_up(from) ?
9323                 map->get_cluster_addrs(from) : entity_addrvec_t())
9324             << dendl;
9325     ConnectionRef con = m->get_connection();
9326     con->mark_down();
9327     auto priv = con->get_priv();
9328     if (auto s = static_cast<Session*>(priv.get()); s) {
9329       if (!is_fast_dispatch)
9330         s->session_dispatch_lock.Lock();
9331       clear_session_waiting_on_map(s);
9332       con->set_priv(nullptr);   // break ref <-> session cycle, if any
9333       s->con.reset();
9334       if (!is_fast_dispatch)
9335         s->session_dispatch_lock.Unlock();
9336     }
9337     return false;
9338   }
9339   return true;
9340 }
9341
9342
9343 /*
9344  * require that we have same (or newer) map, and that
9345  * the source is the pg primary.
9346  */
9347 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9348                                     bool is_fast_dispatch)
9349 {
9350   const Message *m = op->get_req();
9351   dout(15) << "require_same_or_newer_map " << epoch
9352            << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9353
9354   ceph_assert(osd_lock.is_locked());
9355
9356   // do they have a newer map?
9357   if (epoch > osdmap->get_epoch()) {
9358     dout(7) << "waiting for newer map epoch " << epoch
9359             << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9360     wait_for_new_map(op);
9361     return false;
9362   }
9363
9364   if (!require_self_aliveness(op->get_req(), epoch)) {
9365     return false;
9366   }
9367
9368   // ok, our map is same or newer.. do they still exist?
9369   if (m->get_connection()->get_messenger() == cluster_messenger &&
9370       !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9371     return false;
9372   }
9373
9374   return true;
9375 }
9376
9377
9378
9379
9380
9381 // ----------------------------------------
9382 // pg creation
9383
9384 void OSD::split_pgs(
9385   PG *parent,
9386   const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9387   OSDMapRef curmap,
9388   OSDMapRef nextmap,
9389   PG::RecoveryCtx *rctx)
9390 {
9391   unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9392   parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9393
9394   vector<object_stat_sum_t> updated_stats;
9395   parent->start_split_stats(childpgids, &updated_stats);
9396
9397   vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9398   for (set<spg_t>::const_iterator i = childpgids.begin();
9399        i != childpgids.end();
9400        ++i, ++stat_iter) {
9401     ceph_assert(stat_iter != updated_stats.end());
9402     dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9403     PG* child = _make_pg(nextmap, *i);
9404     child->lock(true);
9405     out_pgs->insert(child);
9406     child->ch = store->create_new_collection(child->coll);
9407
9408     {
9409       uint32_t shard_index = i->hash_to_shard(shards.size());
9410       assert(NULL != shards[shard_index]);
9411       store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9412     }
9413
9414     unsigned split_bits = i->get_split_bits(pg_num);
9415     dout(10) << " pg_num is " << pg_num
9416              << ", m_seed " << i->ps()
9417              << ", split_bits is " << split_bits << dendl;
9418     parent->split_colls(
9419       *i,
9420       split_bits,
9421       i->ps(),
9422       &child->get_pool().info,
9423       rctx->transaction);
9424     parent->split_into(
9425       i->pgid,
9426       child,
9427       split_bits);
9428
9429     child->init_collection_pool_opts();
9430
9431     child->finish_split_stats(*stat_iter, rctx->transaction);
9432     child->unlock();
9433   }
9434   ceph_assert(stat_iter != updated_stats.end());
9435   parent->finish_split_stats(*stat_iter, rctx->transaction);
9436 }
9437
9438 /*
9439  * holding osd_lock
9440  */
9441 void OSD::handle_pg_create(OpRequestRef op)
9442 {
9443   const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
9444   ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9445
9446   dout(10) << "handle_pg_create " << *m << dendl;
9447
9448   if (!require_mon_peer(op->get_req())) {
9449     return;
9450   }
9451
9452   if (!require_same_or_newer_map(op, m->epoch, false))
9453     return;
9454
9455   op->mark_started();
9456
9457   map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9458   for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9459        p != m->mkpg.end();
9460        ++p, ++ci) {
9461     ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9462     epoch_t created = p->second.created;
9463     if (p->second.split_bits) // Skip split pgs
9464       continue;
9465     pg_t on = p->first;
9466
9467     if (!osdmap->have_pg_pool(on.pool())) {
9468       dout(20) << "ignoring pg on deleted pool " << on << dendl;
9469       continue;
9470     }
9471
9472     dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9473
9474     // is it still ours?
9475     vector<int> up, acting;
9476     int up_primary = -1;
9477     int acting_primary = -1;
9478     osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9479     int role = osdmap->calc_pg_role(whoami, acting, acting.size());
9480
9481     if (acting_primary != whoami) {
9482       dout(10) << "mkpg " << on << "  not acting_primary (" << acting_primary
9483                << "), my role=" << role << ", skipping" << dendl;
9484       continue;
9485     }
9486
9487     spg_t pgid;
9488     bool mapped = osdmap->get_primary_shard(on, &pgid);
9489     ceph_assert(mapped);
9490
9491     PastIntervals pi;
9492     pg_history_t history;
9493     build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9494
9495     // The mon won't resend unless the primary changed, so we ignore
9496     // same_interval_since.  We'll pass this history with the current
9497     // epoch as the event.
9498     if (history.same_primary_since > m->epoch) {
9499       dout(10) << __func__ << ": got obsolete pg create on pgid "
9500                << pgid << " from epoch " << m->epoch
9501                << ", primary changed in " << history.same_primary_since
9502                << dendl;
9503       continue;
9504     }
9505     enqueue_peering_evt(
9506       pgid,
9507       PGPeeringEventRef(
9508         std::make_shared<PGPeeringEvent>(
9509           osdmap->get_epoch(),
9510           osdmap->get_epoch(),
9511           NullEvt(),
9512           true,
9513           new PGCreateInfo(
9514             pgid,
9515             osdmap->get_epoch(),
9516             history,
9517             pi,
9518             true)
9519           )));
9520   }
9521
9522   {
9523     std::lock_guard l(pending_creates_lock);
9524     if (pending_creates_from_mon == 0) {
9525       last_pg_create_epoch = m->epoch;
9526     }
9527   }
9528
9529   maybe_update_heartbeat_peers();
9530 }
9531
9532
9533 // ----------------------------------------
9534 // peering and recovery
9535
9536 PG::RecoveryCtx OSD::create_context()
9537 {
9538   ObjectStore::Transaction *t = new ObjectStore::Transaction;
9539   map<int, map<spg_t,pg_query_t> > *query_map =
9540     new map<int, map<spg_t, pg_query_t> >;
9541   map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
9542     new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
9543   map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
9544     new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
9545   PG::RecoveryCtx rctx(query_map, info_map, notify_list, t);
9546   return rctx;
9547 }
9548
9549 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
9550                                        ThreadPool::TPHandle *handle)
9551 {
9552   if (!ctx.transaction->empty() || ctx.transaction->has_contexts()) {
9553     int tr = store->queue_transaction(
9554       pg->ch,
9555       std::move(*ctx.transaction), TrackedOpRef(), handle);
9556     ceph_assert(tr == 0);
9557     delete (ctx.transaction);
9558     ctx.transaction = new ObjectStore::Transaction;
9559   }
9560 }
9561
9562 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
9563                            ThreadPool::TPHandle *handle)
9564 {
9565   if (!service.get_osdmap()->is_up(whoami)) {
9566     dout(20) << __func__ << " not up in osdmap" << dendl;
9567   } else if (!is_active()) {
9568     dout(20) << __func__ << " not active" << dendl;
9569   } else {
9570     do_notifies(*ctx.notify_list, curmap);
9571     do_queries(*ctx.query_map, curmap);
9572     do_infos(*ctx.info_map, curmap);
9573   }
9574   if ((!ctx.transaction->empty() || ctx.transaction->has_contexts()) && pg) {
9575     int tr = store->queue_transaction(
9576       pg->ch,
9577       std::move(*ctx.transaction), TrackedOpRef(),
9578       handle);
9579     ceph_assert(tr == 0);
9580   }
9581   delete ctx.notify_list;
9582   delete ctx.query_map;
9583   delete ctx.info_map;
9584   delete ctx.transaction;
9585 }
9586
9587 void OSD::discard_context(PG::RecoveryCtx& ctx)
9588 {
9589   delete ctx.notify_list;
9590   delete ctx.query_map;
9591   delete ctx.info_map;
9592   delete ctx.transaction;
9593 }
9594
9595
9596 /** do_notifies
9597  * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9598  * content for, and they are primary for.
9599  */
9600
9601 void OSD::do_notifies(
9602   map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
9603   OSDMapRef curmap)
9604 {
9605   for (map<int,
9606            vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
9607          notify_list.begin();
9608        it != notify_list.end();
9609        ++it) {
9610     if (!curmap->is_up(it->first)) {
9611       dout(20) << __func__ << " skipping down osd." << it->first << dendl;
9612       continue;
9613     }
9614     ConnectionRef con = service.get_con_osd_cluster(
9615       it->first, curmap->get_epoch());
9616     if (!con) {
9617       dout(20) << __func__ << " skipping osd." << it->first
9618                << " (NULL con)" << dendl;
9619       continue;
9620     }
9621     service.share_map_peer(it->first, con.get(), curmap);
9622     dout(7) << __func__ << " osd." << it->first
9623             << " on " << it->second.size() << " PGs" << dendl;
9624     MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
9625                                        it->second);
9626     con->send_message(m);
9627   }
9628 }
9629
9630
9631 /** do_queries
9632  * send out pending queries for info | summaries
9633  */
9634 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
9635                      OSDMapRef curmap)
9636 {
9637   for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
9638        pit != query_map.end();
9639        ++pit) {
9640     if (!curmap->is_up(pit->first)) {
9641       dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
9642       continue;
9643     }
9644     int who = pit->first;
9645     ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
9646     if (!con) {
9647       dout(20) << __func__ << " skipping osd." << who
9648                << " (NULL con)" << dendl;
9649       continue;
9650     }
9651     service.share_map_peer(who, con.get(), curmap);
9652     dout(7) << __func__ << " querying osd." << who
9653             << " on " << pit->second.size() << " PGs" << dendl;
9654     MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
9655     con->send_message(m);
9656   }
9657 }
9658
9659
9660 void OSD::do_infos(map<int,
9661                        vector<pair<pg_notify_t, PastIntervals> > >& info_map,
9662                    OSDMapRef curmap)
9663 {
9664   for (map<int,
9665            vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
9666          info_map.begin();
9667        p != info_map.end();
9668        ++p) {
9669     if (!curmap->is_up(p->first)) {
9670       dout(20) << __func__ << " skipping down osd." << p->first << dendl;
9671       continue;
9672     }
9673     for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
9674          i != p->second.end();
9675          ++i) {
9676       dout(20) << __func__ << " sending info " << i->first.info
9677                << " to shard " << p->first << dendl;
9678     }
9679     ConnectionRef con = service.get_con_osd_cluster(
9680       p->first, curmap->get_epoch());
9681     if (!con) {
9682       dout(20) << __func__ << " skipping osd." << p->first
9683                << " (NULL con)" << dendl;
9684       continue;
9685     }
9686     service.share_map_peer(p->first, con.get(), curmap);
9687     MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9688     m->pg_list = p->second;
9689     con->send_message(m);
9690   }
9691   info_map.clear();
9692 }
9693
9694 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9695 {
9696   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9697   if (!require_mon_peer(m)) {
9698     m->put();
9699     return;
9700   }
9701   for (auto& p : m->pgs) {
9702     spg_t pgid = p.first;
9703     epoch_t created = p.second.first;
9704     utime_t created_stamp = p.second.second;
9705     dout(20) << __func__ << " " << pgid << " e" << created
9706              << "@" << created_stamp << dendl;
9707     pg_history_t h;
9708     h.epoch_created = created;
9709     h.epoch_pool_created = created;
9710     h.same_up_since = created;
9711     h.same_interval_since = created;
9712     h.same_primary_since = created;
9713     h.last_scrub_stamp = created_stamp;
9714     h.last_deep_scrub_stamp = created_stamp;
9715     h.last_clean_scrub_stamp = created_stamp;
9716
9717     enqueue_peering_evt(
9718       pgid,
9719       PGPeeringEventRef(
9720         std::make_shared<PGPeeringEvent>(
9721           m->epoch,
9722           m->epoch,
9723           NullEvt(),
9724           true,
9725           new PGCreateInfo(
9726             pgid,
9727             created,
9728             h,
9729             PastIntervals(),
9730             true)
9731           )));
9732   }
9733
9734   {
9735     std::lock_guard l(pending_creates_lock);
9736     if (pending_creates_from_mon == 0) {
9737       last_pg_create_epoch = m->epoch;
9738     }
9739   }
9740
9741   m->put();
9742 }
9743
9744 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9745 {
9746   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9747   if (!require_osd_peer(m)) {
9748     m->put();
9749     return;
9750   }
9751   int from = m->get_source().num();
9752   for (auto& p : m->pg_list) {
9753     enqueue_peering_evt(
9754       p.first,
9755       PGPeeringEventRef(
9756         std::make_shared<PGPeeringEvent>(
9757           p.second.epoch_sent, p.second.epoch_sent,
9758           MQuery(
9759             p.first,
9760             pg_shard_t(from, p.second.from),
9761             p.second,
9762             p.second.epoch_sent),
9763           false))
9764       );
9765   }
9766   m->put();
9767 }
9768
9769 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9770 {
9771   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9772   if (!require_osd_peer(m)) {
9773     m->put();
9774     return;
9775   }
9776   int from = m->get_source().num();
9777   for (auto& p : m->get_pg_list()) {
9778     spg_t pgid(p.first.info.pgid.pgid, p.first.to);
9779     enqueue_peering_evt(
9780       pgid,
9781       PGPeeringEventRef(
9782         std::make_shared<PGPeeringEvent>(
9783           p.first.epoch_sent,
9784           p.first.query_epoch,
9785           MNotifyRec(
9786             pgid, pg_shard_t(from, p.first.from),
9787             p.first,
9788             m->get_connection()->get_features(),
9789             p.second),
9790           true,
9791           new PGCreateInfo(
9792             pgid,
9793             p.first.query_epoch,
9794             p.first.info.history,
9795             p.second,
9796             false)
9797           )));
9798   }
9799   m->put();
9800 }
9801
9802 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9803 {
9804   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9805   if (!require_osd_peer(m)) {
9806     m->put();
9807     return;
9808   }
9809   int from = m->get_source().num();
9810   for (auto& p : m->pg_list) {
9811     enqueue_peering_evt(
9812       spg_t(p.first.info.pgid.pgid, p.first.to),
9813       PGPeeringEventRef(
9814         std::make_shared<PGPeeringEvent>(
9815           p.first.epoch_sent, p.first.query_epoch,
9816           MInfoRec(
9817             pg_shard_t(from, p.first.from),
9818             p.first.info,
9819             p.first.epoch_sent)))
9820       );
9821   }
9822   m->put();
9823 }
9824
9825 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9826 {
9827   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9828   if (!require_osd_peer(m)) {
9829     m->put();
9830     return;
9831   }
9832   for (auto& pgid : m->pg_list) {
9833     enqueue_peering_evt(
9834       pgid,
9835       PGPeeringEventRef(
9836         std::make_shared<PGPeeringEvent>(
9837           m->get_epoch(), m->get_epoch(),
9838           PG::DeleteStart())));
9839   }
9840   m->put();
9841 }
9842
9843 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9844 {
9845   dout(10) << __func__ << " " << *m << dendl;
9846   if (!require_mon_or_mgr_peer(m)) {
9847     m->put();
9848     return;
9849   }
9850   epoch_t epoch = get_osdmap_epoch();
9851   for (auto pgid : m->forced_pgs) {
9852     if (m->options & OFR_BACKFILL) {
9853       if (m->options & OFR_CANCEL) {
9854         enqueue_peering_evt(
9855           pgid,
9856           PGPeeringEventRef(
9857             std::make_shared<PGPeeringEvent>(
9858               epoch, epoch,
9859               PG::UnsetForceBackfill())));
9860       } else {
9861         enqueue_peering_evt(
9862           pgid,
9863           PGPeeringEventRef(
9864             std::make_shared<PGPeeringEvent>(
9865               epoch, epoch,
9866               PG::SetForceBackfill())));
9867       }
9868     } else if (m->options & OFR_RECOVERY) {
9869       if (m->options & OFR_CANCEL) {
9870         enqueue_peering_evt(
9871           pgid,
9872           PGPeeringEventRef(
9873             std::make_shared<PGPeeringEvent>(
9874               epoch, epoch,
9875               PG::UnsetForceRecovery())));
9876       } else {
9877         enqueue_peering_evt(
9878           pgid,
9879           PGPeeringEventRef(
9880             std::make_shared<PGPeeringEvent>(
9881               epoch, epoch,
9882               PG::SetForceRecovery())));
9883       }
9884     }
9885   }
9886   m->put();
9887 }
9888
9889 void OSD::handle_pg_query_nopg(const MQuery& q)
9890 {
9891   spg_t pgid = q.pgid;
9892   dout(10) << __func__ << " " << pgid << dendl;
9893
9894   OSDMapRef osdmap = get_osdmap();
9895   if (!osdmap->have_pg_pool(pgid.pool()))
9896     return;
9897
9898   dout(10) << " pg " << pgid << " dne" << dendl;
9899   pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9900   ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9901   if (con) {
9902     Message *m;
9903     if (q.query.type == pg_query_t::LOG ||
9904         q.query.type == pg_query_t::FULLLOG) {
9905       m = new MOSDPGLog(
9906         q.query.from, q.query.to,
9907         osdmap->get_epoch(), empty,
9908         q.query.epoch_sent);
9909     } else {
9910       vector<pair<pg_notify_t,PastIntervals>> ls;
9911       ls.push_back(
9912         make_pair(
9913           pg_notify_t(
9914             q.query.from, q.query.to,
9915             q.query.epoch_sent,
9916             osdmap->get_epoch(),
9917             empty),
9918           PastIntervals()));
9919       m = new MOSDPGNotify(osdmap->get_epoch(), ls);
9920     }
9921     service.share_map_peer(q.from.osd, con.get(), osdmap);
9922     con->send_message(m);
9923   }
9924 }
9925
9926
9927 // =========================================================
9928 // RECOVERY
9929
9930 void OSDService::_maybe_queue_recovery() {
9931   ceph_assert(recovery_lock.is_locked_by_me());
9932   uint64_t available_pushes;
9933   while (!awaiting_throttle.empty() &&
9934          _recover_now(&available_pushes)) {
9935     uint64_t to_start = std::min(
9936       available_pushes,
9937       cct->_conf->osd_recovery_max_single_start);
9938     _queue_for_recovery(awaiting_throttle.front(), to_start);
9939     awaiting_throttle.pop_front();
9940     dout(10) << __func__ << " starting " << to_start
9941              << ", recovery_ops_reserved " << recovery_ops_reserved
9942              << " -> " << (recovery_ops_reserved + to_start) << dendl;
9943     recovery_ops_reserved += to_start;
9944   }
9945 }
9946
9947 bool OSDService::_recover_now(uint64_t *available_pushes)
9948 {
9949   if (available_pushes)
9950       *available_pushes = 0;
9951
9952   if (ceph_clock_now() < defer_recovery_until) {
9953     dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9954     return false;
9955   }
9956
9957   if (recovery_paused) {
9958     dout(15) << __func__ << " paused" << dendl;
9959     return false;
9960   }
9961
9962   uint64_t max = cct->_conf->osd_recovery_max_active;
9963   if (max <= recovery_ops_active + recovery_ops_reserved) {
9964     dout(15) << __func__ << " active " << recovery_ops_active
9965              << " + reserved " << recovery_ops_reserved
9966              << " >= max " << max << dendl;
9967     return false;
9968   }
9969
9970   if (available_pushes)
9971     *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9972
9973   return true;
9974 }
9975
9976 void OSD::do_recovery(
9977   PG *pg, epoch_t queued, uint64_t reserved_pushes,
9978   ThreadPool::TPHandle &handle)
9979 {
9980   uint64_t started = 0;
9981
9982   /*
9983    * When the value of osd_recovery_sleep is set greater than zero, recovery
9984    * ops are scheduled after osd_recovery_sleep amount of time from the previous
9985    * recovery event's schedule time. This is done by adding a
9986    * recovery_requeue_callback event, which re-queues the recovery op using
9987    * queue_recovery_after_sleep.
9988    */
9989   float recovery_sleep = get_osd_recovery_sleep();
9990   {
9991     std::lock_guard l(service.sleep_lock);
9992     if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9993       PGRef pgref(pg);
9994       auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9995         dout(20) << "do_recovery wake up at "
9996                  << ceph_clock_now()
9997                  << ", re-queuing recovery" << dendl;
9998         std::lock_guard l(service.sleep_lock);
9999         service.recovery_needs_sleep = false;
10000         service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
10001       });
10002
10003       // This is true for the first recovery op and when the previous recovery op
10004       // has been scheduled in the past. The next recovery op is scheduled after
10005       // completing the sleep from now.
10006       if (service.recovery_schedule_time < ceph_clock_now()) {
10007         service.recovery_schedule_time = ceph_clock_now();
10008       }
10009       service.recovery_schedule_time += recovery_sleep;
10010       service.sleep_timer.add_event_at(service.recovery_schedule_time,
10011                                                 recovery_requeue_callback);
10012       dout(20) << "Recovery event scheduled at "
10013                << service.recovery_schedule_time << dendl;
10014       return;
10015     }
10016   }
10017
10018   {
10019     {
10020       std::lock_guard l(service.sleep_lock);
10021       service.recovery_needs_sleep = true;
10022     }
10023
10024     if (pg->pg_has_reset_since(queued)) {
10025       goto out;
10026     }
10027
10028     dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
10029 #ifdef DEBUG_RECOVERY_OIDS
10030     dout(20) << "  active was " << service.recovery_oids[pg->pg_id] << dendl;
10031 #endif
10032
10033     bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
10034     dout(10) << "do_recovery started " << started << "/" << reserved_pushes
10035              << " on " << *pg << dendl;
10036
10037     if (do_unfound) {
10038       PG::RecoveryCtx rctx = create_context();
10039       rctx.handle = &handle;
10040       pg->find_unfound(queued, &rctx);
10041       dispatch_context(rctx, pg, pg->get_osdmap());
10042     }
10043   }
10044
10045  out:
10046   ceph_assert(started <= reserved_pushes);
10047   service.release_reserved_pushes(reserved_pushes);
10048 }
10049
10050 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
10051 {
10052   std::lock_guard l(recovery_lock);
10053   dout(10) << "start_recovery_op " << *pg << " " << soid
10054            << " (" << recovery_ops_active << "/"
10055            << cct->_conf->osd_recovery_max_active << " rops)"
10056            << dendl;
10057   recovery_ops_active++;
10058
10059 #ifdef DEBUG_RECOVERY_OIDS
10060   dout(20) << "  active was " << recovery_oids[pg->pg_id] << dendl;
10061   ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
10062   recovery_oids[pg->pg_id].insert(soid);
10063 #endif
10064 }
10065
10066 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
10067 {
10068   std::lock_guard l(recovery_lock);
10069   dout(10) << "finish_recovery_op " << *pg << " " << soid
10070            << " dequeue=" << dequeue
10071            << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
10072            << dendl;
10073
10074   // adjust count
10075   ceph_assert(recovery_ops_active > 0);
10076   recovery_ops_active--;
10077
10078 #ifdef DEBUG_RECOVERY_OIDS
10079   dout(20) << "  active oids was " << recovery_oids[pg->pg_id] << dendl;
10080   ceph_assert(recovery_oids[pg->pg_id].count(soid));
10081   recovery_oids[pg->pg_id].erase(soid);
10082 #endif
10083
10084   _maybe_queue_recovery();
10085 }
10086
10087 bool OSDService::is_recovery_active()
10088 {
10089   if (cct->_conf->osd_debug_pretend_recovery_active) {
10090     return true;
10091   }
10092   return local_reserver.has_reservation() || remote_reserver.has_reservation();
10093 }
10094
10095 void OSDService::release_reserved_pushes(uint64_t pushes)
10096 {
10097   std::lock_guard l(recovery_lock);
10098   dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
10099            << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
10100            << dendl;
10101   ceph_assert(recovery_ops_reserved >= pushes);
10102   recovery_ops_reserved -= pushes;
10103   _maybe_queue_recovery();
10104 }
10105
10106 // =========================================================
10107 // OPS
10108
10109 bool OSD::op_is_discardable(const MOSDOp *op)
10110 {
10111   // drop client request if they are not connected and can't get the
10112   // reply anyway.
10113   if (!op->get_connection()->is_connected()) {
10114     return true;
10115   }
10116   return false;
10117 }
10118
10119 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
10120 {
10121   const utime_t stamp = op->get_req()->get_recv_stamp();
10122   const utime_t latency = ceph_clock_now() - stamp;
10123   const unsigned priority = op->get_req()->get_priority();
10124   const int cost = op->get_req()->get_cost();
10125   const uint64_t owner = op->get_req()->get_source().num();
10126
10127   dout(15) << "enqueue_op " << op << " prio " << priority
10128            << " cost " << cost
10129            << " latency " << latency
10130            << " epoch " << epoch
10131            << " " << *(op->get_req()) << dendl;
10132   op->osd_trace.event("enqueue op");
10133   op->osd_trace.keyval("priority", priority);
10134   op->osd_trace.keyval("cost", cost);
10135   op->mark_queued_for_pg();
10136   logger->tinc(l_osd_op_before_queue_op_lat, latency);
10137   op_shardedwq.queue(
10138     OpQueueItem(
10139       unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
10140       cost, priority, stamp, owner, epoch));
10141 }
10142
10143 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
10144 {
10145   dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10146   op_shardedwq.queue(
10147     OpQueueItem(
10148       unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10149       10,
10150       cct->_conf->osd_peering_op_priority,
10151       utime_t(),
10152       0,
10153       evt->get_epoch_sent()));
10154 }
10155
10156 void OSD::enqueue_peering_evt_front(spg_t pgid, PGPeeringEventRef evt)
10157 {
10158   dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10159   op_shardedwq.queue_front(
10160     OpQueueItem(
10161       unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10162       10,
10163       cct->_conf->osd_peering_op_priority,
10164       utime_t(),
10165       0,
10166       evt->get_epoch_sent()));
10167 }
10168
10169 /*
10170  * NOTE: dequeue called in worker thread, with pg lock
10171  */
10172 void OSD::dequeue_op(
10173   PGRef pg, OpRequestRef op,
10174   ThreadPool::TPHandle &handle)
10175 {
10176   FUNCTRACE(cct);
10177   OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
10178
10179   utime_t now = ceph_clock_now();
10180   op->set_dequeued_time(now);
10181   utime_t latency = now - op->get_req()->get_recv_stamp();
10182   dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
10183            << " cost " << op->get_req()->get_cost()
10184            << " latency " << latency
10185            << " " << *(op->get_req())
10186            << " pg " << *pg << dendl;
10187
10188   logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
10189
10190   auto priv = op->get_req()->get_connection()->get_priv();
10191   if (auto session = static_cast<Session *>(priv.get()); session) {
10192     maybe_share_map(session, op, pg->get_osdmap());
10193   }
10194
10195   if (pg->is_deleting())
10196     return;
10197
10198   op->mark_reached_pg();
10199   op->osd_trace.event("dequeue_op");
10200
10201   pg->do_request(op, handle);
10202
10203   // finish
10204   dout(10) << "dequeue_op " << op << " finish" << dendl;
10205   OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
10206 }
10207
10208
10209 void OSD::dequeue_peering_evt(
10210   OSDShard *sdata,
10211   PG *pg,
10212   PGPeeringEventRef evt,
10213   ThreadPool::TPHandle& handle)
10214 {
10215   PG::RecoveryCtx rctx = create_context();
10216   auto curmap = sdata->get_osdmap();
10217   epoch_t need_up_thru = 0, same_interval_since = 0;
10218   if (!pg) {
10219     if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
10220       handle_pg_query_nopg(*q);
10221     } else {
10222       derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
10223       ceph_abort();
10224     }
10225   } else if (advance_pg(curmap->get_epoch(), pg, handle, &rctx)) {
10226     pg->do_peering_event(evt, &rctx);
10227     if (pg->is_deleted()) {
10228       // do not dispatch rctx; the final _delete_some already did it.
10229       discard_context(rctx);
10230       pg->unlock();
10231       return;
10232     }
10233     dispatch_context_transaction(rctx, pg, &handle);
10234     need_up_thru = pg->get_need_up_thru();
10235     same_interval_since = pg->get_same_interval_since();
10236     pg->unlock();
10237   }
10238
10239   if (need_up_thru) {
10240     queue_want_up_thru(same_interval_since);
10241   }
10242   dispatch_context(rctx, pg, curmap, &handle);
10243
10244   service.send_pg_temp();
10245 }
10246
10247 void OSD::dequeue_delete(
10248   OSDShard *sdata,
10249   PG *pg,
10250   epoch_t e,
10251   ThreadPool::TPHandle& handle)
10252 {
10253   dequeue_peering_evt(
10254     sdata,
10255     pg,
10256     PGPeeringEventRef(
10257       std::make_shared<PGPeeringEvent>(
10258         e, e,
10259         PG::DeleteSome())),
10260     handle);
10261 }
10262
10263
10264
10265 // --------------------------------
10266
10267 const char** OSD::get_tracked_conf_keys() const
10268 {
10269   static const char* KEYS[] = {
10270     "osd_max_backfills",
10271     "osd_min_recovery_priority",
10272     "osd_max_trimming_pgs",
10273     "osd_op_complaint_time",
10274     "osd_op_log_threshold",
10275     "osd_op_history_size",
10276     "osd_op_history_duration",
10277     "osd_op_history_slow_op_size",
10278     "osd_op_history_slow_op_threshold",
10279     "osd_enable_op_tracker",
10280     "osd_map_cache_size",
10281     "osd_pg_epoch_max_lag_factor",
10282     "osd_pg_epoch_persisted_max_stale",
10283     // clog & admin clog
10284     "clog_to_monitors",
10285     "clog_to_syslog",
10286     "clog_to_syslog_facility",
10287     "clog_to_syslog_level",
10288     "osd_objectstore_fuse",
10289     "clog_to_graylog",
10290     "clog_to_graylog_host",
10291     "clog_to_graylog_port",
10292     "host",
10293     "fsid",
10294     "osd_recovery_delay_start",
10295     "osd_client_message_size_cap",
10296     "osd_client_message_cap",
10297     "osd_heartbeat_min_size",
10298     "osd_heartbeat_interval",
10299     "osd_scrub_min_interval",
10300     "osd_scrub_max_interval",
10301     NULL
10302   };
10303   return KEYS;
10304 }
10305
10306 void OSD::handle_conf_change(const ConfigProxy& conf,
10307                              const std::set <std::string> &changed)
10308 {
10309   Mutex::Locker l(osd_lock);
10310   if (changed.count("osd_max_backfills")) {
10311     service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10312     service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10313   }
10314   if (changed.count("osd_min_recovery_priority")) {
10315     service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10316     service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10317   }
10318   if (changed.count("osd_max_trimming_pgs")) {
10319     service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10320   }
10321   if (changed.count("osd_op_complaint_time") ||
10322       changed.count("osd_op_log_threshold")) {
10323     op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10324                                            cct->_conf->osd_op_log_threshold);
10325   }
10326   if (changed.count("osd_op_history_size") ||
10327       changed.count("osd_op_history_duration")) {
10328     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10329                                              cct->_conf->osd_op_history_duration);
10330   }
10331   if (changed.count("osd_op_history_slow_op_size") ||
10332       changed.count("osd_op_history_slow_op_threshold")) {
10333     op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10334                                                       cct->_conf->osd_op_history_slow_op_threshold);
10335   }
10336   if (changed.count("osd_enable_op_tracker")) {
10337       op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10338   }
10339   if (changed.count("osd_map_cache_size")) {
10340     service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10341     service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10342     service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10343   }
10344   if (changed.count("clog_to_monitors") ||
10345       changed.count("clog_to_syslog") ||
10346       changed.count("clog_to_syslog_level") ||
10347       changed.count("clog_to_syslog_facility") ||
10348       changed.count("clog_to_graylog") ||
10349       changed.count("clog_to_graylog_host") ||
10350       changed.count("clog_to_graylog_port") ||
10351       changed.count("host") ||
10352       changed.count("fsid")) {
10353     update_log_config();
10354   }
10355   if (changed.count("osd_pg_epoch_max_lag_factor")) {
10356     m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10357       "osd_pg_epoch_max_lag_factor");
10358   }
10359
10360 #ifdef HAVE_LIBFUSE
10361   if (changed.count("osd_objectstore_fuse")) {
10362     if (store) {
10363       enable_disable_fuse(false);
10364     }
10365   }
10366 #endif
10367
10368   if (changed.count("osd_recovery_delay_start")) {
10369     service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10370     service.kick_recovery_queue();
10371   }
10372
10373   if (changed.count("osd_client_message_cap")) {
10374     uint64_t newval = cct->_conf->osd_client_message_cap;
10375     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10376     if (pol.throttler_messages && newval > 0) {
10377       pol.throttler_messages->reset_max(newval);
10378     }
10379   }
10380   if (changed.count("osd_client_message_size_cap")) {
10381     uint64_t newval = cct->_conf->osd_client_message_size_cap;
10382     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10383     if (pol.throttler_bytes && newval > 0) {
10384       pol.throttler_bytes->reset_max(newval);
10385     }
10386   }
10387
10388   if (changed.count("osd_scrub_min_interval") ||
10389       changed.count("osd_scrub_max_interval")) {
10390     resched_all_scrubs();
10391     dout(0) << __func__ << ": scrub interval change" << dendl;
10392   }
10393   check_config();
10394 }
10395
10396 void OSD::update_log_config()
10397 {
10398   map<string,string> log_to_monitors;
10399   map<string,string> log_to_syslog;
10400   map<string,string> log_channel;
10401   map<string,string> log_prio;
10402   map<string,string> log_to_graylog;
10403   map<string,string> log_to_graylog_host;
10404   map<string,string> log_to_graylog_port;
10405   uuid_d fsid;
10406   string host;
10407
10408   if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10409                                log_channel, log_prio, log_to_graylog,
10410                                log_to_graylog_host, log_to_graylog_port,
10411                                fsid, host) == 0)
10412     clog->update_config(log_to_monitors, log_to_syslog,
10413                         log_channel, log_prio, log_to_graylog,
10414                         log_to_graylog_host, log_to_graylog_port,
10415                         fsid, host);
10416   derr << "log_to_monitors " << log_to_monitors << dendl;
10417 }
10418
10419 void OSD::check_config()
10420 {
10421   // some sanity checks
10422   if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10423     clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10424                  << " is not > osd_pg_epoch_persisted_max_stale ("
10425                  << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10426   }
10427 }
10428
10429 // --------------------------------
10430
10431 void OSD::get_latest_osdmap()
10432 {
10433   dout(10) << __func__ << " -- start" << dendl;
10434
10435   C_SaferCond cond;
10436   service.objecter->wait_for_latest_osdmap(&cond);
10437   cond.wait();
10438
10439   dout(10) << __func__ << " -- finish" << dendl;
10440 }
10441
10442 // --------------------------------
10443
10444 int OSD::init_op_flags(OpRequestRef& op)
10445 {
10446   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10447   vector<OSDOp>::const_iterator iter;
10448
10449   // client flags have no bearing on whether an op is a read, write, etc.
10450   op->rmw_flags = 0;
10451
10452   if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10453     op->set_force_rwordered();
10454   }
10455
10456   // set bits based on op codes, called methods.
10457   for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10458     if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10459          iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10460       /* This a bit odd.  PING isn't actually a write.  It can't
10461        * result in an update to the object_info.  PINGs also aren't
10462        * resent, so there's no reason to write out a log entry.
10463        *
10464        * However, we pipeline them behind writes, so let's force
10465        * the write_ordered flag.
10466        */
10467       op->set_force_rwordered();
10468     } else {
10469       if (ceph_osd_op_mode_modify(iter->op.op))
10470         op->set_write();
10471     }
10472     if (ceph_osd_op_mode_read(iter->op.op))
10473       op->set_read();
10474
10475     // set READ flag if there are src_oids
10476     if (iter->soid.oid.name.length())
10477       op->set_read();
10478
10479     // set PGOP flag if there are PG ops
10480     if (ceph_osd_op_type_pg(iter->op.op))
10481       op->set_pg_op();
10482
10483     if (ceph_osd_op_mode_cache(iter->op.op))
10484       op->set_cache();
10485
10486     // check for ec base pool
10487     int64_t poolid = m->get_pg().pool();
10488     const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10489     if (pool && pool->is_tier()) {
10490       const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10491       if (base_pool && base_pool->require_rollback()) {
10492         if ((iter->op.op != CEPH_OSD_OP_READ) &&
10493             (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
10494             (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
10495             (iter->op.op != CEPH_OSD_OP_STAT) &&
10496             (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10497             (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10498             (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10499             (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10500             (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10501             (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10502             (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10503             (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10504             (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10505             (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10506             (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10507             (iter->op.op != CEPH_OSD_OP_CREATE) &&
10508             (iter->op.op != CEPH_OSD_OP_DELETE) &&
10509             (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10510             (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10511             (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10512             (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10513             (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10514           op->set_promote();
10515         }
10516       }
10517     }
10518
10519     switch (iter->op.op) {
10520     case CEPH_OSD_OP_CALL:
10521       {
10522         bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10523         int is_write, is_read;
10524         string cname, mname;
10525         bp.copy(iter->op.cls.class_len, cname);
10526         bp.copy(iter->op.cls.method_len, mname);
10527
10528         ClassHandler::ClassData *cls;
10529         int r = class_handler->open_class(cname, &cls);
10530         if (r) {
10531           derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10532           if (r == -ENOENT)
10533             r = -EOPNOTSUPP;
10534           else if (r != -EPERM) // propagate permission errors
10535             r = -EIO;
10536           return r;
10537         }
10538         int flags = cls->get_method_flags(mname.c_str());
10539         if (flags < 0) {
10540           if (flags == -ENOENT)
10541             r = -EOPNOTSUPP;
10542           else
10543             r = flags;
10544           return r;
10545         }
10546         is_read = flags & CLS_METHOD_RD;
10547         is_write = flags & CLS_METHOD_WR;
10548         bool is_promote = flags & CLS_METHOD_PROMOTE;
10549
10550         dout(10) << "class " << cname << " method " << mname << " "
10551                  << "flags=" << (is_read ? "r" : "")
10552                              << (is_write ? "w" : "")
10553                              << (is_promote ? "p" : "")
10554                  << dendl;
10555         if (is_read)
10556           op->set_class_read();
10557         if (is_write)
10558           op->set_class_write();
10559         if (is_promote)
10560           op->set_promote();
10561         op->add_class(std::move(cname), std::move(mname), is_read, is_write,
10562                       cls->whitelisted);
10563         break;
10564       }
10565
10566     case CEPH_OSD_OP_WATCH:
10567       // force the read bit for watch since it is depends on previous
10568       // watch state (and may return early if the watch exists) or, in
10569       // the case of ping, is simply a read op.
10570       op->set_read();
10571       // fall through
10572     case CEPH_OSD_OP_NOTIFY:
10573     case CEPH_OSD_OP_NOTIFY_ACK:
10574       {
10575         op->set_promote();
10576         break;
10577       }
10578
10579     case CEPH_OSD_OP_DELETE:
10580       // if we get a delete with FAILOK we can skip handle cache. without
10581       // FAILOK we still need to promote (or do something smarter) to
10582       // determine whether to return ENOENT or 0.
10583       if (iter == m->ops.begin() &&
10584           iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10585         op->set_skip_handle_cache();
10586       }
10587       // skip promotion when proxying a delete op
10588       if (m->ops.size() == 1) {
10589         op->set_skip_promote();
10590       }
10591       break;
10592
10593     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10594     case CEPH_OSD_OP_CACHE_FLUSH:
10595     case CEPH_OSD_OP_CACHE_EVICT:
10596       // If try_flush/flush/evict is the only op, can skip handle cache.
10597       if (m->ops.size() == 1) {
10598         op->set_skip_handle_cache();
10599       }
10600       break;
10601
10602     case CEPH_OSD_OP_READ:
10603     case CEPH_OSD_OP_SYNC_READ:
10604     case CEPH_OSD_OP_SPARSE_READ:
10605     case CEPH_OSD_OP_CHECKSUM:
10606     case CEPH_OSD_OP_WRITEFULL:
10607       if (m->ops.size() == 1 &&
10608           (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10609            iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10610         op->set_skip_promote();
10611       }
10612       break;
10613
10614     // force promotion when pin an object in cache tier
10615     case CEPH_OSD_OP_CACHE_PIN:
10616       op->set_promote();
10617       break;
10618
10619     default:
10620       break;
10621     }
10622   }
10623
10624   if (op->rmw_flags == 0)
10625     return -EINVAL;
10626
10627   return 0;
10628 }
10629
10630 void OSD::set_perf_queries(
10631     const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
10632   dout(10) << "setting " << queries.size() << " queries" << dendl;
10633
10634   std::list<OSDPerfMetricQuery> supported_queries;
10635   for (auto &it : queries) {
10636     auto &query = it.first;
10637     if (!query.key_descriptor.empty()) {
10638       supported_queries.push_back(query);
10639     }
10640   }
10641   if (supported_queries.size() < queries.size()) {
10642     dout(1) << queries.size() - supported_queries.size()
10643             << " unsupported queries" << dendl;
10644   }
10645
10646   {
10647     Mutex::Locker locker(m_perf_queries_lock);
10648     m_perf_queries = supported_queries;
10649     m_perf_limits = queries;
10650   }
10651
10652   std::vector<PGRef> pgs;
10653   _get_pgs(&pgs);
10654   for (auto& pg : pgs) {
10655     pg->lock();
10656     pg->set_dynamic_perf_stats_queries(supported_queries);
10657     pg->unlock();
10658   }
10659 }
10660
10661 void OSD::get_perf_reports(
10662     std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
10663   std::vector<PGRef> pgs;
10664   _get_pgs(&pgs);
10665   DynamicPerfStats dps;
10666   for (auto& pg : pgs) {
10667     // m_perf_queries can be modified only in set_perf_queries by mgr client
10668     // request, and it is protected by by mgr client's lock, which is held
10669     // when set_perf_queries/get_perf_reports are called, so we may not hold
10670     // m_perf_queries_lock here.
10671     DynamicPerfStats pg_dps(m_perf_queries);
10672     pg->lock();
10673     pg->get_dynamic_perf_stats(&pg_dps);
10674     pg->unlock();
10675     dps.merge(pg_dps);
10676   }
10677   dps.add_to_reports(m_perf_limits, reports);
10678   dout(20) << "reports for " << reports->size() << " queries" << dendl;
10679 }
10680
10681 // =============================================================
10682
10683 #undef dout_context
10684 #define dout_context cct
10685 #undef dout_prefix
10686 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10687
10688 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10689 {
10690   dout(10) << pg->pg_id << " " << pg << dendl;
10691   slot->pg = pg;
10692   pg->osd_shard = this;
10693   pg->pg_slot = slot;
10694   osd->inc_num_pgs();
10695
10696   slot->epoch = pg->get_osdmap_epoch();
10697   pg_slots_by_epoch.insert(*slot);
10698 }
10699
10700 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10701 {
10702   dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10703   slot->pg->osd_shard = nullptr;
10704   slot->pg->pg_slot = nullptr;
10705   slot->pg = nullptr;
10706   osd->dec_num_pgs();
10707
10708   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10709   slot->epoch = 0;
10710   if (waiting_for_min_pg_epoch) {
10711     min_pg_epoch_cond.notify_all();
10712   }
10713 }
10714
10715 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10716 {
10717   std::lock_guard l(shard_lock);
10718   dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10719            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10720   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10721   dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10722   slot->epoch = e;
10723   pg_slots_by_epoch.insert(*slot);
10724   dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10725            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10726   if (waiting_for_min_pg_epoch) {
10727     min_pg_epoch_cond.notify_all();
10728   }
10729 }
10730
10731 epoch_t OSDShard::get_min_pg_epoch()
10732 {
10733   std::lock_guard l(shard_lock);
10734   auto p = pg_slots_by_epoch.begin();
10735   if (p == pg_slots_by_epoch.end()) {
10736     return 0;
10737   }
10738   return p->epoch;
10739 }
10740
10741 void OSDShard::wait_min_pg_epoch(epoch_t need)
10742 {
10743   std::unique_lock l{shard_lock};
10744   ++waiting_for_min_pg_epoch;
10745   min_pg_epoch_cond.wait(l, [need, this] {
10746     if (pg_slots_by_epoch.empty()) {
10747       return true;
10748     } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10749       return true;
10750     } else {
10751       dout(10) << need << " waiting on "
10752                << pg_slots_by_epoch.begin()->epoch << dendl;
10753       return false;
10754     }
10755   });
10756   --waiting_for_min_pg_epoch;
10757 }
10758
10759 epoch_t OSDShard::get_max_waiting_epoch()
10760 {
10761   std::lock_guard l(shard_lock);
10762   epoch_t r = 0;
10763   for (auto& i : pg_slots) {
10764     if (!i.second->waiting_peering.empty()) {
10765       r = std::max(r, i.second->waiting_peering.rbegin()->first);
10766     }
10767   }
10768   return r;
10769 }
10770
10771 void OSDShard::consume_map(
10772   OSDMapRef& new_osdmap,
10773   unsigned *pushes_to_free)
10774 {
10775   std::lock_guard l(shard_lock);
10776   OSDMapRef old_osdmap;
10777   {
10778     std::lock_guard l(osdmap_lock);
10779     old_osdmap = std::move(shard_osdmap);
10780     shard_osdmap = new_osdmap;
10781   }
10782   dout(10) << new_osdmap->get_epoch()
10783            << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10784            << dendl;
10785   bool queued = false;
10786
10787   // check slots
10788   auto p = pg_slots.begin();
10789   while (p != pg_slots.end()) {
10790     OSDShardPGSlot *slot = p->second.get();
10791     const spg_t& pgid = p->first;
10792     dout(20) << __func__ << " " << pgid << dendl;
10793     if (!slot->waiting_for_split.empty()) {
10794       dout(20) << __func__ << "  " << pgid
10795                << " waiting for split " << slot->waiting_for_split << dendl;
10796       ++p;
10797       continue;
10798     }
10799     if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10800       dout(20) << __func__ << "  " << pgid
10801                << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10802                << dendl;
10803       ++p;
10804       continue;
10805     }
10806     if (!slot->waiting_peering.empty()) {
10807       epoch_t first = slot->waiting_peering.begin()->first;
10808       if (first <= new_osdmap->get_epoch()) {
10809         dout(20) << __func__ << "  " << pgid
10810                  << " pending_peering first epoch " << first
10811                  << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10812         _wake_pg_slot(pgid, slot);
10813         queued = true;
10814       }
10815       ++p;
10816       continue;
10817     }
10818     if (!slot->waiting.empty()) {
10819       if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10820         dout(20) << __func__ << "  " << pgid << " maps to us, keeping"
10821                  << dendl;
10822         ++p;
10823         continue;
10824       }
10825       while (!slot->waiting.empty() &&
10826              slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10827         auto& qi = slot->waiting.front();
10828         dout(20) << __func__ << "  " << pgid
10829                  << " waiting item " << qi
10830                  << " epoch " << qi.get_map_epoch()
10831                  << " <= " << new_osdmap->get_epoch()
10832                  << ", "
10833                  << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10834                      "misdirected")
10835                  << ", dropping" << dendl;
10836         *pushes_to_free += qi.get_reserved_pushes();
10837         slot->waiting.pop_front();
10838       }
10839     }
10840     if (slot->waiting.empty() &&
10841         slot->num_running == 0 &&
10842         slot->waiting_for_split.empty() &&
10843         !slot->pg) {
10844       dout(20) << __func__ << "  " << pgid << " empty, pruning" << dendl;
10845       p = pg_slots.erase(p);
10846       continue;
10847     }
10848
10849     ++p;
10850   }
10851   if (queued) {
10852     std::lock_guard l{sdata_wait_lock};
10853     sdata_cond.notify_one();
10854   }
10855 }
10856
10857 void OSDShard::_wake_pg_slot(
10858   spg_t pgid,
10859   OSDShardPGSlot *slot)
10860 {
10861   dout(20) << __func__ << " " << pgid
10862            << " to_process " << slot->to_process
10863            << " waiting " << slot->waiting
10864            << " waiting_peering " << slot->waiting_peering << dendl;
10865   for (auto i = slot->to_process.rbegin();
10866        i != slot->to_process.rend();
10867        ++i) {
10868     _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10869   }
10870   slot->to_process.clear();
10871   for (auto i = slot->waiting.rbegin();
10872        i != slot->waiting.rend();
10873        ++i) {
10874     _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10875   }
10876   slot->waiting.clear();
10877   for (auto i = slot->waiting_peering.rbegin();
10878        i != slot->waiting_peering.rend();
10879        ++i) {
10880     // this is overkill; we requeue everything, even if some of these
10881     // items are waiting for maps we don't have yet.  FIXME, maybe,
10882     // someday, if we decide this inefficiency matters
10883     for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10884       _enqueue_front(std::move(*j), osd->op_prio_cutoff);
10885     }
10886   }
10887   slot->waiting_peering.clear();
10888   ++slot->requeue_seq;
10889 }
10890
10891 void OSDShard::identify_splits_and_merges(
10892   const OSDMapRef& as_of_osdmap,
10893   set<pair<spg_t,epoch_t>> *split_pgs,
10894   set<pair<spg_t,epoch_t>> *merge_pgs)
10895 {
10896   std::lock_guard l(shard_lock);
10897   if (shard_osdmap) {
10898     for (auto& i : pg_slots) {
10899       const spg_t& pgid = i.first;
10900       auto *slot = i.second.get();
10901       if (slot->pg) {
10902         osd->service.identify_splits_and_merges(
10903           shard_osdmap, as_of_osdmap, pgid,
10904           split_pgs, merge_pgs);
10905       } else if (!slot->waiting_for_split.empty()) {
10906         osd->service.identify_splits_and_merges(
10907           shard_osdmap, as_of_osdmap, pgid,
10908           split_pgs, nullptr);
10909       } else {
10910         dout(20) << __func__ << " slot " << pgid
10911                  << " has no pg and waiting_for_split "
10912                  << slot->waiting_for_split << dendl;
10913       }
10914     }
10915   }
10916 }
10917
10918 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10919                             set<pair<spg_t,epoch_t>> *pgids)
10920 {
10921   std::lock_guard l(shard_lock);
10922   _prime_splits(pgids);
10923   if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10924     set<pair<spg_t,epoch_t>> newer_children;
10925     for (auto i : *pgids) {
10926       osd->service.identify_splits_and_merges(
10927         as_of_osdmap, shard_osdmap, i.first,
10928         &newer_children, nullptr);
10929     }
10930     newer_children.insert(pgids->begin(), pgids->end());
10931     dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10932              << shard_osdmap->get_epoch() << ", new children " << newer_children
10933              << dendl;
10934     _prime_splits(&newer_children);
10935     // note: we don't care what is left over here for other shards.
10936     // if this shard is ahead of us and one isn't, e.g., one thread is
10937     // calling into prime_splits via _process (due to a newly created
10938     // pg) and this shard has a newer map due to a racing consume_map,
10939     // then any grandchildren left here will be identified (or were
10940     // identified) when the slower shard's osdmap is advanced.
10941     // _prime_splits() will tolerate the case where the pgid is
10942     // already primed.
10943   }
10944 }
10945
10946 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10947 {
10948   dout(10) << *pgids << dendl;
10949   auto p = pgids->begin();
10950   while (p != pgids->end()) {
10951     unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10952     if (shard_index == shard_id) {
10953       auto r = pg_slots.emplace(p->first, nullptr);
10954       if (r.second) {
10955         dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10956         r.first->second = make_unique<OSDShardPGSlot>();
10957         r.first->second->waiting_for_split.insert(p->second);
10958       } else {
10959         auto q = r.first;
10960         ceph_assert(q != pg_slots.end());
10961         dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10962                  << dendl;
10963         q->second->waiting_for_split.insert(p->second);
10964       }
10965       p = pgids->erase(p);
10966     } else {
10967       ++p;
10968     }
10969   }
10970 }
10971
10972 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10973                             set<pair<spg_t,epoch_t>> *merge_pgs)
10974 {
10975   std::lock_guard l(shard_lock);
10976   dout(20) << __func__ << " checking shard " << shard_id
10977            << " for remaining merge pgs " << merge_pgs << dendl;
10978   auto p = merge_pgs->begin();
10979   while (p != merge_pgs->end()) {
10980     spg_t pgid = p->first;
10981     epoch_t epoch = p->second;
10982     unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10983     if (shard_index != shard_id) {
10984       ++p;
10985       continue;
10986     }
10987     OSDShardPGSlot *slot;
10988     auto r = pg_slots.emplace(pgid, nullptr);
10989     if (r.second) {
10990       r.first->second = make_unique<OSDShardPGSlot>();
10991     }
10992     slot = r.first->second.get();
10993     if (slot->pg) {
10994       // already have pg
10995       dout(20) << __func__ << "  have merge participant pg " << pgid
10996                << " " << slot->pg << dendl;
10997     } else if (!slot->waiting_for_split.empty() &&
10998                *slot->waiting_for_split.begin() < epoch) {
10999       dout(20) << __func__ << "  pending split on merge participant pg " << pgid
11000                << " " << slot->waiting_for_split << dendl;
11001     } else {
11002       dout(20) << __func__ << "  creating empty merge participant " << pgid
11003                << " for merge in " << epoch << dendl;
11004       // leave history zeroed; PG::merge_from() will fill it in.
11005       pg_history_t history;
11006       PGCreateInfo cinfo(pgid, epoch - 1,
11007                          history, PastIntervals(), false);
11008       PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
11009       _attach_pg(r.first->second.get(), pg.get());
11010       _wake_pg_slot(pgid, slot);
11011       pg->unlock();
11012     }
11013     // mark slot for merge
11014     dout(20) << __func__ << "  marking merge participant " << pgid << dendl;
11015     slot->waiting_for_merge_epoch = epoch;
11016     p = merge_pgs->erase(p);
11017   }
11018 }
11019
11020 void OSDShard::register_and_wake_split_child(PG *pg)
11021 {
11022   epoch_t epoch;
11023   {
11024     std::lock_guard l(shard_lock);
11025     dout(10) << pg->pg_id << " " << pg << dendl;
11026     auto p = pg_slots.find(pg->pg_id);
11027     ceph_assert(p != pg_slots.end());
11028     auto *slot = p->second.get();
11029     dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
11030              << dendl;
11031     ceph_assert(!slot->pg);
11032     ceph_assert(!slot->waiting_for_split.empty());
11033     _attach_pg(slot, pg);
11034
11035     epoch = pg->get_osdmap_epoch();
11036     ceph_assert(slot->waiting_for_split.count(epoch));
11037     slot->waiting_for_split.erase(epoch);
11038     if (slot->waiting_for_split.empty()) {
11039       _wake_pg_slot(pg->pg_id, slot);
11040     } else {
11041       dout(10) << __func__ << " still waiting for split on "
11042                << slot->waiting_for_split << dendl;
11043     }
11044   }
11045
11046   // kick child to ensure it pulls up to the latest osdmap
11047   osd->enqueue_peering_evt(
11048     pg->pg_id,
11049     PGPeeringEventRef(
11050       std::make_shared<PGPeeringEvent>(
11051         epoch,
11052         epoch,
11053         NullEvt())));
11054
11055   std::lock_guard l{sdata_wait_lock};
11056   sdata_cond.notify_one();
11057 }
11058
11059 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
11060 {
11061   std::lock_guard l(shard_lock);
11062   vector<spg_t> to_delete;
11063   for (auto& i : pg_slots) {
11064     if (i.first != parent &&
11065         i.first.get_ancestor(old_pg_num) == parent) {
11066       dout(10) << __func__ << " parent " << parent << " clearing " << i.first
11067                << dendl;
11068       _wake_pg_slot(i.first, i.second.get());
11069       to_delete.push_back(i.first);
11070     }
11071   }
11072   for (auto pgid : to_delete) {
11073     pg_slots.erase(pgid);
11074   }
11075 }
11076
11077
11078 // =============================================================
11079
11080 #undef dout_context
11081 #define dout_context osd->cct
11082 #undef dout_prefix
11083 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
11084
11085 void OSD::ShardedOpWQ::_add_slot_waiter(
11086   spg_t pgid,
11087   OSDShardPGSlot *slot,
11088   OpQueueItem&& qi)
11089 {
11090   if (qi.is_peering()) {
11091     dout(20) << __func__ << " " << pgid
11092              << " peering, item epoch is "
11093              << qi.get_map_epoch()
11094              << ", will wait on " << qi << dendl;
11095     slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
11096   } else {
11097     dout(20) << __func__ << " " << pgid
11098              << " item epoch is "
11099              << qi.get_map_epoch()
11100              << ", will wait on " << qi << dendl;
11101     slot->waiting.push_back(std::move(qi));
11102   }
11103 }
11104
11105 #undef dout_prefix
11106 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
11107
11108 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
11109 {
11110   uint32_t shard_index = thread_index % osd->num_shards;
11111   auto& sdata = osd->shards[shard_index];
11112   ceph_assert(sdata);
11113
11114   // If all threads of shards do oncommits, there is a out-of-order
11115   // problem.  So we choose the thread which has the smallest
11116   // thread_index(thread_index < num_shards) of shard to do oncommit
11117   // callback.
11118   bool is_smallest_thread_index = thread_index < osd->num_shards;
11119
11120   // peek at spg_t
11121   sdata->shard_lock.lock();
11122   if (sdata->pqueue->empty() &&
11123       (!is_smallest_thread_index || sdata->context_queue.empty())) {
11124     std::unique_lock wait_lock{sdata->sdata_wait_lock};
11125     if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11126       // we raced with a context_queue addition, don't wait
11127       wait_lock.unlock();
11128     } else if (!sdata->stop_waiting) {
11129       dout(20) << __func__ << " empty q, waiting" << dendl;
11130       osd->cct->get_heartbeat_map()->clear_timeout(hb);
11131       sdata->shard_lock.unlock();
11132       sdata->sdata_cond.wait(wait_lock);
11133       wait_lock.unlock();
11134       sdata->shard_lock.lock();
11135       if (sdata->pqueue->empty() &&
11136          !(is_smallest_thread_index && !sdata->context_queue.empty())) {
11137         sdata->shard_lock.unlock();
11138         return;
11139       }
11140       osd->cct->get_heartbeat_map()->reset_timeout(hb,
11141           osd->cct->_conf->threadpool_default_timeout, 0);
11142     } else {
11143       dout(20) << __func__ << " need return immediately" << dendl;
11144       wait_lock.unlock();
11145       sdata->shard_lock.unlock();
11146       return;
11147     }
11148   }
11149
11150   list<Context *> oncommits;
11151   if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11152     sdata->context_queue.swap(oncommits);
11153   }
11154
11155   if (sdata->pqueue->empty()) {
11156     if (osd->is_stopping()) {
11157       sdata->shard_lock.unlock();
11158       for (auto c : oncommits) {
11159         dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11160         delete c;
11161       }
11162       return;    // OSD shutdown, discard.
11163     }
11164     sdata->shard_lock.unlock();
11165     handle_oncommits(oncommits);
11166     return;
11167   }
11168
11169   OpQueueItem item = sdata->pqueue->dequeue();
11170   if (osd->is_stopping()) {
11171     sdata->shard_lock.unlock();
11172     for (auto c : oncommits) {
11173       dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11174       delete c;
11175     }
11176     return;    // OSD shutdown, discard.
11177   }
11178
11179   const auto token = item.get_ordering_token();
11180   auto r = sdata->pg_slots.emplace(token, nullptr);
11181   if (r.second) {
11182     r.first->second = make_unique<OSDShardPGSlot>();
11183   }
11184   OSDShardPGSlot *slot = r.first->second.get();
11185   dout(20) << __func__ << " " << token
11186            << (r.second ? " (new)" : "")
11187            << " to_process " << slot->to_process
11188            << " waiting " << slot->waiting
11189            << " waiting_peering " << slot->waiting_peering
11190            << dendl;
11191   slot->to_process.push_back(std::move(item));
11192   dout(20) << __func__ << " " << slot->to_process.back()
11193            << " queued" << dendl;
11194
11195  retry_pg:
11196   PGRef pg = slot->pg;
11197
11198   // lock pg (if we have it)
11199   if (pg) {
11200     // note the requeue seq now...
11201     uint64_t requeue_seq = slot->requeue_seq;
11202     ++slot->num_running;
11203
11204     sdata->shard_lock.unlock();
11205     osd->service.maybe_inject_dispatch_delay();
11206     pg->lock();
11207     osd->service.maybe_inject_dispatch_delay();
11208     sdata->shard_lock.lock();
11209
11210     auto q = sdata->pg_slots.find(token);
11211     if (q == sdata->pg_slots.end()) {
11212       // this can happen if we race with pg removal.
11213       dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
11214       pg->unlock();
11215       sdata->shard_lock.unlock();
11216       handle_oncommits(oncommits);
11217       return;
11218     }
11219     slot = q->second.get();
11220     --slot->num_running;
11221
11222     if (slot->to_process.empty()) {
11223       // raced with _wake_pg_slot or consume_map
11224       dout(20) << __func__ << " " << token
11225                << " nothing queued" << dendl;
11226       pg->unlock();
11227       sdata->shard_lock.unlock();
11228       handle_oncommits(oncommits);
11229       return;
11230     }
11231     if (requeue_seq != slot->requeue_seq) {
11232       dout(20) << __func__ << " " << token
11233                << " requeue_seq " << slot->requeue_seq << " > our "
11234                << requeue_seq << ", we raced with _wake_pg_slot"
11235                << dendl;
11236       pg->unlock();
11237       sdata->shard_lock.unlock();
11238       handle_oncommits(oncommits);
11239       return;
11240     }
11241     if (slot->pg != pg) {
11242       // this can happen if we race with pg removal.
11243       dout(20) << __func__ << " slot " << token << " no longer attached to "
11244                << pg << dendl;
11245       pg->unlock();
11246       goto retry_pg;
11247     }
11248   }
11249
11250   dout(20) << __func__ << " " << token
11251            << " to_process " << slot->to_process
11252            << " waiting " << slot->waiting
11253            << " waiting_peering " << slot->waiting_peering << dendl;
11254
11255   ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
11256                                  suicide_interval);
11257
11258   // take next item
11259   auto qi = std::move(slot->to_process.front());
11260   slot->to_process.pop_front();
11261   dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11262   set<pair<spg_t,epoch_t>> new_children;
11263   OSDMapRef osdmap;
11264
11265   while (!pg) {
11266     // should this pg shard exist on this osd in this (or a later) epoch?
11267     osdmap = sdata->shard_osdmap;
11268     const PGCreateInfo *create_info = qi.creates_pg();
11269     if (!slot->waiting_for_split.empty()) {
11270       dout(20) << __func__ << " " << token
11271                << " splitting " << slot->waiting_for_split << dendl;
11272       _add_slot_waiter(token, slot, std::move(qi));
11273     } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11274       dout(20) << __func__ << " " << token
11275                << " map " << qi.get_map_epoch() << " > "
11276                << osdmap->get_epoch() << dendl;
11277       _add_slot_waiter(token, slot, std::move(qi));
11278     } else if (qi.is_peering()) {
11279       if (!qi.peering_requires_pg()) {
11280         // for pg-less events, we run them under the ordering lock, since
11281         // we don't have the pg lock to keep them ordered.
11282         qi.run(osd, sdata, pg, tp_handle);
11283       } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11284         if (create_info) {
11285           if (create_info->by_mon &&
11286               osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11287             dout(20) << __func__ << " " << token
11288                      << " no pg, no longer primary, ignoring mon create on "
11289                      << qi << dendl;
11290           } else {
11291             dout(20) << __func__ << " " << token
11292                      << " no pg, should create on " << qi << dendl;
11293             pg = osd->handle_pg_create_info(osdmap, create_info);
11294             if (pg) {
11295               // we created the pg! drop out and continue "normally"!
11296               sdata->_attach_pg(slot, pg.get());
11297               sdata->_wake_pg_slot(token, slot);
11298
11299               // identify split children between create epoch and shard epoch.
11300               osd->service.identify_splits_and_merges(
11301                 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11302               sdata->_prime_splits(&new_children);
11303               // distribute remaining split children to other shards below!
11304               break;
11305             }
11306             dout(20) << __func__ << " ignored create on " << qi << dendl;
11307           }
11308         } else {
11309           dout(20) << __func__ << " " << token
11310                    << " no pg, peering, !create, discarding " << qi << dendl;
11311         }
11312       } else {
11313         dout(20) << __func__ << " " << token
11314                  << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11315                  << ", discarding " << qi
11316                  << dendl;
11317       }
11318     } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11319       dout(20) << __func__ << " " << token
11320                << " no pg, should exist e" << osdmap->get_epoch()
11321                << ", will wait on " << qi << dendl;
11322       _add_slot_waiter(token, slot, std::move(qi));
11323     } else {
11324       dout(20) << __func__ << " " << token
11325                << " no pg, shouldn't exist e" << osdmap->get_epoch()
11326                << ", dropping " << qi << dendl;
11327       // share map with client?
11328       if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11329         auto priv = (*_op)->get_req()->get_connection()->get_priv();
11330         if (auto session = static_cast<Session *>(priv.get()); session) {
11331           osd->maybe_share_map(session, *_op, sdata->shard_osdmap);
11332         }
11333       }
11334       unsigned pushes_to_free = qi.get_reserved_pushes();
11335       if (pushes_to_free > 0) {
11336         sdata->shard_lock.unlock();
11337         osd->service.release_reserved_pushes(pushes_to_free);
11338         handle_oncommits(oncommits);
11339         return;
11340       }
11341     }
11342     sdata->shard_lock.unlock();
11343     handle_oncommits(oncommits);
11344     return;
11345   }
11346   if (qi.is_peering()) {
11347     OSDMapRef osdmap = sdata->shard_osdmap;
11348     if (qi.get_map_epoch() > osdmap->get_epoch()) {
11349       _add_slot_waiter(token, slot, std::move(qi));
11350       sdata->shard_lock.unlock();
11351       pg->unlock();
11352       handle_oncommits(oncommits);
11353       return;
11354     }
11355   }
11356   sdata->shard_lock.unlock();
11357
11358   if (!new_children.empty()) {
11359     for (auto shard : osd->shards) {
11360       shard->prime_splits(osdmap, &new_children);
11361     }
11362     ceph_assert(new_children.empty());
11363   }
11364
11365   // osd_opwq_process marks the point at which an operation has been dequeued
11366   // and will begin to be handled by a worker thread.
11367   {
11368 #ifdef WITH_LTTNG
11369     osd_reqid_t reqid;
11370     if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11371       reqid = (*_op)->get_reqid();
11372     }
11373 #endif
11374     tracepoint(osd, opwq_process_start, reqid.name._type,
11375         reqid.name._num, reqid.tid, reqid.inc);
11376   }
11377
11378   lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11379   Formatter *f = Formatter::create("json");
11380   f->open_object_section("q");
11381   dump(f);
11382   f->close_section();
11383   f->flush(*_dout);
11384   delete f;
11385   *_dout << dendl;
11386
11387   qi.run(osd, sdata, pg, tp_handle);
11388
11389   {
11390 #ifdef WITH_LTTNG
11391     osd_reqid_t reqid;
11392     if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11393       reqid = (*_op)->get_reqid();
11394     }
11395 #endif
11396     tracepoint(osd, opwq_process_finish, reqid.name._type,
11397         reqid.name._num, reqid.tid, reqid.inc);
11398   }
11399
11400   handle_oncommits(oncommits);
11401 }
11402
11403 void OSD::ShardedOpWQ::_enqueue(OpQueueItem&& item) {
11404   uint32_t shard_index =
11405     item.get_ordering_token().hash_to_shard(osd->shards.size());
11406
11407   OSDShard* sdata = osd->shards[shard_index];
11408   assert (NULL != sdata);
11409   unsigned priority = item.get_priority();
11410   unsigned cost = item.get_cost();
11411   sdata->shard_lock.lock();
11412
11413   dout(20) << __func__ << " " << item << dendl;
11414   if (priority >= osd->op_prio_cutoff)
11415     sdata->pqueue->enqueue_strict(
11416       item.get_owner(), priority, std::move(item));
11417   else
11418     sdata->pqueue->enqueue(
11419       item.get_owner(), priority, cost, std::move(item));
11420   sdata->shard_lock.unlock();
11421
11422   std::lock_guard l{sdata->sdata_wait_lock};
11423   sdata->sdata_cond.notify_one();
11424 }
11425
11426 void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem&& item)
11427 {
11428   auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11429   auto& sdata = osd->shards[shard_index];
11430   ceph_assert(sdata);
11431   sdata->shard_lock.lock();
11432   auto p = sdata->pg_slots.find(item.get_ordering_token());
11433   if (p != sdata->pg_slots.end() &&
11434       !p->second->to_process.empty()) {
11435     // we may be racing with _process, which has dequeued a new item
11436     // from pqueue, put it on to_process, and is now busy taking the
11437     // pg lock.  ensure this old requeued item is ordered before any
11438     // such newer item in to_process.
11439     p->second->to_process.push_front(std::move(item));
11440     item = std::move(p->second->to_process.back());
11441     p->second->to_process.pop_back();
11442     dout(20) << __func__
11443              << " " << p->second->to_process.front()
11444              << " shuffled w/ " << item << dendl;
11445   } else {
11446     dout(20) << __func__ << " " << item << dendl;
11447   }
11448   sdata->_enqueue_front(std::move(item), osd->op_prio_cutoff);
11449   sdata->shard_lock.unlock();
11450   std::lock_guard l{sdata->sdata_wait_lock};
11451   sdata->sdata_cond.notify_one();
11452 }
11453
11454 namespace ceph {
11455 namespace osd_cmds {
11456
11457 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11458          std::ostream& os)
11459 {
11460   if (!ceph_using_tcmalloc()) {
11461         os << "could not issue heap profiler command -- not using tcmalloc!";
11462         return -EOPNOTSUPP;
11463   }
11464
11465   string cmd;
11466   if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
11467         os << "unable to get value for command \"" << cmd << "\"";
11468        return -EINVAL;
11469   }
11470
11471   std::vector<std::string> cmd_vec;
11472   get_str_vec(cmd, cmd_vec);
11473
11474   string val;
11475   if (cmd_getval(&cct, cmdmap, "value", val)) {
11476     cmd_vec.push_back(val);
11477   }
11478
11479   ceph_heap_profiler_handle_command(cmd_vec, os);
11480
11481   return 0;
11482 }
11483
11484 }} // namespace ceph::osd_cmds
11485
11486
11487 std::ostream& operator<<(std::ostream& out, const io_queue& q) {
11488   switch(q) {
11489   case io_queue::prioritized:
11490     out << "prioritized";
11491     break;
11492   case io_queue::weightedpriority:
11493     out << "weightedpriority";
11494     break;
11495   case io_queue::mclock_opclass:
11496     out << "mclock_opclass";
11497     break;
11498   case io_queue::mclock_client:
11499     out << "mclock_client";
11500     break;
11501   }
11502   return out;
11503 }