ceph/src/osd/OSD.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2017 OVH
   8  *
   9  * This is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License version 2.1, as published by the Free Software
  12  * Foundation.  See file COPYING.
  13  *
  14  */
  15
  16 #include "acconfig.h"
  17
  18 #include <cctype>
  19 #include <fstream>
  20 #include <iostream>
  21 #include <iterator>
  22
  23 #include <unistd.h>
  24 #include <sys/stat.h>
  25 #include <signal.h>
  26 #include <time.h>
  27 #include <boost/scoped_ptr.hpp>
  28 #include <boost/range/adaptor/reversed.hpp>
  29
  30 #ifdef HAVE_SYS_PARAM_H
  31 #include <sys/param.h>
  32 #endif
  33
  34 #ifdef HAVE_SYS_MOUNT_H
  35 #include <sys/mount.h>
  36 #endif
  37
  38 #include "osd/PG.h"
  39
  40 #include "include/types.h"
  41 #include "include/compat.h"
  42 #include "include/random.h"
  43
  44 #include "OSD.h"
  45 #include "OSDMap.h"
  46 #include "Watch.h"
  47 #include "osdc/Objecter.h"
  48
  49 #include "common/errno.h"
  50 #include "common/ceph_argparse.h"
  51 #include "common/ceph_time.h"
  52 #include "common/version.h"
  53 #include "common/pick_address.h"
  54 #include "common/blkdev.h"
  55 #include "common/numa.h"
  56
  57 #include "os/ObjectStore.h"
  58 #ifdef HAVE_LIBFUSE
  59 #include "os/FuseStore.h"
  60 #endif
  61
  62 #include "PrimaryLogPG.h"
  63
  64 #include "msg/Messenger.h"
  65 #include "msg/Message.h"
  66
  67 #include "mon/MonClient.h"
  68
  69 #include "messages/MLog.h"
  70
  71 #include "messages/MGenericMessage.h"
  72 #include "messages/MOSDPing.h"
  73 #include "messages/MOSDFailure.h"
  74 #include "messages/MOSDMarkMeDown.h"
  75 #include "messages/MOSDFull.h"
  76 #include "messages/MOSDOp.h"
  77 #include "messages/MOSDOpReply.h"
  78 #include "messages/MOSDBackoff.h"
  79 #include "messages/MOSDBeacon.h"
  80 #include "messages/MOSDRepOp.h"
  81 #include "messages/MOSDRepOpReply.h"
  82 #include "messages/MOSDBoot.h"
  83 #include "messages/MOSDPGTemp.h"
  84 #include "messages/MOSDPGReadyToMerge.h"
  85
  86 #include "messages/MOSDMap.h"
  87 #include "messages/MMonGetOSDMap.h"
  88 #include "messages/MOSDPGNotify.h"
  89 #include "messages/MOSDPGQuery.h"
  90 #include "messages/MOSDPGLog.h"
  91 #include "messages/MOSDPGRemove.h"
  92 #include "messages/MOSDPGInfo.h"
  93 #include "messages/MOSDPGCreate.h"
  94 #include "messages/MOSDPGCreate2.h"
  95 #include "messages/MOSDPGTrim.h"
  96 #include "messages/MOSDPGScan.h"
  97 #include "messages/MBackfillReserve.h"
  98 #include "messages/MRecoveryReserve.h"
  99 #include "messages/MOSDForceRecovery.h"
 100 #include "messages/MOSDECSubOpWrite.h"
 101 #include "messages/MOSDECSubOpWriteReply.h"
 102 #include "messages/MOSDECSubOpRead.h"
 103 #include "messages/MOSDECSubOpReadReply.h"
 104 #include "messages/MOSDPGCreated.h"
 105 #include "messages/MOSDPGUpdateLogMissing.h"
 106 #include "messages/MOSDPGUpdateLogMissingReply.h"
 107
 108 #include "messages/MOSDPeeringOp.h"
 109
 110 #include "messages/MOSDAlive.h"
 111
 112 #include "messages/MOSDScrub.h"
 113 #include "messages/MOSDScrub2.h"
 114 #include "messages/MOSDRepScrub.h"
 115
 116 #include "messages/MMonCommand.h"
 117 #include "messages/MCommand.h"
 118 #include "messages/MCommandReply.h"
 119
 120 #include "messages/MPGStats.h"
 121 #include "messages/MPGStatsAck.h"
 122
 123 #include "messages/MWatchNotify.h"
 124 #include "messages/MOSDPGPush.h"
 125 #include "messages/MOSDPGPushReply.h"
 126 #include "messages/MOSDPGPull.h"
 127
 128 #include "common/perf_counters.h"
 129 #include "common/Timer.h"
 130 #include "common/LogClient.h"
 131 #include "common/AsyncReserver.h"
 132 #include "common/HeartbeatMap.h"
 133 #include "common/admin_socket.h"
 134 #include "common/ceph_context.h"
 135
 136 #include "global/signal_handler.h"
 137 #include "global/pidfile.h"
 138
 139 #include "include/color.h"
 140 #include "perfglue/cpu_profiler.h"
 141 #include "perfglue/heap_profiler.h"
 142
 143 #include "osd/OpRequest.h"
 144
 145 #include "auth/AuthAuthorizeHandler.h"
 146 #include "auth/RotatingKeyRing.h"
 147
 148 #include "objclass/objclass.h"
 149
 150 #include "common/cmdparse.h"
 151 #include "include/str_list.h"
 152 #include "include/util.h"
 153
 154 #include "include/ceph_assert.h"
 155 #include "common/config.h"
 156 #include "common/EventTrace.h"
 157
 158 #include "json_spirit/json_spirit_reader.h"
 159 #include "json_spirit/json_spirit_writer.h"
 160
 161 #ifdef WITH_LTTNG
 162 #define TRACEPOINT_DEFINE
 163 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 164 #include "tracing/osd.h"
 165 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
 166 #undef TRACEPOINT_DEFINE
 167 #else
 168 #define tracepoint(...)
 169 #endif
 170
 171 #define dout_context cct
 172 #define dout_subsys ceph_subsys_osd
 173 #undef dout_prefix
 174 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
 175
 176
 177 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
 178   return *_dout << "osd." << whoami << " " << epoch << " ";
 179 }
 180
 181 //Initial features in new superblock.
 182 //Features here are also automatically upgraded
 183 CompatSet OSD::get_osd_initial_compat_set() {
 184   CompatSet::FeatureSet ceph_osd_feature_compat;
 185   CompatSet::FeatureSet ceph_osd_feature_ro_compat;
 186   CompatSet::FeatureSet ceph_osd_feature_incompat;
 187   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
 188   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
 189   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
 190   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
 191   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
 192   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
 193   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
 194   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
 195   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
 196   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
 197   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
 198   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
 199   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
 200   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
 201   ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
 202   return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
 203                    ceph_osd_feature_incompat);
 204 }
 205
 206 //Features are added here that this OSD supports.
 207 CompatSet OSD::get_osd_compat_set() {
 208   CompatSet compat =  get_osd_initial_compat_set();
 209   //Any features here can be set in code, but not in initial superblock
 210   compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
 211   return compat;
 212 }
 213
 214 OSDService::OSDService(OSD *osd) :
 215   osd(osd),
 216   cct(osd->cct),
 217   whoami(osd->whoami), store(osd->store),
 218   log_client(osd->log_client), clog(osd->clog),
 219   pg_recovery_stats(osd->pg_recovery_stats),
 220   cluster_messenger(osd->cluster_messenger),
 221   client_messenger(osd->client_messenger),
 222   logger(osd->logger),
 223   recoverystate_perf(osd->recoverystate_perf),
 224   monc(osd->monc),
 225   class_handler(osd->class_handler),
 226   osd_max_object_size(cct->_conf, "osd_max_object_size"),
 227   osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
 228   publish_lock{ceph::make_mutex("OSDService::publish_lock")},
 229   pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
 230   max_oldest_map(0),
 231   peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
 232   sched_scrub_lock("OSDService::sched_scrub_lock"),
 233   scrubs_local(0),
 234   scrubs_remote(0),
 235   agent_lock("OSDService::agent_lock"),
 236   agent_valid_iterator(false),
 237   agent_ops(0),
 238   flush_mode_high_count(0),
 239   agent_active(true),
 240   agent_thread(this),
 241   agent_stop_flag(false),
 242   agent_timer_lock("OSDService::agent_timer_lock"),
 243   agent_timer(osd->client_messenger->cct, agent_timer_lock),
 244   last_recalibrate(ceph_clock_now()),
 245   promote_max_objects(0),
 246   promote_max_bytes(0),
 247   objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
 248   m_objecter_finishers(cct->_conf->osd_objecter_finishers),
 249   watch_lock("OSDService::watch_lock"),
 250   watch_timer(osd->client_messenger->cct, watch_lock),
 251   next_notif_id(0),
 252   recovery_request_lock("OSDService::recovery_request_lock"),
 253   recovery_request_timer(cct, recovery_request_lock, false),
 254   sleep_lock("OSDService::sleep_lock"),
 255   sleep_timer(cct, sleep_lock, false),
 256   reserver_finisher(cct),
 257   local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 258                  cct->_conf->osd_min_recovery_priority),
 259   remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
 260                   cct->_conf->osd_min_recovery_priority),
 261   pg_temp_lock("OSDService::pg_temp_lock"),
 262   snap_reserver(cct, &reserver_finisher,
 263                 cct->_conf->osd_max_trimming_pgs),
 264   recovery_lock("OSDService::recovery_lock"),
 265   recovery_ops_active(0),
 266   recovery_ops_reserved(0),
 267   recovery_paused(false),
 268   map_cache_lock("OSDService::map_cache_lock"),
 269   map_cache(cct, cct->_conf->osd_map_cache_size),
 270   map_bl_cache(cct->_conf->osd_map_cache_size),
 271   map_bl_inc_cache(cct->_conf->osd_map_cache_size),
 272   stat_lock("OSDService::stat_lock"),
 273   full_status_lock("OSDService::full_status_lock"),
 274   cur_state(NONE),
 275   cur_ratio(0), physical_ratio(0),
 276   epoch_lock("OSDService::epoch_lock"),
 277   boot_epoch(0), up_epoch(0), bind_epoch(0),
 278   is_stopping_lock("OSDService::is_stopping_lock")
 279 #ifdef PG_DEBUG_REFS
 280   , pgid_lock("OSDService::pgid_lock")
 281 #endif
 282 {
 283   objecter->init();
 284
 285   for (int i = 0; i < m_objecter_finishers; i++) {
 286     ostringstream str;
 287     str << "objecter-finisher-" << i;
 288     Finisher *fin = new Finisher(osd->client_messenger->cct, str.str(), "finisher");
 289     objecter_finishers.push_back(fin);
 290   }
 291 }
 292
 293 OSDService::~OSDService()
 294 {
 295   delete objecter;
 296
 297   for (auto f : objecter_finishers) {
 298     delete f;
 299     f = NULL;
 300   }
 301 }
 302
 303
 304
 305 #ifdef PG_DEBUG_REFS
 306 void OSDService::add_pgid(spg_t pgid, PG *pg){
 307   std::lock_guard l(pgid_lock);
 308   if (!pgid_tracker.count(pgid)) {
 309     live_pgs[pgid] = pg;
 310   }
 311   pgid_tracker[pgid]++;
 312 }
 313 void OSDService::remove_pgid(spg_t pgid, PG *pg)
 314 {
 315   std::lock_guard l(pgid_lock);
 316   ceph_assert(pgid_tracker.count(pgid));
 317   ceph_assert(pgid_tracker[pgid] > 0);
 318   pgid_tracker[pgid]--;
 319   if (pgid_tracker[pgid] == 0) {
 320     pgid_tracker.erase(pgid);
 321     live_pgs.erase(pgid);
 322   }
 323 }
 324 void OSDService::dump_live_pgids()
 325 {
 326   std::lock_guard l(pgid_lock);
 327   derr << "live pgids:" << dendl;
 328   for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
 329        i != pgid_tracker.cend();
 330        ++i) {
 331     derr << "\t" << *i << dendl;
 332     live_pgs[i->first]->dump_live_ids();
 333   }
 334 }
 335 #endif
 336
 337
 338
 339 void OSDService::identify_splits_and_merges(
 340   OSDMapRef old_map,
 341   OSDMapRef new_map,
 342   spg_t pgid,
 343   set<pair<spg_t,epoch_t>> *split_children,
 344   set<pair<spg_t,epoch_t>> *merge_pgs)
 345 {
 346   if (!old_map->have_pg_pool(pgid.pool())) {
 347     return;
 348   }
 349   int old_pgnum = old_map->get_pg_num(pgid.pool());
 350   auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
 351   if (p == osd->pg_num_history.pg_nums.end()) {
 352     return;
 353   }
 354   dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
 355            << " to e" << new_map->get_epoch()
 356            << " pg_nums " << p->second << dendl;
 357   deque<spg_t> queue;
 358   queue.push_back(pgid);
 359   set<spg_t> did;
 360   while (!queue.empty()) {
 361     auto cur = queue.front();
 362     queue.pop_front();
 363     did.insert(cur);
 364     unsigned pgnum = old_pgnum;
 365     for (auto q = p->second.lower_bound(old_map->get_epoch());
 366          q != p->second.end() &&
 367            q->first <= new_map->get_epoch();
 368          ++q) {
 369       if (pgnum < q->second) {
 370         // split?
 371         if (cur.ps() < pgnum) {
 372           set<spg_t> children;
 373           if (cur.is_split(pgnum, q->second, &children)) {
 374             dout(20) << __func__ << " " << cur << " e" << q->first
 375                      << " pg_num " << pgnum << " -> " << q->second
 376                      << " children " << children << dendl;
 377             for (auto i : children) {
 378               split_children->insert(make_pair(i, q->first));
 379               if (!did.count(i))
 380                 queue.push_back(i);
 381             }
 382           }
 383         } else if (cur.ps() < q->second) {
 384           dout(20) << __func__ << " " << cur << " e" << q->first
 385                    << " pg_num " << pgnum << " -> " << q->second
 386                    << " is a child" << dendl;
 387           // normally we'd capture this from the parent, but it's
 388           // possible the parent doesn't exist yet (it will be
 389           // fabricated to allow an intervening merge).  note this PG
 390           // as a split child here to be sure we catch it.
 391           split_children->insert(make_pair(cur, q->first));
 392         } else {
 393           dout(20) << __func__ << " " << cur << " e" << q->first
 394                    << " pg_num " << pgnum << " -> " << q->second
 395                    << " is post-split, skipping" << dendl;
 396         }
 397       } else if (merge_pgs) {
 398         // merge?
 399         if (cur.ps() >= q->second) {
 400           if (cur.ps() < pgnum) {
 401             spg_t parent;
 402             if (cur.is_merge_source(pgnum, q->second, &parent)) {
 403               set<spg_t> children;
 404               parent.is_split(q->second, pgnum, &children);
 405               dout(20) << __func__ << " " << cur << " e" << q->first
 406                        << " pg_num " << pgnum << " -> " << q->second
 407                        << " is merge source, target " << parent
 408                        << ", source(s) " << children << dendl;
 409               merge_pgs->insert(make_pair(parent, q->first));
 410               if (!did.count(parent)) {
 411                 // queue (and re-scan) parent in case it might not exist yet
 412                 // and there are some future splits pending on it
 413                 queue.push_back(parent);
 414               }
 415               for (auto c : children) {
 416                 merge_pgs->insert(make_pair(c, q->first));
 417                 if (!did.count(c))
 418                   queue.push_back(c);
 419               }
 420             }
 421           } else {
 422             dout(20) << __func__ << " " << cur << " e" << q->first
 423                      << " pg_num " << pgnum << " -> " << q->second
 424                      << " is beyond old pgnum, skipping" << dendl;
 425           }
 426         } else {
 427           set<spg_t> children;
 428           if (cur.is_split(q->second, pgnum, &children)) {
 429             dout(20) << __func__ << " " << cur << " e" << q->first
 430                      << " pg_num " << pgnum << " -> " << q->second
 431                      << " is merge target, source " << children << dendl;
 432             for (auto c : children) {
 433               merge_pgs->insert(make_pair(c, q->first));
 434               if (!did.count(c))
 435                 queue.push_back(c);
 436             }
 437             merge_pgs->insert(make_pair(cur, q->first));
 438           }
 439         }
 440       }
 441       pgnum = q->second;
 442     }
 443   }
 444 }
 445
 446 void OSDService::need_heartbeat_peer_update()
 447 {
 448   osd->need_heartbeat_peer_update();
 449 }
 450
 451 void OSDService::start_shutdown()
 452 {
 453   {
 454     std::lock_guard l(agent_timer_lock);
 455     agent_timer.shutdown();
 456   }
 457
 458   {
 459     std::lock_guard l(sleep_lock);
 460     sleep_timer.shutdown();
 461   }
 462
 463   {
 464     std::lock_guard l(recovery_request_lock);
 465     recovery_request_timer.shutdown();
 466   }
 467 }
 468
 469 void OSDService::shutdown_reserver()
 470 {
 471   reserver_finisher.wait_for_empty();
 472   reserver_finisher.stop();
 473 }
 474
 475 void OSDService::shutdown()
 476 {
 477   {
 478     std::lock_guard l(watch_lock);
 479     watch_timer.shutdown();
 480   }
 481
 482   objecter->shutdown();
 483   for (auto f : objecter_finishers) {
 484     f->wait_for_empty();
 485     f->stop();
 486   }
 487
 488   publish_map(OSDMapRef());
 489   next_osdmap = OSDMapRef();
 490 }
 491
 492 void OSDService::init()
 493 {
 494   reserver_finisher.start();
 495   for (auto f : objecter_finishers) {
 496     f->start();
 497   }
 498   objecter->set_client_incarnation(0);
 499
 500   // deprioritize objecter in daemonperf output
 501   objecter->get_logger()->set_prio_adjust(-3);
 502
 503   watch_timer.init();
 504   agent_timer.init();
 505
 506   agent_thread.create("osd_srv_agent");
 507
 508   if (cct->_conf->osd_recovery_delay_start)
 509     defer_recovery(cct->_conf->osd_recovery_delay_start);
 510 }
 511
 512 void OSDService::final_init()
 513 {
 514   objecter->start(osdmap.get());
 515 }
 516
 517 void OSDService::activate_map()
 518 {
 519   // wake/unwake the tiering agent
 520   agent_lock.Lock();
 521   agent_active =
 522     !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
 523     osd->is_active();
 524   agent_cond.Signal();
 525   agent_lock.Unlock();
 526 }
 527
 528 void OSDService::request_osdmap_update(epoch_t e)
 529 {
 530   osd->osdmap_subscribe(e, false);
 531 }
 532
 533 class AgentTimeoutCB : public Context {
 534   PGRef pg;
 535 public:
 536   explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
 537   void finish(int) override {
 538     pg->agent_choose_mode_restart();
 539   }
 540 };
 541
 542 void OSDService::agent_entry()
 543 {
 544   dout(10) << __func__ << " start" << dendl;
 545   agent_lock.Lock();
 546
 547   while (!agent_stop_flag) {
 548     if (agent_queue.empty()) {
 549       dout(20) << __func__ << " empty queue" << dendl;
 550       agent_cond.Wait(agent_lock);
 551       continue;
 552     }
 553     uint64_t level = agent_queue.rbegin()->first;
 554     set<PGRef>& top = agent_queue.rbegin()->second;
 555     dout(10) << __func__
 556              << " tiers " << agent_queue.size()
 557              << ", top is " << level
 558              << " with pgs " << top.size()
 559              << ", ops " << agent_ops << "/"
 560              << cct->_conf->osd_agent_max_ops
 561              << (agent_active ? " active" : " NOT ACTIVE")
 562              << dendl;
 563     dout(20) << __func__ << " oids " << agent_oids << dendl;
 564     int max = cct->_conf->osd_agent_max_ops - agent_ops;
 565     int agent_flush_quota = max;
 566     if (!flush_mode_high_count)
 567       agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
 568     if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
 569       agent_cond.Wait(agent_lock);
 570       continue;
 571     }
 572
 573     if (!agent_valid_iterator || agent_queue_pos == top.end()) {
 574       agent_queue_pos = top.begin();
 575       agent_valid_iterator = true;
 576     }
 577     PGRef pg = *agent_queue_pos;
 578     dout(10) << "high_count " << flush_mode_high_count
 579              << " agent_ops " << agent_ops
 580              << " flush_quota " << agent_flush_quota << dendl;
 581     agent_lock.Unlock();
 582     if (!pg->agent_work(max, agent_flush_quota)) {
 583       dout(10) << __func__ << " " << pg->pg_id
 584         << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
 585         << " seconds" << dendl;
 586
 587       osd->logger->inc(l_osd_tier_delay);
 588       // Queue a timer to call agent_choose_mode for this pg in 5 seconds
 589       agent_timer_lock.Lock();
 590       Context *cb = new AgentTimeoutCB(pg);
 591       agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
 592       agent_timer_lock.Unlock();
 593     }
 594     agent_lock.Lock();
 595   }
 596   agent_lock.Unlock();
 597   dout(10) << __func__ << " finish" << dendl;
 598 }
 599
 600 void OSDService::agent_stop()
 601 {
 602   {
 603     std::lock_guard l(agent_lock);
 604
 605     // By this time all ops should be cancelled
 606     ceph_assert(agent_ops == 0);
 607     // By this time all PGs are shutdown and dequeued
 608     if (!agent_queue.empty()) {
 609       set<PGRef>& top = agent_queue.rbegin()->second;
 610       derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
 611       ceph_abort_msg("agent queue not empty");
 612     }
 613
 614     agent_stop_flag = true;
 615     agent_cond.Signal();
 616   }
 617   agent_thread.join();
 618 }
 619
 620 // -------------------------------------
 621
 622 void OSDService::promote_throttle_recalibrate()
 623 {
 624   utime_t now = ceph_clock_now();
 625   double dur = now - last_recalibrate;
 626   last_recalibrate = now;
 627   unsigned prob = promote_probability_millis;
 628
 629   uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
 630   uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
 631
 632   unsigned min_prob = 1;
 633
 634   uint64_t attempts, obj, bytes;
 635   promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
 636   dout(10) << __func__ << " " << attempts << " attempts, promoted "
 637            << obj << " objects and " << byte_u_t(bytes) << "; target "
 638            << target_obj_sec << " obj/sec or "
 639            << byte_u_t(target_bytes_sec) << "/sec"
 640            << dendl;
 641
 642   // calculate what the probability *should* be, given the targets
 643   unsigned new_prob;
 644   if (attempts && dur > 0) {
 645     uint64_t avg_size = 1;
 646     if (obj)
 647       avg_size = std::max<uint64_t>(bytes / obj, 1);
 648     unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
 649     unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
 650       / (double)attempts;
 651     dout(20) << __func__ << "  po " << po << " pb " << pb << " avg_size "
 652              << avg_size << dendl;
 653     if (target_obj_sec && target_bytes_sec)
 654       new_prob = std::min(po, pb);
 655     else if (target_obj_sec)
 656       new_prob = po;
 657     else if (target_bytes_sec)
 658       new_prob = pb;
 659     else
 660       new_prob = 1000;
 661   } else {
 662     new_prob = 1000;
 663   }
 664   dout(20) << __func__ << "  new_prob " << new_prob << dendl;
 665
 666   // correct for persistent skew between target rate and actual rate, adjust
 667   double ratio = 1.0;
 668   unsigned actual = 0;
 669   if (attempts && obj) {
 670     actual = obj * 1000 / attempts;
 671     ratio = (double)actual / (double)prob;
 672     new_prob = (double)new_prob / ratio;
 673   }
 674   new_prob = std::max(new_prob, min_prob);
 675   new_prob = std::min(new_prob, 1000u);
 676
 677   // adjust
 678   prob = (prob + new_prob) / 2;
 679   prob = std::max(prob, min_prob);
 680   prob = std::min(prob, 1000u);
 681   dout(10) << __func__ << "  actual " << actual
 682            << ", actual/prob ratio " << ratio
 683            << ", adjusted new_prob " << new_prob
 684            << ", prob " << promote_probability_millis << " -> " << prob
 685            << dendl;
 686   promote_probability_millis = prob;
 687
 688   // set hard limits for this interval to mitigate stampedes
 689   promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
 690   promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
 691 }
 692
 693 // -------------------------------------
 694
 695 float OSDService::get_failsafe_full_ratio()
 696 {
 697   float full_ratio = cct->_conf->osd_failsafe_full_ratio;
 698   if (full_ratio > 1.0) full_ratio /= 100.0;
 699   return full_ratio;
 700 }
 701
 702 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
 703 {
 704   // The OSDMap ratios take precendence.  So if the failsafe is .95 and
 705   // the admin sets the cluster full to .96, the failsafe moves up to .96
 706   // too.  (Not that having failsafe == full is ideal, but it's better than
 707   // dropping writes before the clusters appears full.)
 708   OSDMapRef osdmap = get_osdmap();
 709   if (!osdmap || osdmap->get_epoch() == 0) {
 710     return NONE;
 711   }
 712   float nearfull_ratio = osdmap->get_nearfull_ratio();
 713   float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
 714   float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
 715   float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
 716
 717   if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
 718     // use the failsafe for nearfull and full; the mon isn't using the
 719     // flags anyway because we're mid-upgrade.
 720     full_ratio = failsafe_ratio;
 721     backfillfull_ratio = failsafe_ratio;
 722     nearfull_ratio = failsafe_ratio;
 723   } else if (full_ratio <= 0 ||
 724              backfillfull_ratio <= 0 ||
 725              nearfull_ratio <= 0) {
 726     derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
 727     // use failsafe flag.  ick.  the monitor did something wrong or the user
 728     // did something stupid.
 729     full_ratio = failsafe_ratio;
 730     backfillfull_ratio = failsafe_ratio;
 731     nearfull_ratio = failsafe_ratio;
 732   }
 733
 734   if (injectfull_state > NONE && injectfull) {
 735     inject = "(Injected)";
 736     return injectfull_state;
 737   } else if (pratio > failsafe_ratio) {
 738     return FAILSAFE;
 739   } else if (ratio > full_ratio) {
 740     return FULL;
 741   } else if (ratio > backfillfull_ratio) {
 742     return BACKFILLFULL;
 743   } else if (ratio > nearfull_ratio) {
 744     return NEARFULL;
 745   }
 746    return NONE;
 747 }
 748
 749 void OSDService::check_full_status(float ratio, float pratio)
 750 {
 751   std::lock_guard l(full_status_lock);
 752
 753   cur_ratio = ratio;
 754   physical_ratio = pratio;
 755
 756   string inject;
 757   s_names new_state;
 758   new_state = recalc_full_state(ratio, pratio, inject);
 759
 760   dout(20) << __func__ << " cur ratio " << ratio
 761            << ", physical ratio " << pratio
 762            << ", new state " << get_full_state_name(new_state)
 763            << " " << inject
 764            << dendl;
 765
 766   // warn
 767   if (cur_state != new_state) {
 768     dout(10) << __func__ << " " << get_full_state_name(cur_state)
 769              << " -> " << get_full_state_name(new_state) << dendl;
 770     if (new_state == FAILSAFE) {
 771       clog->error() << "full status failsafe engaged, dropping updates, now "
 772                     << (int)roundf(ratio * 100) << "% full";
 773     } else if (cur_state == FAILSAFE) {
 774       clog->error() << "full status failsafe disengaged, no longer dropping "
 775                      << "updates, now " << (int)roundf(ratio * 100) << "% full";
 776     }
 777     cur_state = new_state;
 778   }
 779 }
 780
 781 bool OSDService::need_fullness_update()
 782 {
 783   OSDMapRef osdmap = get_osdmap();
 784   s_names cur = NONE;
 785   if (osdmap->exists(whoami)) {
 786     if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
 787       cur = FULL;
 788     } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
 789       cur = BACKFILLFULL;
 790     } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
 791       cur = NEARFULL;
 792     }
 793   }
 794   s_names want = NONE;
 795   if (is_full())
 796     want = FULL;
 797   else if (is_backfillfull())
 798     want = BACKFILLFULL;
 799   else if (is_nearfull())
 800     want = NEARFULL;
 801   return want != cur;
 802 }
 803
 804 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
 805 {
 806   if (injectfull && injectfull_state >= type) {
 807     // injectfull is either a count of the number of times to return failsafe full
 808     // or if -1 then always return full
 809     if (injectfull > 0)
 810       --injectfull;
 811     ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
 812              << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
 813              << dendl;
 814     return true;
 815   }
 816   return false;
 817 }
 818
 819 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
 820 {
 821   std::lock_guard l(full_status_lock);
 822
 823   if (_check_inject_full(dpp, type))
 824     return true;
 825
 826   if (cur_state >= type)
 827     ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
 828                        << " physical " << physical_ratio << dendl;
 829
 830   return cur_state >= type;
 831 }
 832
 833 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
 834 {
 835   ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
 836   {
 837     std::lock_guard l(full_status_lock);
 838     if (_check_inject_full(dpp, type)) {
 839       return true;
 840     }
 841   }
 842
 843   float pratio;
 844   float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
 845
 846   string notused;
 847   s_names tentative_state = recalc_full_state(ratio, pratio, notused);
 848
 849   if (tentative_state >= type)
 850     ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
 851
 852   return tentative_state >= type;
 853 }
 854
 855 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
 856 {
 857   return _check_full(dpp, FAILSAFE);
 858 }
 859
 860 bool OSDService::check_full(DoutPrefixProvider *dpp) const
 861 {
 862   return _check_full(dpp, FULL);
 863 }
 864
 865 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
 866 {
 867   return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
 868 }
 869
 870 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
 871 {
 872   return _check_full(dpp, BACKFILLFULL);
 873 }
 874
 875 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
 876 {
 877   return _check_full(dpp, NEARFULL);
 878 }
 879
 880 bool OSDService::is_failsafe_full() const
 881 {
 882   std::lock_guard l(full_status_lock);
 883   return cur_state == FAILSAFE;
 884 }
 885
 886 bool OSDService::is_full() const
 887 {
 888   std::lock_guard l(full_status_lock);
 889   return cur_state >= FULL;
 890 }
 891
 892 bool OSDService::is_backfillfull() const
 893 {
 894   std::lock_guard l(full_status_lock);
 895   return cur_state >= BACKFILLFULL;
 896 }
 897
 898 bool OSDService::is_nearfull() const
 899 {
 900   std::lock_guard l(full_status_lock);
 901   return cur_state >= NEARFULL;
 902 }
 903
 904 void OSDService::set_injectfull(s_names type, int64_t count)
 905 {
 906   std::lock_guard l(full_status_lock);
 907   injectfull_state = type;
 908   injectfull = count;
 909 }
 910
 911 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
 912                             osd_alert_list_t& alerts)
 913 {
 914   uint64_t bytes = stbuf.total;
 915   uint64_t avail = stbuf.available;
 916   uint64_t used = stbuf.get_used_raw();
 917
 918   // For testing fake statfs values so it doesn't matter if all
 919   // OSDs are using the same partition.
 920   if (cct->_conf->fake_statfs_for_testing) {
 921     uint64_t total_num_bytes = 0;
 922     vector<PGRef> pgs;
 923     osd->_get_pgs(&pgs);
 924     for (auto p : pgs) {
 925       total_num_bytes += p->get_stats_num_bytes();
 926     }
 927     bytes = cct->_conf->fake_statfs_for_testing;
 928     if (total_num_bytes < bytes)
 929       avail = bytes - total_num_bytes;
 930     else
 931       avail = 0;
 932     dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
 933             << " adjust available " << avail
 934             << dendl;
 935     used = bytes - avail;
 936   }
 937
 938   osd->logger->set(l_osd_stat_bytes, bytes);
 939   osd->logger->set(l_osd_stat_bytes_used, used);
 940   osd->logger->set(l_osd_stat_bytes_avail, avail);
 941
 942   std::lock_guard l(stat_lock);
 943   osd_stat.statfs = stbuf;
 944   osd_stat.os_alerts.clear();
 945   osd_stat.os_alerts[whoami].swap(alerts);
 946   if (cct->_conf->fake_statfs_for_testing) {
 947     osd_stat.statfs.total = bytes;
 948     osd_stat.statfs.available = avail;
 949     // For testing don't want used to go negative, so clear reserved
 950     osd_stat.statfs.internally_reserved = 0;
 951   }
 952 }
 953
 954 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
 955                                     int num_pgs)
 956 {
 957   utime_t now = ceph_clock_now();
 958   auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
 959   std::lock_guard l(stat_lock);
 960   osd_stat.hb_peers.swap(hb_peers);
 961   osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
 962   osd_stat.num_pgs = num_pgs;
 963   // Clean entries that aren't updated
 964   // This is called often enough that we can just remove 1 at a time
 965   for (auto i: osd_stat.hb_pingtime) {
 966     if (i.second.last_update == 0)
 967       continue;
 968     if (stale_time && now.sec() - i.second.last_update > stale_time) {
 969       dout(20) << __func__ << " time out heartbeat for osd " << i.first
 970                << " last_update " << i.second.last_update << dendl;
 971       osd_stat.hb_pingtime.erase(i.first);
 972       break;
 973     }
 974   }
 975   return osd_stat;
 976 }
 977
 978 void OSDService::inc_osd_stat_repaired()
 979 {
 980   std::lock_guard l(stat_lock);
 981   osd_stat.num_shards_repaired++;
 982   return;
 983 }
 984
 985 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
 986                                          uint64_t adjust_used)
 987 {
 988   *pratio =
 989    ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
 990
 991   if (adjust_used) {
 992     dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used()  << dendl;
 993     if (new_stat.statfs.available > adjust_used)
 994       new_stat.statfs.available -= adjust_used;
 995     else
 996       new_stat.statfs.available = 0;
 997     dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
 998   }
 999
1000   // Check all pgs and adjust kb_used to include all pending backfill data
1001   int backfill_adjusted = 0;
1002   vector<PGRef> pgs;
1003   osd->_get_pgs(&pgs);
1004   for (auto p : pgs) {
1005     backfill_adjusted += p->pg_stat_adjust(&new_stat);
1006   }
1007   if (backfill_adjusted) {
1008     dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1009   }
1010   return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
1011 }
1012
1013 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
1014 {
1015   OSDMapRef osdmap = get_osdmap();
1016   for (auto shard : missing_on) {
1017     if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
1018       return true;
1019   }
1020   return false;
1021 }
1022
1023 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1024 {
1025   OSDMapRef next_map = get_nextmap_reserved();
1026   // service map is always newer/newest
1027   ceph_assert(from_epoch <= next_map->get_epoch());
1028
1029   if (next_map->is_down(peer) ||
1030       next_map->get_info(peer).up_from > from_epoch) {
1031     m->put();
1032     release_map(next_map);
1033     return;
1034   }
1035   ConnectionRef peer_con = osd->cluster_messenger->connect_to_osd(
1036     next_map->get_cluster_addrs(peer));
1037   share_map_peer(peer, peer_con.get(), next_map);
1038   peer_con->send_message(m);
1039   release_map(next_map);
1040 }
1041
1042 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1043 {
1044   OSDMapRef next_map = get_nextmap_reserved();
1045   // service map is always newer/newest
1046   ceph_assert(from_epoch <= next_map->get_epoch());
1047
1048   if (next_map->is_down(peer) ||
1049       next_map->get_info(peer).up_from > from_epoch) {
1050     release_map(next_map);
1051     return NULL;
1052   }
1053   ConnectionRef con = osd->cluster_messenger->connect_to_osd(
1054     next_map->get_cluster_addrs(peer));
1055   release_map(next_map);
1056   return con;
1057 }
1058
1059 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1060 {
1061   OSDMapRef next_map = get_nextmap_reserved();
1062   // service map is always newer/newest
1063   ceph_assert(from_epoch <= next_map->get_epoch());
1064
1065   pair<ConnectionRef,ConnectionRef> ret;
1066   if (next_map->is_down(peer) ||
1067       next_map->get_info(peer).up_from > from_epoch) {
1068     release_map(next_map);
1069     return ret;
1070   }
1071   ret.first = osd->hb_back_client_messenger->connect_to_osd(
1072     next_map->get_hb_back_addrs(peer));
1073   ret.second = osd->hb_front_client_messenger->connect_to_osd(
1074     next_map->get_hb_front_addrs(peer));
1075   release_map(next_map);
1076   return ret;
1077 }
1078
1079 entity_name_t OSDService::get_cluster_msgr_name() const
1080 {
1081   return cluster_messenger->get_myname();
1082 }
1083
1084 void OSDService::queue_want_pg_temp(pg_t pgid,
1085                                     const vector<int>& want,
1086                                     bool forced)
1087 {
1088   std::lock_guard l(pg_temp_lock);
1089   auto p = pg_temp_pending.find(pgid);
1090   if (p == pg_temp_pending.end() ||
1091       p->second.acting != want ||
1092       forced) {
1093     pg_temp_wanted[pgid] = {want, forced};
1094   }
1095 }
1096
1097 void OSDService::remove_want_pg_temp(pg_t pgid)
1098 {
1099   std::lock_guard l(pg_temp_lock);
1100   pg_temp_wanted.erase(pgid);
1101   pg_temp_pending.erase(pgid);
1102 }
1103
1104 void OSDService::_sent_pg_temp()
1105 {
1106 #ifdef HAVE_STDLIB_MAP_SPLICING
1107   pg_temp_pending.merge(pg_temp_wanted);
1108 #else
1109   pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1110                          make_move_iterator(end(pg_temp_wanted)));
1111 #endif
1112   pg_temp_wanted.clear();
1113 }
1114
1115 void OSDService::requeue_pg_temp()
1116 {
1117   std::lock_guard l(pg_temp_lock);
1118   // wanted overrides pending.  note that remove_want_pg_temp
1119   // clears the item out of both.
1120   unsigned old_wanted = pg_temp_wanted.size();
1121   unsigned old_pending = pg_temp_pending.size();
1122   _sent_pg_temp();
1123   pg_temp_wanted.swap(pg_temp_pending);
1124   dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1125            << pg_temp_wanted.size() << dendl;
1126 }
1127
1128 std::ostream& operator<<(std::ostream& out,
1129                          const OSDService::pg_temp_t& pg_temp)
1130 {
1131   out << pg_temp.acting;
1132   if (pg_temp.forced) {
1133     out << " (forced)";
1134   }
1135   return out;
1136 }
1137
1138 void OSDService::send_pg_temp()
1139 {
1140   std::lock_guard l(pg_temp_lock);
1141   if (pg_temp_wanted.empty())
1142     return;
1143   dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1144   MOSDPGTemp *ms[2] = {nullptr, nullptr};
1145   for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1146     auto& m = ms[pg_temp.forced];
1147     if (!m) {
1148       m = new MOSDPGTemp(osdmap->get_epoch());
1149       m->forced = pg_temp.forced;
1150     }
1151     m->pg_temp.emplace(pgid, pg_temp.acting);
1152   }
1153   for (auto m : ms) {
1154     if (m) {
1155       monc->send_mon_message(m);
1156     }
1157   }
1158   _sent_pg_temp();
1159 }
1160
1161 void OSDService::send_pg_created(pg_t pgid)
1162 {
1163   std::lock_guard l(pg_created_lock);
1164   dout(20) << __func__ << dendl;
1165   auto o = get_osdmap();
1166   if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1167     pg_created.insert(pgid);
1168     monc->send_mon_message(new MOSDPGCreated(pgid));
1169   }
1170 }
1171
1172 void OSDService::send_pg_created()
1173 {
1174   std::lock_guard l(pg_created_lock);
1175   dout(20) << __func__ << dendl;
1176   auto o = get_osdmap();
1177   if (o->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1178     for (auto pgid : pg_created) {
1179       monc->send_mon_message(new MOSDPGCreated(pgid));
1180     }
1181   }
1182 }
1183
1184 void OSDService::prune_pg_created()
1185 {
1186   std::lock_guard l(pg_created_lock);
1187   dout(20) << __func__ << dendl;
1188   auto o = get_osdmap();
1189   auto i = pg_created.begin();
1190   while (i != pg_created.end()) {
1191     auto p = o->get_pg_pool(i->pool());
1192     if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1193       dout(20) << __func__ << " pruning " << *i << dendl;
1194       i = pg_created.erase(i);
1195     } else {
1196       dout(20) << __func__ << " keeping " << *i << dendl;
1197       ++i;
1198     }
1199   }
1200 }
1201
1202
1203 // --------------------------------------
1204 // dispatch
1205
1206 epoch_t OSDService::get_peer_epoch(int peer)
1207 {
1208   std::lock_guard l(peer_map_epoch_lock);
1209   map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1210   if (p == peer_map_epoch.end())
1211     return 0;
1212   return p->second;
1213 }
1214
1215 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1216 {
1217   std::lock_guard l(peer_map_epoch_lock);
1218   map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1219   if (p != peer_map_epoch.end()) {
1220     if (p->second < e) {
1221       dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1222       p->second = e;
1223     } else {
1224       dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1225     }
1226     return p->second;
1227   } else {
1228     dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1229     peer_map_epoch[peer] = e;
1230     return e;
1231   }
1232 }
1233
1234 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1235 {
1236   std::lock_guard l(peer_map_epoch_lock);
1237   map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1238   if (p != peer_map_epoch.end()) {
1239     if (p->second <= as_of) {
1240       dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1241                << " had " << p->second << dendl;
1242       peer_map_epoch.erase(p);
1243     } else {
1244       dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1245                << " has " << p->second << " - not forgetting" << dendl;
1246     }
1247   }
1248 }
1249
1250 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1251                                   epoch_t epoch, const OSDMapRef& osdmap,
1252                                   const epoch_t *sent_epoch_p)
1253 {
1254   dout(20) << "should_share_map "
1255            << name << " " << con->get_peer_addr()
1256            << " " << epoch << dendl;
1257
1258   // does client have old map?
1259   if (name.is_client()) {
1260     bool message_sendmap = epoch < osdmap->get_epoch();
1261     if (message_sendmap && sent_epoch_p) {
1262       dout(20) << "client session last_sent_epoch: "
1263                << *sent_epoch_p
1264                << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1265       if (*sent_epoch_p < osdmap->get_epoch()) {
1266         return true;
1267       } // else we don't need to send it out again
1268     }
1269   }
1270
1271   if (con->get_messenger() == osd->cluster_messenger &&
1272       con != osd->cluster_messenger->get_loopback_connection() &&
1273       osdmap->is_up(name.num()) &&
1274       (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1275        osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1276     // remember
1277     epoch_t has = std::max(get_peer_epoch(name.num()), epoch);
1278
1279     // share?
1280     if (has < osdmap->get_epoch()) {
1281       dout(10) << name << " " << con->get_peer_addr()
1282                << " has old map " << epoch << " < "
1283                << osdmap->get_epoch() << dendl;
1284       return true;
1285     }
1286   }
1287
1288   return false;
1289 }
1290
1291 void OSDService::share_map(
1292     entity_name_t name,
1293     Connection *con,
1294     epoch_t epoch,
1295     OSDMapRef& osdmap,
1296     epoch_t *sent_epoch_p)
1297 {
1298   dout(20) << "share_map "
1299            << name << " " << con->get_peer_addr()
1300            << " " << epoch << dendl;
1301
1302   if (!osd->is_active()) {
1303     /*It is safe not to proceed as OSD is not in healthy state*/
1304     return;
1305   }
1306
1307   bool want_shared = should_share_map(name, con, epoch,
1308                                       osdmap, sent_epoch_p);
1309
1310   if (want_shared){
1311     if (name.is_client()) {
1312       dout(10) << name << " has old map " << epoch
1313           << " < " << osdmap->get_epoch() << dendl;
1314       // we know the Session is valid or we wouldn't be sending
1315       if (sent_epoch_p) {
1316         *sent_epoch_p = osdmap->get_epoch();
1317       }
1318       send_incremental_map(epoch, con, osdmap);
1319     } else if (con->get_messenger() == osd->cluster_messenger &&
1320         osdmap->is_up(name.num()) &&
1321         (osdmap->get_cluster_addrs(name.num()) == con->get_peer_addrs() ||
1322             osdmap->get_hb_back_addrs(name.num()) == con->get_peer_addrs())) {
1323       dout(10) << name << " " << con->get_peer_addrs()
1324                        << " has old map " << epoch << " < "
1325                        << osdmap->get_epoch() << dendl;
1326       note_peer_epoch(name.num(), osdmap->get_epoch());
1327       send_incremental_map(epoch, con, osdmap);
1328     }
1329   }
1330 }
1331
1332 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1333 {
1334   if (!map)
1335     map = get_osdmap();
1336
1337   // send map?
1338   epoch_t pe = get_peer_epoch(peer);
1339   if (pe) {
1340     if (pe < map->get_epoch()) {
1341       send_incremental_map(pe, con, map);
1342       note_peer_epoch(peer, map->get_epoch());
1343     } else
1344       dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1345   } else {
1346     dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1347     // no idea about peer's epoch.
1348     // ??? send recent ???
1349     // do nothing.
1350   }
1351 }
1352
1353 bool OSDService::can_inc_scrubs()
1354 {
1355   bool can_inc = false;
1356   std::lock_guard l(sched_scrub_lock);
1357
1358   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1359     dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1360              << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1361     can_inc = true;
1362   } else {
1363     dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1364              << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1365   }
1366
1367   return can_inc;
1368 }
1369
1370 bool OSDService::inc_scrubs_local()
1371 {
1372   bool result = false;
1373   std::lock_guard l{sched_scrub_lock};
1374   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1375     dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1376              << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1377     result = true;
1378     ++scrubs_local;
1379   } else {
1380     dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1381   }
1382   return result;
1383 }
1384
1385 void OSDService::dec_scrubs_local()
1386 {
1387   std::lock_guard l{sched_scrub_lock};
1388   dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1389            << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1390   --scrubs_local;
1391   ceph_assert(scrubs_local >= 0);
1392 }
1393
1394 bool OSDService::inc_scrubs_remote()
1395 {
1396   bool result = false;
1397   std::lock_guard l{sched_scrub_lock};
1398   if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1399     dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1400              << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1401     result = true;
1402     ++scrubs_remote;
1403   } else {
1404     dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1405   }
1406   return result;
1407 }
1408
1409 void OSDService::dec_scrubs_remote()
1410 {
1411   std::lock_guard l{sched_scrub_lock};
1412   dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1413            << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1414   --scrubs_remote;
1415   ceph_assert(scrubs_remote >= 0);
1416 }
1417
1418 void OSDService::dump_scrub_reservations(Formatter *f)
1419 {
1420   std::lock_guard l{sched_scrub_lock};
1421   f->dump_int("scrubs_local", scrubs_local);
1422   f->dump_int("scrubs_remote", scrubs_remote);
1423   f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1424 }
1425
1426 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1427                                  epoch_t *_bind_epoch) const
1428 {
1429   std::lock_guard l(epoch_lock);
1430   if (_boot_epoch)
1431     *_boot_epoch = boot_epoch;
1432   if (_up_epoch)
1433     *_up_epoch = up_epoch;
1434   if (_bind_epoch)
1435     *_bind_epoch = bind_epoch;
1436 }
1437
1438 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1439                             const epoch_t *_bind_epoch)
1440 {
1441   std::lock_guard l(epoch_lock);
1442   if (_boot_epoch) {
1443     ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1444     boot_epoch = *_boot_epoch;
1445   }
1446   if (_up_epoch) {
1447     ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1448     up_epoch = *_up_epoch;
1449   }
1450   if (_bind_epoch) {
1451     ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1452     bind_epoch = *_bind_epoch;
1453   }
1454 }
1455
1456 bool OSDService::prepare_to_stop()
1457 {
1458   std::lock_guard l(is_stopping_lock);
1459   if (get_state() != NOT_STOPPING)
1460     return false;
1461
1462   OSDMapRef osdmap = get_osdmap();
1463   if (osdmap && osdmap->is_up(whoami)) {
1464     dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1465     set_state(PREPARING_TO_STOP);
1466     monc->send_mon_message(
1467       new MOSDMarkMeDown(
1468         monc->get_fsid(),
1469         whoami,
1470         osdmap->get_addrs(whoami),
1471         osdmap->get_epoch(),
1472         true  // request ack
1473         ));
1474     utime_t now = ceph_clock_now();
1475     utime_t timeout;
1476     timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1477     while ((ceph_clock_now() < timeout) &&
1478        (get_state() != STOPPING)) {
1479       is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1480     }
1481   }
1482   dout(0) << __func__ << " starting shutdown" << dendl;
1483   set_state(STOPPING);
1484   return true;
1485 }
1486
1487 void OSDService::got_stop_ack()
1488 {
1489   std::lock_guard l(is_stopping_lock);
1490   if (get_state() == PREPARING_TO_STOP) {
1491     dout(0) << __func__ << " starting shutdown" << dendl;
1492     set_state(STOPPING);
1493     is_stopping_cond.Signal();
1494   } else {
1495     dout(10) << __func__ << " ignoring msg" << dendl;
1496   }
1497 }
1498
1499 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1500                                                OSDSuperblock& sblock)
1501 {
1502   MOSDMap *m = new MOSDMap(monc->get_fsid(),
1503                            osdmap->get_encoding_features());
1504   m->oldest_map = max_oldest_map;
1505   m->newest_map = sblock.newest_map;
1506
1507   int max = cct->_conf->osd_map_message_max;
1508   ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1509
1510   if (since < m->oldest_map) {
1511     // we don't have the next map the target wants, so start with a
1512     // full map.
1513     bufferlist bl;
1514     dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1515              << since << ", starting with full map" << dendl;
1516     since = m->oldest_map;
1517     if (!get_map_bl(since, bl)) {
1518       derr << __func__ << " missing full map " << since << dendl;
1519       goto panic;
1520     }
1521     max--;
1522     max_bytes -= bl.length();
1523     m->maps[since].claim(bl);
1524   }
1525   for (epoch_t e = since + 1; e <= to; ++e) {
1526     bufferlist bl;
1527     if (get_inc_map_bl(e, bl)) {
1528       m->incremental_maps[e].claim(bl);
1529     } else {
1530       derr << __func__ << " missing incremental map " << e << dendl;
1531       if (!get_map_bl(e, bl)) {
1532         derr << __func__ << " also missing full map " << e << dendl;
1533         goto panic;
1534       }
1535       m->maps[e].claim(bl);
1536     }
1537     max--;
1538     max_bytes -= bl.length();
1539     if (max <= 0 || max_bytes <= 0) {
1540       break;
1541     }
1542   }
1543   return m;
1544
1545  panic:
1546   if (!m->maps.empty() ||
1547       !m->incremental_maps.empty()) {
1548     // send what we have so far
1549     return m;
1550   }
1551   // send something
1552   bufferlist bl;
1553   if (get_inc_map_bl(m->newest_map, bl)) {
1554     m->incremental_maps[m->newest_map].claim(bl);
1555   } else {
1556     derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1557     if (!get_map_bl(m->newest_map, bl)) {
1558       derr << __func__ << " unable to load latest full map " << m->newest_map
1559            << dendl;
1560       ceph_abort();
1561     }
1562     m->maps[m->newest_map].claim(bl);
1563   }
1564   return m;
1565 }
1566
1567 void OSDService::send_map(MOSDMap *m, Connection *con)
1568 {
1569   con->send_message(m);
1570 }
1571
1572 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1573                                       OSDMapRef& osdmap)
1574 {
1575   epoch_t to = osdmap->get_epoch();
1576   dout(10) << "send_incremental_map " << since << " -> " << to
1577            << " to " << con << " " << con->get_peer_addr() << dendl;
1578
1579   MOSDMap *m = NULL;
1580   while (!m) {
1581     OSDSuperblock sblock(get_superblock());
1582     if (since < sblock.oldest_map) {
1583       // just send latest full map
1584       MOSDMap *m = new MOSDMap(monc->get_fsid(),
1585                                osdmap->get_encoding_features());
1586       m->oldest_map = max_oldest_map;
1587       m->newest_map = sblock.newest_map;
1588       get_map_bl(to, m->maps[to]);
1589       send_map(m, con);
1590       return;
1591     }
1592
1593     if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1594       dout(10) << "  " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1595                << ", only sending most recent" << dendl;
1596       since = to - cct->_conf->osd_map_share_max_epochs;
1597     }
1598
1599     m = build_incremental_map_msg(since, to, sblock);
1600   }
1601   send_map(m, con);
1602 }
1603
1604 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1605 {
1606   bool found = map_bl_cache.lookup(e, &bl);
1607   if (found) {
1608     if (logger)
1609       logger->inc(l_osd_map_bl_cache_hit);
1610     return true;
1611   }
1612   if (logger)
1613     logger->inc(l_osd_map_bl_cache_miss);
1614   found = store->read(meta_ch,
1615                       OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1616                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1617   if (found) {
1618     _add_map_bl(e, bl);
1619   }
1620   return found;
1621 }
1622
1623 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1624 {
1625   std::lock_guard l(map_cache_lock);
1626   bool found = map_bl_inc_cache.lookup(e, &bl);
1627   if (found) {
1628     if (logger)
1629       logger->inc(l_osd_map_bl_cache_hit);
1630     return true;
1631   }
1632   if (logger)
1633     logger->inc(l_osd_map_bl_cache_miss);
1634   found = store->read(meta_ch,
1635                       OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1636                       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1637   if (found) {
1638     _add_map_inc_bl(e, bl);
1639   }
1640   return found;
1641 }
1642
1643 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1644 {
1645   dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1646   // cache a contiguous buffer
1647   if (bl.get_num_buffers() > 1) {
1648     bl.rebuild();
1649   }
1650   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1651   map_bl_cache.add(e, bl);
1652 }
1653
1654 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1655 {
1656   dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1657   // cache a contiguous buffer
1658   if (bl.get_num_buffers() > 1) {
1659     bl.rebuild();
1660   }
1661   bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1662   map_bl_inc_cache.add(e, bl);
1663 }
1664
1665 int OSDService::get_deleted_pool_pg_num(int64_t pool)
1666 {
1667   std::lock_guard l(map_cache_lock);
1668   auto p = deleted_pool_pg_nums.find(pool);
1669   if (p != deleted_pool_pg_nums.end()) {
1670     return p->second;
1671   }
1672   dout(20) << __func__ << " " << pool << " loading" << dendl;
1673   ghobject_t oid = OSD::make_final_pool_info_oid(pool);
1674   bufferlist bl;
1675   int r = store->read(meta_ch, oid, 0, 0, bl);
1676   ceph_assert(r >= 0);
1677   auto blp = bl.cbegin();
1678   pg_pool_t pi;
1679   ::decode(pi, blp);
1680   deleted_pool_pg_nums[pool] = pi.get_pg_num();
1681   dout(20) << __func__ << " " << pool << " got " << pi.get_pg_num() << dendl;
1682   return pi.get_pg_num();
1683 }
1684
1685 OSDMapRef OSDService::_add_map(OSDMap *o)
1686 {
1687   epoch_t e = o->get_epoch();
1688
1689   if (cct->_conf->osd_map_dedup) {
1690     // Dedup against an existing map at a nearby epoch
1691     OSDMapRef for_dedup = map_cache.lower_bound(e);
1692     if (for_dedup) {
1693       OSDMap::dedup(for_dedup.get(), o);
1694     }
1695   }
1696   bool existed;
1697   OSDMapRef l = map_cache.add(e, o, &existed);
1698   if (existed) {
1699     delete o;
1700   }
1701   return l;
1702 }
1703
1704 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1705 {
1706   std::lock_guard l(map_cache_lock);
1707   OSDMapRef retval = map_cache.lookup(epoch);
1708   if (retval) {
1709     dout(30) << "get_map " << epoch << " -cached" << dendl;
1710     if (logger) {
1711       logger->inc(l_osd_map_cache_hit);
1712     }
1713     return retval;
1714   }
1715   if (logger) {
1716     logger->inc(l_osd_map_cache_miss);
1717     epoch_t lb = map_cache.cached_key_lower_bound();
1718     if (epoch < lb) {
1719       dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1720       logger->inc(l_osd_map_cache_miss_low);
1721       logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1722     }
1723   }
1724
1725   OSDMap *map = new OSDMap;
1726   if (epoch > 0) {
1727     dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1728     bufferlist bl;
1729     if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1730       derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1731       delete map;
1732       return OSDMapRef();
1733     }
1734     map->decode(bl);
1735   } else {
1736     dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1737   }
1738   return _add_map(map);
1739 }
1740
1741 // ops
1742
1743
1744 void OSDService::reply_op_error(OpRequestRef op, int err)
1745 {
1746   reply_op_error(op, err, eversion_t(), 0);
1747 }
1748
1749 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1750                                 version_t uv)
1751 {
1752   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1753   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1754   int flags;
1755   flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1756
1757   MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags, true);
1758   reply->set_reply_versions(v, uv);
1759   m->get_connection()->send_message(reply);
1760 }
1761
1762 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1763 {
1764   if (!cct->_conf->osd_debug_misdirected_ops) {
1765     return;
1766   }
1767
1768   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1769   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1770
1771   ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1772
1773   if (pg->is_ec_pg()) {
1774     /**
1775        * OSD recomputes op target based on current OSDMap. With an EC pg, we
1776        * can get this result:
1777        * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1778        *    [CRUSH_ITEM_NONE, 2, 3]/3
1779        * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1780        *    [3, 2, 3]/3
1781        * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1782        *    -- misdirected op
1783        * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1784        *    it and fulfils it
1785        *
1786        * We can't compute the op target based on the sending map epoch due to
1787        * splitting.  The simplest thing is to detect such cases here and drop
1788        * them without an error (the client will resend anyway).
1789        */
1790     ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1791     OSDMapRef opmap = try_get_map(m->get_map_epoch());
1792     if (!opmap) {
1793       dout(7) << __func__ << ": " << *pg << " no longer have map for "
1794               << m->get_map_epoch() << ", dropping" << dendl;
1795       return;
1796     }
1797     pg_t _pgid = m->get_raw_pg();
1798     spg_t pgid;
1799     if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1800       _pgid = opmap->raw_pg_to_pg(_pgid);
1801     if (opmap->get_primary_shard(_pgid, &pgid) &&
1802         pgid.shard != pg->pg_id.shard) {
1803       dout(7) << __func__ << ": " << *pg << " primary changed since "
1804               << m->get_map_epoch() << ", dropping" << dendl;
1805       return;
1806     }
1807   }
1808
1809   dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1810   clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1811                << " pg " << m->get_raw_pg()
1812                << " to osd." << whoami
1813                << " not " << pg->get_acting()
1814                << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1815 }
1816
1817 void OSDService::enqueue_back(OpQueueItem&& qi)
1818 {
1819   osd->op_shardedwq.queue(std::move(qi));
1820 }
1821
1822 void OSDService::enqueue_front(OpQueueItem&& qi)
1823 {
1824   osd->op_shardedwq.queue_front(std::move(qi));
1825 }
1826
1827 void OSDService::queue_recovery_context(
1828   PG *pg,
1829   GenContext<ThreadPool::TPHandle&> *c)
1830 {
1831   epoch_t e = get_osdmap_epoch();
1832   enqueue_back(
1833     OpQueueItem(
1834       unique_ptr<OpQueueItem::OpQueueable>(
1835         new PGRecoveryContext(pg->get_pgid(), c, e)),
1836       cct->_conf->osd_recovery_cost,
1837       cct->_conf->osd_recovery_priority,
1838       ceph_clock_now(),
1839       0,
1840       e));
1841 }
1842
1843 void OSDService::queue_for_snap_trim(PG *pg)
1844 {
1845   dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1846   enqueue_back(
1847     OpQueueItem(
1848       unique_ptr<OpQueueItem::OpQueueable>(
1849         new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1850       cct->_conf->osd_snap_trim_cost,
1851       cct->_conf->osd_snap_trim_priority,
1852       ceph_clock_now(),
1853       0,
1854       pg->get_osdmap_epoch()));
1855 }
1856
1857 void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1858 {
1859   unsigned scrub_queue_priority = pg->scrubber.priority;
1860   if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1861     scrub_queue_priority = cct->_conf->osd_client_op_priority;
1862   }
1863   const auto epoch = pg->get_osdmap_epoch();
1864   enqueue_back(
1865     OpQueueItem(
1866       unique_ptr<OpQueueItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1867       cct->_conf->osd_scrub_cost,
1868       scrub_queue_priority,
1869       ceph_clock_now(),
1870       0,
1871       epoch));
1872 }
1873
1874 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1875 {
1876   dout(10) << __func__ << " on " << pgid << " e " << e  << dendl;
1877   enqueue_back(
1878     OpQueueItem(
1879       unique_ptr<OpQueueItem::OpQueueable>(
1880         new PGDelete(pgid, e)),
1881       cct->_conf->osd_pg_delete_cost,
1882       cct->_conf->osd_pg_delete_priority,
1883       ceph_clock_now(),
1884       0,
1885       e));
1886 }
1887
1888 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1889 {
1890   return osd->try_finish_pg_delete(pg, old_pg_num);
1891 }
1892
1893 // ---
1894
1895 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1896 {
1897   std::lock_guard l(merge_lock);
1898   dout(10) << __func__ << " " << pg->pg_id << dendl;
1899   ready_to_merge_source[pg->pg_id.pgid] = version;
1900   assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1901   _send_ready_to_merge();
1902 }
1903
1904 void OSDService::set_ready_to_merge_target(PG *pg,
1905                                            eversion_t version,
1906                                            epoch_t last_epoch_started,
1907                                            epoch_t last_epoch_clean)
1908 {
1909   std::lock_guard l(merge_lock);
1910   dout(10) << __func__ << " " << pg->pg_id << dendl;
1911   ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1912                                          make_tuple(version,
1913                                                     last_epoch_started,
1914                                                     last_epoch_clean)));
1915   assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1916   _send_ready_to_merge();
1917 }
1918
1919 void OSDService::set_not_ready_to_merge_source(pg_t source)
1920 {
1921   std::lock_guard l(merge_lock);
1922   dout(10) << __func__ << " " << source << dendl;
1923   not_ready_to_merge_source.insert(source);
1924   assert(ready_to_merge_source.count(source) == 0);
1925   _send_ready_to_merge();
1926 }
1927
1928 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1929 {
1930   std::lock_guard l(merge_lock);
1931   dout(10) << __func__ << " " << target << " source " << source << dendl;
1932   not_ready_to_merge_target[target] = source;
1933   assert(ready_to_merge_target.count(target) == 0);
1934   _send_ready_to_merge();
1935 }
1936
1937 void OSDService::send_ready_to_merge()
1938 {
1939   std::lock_guard l(merge_lock);
1940   _send_ready_to_merge();
1941 }
1942
1943 void OSDService::_send_ready_to_merge()
1944 {
1945   dout(20) << __func__
1946            << " ready_to_merge_source " << ready_to_merge_source
1947            << " not_ready_to_merge_source " << not_ready_to_merge_source
1948            << " ready_to_merge_target " << ready_to_merge_target
1949            << " not_ready_to_merge_target " << not_ready_to_merge_target
1950            << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1951            << dendl;
1952   for (auto src : not_ready_to_merge_source) {
1953     if (sent_ready_to_merge_source.count(src) == 0) {
1954       monc->send_mon_message(new MOSDPGReadyToMerge(
1955                                src,
1956                                {}, {}, 0, 0,
1957                                false,
1958                                osdmap->get_epoch()));
1959       sent_ready_to_merge_source.insert(src);
1960     }
1961   }
1962   for (auto p : not_ready_to_merge_target) {
1963     if (sent_ready_to_merge_source.count(p.second) == 0) {
1964       monc->send_mon_message(new MOSDPGReadyToMerge(
1965                                p.second,
1966                                {}, {}, 0, 0,
1967                                false,
1968                                osdmap->get_epoch()));
1969       sent_ready_to_merge_source.insert(p.second);
1970     }
1971   }
1972   for (auto src : ready_to_merge_source) {
1973     if (not_ready_to_merge_source.count(src.first) ||
1974         not_ready_to_merge_target.count(src.first.get_parent())) {
1975       continue;
1976     }
1977     auto p = ready_to_merge_target.find(src.first.get_parent());
1978     if (p != ready_to_merge_target.end() &&
1979         sent_ready_to_merge_source.count(src.first) == 0) {
1980       monc->send_mon_message(new MOSDPGReadyToMerge(
1981                                src.first,           // source pgid
1982                                src.second,          // src version
1983                                std::get<0>(p->second), // target version
1984                                std::get<1>(p->second), // PG's last_epoch_started
1985                                std::get<2>(p->second), // PG's last_epoch_clean
1986                                true,
1987                                osdmap->get_epoch()));
1988       sent_ready_to_merge_source.insert(src.first);
1989     }
1990   }
1991 }
1992
1993 void OSDService::clear_ready_to_merge(PG *pg)
1994 {
1995   std::lock_guard l(merge_lock);
1996   dout(10) << __func__ << " " << pg->pg_id << dendl;
1997   ready_to_merge_source.erase(pg->pg_id.pgid);
1998   ready_to_merge_target.erase(pg->pg_id.pgid);
1999   not_ready_to_merge_source.erase(pg->pg_id.pgid);
2000   not_ready_to_merge_target.erase(pg->pg_id.pgid);
2001   sent_ready_to_merge_source.erase(pg->pg_id.pgid);
2002 }
2003
2004 void OSDService::clear_sent_ready_to_merge()
2005 {
2006   std::lock_guard l(merge_lock);
2007   sent_ready_to_merge_source.clear();
2008 }
2009
2010 void OSDService::prune_sent_ready_to_merge(OSDMapRef& osdmap)
2011 {
2012   std::lock_guard l(merge_lock);
2013   auto i = sent_ready_to_merge_source.begin();
2014   while (i != sent_ready_to_merge_source.end()) {
2015     if (!osdmap->pg_exists(*i)) {
2016       dout(10) << __func__ << " " << *i << dendl;
2017       i = sent_ready_to_merge_source.erase(i);
2018     } else {
2019       ++i;
2020     }
2021   }
2022 }
2023
2024 // ---
2025
2026 void OSDService::_queue_for_recovery(
2027   std::pair<epoch_t, PGRef> p,
2028   uint64_t reserved_pushes)
2029 {
2030   ceph_assert(recovery_lock.is_locked_by_me());
2031   enqueue_back(
2032     OpQueueItem(
2033       unique_ptr<OpQueueItem::OpQueueable>(
2034         new PGRecovery(
2035           p.second->get_pgid(), p.first, reserved_pushes)),
2036       cct->_conf->osd_recovery_cost,
2037       cct->_conf->osd_recovery_priority,
2038       ceph_clock_now(),
2039       0,
2040       p.first));
2041 }
2042
2043 // ====================================================================
2044 // OSD
2045
2046 #undef dout_prefix
2047 #define dout_prefix *_dout
2048
2049 // Commands shared between OSD's console and admin console:
2050 namespace ceph {
2051 namespace osd_cmds {
2052
2053 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
2054
2055 }} // namespace ceph::osd_cmds
2056
2057 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami)
2058 {
2059   int ret;
2060
2061   OSDSuperblock sb;
2062   bufferlist sbbl;
2063   ObjectStore::CollectionHandle ch;
2064
2065   // if we are fed a uuid for this osd, use it.
2066   store->set_fsid(cct->_conf->osd_uuid);
2067
2068   ret = store->mkfs();
2069   if (ret) {
2070     derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2071          << cpp_strerror(ret) << dendl;
2072     goto free_store;
2073   }
2074
2075   store->set_cache_shards(1);  // doesn't matter for mkfs!
2076
2077   ret = store->mount();
2078   if (ret) {
2079     derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2080          << cpp_strerror(ret) << dendl;
2081     goto free_store;
2082   }
2083
2084   ch = store->open_collection(coll_t::meta());
2085   if (ch) {
2086     ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2087     if (ret < 0) {
2088       derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2089       goto free_store;
2090     }
2091     /* if we already have superblock, check content of superblock */
2092     dout(0) << " have superblock" << dendl;
2093     auto p = sbbl.cbegin();
2094     decode(sb, p);
2095     if (whoami != sb.whoami) {
2096       derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2097            << dendl;
2098       ret = -EINVAL;
2099       goto umount_store;
2100     }
2101     if (fsid != sb.cluster_fsid) {
2102       derr << "provided cluster fsid " << fsid
2103            << " != superblock's " << sb.cluster_fsid << dendl;
2104       ret = -EINVAL;
2105       goto umount_store;
2106     }
2107   } else {
2108     // create superblock
2109     sb.cluster_fsid = fsid;
2110     sb.osd_fsid = store->get_fsid();
2111     sb.whoami = whoami;
2112     sb.compat_features = get_osd_initial_compat_set();
2113
2114     bufferlist bl;
2115     encode(sb, bl);
2116
2117     ObjectStore::CollectionHandle ch = store->create_new_collection(
2118       coll_t::meta());
2119     ObjectStore::Transaction t;
2120     t.create_collection(coll_t::meta(), 0);
2121     t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2122     ret = store->queue_transaction(ch, std::move(t));
2123     if (ret) {
2124       derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2125            << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2126       goto umount_store;
2127     }
2128   }
2129
2130   ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
2131   if (ret) {
2132     derr << "OSD::mkfs: failed to write fsid file: error "
2133          << cpp_strerror(ret) << dendl;
2134     goto umount_store;
2135   }
2136
2137 umount_store:
2138   if (ch) {
2139     ch.reset();
2140   }
2141   store->umount();
2142 free_store:
2143   delete store;
2144   return ret;
2145 }
2146
2147 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
2148 {
2149   char val[80];
2150   int r;
2151
2152   snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2153   r = store->write_meta("magic", val);
2154   if (r < 0)
2155     return r;
2156
2157   snprintf(val, sizeof(val), "%d", whoami);
2158   r = store->write_meta("whoami", val);
2159   if (r < 0)
2160     return r;
2161
2162   cluster_fsid.print(val);
2163   r = store->write_meta("ceph_fsid", val);
2164   if (r < 0)
2165     return r;
2166
2167   string key = cct->_conf.get_val<string>("key");
2168   if (key.size()) {
2169     r = store->write_meta("osd_key", key);
2170     if (r < 0)
2171       return r;
2172   } else {
2173     string keyfile = cct->_conf.get_val<string>("keyfile");
2174     if (!keyfile.empty()) {
2175       bufferlist keybl;
2176       string err;
2177       r = keybl.read_file(keyfile.c_str(), &err);
2178       if (r < 0) {
2179         derr << __func__ << " failed to read keyfile " << keyfile << ": "
2180              << err << ": " << cpp_strerror(r) << dendl;
2181         return r;
2182       }
2183       r = store->write_meta("osd_key", keybl.to_str());
2184       if (r < 0)
2185         return r;
2186     }
2187   }
2188
2189   r = store->write_meta("ready", "ready");
2190   if (r < 0)
2191     return r;
2192
2193   return 0;
2194 }
2195
2196 int OSD::peek_meta(ObjectStore *store,
2197                    std::string *magic,
2198                    uuid_d *cluster_fsid,
2199                    uuid_d *osd_fsid,
2200                    int *whoami,
2201                    int *require_osd_release)
2202 {
2203   string val;
2204
2205   int r = store->read_meta("magic", &val);
2206   if (r < 0)
2207     return r;
2208   *magic = val;
2209
2210   r = store->read_meta("whoami", &val);
2211   if (r < 0)
2212     return r;
2213   *whoami = atoi(val.c_str());
2214
2215   r = store->read_meta("ceph_fsid", &val);
2216   if (r < 0)
2217     return r;
2218   r = cluster_fsid->parse(val.c_str());
2219   if (!r)
2220     return -EINVAL;
2221
2222   r = store->read_meta("fsid", &val);
2223   if (r < 0) {
2224     *osd_fsid = uuid_d();
2225   } else {
2226     r = osd_fsid->parse(val.c_str());
2227     if (!r)
2228       return -EINVAL;
2229   }
2230
2231   r = store->read_meta("require_osd_release", &val);
2232   if (r >= 0) {
2233     *require_osd_release = atoi(val.c_str());
2234   }
2235
2236   return 0;
2237 }
2238
2239
2240 #undef dout_prefix
2241 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2242
2243 // cons/des
2244
2245 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2246          int id,
2247          Messenger *internal_messenger,
2248          Messenger *external_messenger,
2249          Messenger *hb_client_front,
2250          Messenger *hb_client_back,
2251          Messenger *hb_front_serverm,
2252          Messenger *hb_back_serverm,
2253          Messenger *osdc_messenger,
2254          MonClient *mc,
2255          const std::string &dev, const std::string &jdev) :
2256   Dispatcher(cct_),
2257   osd_lock("OSD::osd_lock"),
2258   tick_timer(cct, osd_lock),
2259   tick_timer_lock("OSD::tick_timer_lock"),
2260   tick_timer_without_osd_lock(cct, tick_timer_lock),
2261   gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2262   cluster_messenger(internal_messenger),
2263   client_messenger(external_messenger),
2264   objecter_messenger(osdc_messenger),
2265   monc(mc),
2266   mgrc(cct_, client_messenger),
2267   logger(NULL),
2268   recoverystate_perf(NULL),
2269   store(store_),
2270   log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2271   clog(log_client.create_channel()),
2272   whoami(id),
2273   dev_path(dev), journal_path(jdev),
2274   store_is_rotational(store->is_rotational()),
2275   trace_endpoint("0.0.0.0", 0, "osd"),
2276   asok_hook(NULL),
2277   m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2278                                   "osd_pg_epoch_max_lag_factor")),
2279   osd_compat(get_osd_compat_set()),
2280   osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2281             get_num_op_threads()),
2282   command_tp(cct, "OSD::command_tp", "tp_osd_cmd",  1),
2283   session_waiting_lock("OSD::session_waiting_lock"),
2284   osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
2285   heartbeat_lock("OSD::heartbeat_lock"),
2286   heartbeat_stop(false),
2287   heartbeat_need_update(true),
2288   hb_front_client_messenger(hb_client_front),
2289   hb_back_client_messenger(hb_client_back),
2290   hb_front_server_messenger(hb_front_serverm),
2291   hb_back_server_messenger(hb_back_serverm),
2292   daily_loadavg(0.0),
2293   heartbeat_thread(this),
2294   heartbeat_dispatcher(this),
2295   op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2296                   cct->_conf->osd_num_op_tracker_shard),
2297   test_ops_hook(NULL),
2298   op_queue(get_io_queue()),
2299   op_prio_cutoff(get_io_prio_cut()),
2300   op_shardedwq(
2301     this,
2302     cct->_conf->osd_op_thread_timeout,
2303     cct->_conf->osd_op_thread_suicide_timeout,
2304     &osd_op_tp),
2305   map_lock("OSD::map_lock"),
2306   last_pg_create_epoch(0),
2307   mon_report_lock("OSD::mon_report_lock"),
2308   boot_finisher(cct),
2309   up_thru_wanted(0),
2310   requested_full_first(0),
2311   requested_full_last(0),
2312   command_wq(
2313     this,
2314     cct->_conf->osd_command_thread_timeout,
2315     cct->_conf->osd_command_thread_suicide_timeout,
2316     &command_tp),
2317   service(this)
2318 {
2319
2320   if (!gss_ktfile_client.empty()) {
2321     // Assert we can export environment variable
2322     /*
2323         The default client keytab is used, if it is present and readable,
2324         to automatically obtain initial credentials for GSSAPI client
2325         applications. The principal name of the first entry in the client
2326         keytab is used by default when obtaining initial credentials.
2327         1. The KRB5_CLIENT_KTNAME environment variable.
2328         2. The default_client_keytab_name profile variable in [libdefaults].
2329         3. The hardcoded default, DEFCKTNAME.
2330     */
2331     const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2332                                     gss_ktfile_client.c_str(), 1));
2333     ceph_assert(set_result == 0);
2334   }
2335
2336   monc->set_messenger(client_messenger);
2337   op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2338                                          cct->_conf->osd_op_log_threshold);
2339   op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2340                                            cct->_conf->osd_op_history_duration);
2341   op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2342                                                     cct->_conf->osd_op_history_slow_op_threshold);
2343 #ifdef WITH_BLKIN
2344   std::stringstream ss;
2345   ss << "osd." << whoami;
2346   trace_endpoint.copy_name(ss.str());
2347 #endif
2348
2349   // initialize shards
2350   num_shards = get_num_op_shards();
2351   for (uint32_t i = 0; i < num_shards; i++) {
2352     OSDShard *one_shard = new OSDShard(
2353       i,
2354       cct,
2355       this,
2356       cct->_conf->osd_op_pq_max_tokens_per_priority,
2357       cct->_conf->osd_op_pq_min_cost,
2358       op_queue);
2359     shards.push_back(one_shard);
2360   }
2361 }
2362
2363 OSD::~OSD()
2364 {
2365   while (!shards.empty()) {
2366     delete shards.back();
2367     shards.pop_back();
2368   }
2369   delete class_handler;
2370   cct->get_perfcounters_collection()->remove(recoverystate_perf);
2371   cct->get_perfcounters_collection()->remove(logger);
2372   delete recoverystate_perf;
2373   delete logger;
2374   delete store;
2375 }
2376
2377 double OSD::get_tick_interval() const
2378 {
2379   // vary +/- 5% to avoid scrub scheduling livelocks
2380   constexpr auto delta = 0.05;
2381   return (OSD_TICK_INTERVAL *
2382           ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2383 }
2384
2385 void cls_initialize(ClassHandler *ch);
2386
2387 void OSD::handle_signal(int signum)
2388 {
2389   ceph_assert(signum == SIGINT || signum == SIGTERM);
2390   derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2391   shutdown();
2392 }
2393
2394 int OSD::pre_init()
2395 {
2396   std::lock_guard lock(osd_lock);
2397   if (is_stopping())
2398     return 0;
2399
2400   if (store->test_mount_in_use()) {
2401     derr << "OSD::pre_init: object store '" << dev_path << "' is "
2402          << "currently in use. (Is ceph-osd already running?)" << dendl;
2403     return -EBUSY;
2404   }
2405
2406   cct->_conf.add_observer(this);
2407   return 0;
2408 }
2409
2410 int OSD::set_numa_affinity()
2411 {
2412   // storage numa node
2413   int store_node = -1;
2414   store->get_numa_node(&store_node, nullptr, nullptr);
2415   if (store_node >= 0) {
2416     dout(1) << __func__ << " storage numa node " << store_node << dendl;
2417   }
2418
2419   // check network numa node(s)
2420   int front_node = -1, back_node = -1;
2421   string front_iface = pick_iface(
2422     cct,
2423     client_messenger->get_myaddrs().front().get_sockaddr_storage());
2424   string back_iface = pick_iface(
2425     cct,
2426     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2427   int r = get_iface_numa_node(front_iface, &front_node);
2428   if (r >= 0) {
2429     dout(1) << __func__ << " public network " << front_iface << " numa node "
2430               << front_node << dendl;
2431     r = get_iface_numa_node(back_iface, &back_node);
2432     if (r >= 0) {
2433       dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2434               << back_node << dendl;
2435       if (front_node == back_node &&
2436           front_node == store_node) {
2437         dout(1) << " objectstore and network numa nodes all match" << dendl;
2438         if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2439           numa_node = front_node;
2440         }
2441       } else {
2442         dout(1) << __func__ << " objectstore and network numa nodes do not match"
2443                 << dendl;
2444       }
2445     }
2446   } else {
2447     derr << __func__ << " unable to identify public interface '" << front_iface
2448          << "' numa node: " << cpp_strerror(r) << dendl;
2449   }
2450   if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2451     // this takes precedence over the automagic logic above
2452     numa_node = node;
2453   }
2454   if (numa_node >= 0) {
2455     int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2456     if (r < 0) {
2457       dout(1) << __func__ << " unable to determine numa node " << numa_node
2458               << " CPUs" << dendl;
2459       numa_node = -1;
2460     } else {
2461       dout(1) << __func__ << " setting numa affinity to node " << numa_node
2462               << " cpus "
2463               << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2464               << dendl;
2465       r = sched_setaffinity(getpid(), numa_cpu_set_size, &numa_cpu_set);
2466       if (r < 0) {
2467         r = -errno;
2468         derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2469              << dendl;
2470         numa_node = -1;
2471       }
2472     }
2473   } else {
2474     dout(1) << __func__ << " not setting numa affinity" << dendl;
2475   }
2476   return 0;
2477 }
2478
2479 // asok
2480
2481 class OSDSocketHook : public AdminSocketHook {
2482   OSD *osd;
2483 public:
2484   explicit OSDSocketHook(OSD *o) : osd(o) {}
2485   bool call(std::string_view admin_command, const cmdmap_t& cmdmap,
2486             std::string_view format, bufferlist& out) override {
2487     stringstream ss;
2488     bool r = true;
2489     try {
2490       r = osd->asok_command(admin_command, cmdmap, format, ss);
2491     } catch (const bad_cmd_get& e) {
2492       ss << e.what();
2493       r = true;
2494     }
2495     out.append(ss);
2496     return r;
2497   }
2498 };
2499
2500 std::set<int64_t> OSD::get_mapped_pools()
2501 {
2502   std::set<int64_t> pools;
2503   std::vector<spg_t> pgids;
2504   _get_pgids(&pgids);
2505   for (const auto &pgid : pgids) {
2506     pools.insert(pgid.pool());
2507   }
2508   return pools;
2509 }
2510
2511 bool OSD::asok_command(std::string_view admin_command, const cmdmap_t& cmdmap,
2512                        std::string_view format, ostream& ss)
2513 {
2514   Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2515   if (admin_command == "status") {
2516     f->open_object_section("status");
2517     f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2518     f->dump_stream("osd_fsid") << superblock.osd_fsid;
2519     f->dump_unsigned("whoami", superblock.whoami);
2520     f->dump_string("state", get_state_name(get_state()));
2521     f->dump_unsigned("oldest_map", superblock.oldest_map);
2522     f->dump_unsigned("newest_map", superblock.newest_map);
2523     f->dump_unsigned("num_pgs", num_pgs);
2524     f->close_section();
2525   } else if (admin_command == "flush_journal") {
2526     store->flush_journal();
2527   } else if (admin_command == "dump_ops_in_flight" ||
2528              admin_command == "ops" ||
2529              admin_command == "dump_blocked_ops" ||
2530              admin_command == "dump_historic_ops" ||
2531              admin_command == "dump_historic_ops_by_duration" ||
2532              admin_command == "dump_historic_slow_ops") {
2533
2534     const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2535 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2536 will start to track new ops received afterwards.";
2537
2538     set<string> filters;
2539     vector<string> filter_str;
2540     if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2541         copy(filter_str.begin(), filter_str.end(),
2542            inserter(filters, filters.end()));
2543     }
2544
2545     if (admin_command == "dump_ops_in_flight" ||
2546         admin_command == "ops") {
2547       if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2548         ss << error_str;
2549       }
2550     }
2551     if (admin_command == "dump_blocked_ops") {
2552       if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2553         ss << error_str;
2554       }
2555     }
2556     if (admin_command == "dump_historic_ops") {
2557       if (!op_tracker.dump_historic_ops(f, false, filters)) {
2558         ss << error_str;
2559       }
2560     }
2561     if (admin_command == "dump_historic_ops_by_duration") {
2562       if (!op_tracker.dump_historic_ops(f, true, filters)) {
2563         ss << error_str;
2564       }
2565     }
2566     if (admin_command == "dump_historic_slow_ops") {
2567       if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2568         ss << error_str;
2569       }
2570     }
2571   } else if (admin_command == "dump_op_pq_state") {
2572     f->open_object_section("pq");
2573     op_shardedwq.dump(f);
2574     f->close_section();
2575   } else if (admin_command == "dump_blacklist") {
2576     list<pair<entity_addr_t,utime_t> > bl;
2577     OSDMapRef curmap = service.get_osdmap();
2578
2579     f->open_array_section("blacklist");
2580     curmap->get_blacklist(&bl);
2581     for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2582         it != bl.end(); ++it) {
2583       f->open_object_section("entry");
2584       f->open_object_section("entity_addr_t");
2585       it->first.dump(f);
2586       f->close_section(); //entity_addr_t
2587       it->second.localtime(f->dump_stream("expire_time"));
2588       f->close_section(); //entry
2589     }
2590     f->close_section(); //blacklist
2591   } else if (admin_command == "dump_watchers") {
2592     list<obj_watch_item_t> watchers;
2593     // scan pg's
2594     vector<PGRef> pgs;
2595     _get_pgs(&pgs);
2596     for (auto& pg : pgs) {
2597       list<obj_watch_item_t> pg_watchers;
2598       pg->get_watchers(&pg_watchers);
2599       watchers.splice(watchers.end(), pg_watchers);
2600     }
2601
2602     f->open_array_section("watchers");
2603     for (list<obj_watch_item_t>::iterator it = watchers.begin();
2604         it != watchers.end(); ++it) {
2605
2606       f->open_object_section("watch");
2607
2608       f->dump_string("namespace", it->obj.nspace);
2609       f->dump_string("object", it->obj.oid.name);
2610
2611       f->open_object_section("entity_name");
2612       it->wi.name.dump(f);
2613       f->close_section(); //entity_name_t
2614
2615       f->dump_unsigned("cookie", it->wi.cookie);
2616       f->dump_unsigned("timeout", it->wi.timeout_seconds);
2617
2618       f->open_object_section("entity_addr_t");
2619       it->wi.addr.dump(f);
2620       f->close_section(); //entity_addr_t
2621
2622       f->close_section(); //watch
2623     }
2624
2625     f->close_section(); //watchers
2626   } else if (admin_command == "dump_recovery_reservations") {
2627     f->open_object_section("reservations");
2628     f->open_object_section("local_reservations");
2629     service.local_reserver.dump(f);
2630     f->close_section();
2631     f->open_object_section("remote_reservations");
2632     service.remote_reserver.dump(f);
2633     f->close_section();
2634     f->close_section();
2635   } else if (admin_command == "dump_scrub_reservations") {
2636     f->open_object_section("scrub_reservations");
2637     service.dump_scrub_reservations(f);
2638     f->close_section();
2639   } else if (admin_command == "get_latest_osdmap") {
2640     get_latest_osdmap();
2641   } else if (admin_command == "heap") {
2642     auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2643
2644     // Note: Failed heap profile commands won't necessarily trigger an error:
2645     f->open_object_section("result");
2646     f->dump_string("error", cpp_strerror(result));
2647     f->dump_bool("success", result >= 0);
2648     f->close_section();
2649   } else if (admin_command == "set_heap_property") {
2650     string property;
2651     int64_t value = 0;
2652     string error;
2653     bool success = false;
2654     if (!cmd_getval(cct, cmdmap, "property", property)) {
2655       error = "unable to get property";
2656       success = false;
2657     } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2658       error = "unable to get value";
2659       success = false;
2660     } else if (value < 0) {
2661       error = "negative value not allowed";
2662       success = false;
2663     } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2664       error = "invalid property";
2665       success = false;
2666     } else {
2667       success = true;
2668     }
2669     f->open_object_section("result");
2670     f->dump_string("error", error);
2671     f->dump_bool("success", success);
2672     f->close_section();
2673   } else if (admin_command == "get_heap_property") {
2674     string property;
2675     size_t value = 0;
2676     string error;
2677     bool success = false;
2678     if (!cmd_getval(cct, cmdmap, "property", property)) {
2679       error = "unable to get property";
2680       success = false;
2681     } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2682       error = "invalid property";
2683       success = false;
2684     } else {
2685       success = true;
2686     }
2687     f->open_object_section("result");
2688     f->dump_string("error", error);
2689     f->dump_bool("success", success);
2690     f->dump_int("value", value);
2691     f->close_section();
2692   } else if (admin_command == "dump_objectstore_kv_stats") {
2693     store->get_db_statistics(f);
2694   } else if (admin_command == "dump_scrubs") {
2695     service.dumps_scrub(f);
2696   } else if (admin_command == "calc_objectstore_db_histogram") {
2697     store->generate_db_histogram(f);
2698   } else if (admin_command == "flush_store_cache") {
2699     store->flush_cache(&ss);
2700   } else if (admin_command == "dump_pgstate_history") {
2701     f->open_object_section("pgstate_history");
2702     vector<PGRef> pgs;
2703     _get_pgs(&pgs);
2704     for (auto& pg : pgs) {
2705       f->dump_stream("pg") << pg->pg_id;
2706       pg->dump_pgstate_history(f);
2707     }
2708     f->close_section();
2709   } else if (admin_command == "compact") {
2710     dout(1) << "triggering manual compaction" << dendl;
2711     auto start = ceph::coarse_mono_clock::now();
2712     store->compact();
2713     auto end = ceph::coarse_mono_clock::now();
2714     double duration = std::chrono::duration<double>(end-start).count();
2715     dout(1) << "finished manual compaction in "
2716             << duration
2717             << " seconds" << dendl;
2718     f->open_object_section("compact_result");
2719     f->dump_float("elapsed_time", duration);
2720     f->close_section();
2721   } else if (admin_command == "get_mapped_pools") {
2722     f->open_array_section("mapped_pools");
2723     set<int64_t> poollist = get_mapped_pools();
2724     for (auto pool : poollist) {
2725       f->dump_int("pool_id", pool);
2726     }
2727     f->close_section();
2728   } else if (admin_command == "smart") {
2729     string devid;
2730     cmd_getval(cct, cmdmap, "devid", devid);
2731     probe_smart(devid, ss);
2732   } else if (admin_command == "list_devices") {
2733     set<string> devnames;
2734     store->get_devices(&devnames);
2735     f->open_object_section("list_devices");
2736     for (auto dev : devnames) {
2737       if (dev.find("dm-") == 0) {
2738         continue;
2739       }
2740       f->dump_string("device", "/dev/" + dev);
2741     }
2742     f->close_section();
2743   } else if (admin_command == "send_beacon") {
2744     if (is_active()) {
2745       send_beacon(ceph::coarse_mono_clock::now());
2746     }
2747   } else if (admin_command == "dump_osd_network") {
2748     int64_t value = 0;
2749     if (!(cmd_getval(cct, cmdmap, "value", value))) {
2750       // Convert milliseconds to microseconds
2751       value = static_cast<int64_t>(g_conf().get_val<double>("mon_warn_on_slow_ping_time")) * 1000;
2752       if (value == 0) {
2753         double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2754         value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2755         value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2756       }
2757     } else {
2758       // Convert user input to microseconds
2759       value *= 1000;
2760     }
2761     if (value < 0) value = 0;
2762
2763     struct osd_ping_time_t {
2764       uint32_t pingtime;
2765       int to;
2766       bool back;
2767       std::array<uint32_t,3> times;
2768       std::array<uint32_t,3> min;
2769       std::array<uint32_t,3> max;
2770       uint32_t last;
2771       uint32_t last_update;
2772
2773       bool operator<(const osd_ping_time_t& rhs) const {
2774         if (pingtime < rhs.pingtime)
2775           return true;
2776         if (pingtime > rhs.pingtime)
2777           return false;
2778         if (to < rhs.to)
2779           return true;
2780         if (to > rhs.to)
2781           return false;
2782         return back;
2783       }
2784     };
2785
2786     set<osd_ping_time_t> sorted;
2787     // Get pingtimes under lock and not on the stack
2788     map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
2789     service.get_hb_pingtime(pingtimes);
2790     for (auto j : *pingtimes) {
2791       if (j.second.last_update == 0)
2792         continue;
2793       osd_ping_time_t item;
2794       item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
2795       item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
2796       if (item.pingtime >= value) {
2797         item.to = j.first;
2798         item.times[0] = j.second.back_pingtime[0];
2799         item.times[1] = j.second.back_pingtime[1];
2800         item.times[2] = j.second.back_pingtime[2];
2801         item.min[0] = j.second.back_min[0];
2802         item.min[1] = j.second.back_min[1];
2803         item.min[2] = j.second.back_min[2];
2804         item.max[0] = j.second.back_max[0];
2805         item.max[1] = j.second.back_max[1];
2806         item.max[2] = j.second.back_max[2];
2807         item.last = j.second.back_last;
2808         item.back = true;
2809         item.last_update = j.second.last_update;
2810         sorted.emplace(item);
2811       }
2812       if (j.second.front_last == 0)
2813         continue;
2814       item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
2815       item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
2816       if (item.pingtime >= value) {
2817         item.to = j.first;
2818         item.times[0] = j.second.front_pingtime[0];
2819         item.times[1] = j.second.front_pingtime[1];
2820         item.times[2] = j.second.front_pingtime[2];
2821         item.min[0] = j.second.front_min[0];
2822         item.min[1] = j.second.front_min[1];
2823         item.min[2] = j.second.front_min[2];
2824         item.max[0] = j.second.front_max[0];
2825         item.max[1] = j.second.front_max[1];
2826         item.max[2] = j.second.front_max[2];
2827         item.last = j.second.front_last;
2828         item.last_update = j.second.last_update;
2829         item.back = false;
2830         sorted.emplace(item);
2831       }
2832     }
2833     delete pingtimes;
2834     //
2835     // Network ping times (1min 5min 15min)
2836     f->open_object_section("network_ping_times");
2837     f->dump_int("threshold", value / 1000);
2838     f->open_array_section("entries");
2839     for (auto &sitem : boost::adaptors::reverse(sorted)) {
2840       ceph_assert(sitem.pingtime >= value);
2841       f->open_object_section("entry");
2842
2843       const time_t lu(sitem.last_update);
2844       char buffer[26];
2845       string lustr(ctime_r(&lu, buffer));
2846       lustr.pop_back();   // Remove trailing \n
2847       auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
2848       f->dump_string("last update", lustr);
2849       f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
2850       f->dump_int("from osd", whoami);
2851       f->dump_int("to osd", sitem.to);
2852       f->dump_string("interface", (sitem.back ? "back" : "front"));
2853       f->open_object_section("average");
2854       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
2855       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
2856       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
2857       f->close_section();  // average
2858       f->open_object_section("min");
2859       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2860       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2861       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2862       f->close_section();  // min
2863       f->open_object_section("max");
2864       f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
2865       f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
2866       f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
2867       f->close_section();  // max
2868       f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
2869       f->close_section();  // entry
2870     }
2871     f->close_section(); // entries
2872     f->close_section(); // network_ping_times
2873   } else {
2874     ceph_abort_msg("broken asok registration");
2875   }
2876   f->flush(ss);
2877   delete f;
2878   return true;
2879 }
2880
2881 class TestOpsSocketHook : public AdminSocketHook {
2882   OSDService *service;
2883   ObjectStore *store;
2884 public:
2885   TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2886   bool call(std::string_view command, const cmdmap_t& cmdmap,
2887             std::string_view format, bufferlist& out) override {
2888     stringstream ss;
2889     try {
2890       test_ops(service, store, command, cmdmap, ss);
2891     } catch (const bad_cmd_get& e) {
2892       ss << e.what();
2893     }
2894     out.append(ss);
2895     return true;
2896   }
2897   void test_ops(OSDService *service, ObjectStore *store,
2898                 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
2899
2900 };
2901
2902 class OSD::C_Tick : public Context {
2903   OSD *osd;
2904   public:
2905   explicit C_Tick(OSD *o) : osd(o) {}
2906   void finish(int r) override {
2907     osd->tick();
2908   }
2909 };
2910
2911 class OSD::C_Tick_WithoutOSDLock : public Context {
2912   OSD *osd;
2913   public:
2914   explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2915   void finish(int r) override {
2916     osd->tick_without_osd_lock();
2917   }
2918 };
2919
2920 int OSD::enable_disable_fuse(bool stop)
2921 {
2922 #ifdef HAVE_LIBFUSE
2923   int r;
2924   string mntpath = cct->_conf->osd_data + "/fuse";
2925   if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2926     dout(1) << __func__ << " disabling" << dendl;
2927     fuse_store->stop();
2928     delete fuse_store;
2929     fuse_store = NULL;
2930     r = ::rmdir(mntpath.c_str());
2931     if (r < 0) {
2932       r = -errno;
2933       derr << __func__ << " failed to rmdir " << mntpath << ": "
2934            << cpp_strerror(r) << dendl;
2935       return r;
2936     }
2937     return 0;
2938   }
2939   if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2940     dout(1) << __func__ << " enabling" << dendl;
2941     r = ::mkdir(mntpath.c_str(), 0700);
2942     if (r < 0)
2943       r = -errno;
2944     if (r < 0 && r != -EEXIST) {
2945       derr << __func__ << " unable to create " << mntpath << ": "
2946            << cpp_strerror(r) << dendl;
2947       return r;
2948     }
2949     fuse_store = new FuseStore(store, mntpath);
2950     r = fuse_store->start();
2951     if (r < 0) {
2952       derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2953       delete fuse_store;
2954       fuse_store = NULL;
2955       return r;
2956     }
2957   }
2958 #endif  // HAVE_LIBFUSE
2959   return 0;
2960 }
2961
2962 int OSD::get_num_op_shards()
2963 {
2964   if (cct->_conf->osd_op_num_shards)
2965     return cct->_conf->osd_op_num_shards;
2966   if (store_is_rotational)
2967     return cct->_conf->osd_op_num_shards_hdd;
2968   else
2969     return cct->_conf->osd_op_num_shards_ssd;
2970 }
2971
2972 int OSD::get_num_op_threads()
2973 {
2974   if (cct->_conf->osd_op_num_threads_per_shard)
2975     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2976   if (store_is_rotational)
2977     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2978   else
2979     return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2980 }
2981
2982 float OSD::get_osd_recovery_sleep()
2983 {
2984   if (cct->_conf->osd_recovery_sleep)
2985     return cct->_conf->osd_recovery_sleep;
2986   if (!store_is_rotational && !journal_is_rotational)
2987     return cct->_conf->osd_recovery_sleep_ssd;
2988   else if (store_is_rotational && !journal_is_rotational)
2989     return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
2990   else
2991     return cct->_conf->osd_recovery_sleep_hdd;
2992 }
2993
2994 float OSD::get_osd_delete_sleep()
2995 {
2996   float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
2997   if (osd_delete_sleep > 0)
2998     return osd_delete_sleep;
2999   if (!store_is_rotational && !journal_is_rotational)
3000     return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3001   if (store_is_rotational && !journal_is_rotational)
3002     return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3003   return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3004 }
3005
3006 float OSD::get_osd_snap_trim_sleep()
3007 {
3008   float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3009   if (osd_snap_trim_sleep > 0)
3010     return osd_snap_trim_sleep;
3011   if (!store_is_rotational && !journal_is_rotational)
3012     return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3013   if (store_is_rotational && !journal_is_rotational)
3014     return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3015   return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3016 }
3017
3018 int OSD::init()
3019 {
3020   CompatSet initial, diff;
3021   std::lock_guard lock(osd_lock);
3022   if (is_stopping())
3023     return 0;
3024
3025   tick_timer.init();
3026   tick_timer_without_osd_lock.init();
3027   service.recovery_request_timer.init();
3028   service.sleep_timer.init();
3029
3030   boot_finisher.start();
3031
3032   {
3033     string val;
3034     store->read_meta("require_osd_release", &val);
3035     last_require_osd_release = atoi(val.c_str());
3036   }
3037
3038   // mount.
3039   dout(2) << "init " << dev_path
3040           << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3041           << dendl;
3042   dout(2) << "journal " << journal_path << dendl;
3043   ceph_assert(store);  // call pre_init() first!
3044
3045   store->set_cache_shards(get_num_op_shards());
3046
3047   int r = store->mount();
3048   if (r < 0) {
3049     derr << "OSD:init: unable to mount object store" << dendl;
3050     return r;
3051   }
3052   journal_is_rotational = store->is_journal_rotational();
3053   dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3054           << dendl;
3055
3056   enable_disable_fuse(false);
3057
3058   dout(2) << "boot" << dendl;
3059
3060   service.meta_ch = store->open_collection(coll_t::meta());
3061
3062   // initialize the daily loadavg with current 15min loadavg
3063   double loadavgs[3];
3064   if (getloadavg(loadavgs, 3) == 3) {
3065     daily_loadavg = loadavgs[2];
3066   } else {
3067     derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3068     daily_loadavg = 1.0;
3069   }
3070
3071   int rotating_auth_attempts = 0;
3072   auto rotating_auth_timeout =
3073     g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3074
3075   // sanity check long object name handling
3076   {
3077     hobject_t l;
3078     l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3079     l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3080     l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3081     r = store->validate_hobject_key(l);
3082     if (r < 0) {
3083       derr << "backend (" << store->get_type() << ") is unable to support max "
3084            << "object name[space] len" << dendl;
3085       derr << "   osd max object name len = "
3086            << cct->_conf->osd_max_object_name_len << dendl;
3087       derr << "   osd max object namespace len = "
3088            << cct->_conf->osd_max_object_namespace_len << dendl;
3089       derr << cpp_strerror(r) << dendl;
3090       if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3091         goto out;
3092       }
3093       derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3094            << dendl;
3095     } else {
3096       dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3097     }
3098   }
3099
3100   // read superblock
3101   r = read_superblock();
3102   if (r < 0) {
3103     derr << "OSD::init() : unable to read osd superblock" << dendl;
3104     r = -EINVAL;
3105     goto out;
3106   }
3107
3108   if (osd_compat.compare(superblock.compat_features) < 0) {
3109     derr << "The disk uses features unsupported by the executable." << dendl;
3110     derr << " ondisk features " << superblock.compat_features << dendl;
3111     derr << " daemon features " << osd_compat << dendl;
3112
3113     if (osd_compat.writeable(superblock.compat_features)) {
3114       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3115       derr << "it is still writeable, though. Missing features: " << diff << dendl;
3116       r = -EOPNOTSUPP;
3117       goto out;
3118     }
3119     else {
3120       CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3121       derr << "Cannot write to disk! Missing features: " << diff << dendl;
3122       r = -EOPNOTSUPP;
3123       goto out;
3124     }
3125   }
3126
3127   assert_warn(whoami == superblock.whoami);
3128   if (whoami != superblock.whoami) {
3129     derr << "OSD::init: superblock says osd"
3130          << superblock.whoami << " but I am osd." << whoami << dendl;
3131     r = -EINVAL;
3132     goto out;
3133   }
3134
3135   // load up "current" osdmap
3136   assert_warn(!osdmap);
3137   if (osdmap) {
3138     derr << "OSD::init: unable to read current osdmap" << dendl;
3139     r = -EINVAL;
3140     goto out;
3141   }
3142   osdmap = get_map(superblock.current_epoch);
3143
3144   // make sure we don't have legacy pgs deleting
3145   {
3146     vector<coll_t> ls;
3147     int r = store->list_collections(ls);
3148     ceph_assert(r >= 0);
3149     for (auto c : ls) {
3150       spg_t pgid;
3151       if (c.is_pg(&pgid) &&
3152           !osdmap->have_pg_pool(pgid.pool())) {
3153         ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3154         if (!store->exists(service.meta_ch, oid)) {
3155           derr << __func__ << " missing pg_pool_t for deleted pool "
3156                << pgid.pool() << " for pg " << pgid
3157                << "; please downgrade to luminous and allow "
3158                << "pg deletion to complete before upgrading" << dendl;
3159           ceph_abort();
3160         }
3161       }
3162     }
3163   }
3164
3165   initial = get_osd_initial_compat_set();
3166   diff = superblock.compat_features.unsupported(initial);
3167   if (superblock.compat_features.merge(initial)) {
3168     // We need to persist the new compat_set before we
3169     // do anything else
3170     dout(5) << "Upgrading superblock adding: " << diff << dendl;
3171     ObjectStore::Transaction t;
3172     write_superblock(t);
3173     r = store->queue_transaction(service.meta_ch, std::move(t));
3174     if (r < 0)
3175       goto out;
3176   }
3177
3178   // make sure snap mapper object exists
3179   if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3180     dout(10) << "init creating/touching snapmapper object" << dendl;
3181     ObjectStore::Transaction t;
3182     t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3183     r = store->queue_transaction(service.meta_ch, std::move(t));
3184     if (r < 0)
3185       goto out;
3186   }
3187
3188   class_handler = new ClassHandler(cct);
3189   cls_initialize(class_handler);
3190
3191   if (cct->_conf->osd_open_classes_on_start) {
3192     int r = class_handler->open_all_classes();
3193     if (r)
3194       dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3195   }
3196
3197   check_osdmap_features();
3198
3199   create_recoverystate_perf();
3200
3201   {
3202     epoch_t bind_epoch = osdmap->get_epoch();
3203     service.set_epochs(NULL, NULL, &bind_epoch);
3204   }
3205
3206   clear_temp_objects();
3207
3208   // initialize osdmap references in sharded wq
3209   for (auto& shard : shards) {
3210     std::lock_guard l(shard->osdmap_lock);
3211     shard->shard_osdmap = osdmap;
3212   }
3213
3214   // load up pgs (as they previously existed)
3215   load_pgs();
3216
3217   dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3218   dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
3219     op_prio_cutoff << "." << dendl;
3220
3221   create_logger();
3222
3223   // prime osd stats
3224   {
3225     struct store_statfs_t stbuf;
3226     osd_alert_list_t alerts;
3227     int r = store->statfs(&stbuf, &alerts);
3228     ceph_assert(r == 0);
3229     service.set_statfs(stbuf, alerts);
3230   }
3231
3232   // client_messenger auth_client is already set up by monc.
3233   for (auto m : { cluster_messenger,
3234         objecter_messenger,
3235         hb_front_client_messenger,
3236         hb_back_client_messenger,
3237         hb_front_server_messenger,
3238         hb_back_server_messenger } ) {
3239     m->set_auth_client(monc);
3240   }
3241   for (auto m : { client_messenger,
3242         cluster_messenger,
3243         hb_front_server_messenger,
3244         hb_back_server_messenger }) {
3245     m->set_auth_server(monc);
3246   }
3247   monc->set_handle_authentication_dispatcher(this);
3248
3249   monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3250                       | CEPH_ENTITY_TYPE_MGR);
3251   r = monc->init();
3252   if (r < 0)
3253     goto out;
3254
3255   mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3256   mgrc.set_perf_metric_query_cb(
3257     [this](const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
3258         set_perf_queries(queries);
3259       },
3260       [this](std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
3261         get_perf_reports(reports);
3262       });
3263   mgrc.init();
3264
3265   // tell monc about log_client so it will know about mon session resets
3266   monc->set_log_client(&log_client);
3267   update_log_config();
3268
3269   // i'm ready!
3270   client_messenger->add_dispatcher_tail(&mgrc);
3271   client_messenger->add_dispatcher_tail(this);
3272   cluster_messenger->add_dispatcher_head(this);
3273
3274   hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3275   hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3276   hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3277   hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3278
3279   objecter_messenger->add_dispatcher_head(service.objecter);
3280
3281   service.init();
3282   service.publish_map(osdmap);
3283   service.publish_superblock(superblock);
3284   service.max_oldest_map = superblock.oldest_map;
3285
3286   for (auto& shard : shards) {
3287     // put PGs in a temporary set because we may modify pg_slots
3288     // unordered_map below.
3289     set<PGRef> pgs;
3290     for (auto& i : shard->pg_slots) {
3291       PGRef pg = i.second->pg;
3292       if (!pg) {
3293         continue;
3294       }
3295       pgs.insert(pg);
3296     }
3297     for (auto pg : pgs) {
3298       pg->lock();
3299       set<pair<spg_t,epoch_t>> new_children;
3300       set<pair<spg_t,epoch_t>> merge_pgs;
3301       service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3302                                          &new_children, &merge_pgs);
3303       if (!new_children.empty()) {
3304         for (auto shard : shards) {
3305           shard->prime_splits(osdmap, &new_children);
3306         }
3307         assert(new_children.empty());
3308       }
3309       if (!merge_pgs.empty()) {
3310         for (auto shard : shards) {
3311           shard->prime_merges(osdmap, &merge_pgs);
3312         }
3313         assert(merge_pgs.empty());
3314       }
3315       pg->unlock();
3316     }
3317   }
3318
3319   osd_op_tp.start();
3320   command_tp.start();
3321
3322   // start the heartbeat
3323   heartbeat_thread.create("osd_srv_heartbt");
3324
3325   // tick
3326   tick_timer.add_event_after(get_tick_interval(),
3327                              new C_Tick(this));
3328   {
3329     std::lock_guard l(tick_timer_lock);
3330     tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3331                                                 new C_Tick_WithoutOSDLock(this));
3332   }
3333
3334   osd_lock.Unlock();
3335
3336   r = monc->authenticate();
3337   if (r < 0) {
3338     derr << __func__ << " authentication failed: " << cpp_strerror(r)
3339          << dendl;
3340     exit(1);
3341   }
3342
3343   while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3344     derr << "unable to obtain rotating service keys; retrying" << dendl;
3345     ++rotating_auth_attempts;
3346     if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3347         derr << __func__ << " wait_auth_rotating timed out" << dendl;
3348         exit(1);
3349     }
3350   }
3351
3352   r = update_crush_device_class();
3353   if (r < 0) {
3354     derr << __func__ << " unable to update_crush_device_class: "
3355          << cpp_strerror(r) << dendl;
3356     exit(1);
3357   }
3358
3359   r = update_crush_location();
3360   if (r < 0) {
3361     derr << __func__ << " unable to update_crush_location: "
3362          << cpp_strerror(r) << dendl;
3363     exit(1);
3364   }
3365
3366   osd_lock.Lock();
3367   if (is_stopping())
3368     return 0;
3369
3370   // start objecter *after* we have authenticated, so that we don't ignore
3371   // the OSDMaps it requests.
3372   service.final_init();
3373
3374   check_config();
3375
3376   dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3377   consume_map();
3378
3379   dout(0) << "done with init, starting boot process" << dendl;
3380
3381   // subscribe to any pg creations
3382   monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3383
3384   // MgrClient needs this (it doesn't have MonClient reference itself)
3385   monc->sub_want("mgrmap", 0, 0);
3386
3387   // we don't need to ask for an osdmap here; objecter will
3388   //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3389
3390   monc->renew_subs();
3391
3392   start_boot();
3393
3394   return 0;
3395
3396 out:
3397   enable_disable_fuse(true);
3398   store->umount();
3399   delete store;
3400   store = NULL;
3401   return r;
3402 }
3403
3404 void OSD::final_init()
3405 {
3406   AdminSocket *admin_socket = cct->get_admin_socket();
3407   asok_hook = new OSDSocketHook(this);
3408   int r = admin_socket->register_command("status", "status", asok_hook,
3409                                          "high-level status of OSD");
3410   ceph_assert(r == 0);
3411   r = admin_socket->register_command("flush_journal", "flush_journal",
3412                                      asok_hook,
3413                                      "flush the journal to permanent store");
3414   ceph_assert(r == 0);
3415   r = admin_socket->register_command("dump_ops_in_flight",
3416                                      "dump_ops_in_flight " \
3417                                      "name=filterstr,type=CephString,n=N,req=false",
3418                                      asok_hook,
3419                                      "show the ops currently in flight");
3420   ceph_assert(r == 0);
3421   r = admin_socket->register_command("ops",
3422                                      "ops " \
3423                                      "name=filterstr,type=CephString,n=N,req=false",
3424                                      asok_hook,
3425                                      "show the ops currently in flight");
3426   ceph_assert(r == 0);
3427   r = admin_socket->register_command("dump_blocked_ops",
3428                                      "dump_blocked_ops " \
3429                                      "name=filterstr,type=CephString,n=N,req=false",
3430                                      asok_hook,
3431                                      "show the blocked ops currently in flight");
3432   ceph_assert(r == 0);
3433   r = admin_socket->register_command("dump_historic_ops",
3434                                      "dump_historic_ops " \
3435                                      "name=filterstr,type=CephString,n=N,req=false",
3436                                      asok_hook,
3437                                      "show recent ops");
3438   ceph_assert(r == 0);
3439   r = admin_socket->register_command("dump_historic_slow_ops",
3440                                      "dump_historic_slow_ops " \
3441                                      "name=filterstr,type=CephString,n=N,req=false",
3442                                      asok_hook,
3443                                      "show slowest recent ops");
3444   ceph_assert(r == 0);
3445   r = admin_socket->register_command("dump_historic_ops_by_duration",
3446                                      "dump_historic_ops_by_duration " \
3447                                      "name=filterstr,type=CephString,n=N,req=false",
3448                                      asok_hook,
3449                                      "show slowest recent ops, sorted by duration");
3450   ceph_assert(r == 0);
3451   r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
3452                                      asok_hook,
3453                                      "dump op priority queue state");
3454   ceph_assert(r == 0);
3455   r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
3456                                      asok_hook,
3457                                      "dump blacklisted clients and times");
3458   ceph_assert(r == 0);
3459   r = admin_socket->register_command("dump_watchers", "dump_watchers",
3460                                      asok_hook,
3461                                      "show clients which have active watches,"
3462                                      " and on which objects");
3463   ceph_assert(r == 0);
3464   r = admin_socket->register_command("dump_recovery_reservations", "dump_recovery_reservations",
3465                                      asok_hook,
3466                                      "show recovery reservations");
3467   ceph_assert(r == 0);
3468   r = admin_socket->register_command("dump_scrub_reservations", "dump_scrub_reservations",
3469                                      asok_hook,
3470                                      "show scrub reservations");
3471   ceph_assert(r == 0);
3472   r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
3473                                      asok_hook,
3474                                      "force osd to update the latest map from "
3475                                      "the mon");
3476   ceph_assert(r == 0);
3477
3478   r = admin_socket->register_command( "heap",
3479                                       "heap " \
3480                                       "name=heapcmd,type=CephString " \
3481                                       "name=value,type=CephString,req=false",
3482                                       asok_hook,
3483                                       "show heap usage info (available only if "
3484                                       "compiled with tcmalloc)");
3485   ceph_assert(r == 0);
3486
3487   r = admin_socket->register_command("set_heap_property",
3488                                      "set_heap_property " \
3489                                      "name=property,type=CephString " \
3490                                      "name=value,type=CephInt",
3491                                      asok_hook,
3492                                      "update malloc extension heap property");
3493   ceph_assert(r == 0);
3494
3495   r = admin_socket->register_command("get_heap_property",
3496                                      "get_heap_property " \
3497                                      "name=property,type=CephString",
3498                                      asok_hook,
3499                                      "get malloc extension heap property");
3500   ceph_assert(r == 0);
3501
3502   r = admin_socket->register_command("dump_objectstore_kv_stats",
3503                                      "dump_objectstore_kv_stats",
3504                                      asok_hook,
3505                                      "print statistics of kvdb which used by bluestore");
3506   ceph_assert(r == 0);
3507
3508   r = admin_socket->register_command("dump_scrubs",
3509                                      "dump_scrubs",
3510                                      asok_hook,
3511                                      "print scheduled scrubs");
3512   ceph_assert(r == 0);
3513
3514   r = admin_socket->register_command("calc_objectstore_db_histogram",
3515                                      "calc_objectstore_db_histogram",
3516                                      asok_hook,
3517                                      "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3518   ceph_assert(r == 0);
3519
3520   r = admin_socket->register_command("flush_store_cache",
3521                                      "flush_store_cache",
3522                                      asok_hook,
3523                                      "Flush bluestore internal cache");
3524   ceph_assert(r == 0);
3525   r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
3526                                      asok_hook,
3527                                      "show recent state history");
3528   ceph_assert(r == 0);
3529
3530   r = admin_socket->register_command("compact", "compact",
3531                                      asok_hook,
3532                                      "Commpact object store's omap."
3533                                      " WARNING: Compaction probably slows your requests");
3534   ceph_assert(r == 0);
3535
3536   r = admin_socket->register_command("get_mapped_pools", "get_mapped_pools",
3537                                      asok_hook,
3538                                      "dump pools whose PG(s) are mapped to this OSD.");
3539
3540   ceph_assert(r == 0);
3541
3542   r = admin_socket->register_command("smart", "smart name=devid,type=CephString,req=False",
3543                                      asok_hook,
3544                                      "probe OSD devices for SMART data.");
3545
3546   ceph_assert(r == 0);
3547
3548   r = admin_socket->register_command("list_devices", "list_devices",
3549                                      asok_hook,
3550                                      "list OSD devices.");
3551   r = admin_socket->register_command("send_beacon", "send_beacon",
3552                                      asok_hook,
3553                                      "send OSD beacon to mon immediately");
3554
3555   r = admin_socket->register_command("dump_osd_network", "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3556                                          "Dump osd heartbeat network ping times");
3557   ceph_assert(r == 0);
3558
3559   test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3560   // Note: pools are CephString instead of CephPoolname because
3561   // these commands traditionally support both pool names and numbers
3562   r = admin_socket->register_command(
3563    "setomapval",
3564    "setomapval " \
3565    "name=pool,type=CephString " \
3566    "name=objname,type=CephObjectname " \
3567    "name=key,type=CephString "\
3568    "name=val,type=CephString",
3569    test_ops_hook,
3570    "set omap key");
3571   ceph_assert(r == 0);
3572   r = admin_socket->register_command(
3573     "rmomapkey",
3574     "rmomapkey " \
3575     "name=pool,type=CephString " \
3576     "name=objname,type=CephObjectname " \
3577     "name=key,type=CephString",
3578     test_ops_hook,
3579     "remove omap key");
3580   ceph_assert(r == 0);
3581   r = admin_socket->register_command(
3582     "setomapheader",
3583     "setomapheader " \
3584     "name=pool,type=CephString " \
3585     "name=objname,type=CephObjectname " \
3586     "name=header,type=CephString",
3587     test_ops_hook,
3588     "set omap header");
3589   ceph_assert(r == 0);
3590
3591   r = admin_socket->register_command(
3592     "getomap",
3593     "getomap " \
3594     "name=pool,type=CephString " \
3595     "name=objname,type=CephObjectname",
3596     test_ops_hook,
3597     "output entire object map");
3598   ceph_assert(r == 0);
3599
3600   r = admin_socket->register_command(
3601     "truncobj",
3602     "truncobj " \
3603     "name=pool,type=CephString " \
3604     "name=objname,type=CephObjectname " \
3605     "name=len,type=CephInt",
3606     test_ops_hook,
3607     "truncate object to length");
3608   ceph_assert(r == 0);
3609
3610   r = admin_socket->register_command(
3611     "injectdataerr",
3612     "injectdataerr " \
3613     "name=pool,type=CephString " \
3614     "name=objname,type=CephObjectname " \
3615     "name=shardid,type=CephInt,req=false,range=0|255",
3616     test_ops_hook,
3617     "inject data error to an object");
3618   ceph_assert(r == 0);
3619
3620   r = admin_socket->register_command(
3621     "injectmdataerr",
3622     "injectmdataerr " \
3623     "name=pool,type=CephString " \
3624     "name=objname,type=CephObjectname " \
3625     "name=shardid,type=CephInt,req=false,range=0|255",
3626     test_ops_hook,
3627     "inject metadata error to an object");
3628   ceph_assert(r == 0);
3629   r = admin_socket->register_command(
3630     "set_recovery_delay",
3631     "set_recovery_delay " \
3632     "name=utime,type=CephInt,req=false",
3633     test_ops_hook,
3634      "Delay osd recovery by specified seconds");
3635   ceph_assert(r == 0);
3636   r = admin_socket->register_command(
3637    "trigger_scrub",
3638    "trigger_scrub " \
3639    "name=pgid,type=CephString " \
3640    "name=time,type=CephInt,req=false",
3641    test_ops_hook,
3642    "Trigger a scheduled scrub ");
3643   ceph_assert(r == 0);
3644   r = admin_socket->register_command(
3645    "trigger_deep_scrub",
3646    "trigger_deep_scrub " \
3647    "name=pgid,type=CephString " \
3648    "name=time,type=CephInt,req=false",
3649    test_ops_hook,
3650    "Trigger a scheduled deep scrub ");
3651   ceph_assert(r == 0);
3652   r = admin_socket->register_command(
3653    "injectfull",
3654    "injectfull " \
3655    "name=type,type=CephString,req=false " \
3656    "name=count,type=CephInt,req=false ",
3657    test_ops_hook,
3658    "Inject a full disk (optional count times)");
3659   ceph_assert(r == 0);
3660 }
3661
3662 void OSD::create_logger()
3663 {
3664   dout(10) << "create_logger" << dendl;
3665
3666   PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
3667
3668   // Latency axis configuration for op histograms, values are in nanoseconds
3669   PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3670     "Latency (usec)",
3671     PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3672     0,                               ///< Start at 0
3673     100000,                          ///< Quantization unit is 100usec
3674     32,                              ///< Enough to cover much longer than slow requests
3675   };
3676
3677   // Op size axis configuration for op histograms, values are in bytes
3678   PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3679     "Request size (bytes)",
3680     PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3681     0,                               ///< Start at 0
3682     512,                             ///< Quantization unit is 512 bytes
3683     32,                              ///< Enough to cover requests larger than GB
3684   };
3685
3686
3687   // All the basic OSD operation stats are to be considered useful
3688   osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3689
3690   osd_plb.add_u64(
3691     l_osd_op_wip, "op_wip",
3692     "Replication operations currently being processed (primary)");
3693   osd_plb.add_u64_counter(
3694     l_osd_op, "op",
3695     "Client operations",
3696     "ops", PerfCountersBuilder::PRIO_CRITICAL);
3697   osd_plb.add_u64_counter(
3698     l_osd_op_inb,   "op_in_bytes",
3699     "Client operations total write size",
3700     "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
3701   osd_plb.add_u64_counter(
3702     l_osd_op_outb,  "op_out_bytes",
3703     "Client operations total read size",
3704     "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
3705   osd_plb.add_time_avg(
3706     l_osd_op_lat,   "op_latency",
3707     "Latency of client operations (including queue time)",
3708     "l", 9);
3709   osd_plb.add_time_avg(
3710     l_osd_op_process_lat, "op_process_latency",
3711     "Latency of client operations (excluding queue time)");
3712   osd_plb.add_time_avg(
3713     l_osd_op_prepare_lat, "op_prepare_latency",
3714     "Latency of client operations (excluding queue time and wait for finished)");
3715
3716   osd_plb.add_u64_counter(
3717     l_osd_op_r, "op_r", "Client read operations");
3718   osd_plb.add_u64_counter(
3719     l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3720   osd_plb.add_time_avg(
3721     l_osd_op_r_lat, "op_r_latency",
3722     "Latency of read operation (including queue time)");
3723   osd_plb.add_u64_counter_histogram(
3724     l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3725     op_hist_x_axis_config, op_hist_y_axis_config,
3726     "Histogram of operation latency (including queue time) + data read");
3727   osd_plb.add_time_avg(
3728     l_osd_op_r_process_lat, "op_r_process_latency",
3729     "Latency of read operation (excluding queue time)");
3730   osd_plb.add_time_avg(
3731     l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3732     "Latency of read operations (excluding queue time and wait for finished)");
3733   osd_plb.add_u64_counter(
3734     l_osd_op_w, "op_w", "Client write operations");
3735   osd_plb.add_u64_counter(
3736     l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3737   osd_plb.add_time_avg(
3738     l_osd_op_w_lat,  "op_w_latency",
3739     "Latency of write operation (including queue time)");
3740   osd_plb.add_u64_counter_histogram(
3741     l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3742     op_hist_x_axis_config, op_hist_y_axis_config,
3743     "Histogram of operation latency (including queue time) + data written");
3744   osd_plb.add_time_avg(
3745     l_osd_op_w_process_lat, "op_w_process_latency",
3746     "Latency of write operation (excluding queue time)");
3747   osd_plb.add_time_avg(
3748     l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3749     "Latency of write operations (excluding queue time and wait for finished)");
3750   osd_plb.add_u64_counter(
3751     l_osd_op_rw, "op_rw",
3752     "Client read-modify-write operations");
3753   osd_plb.add_u64_counter(
3754     l_osd_op_rw_inb, "op_rw_in_bytes",
3755     "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3756   osd_plb.add_u64_counter(
3757     l_osd_op_rw_outb,"op_rw_out_bytes",
3758     "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3759   osd_plb.add_time_avg(
3760     l_osd_op_rw_lat, "op_rw_latency",
3761     "Latency of read-modify-write operation (including queue time)");
3762   osd_plb.add_u64_counter_histogram(
3763     l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3764     op_hist_x_axis_config, op_hist_y_axis_config,
3765     "Histogram of rw operation latency (including queue time) + data written");
3766   osd_plb.add_u64_counter_histogram(
3767     l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3768     op_hist_x_axis_config, op_hist_y_axis_config,
3769     "Histogram of rw operation latency (including queue time) + data read");
3770   osd_plb.add_time_avg(
3771     l_osd_op_rw_process_lat, "op_rw_process_latency",
3772     "Latency of read-modify-write operation (excluding queue time)");
3773   osd_plb.add_time_avg(
3774     l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3775     "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3776
3777   // Now we move on to some more obscure stats, revert to assuming things
3778   // are low priority unless otherwise specified.
3779   osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3780
3781   osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3782     "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3783   osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3784     "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3785
3786   osd_plb.add_u64_counter(
3787     l_osd_sop, "subop", "Suboperations");
3788   osd_plb.add_u64_counter(
3789     l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(UNIT_BYTES));
3790   osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3791
3792   osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3793   osd_plb.add_u64_counter(
3794     l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(UNIT_BYTES));
3795   osd_plb.add_time_avg(
3796     l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3797   osd_plb.add_u64_counter(
3798     l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3799   osd_plb.add_time_avg(
3800     l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3801   osd_plb.add_u64_counter(
3802     l_osd_sop_push, "subop_push", "Suboperations push messages");
3803   osd_plb.add_u64_counter(
3804     l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(UNIT_BYTES));
3805   osd_plb.add_time_avg(
3806     l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3807
3808   osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3809   osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3810   osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(UNIT_BYTES));
3811
3812   osd_plb.add_u64_counter(
3813     l_osd_rop, "recovery_ops",
3814     "Started recovery operations",
3815     "rop", PerfCountersBuilder::PRIO_INTERESTING);
3816
3817   osd_plb.add_u64_counter(
3818    l_osd_rbytes, "recovery_bytes",
3819    "recovery bytes",
3820    "rbt", PerfCountersBuilder::PRIO_INTERESTING);
3821
3822   osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3823   osd_plb.add_u64(
3824     l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3825   osd_plb.add_u64(
3826     l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3827     "Total number getting crc from crc_cache with adjusting");
3828   osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3829     "Total number of crc cache misses");
3830
3831   osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3832                   "pgs", PerfCountersBuilder::PRIO_USEFUL);
3833   osd_plb.add_u64(
3834     l_osd_pg_primary, "numpg_primary",
3835     "Placement groups for which this osd is primary");
3836   osd_plb.add_u64(
3837     l_osd_pg_replica, "numpg_replica",
3838     "Placement groups for which this osd is replica");
3839   osd_plb.add_u64(
3840     l_osd_pg_stray, "numpg_stray",
3841     "Placement groups ready to be deleted from this osd");
3842   osd_plb.add_u64(
3843     l_osd_pg_removing, "numpg_removing",
3844     "Placement groups queued for local deletion", "pgsr",
3845     PerfCountersBuilder::PRIO_USEFUL);
3846   osd_plb.add_u64(
3847     l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3848   osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3849   osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3850   osd_plb.add_u64_counter(
3851     l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3852   osd_plb.add_u64_counter(
3853     l_osd_waiting_for_map, "messages_delayed_for_map",
3854     "Operations waiting for OSD map");
3855
3856   osd_plb.add_u64_counter(
3857     l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3858   osd_plb.add_u64_counter(
3859     l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3860   osd_plb.add_u64_counter(
3861     l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3862     "osdmap cache miss below cache lower bound");
3863   osd_plb.add_u64_avg(
3864     l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3865     "osdmap cache miss, avg distance below cache lower bound");
3866   osd_plb.add_u64_counter(
3867     l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3868     "OSDMap buffer cache hits");
3869   osd_plb.add_u64_counter(
3870     l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3871     "OSDMap buffer cache misses");
3872
3873   osd_plb.add_u64(
3874     l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
3875     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3876   osd_plb.add_u64(
3877     l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
3878     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
3879   osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(UNIT_BYTES));
3880
3881   osd_plb.add_u64_counter(
3882     l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3883
3884   osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3885   osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3886   osd_plb.add_u64_counter(
3887     l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3888   osd_plb.add_u64_counter(
3889     l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3890   osd_plb.add_u64_counter(
3891     l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3892     "Failed tier flush attempts");
3893   osd_plb.add_u64_counter(
3894     l_osd_tier_evict, "tier_evict", "Tier evictions");
3895   osd_plb.add_u64_counter(
3896     l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3897   osd_plb.add_u64_counter(
3898     l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3899   osd_plb.add_u64_counter(
3900     l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3901   osd_plb.add_u64_counter(
3902     l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3903   osd_plb.add_u64_counter(
3904     l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3905   osd_plb.add_u64_counter(
3906     l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3907
3908   osd_plb.add_u64_counter(
3909     l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3910   osd_plb.add_u64_counter(
3911     l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3912   osd_plb.add_u64_counter(
3913     l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3914   osd_plb.add_u64_counter(
3915     l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3916
3917   osd_plb.add_u64_counter(
3918     l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3919   osd_plb.add_u64_counter(
3920     l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3921
3922   osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3923   osd_plb.add_time_avg(
3924     l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3925   osd_plb.add_time_avg(
3926     l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3927   osd_plb.add_time_avg(
3928     l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3929
3930   osd_plb.add_u64_counter(
3931     l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3932   osd_plb.add_u64_counter(
3933     l_osd_pg_fastinfo, "osd_pg_fastinfo",
3934     "PG updated its info using fastinfo attr");
3935   osd_plb.add_u64_counter(
3936     l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3937
3938   logger = osd_plb.create_perf_counters();
3939   cct->get_perfcounters_collection()->add(logger);
3940 }
3941
3942 void OSD::create_recoverystate_perf()
3943 {
3944   dout(10) << "create_recoverystate_perf" << dendl;
3945
3946   PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3947
3948   rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3949   rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3950   rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3951   rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3952   rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3953   rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3954   rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3955   rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3956   rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3957   rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3958   rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3959   rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3960   rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3961   rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3962   rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3963   rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3964   rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3965   rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3966   rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3967   rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3968   rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3969   rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3970   rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3971   rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3972   rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3973   rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3974   rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3975   rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3976   rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3977   rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3978   rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3979
3980   recoverystate_perf = rs_perf.create_perf_counters();
3981   cct->get_perfcounters_collection()->add(recoverystate_perf);
3982 }
3983
3984 int OSD::shutdown()
3985 {
3986   if (!service.prepare_to_stop())
3987     return 0; // already shutting down
3988   osd_lock.Lock();
3989   if (is_stopping()) {
3990     osd_lock.Unlock();
3991     return 0;
3992   }
3993   dout(0) << "shutdown" << dendl;
3994
3995   set_state(STATE_STOPPING);
3996
3997   // Debugging
3998   if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
3999     cct->_conf.set_val("debug_osd", "100");
4000     cct->_conf.set_val("debug_journal", "100");
4001     cct->_conf.set_val("debug_filestore", "100");
4002     cct->_conf.set_val("debug_bluestore", "100");
4003     cct->_conf.set_val("debug_ms", "100");
4004     cct->_conf.apply_changes(nullptr);
4005   }
4006
4007   // stop MgrClient earlier as it's more like an internal consumer of OSD
4008   mgrc.shutdown();
4009
4010   service.start_shutdown();
4011
4012   // stop sending work to pgs.  this just prevents any new work in _process
4013   // from racing with on_shutdown and potentially entering the pg after.
4014   op_shardedwq.drain();
4015
4016   // Shutdown PGs
4017   {
4018     vector<PGRef> pgs;
4019     _get_pgs(&pgs);
4020     for (auto pg : pgs) {
4021       pg->shutdown();
4022     }
4023   }
4024
4025   // drain op queue again (in case PGs requeued something)
4026   op_shardedwq.drain();
4027   {
4028     finished.clear(); // zap waiters (bleh, this is messy)
4029     waiting_for_osdmap.clear();
4030   }
4031
4032   // unregister commands
4033   cct->get_admin_socket()->unregister_commands(asok_hook);
4034   delete asok_hook;
4035   asok_hook = NULL;
4036
4037   cct->get_admin_socket()->unregister_commands(test_ops_hook);
4038   delete test_ops_hook;
4039   test_ops_hook = NULL;
4040
4041   osd_lock.Unlock();
4042
4043   heartbeat_lock.Lock();
4044   heartbeat_stop = true;
4045   heartbeat_cond.Signal();
4046   heartbeat_lock.Unlock();
4047   heartbeat_thread.join();
4048
4049   osd_op_tp.drain();
4050   osd_op_tp.stop();
4051   dout(10) << "op sharded tp stopped" << dendl;
4052
4053   command_tp.drain();
4054   command_tp.stop();
4055   dout(10) << "command tp stopped" << dendl;
4056
4057   dout(10) << "stopping agent" << dendl;
4058   service.agent_stop();
4059
4060   boot_finisher.wait_for_empty();
4061
4062   osd_lock.Lock();
4063
4064   boot_finisher.stop();
4065   reset_heartbeat_peers(true);
4066
4067   tick_timer.shutdown();
4068
4069   {
4070     std::lock_guard l(tick_timer_lock);
4071     tick_timer_without_osd_lock.shutdown();
4072   }
4073
4074   // note unmount epoch
4075   dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
4076   superblock.mounted = service.get_boot_epoch();
4077   superblock.clean_thru = osdmap->get_epoch();
4078   ObjectStore::Transaction t;
4079   write_superblock(t);
4080   int r = store->queue_transaction(service.meta_ch, std::move(t));
4081   if (r) {
4082     derr << "OSD::shutdown: error writing superblock: "
4083          << cpp_strerror(r) << dendl;
4084   }
4085
4086
4087   service.shutdown_reserver();
4088
4089   // Remove PGs
4090 #ifdef PG_DEBUG_REFS
4091   service.dump_live_pgids();
4092 #endif
4093   while (true) {
4094     vector<PGRef> pgs;
4095     _get_pgs(&pgs, true);
4096     if (pgs.empty()) {
4097       break;
4098     }
4099     for (auto& pg : pgs) {
4100       if (pg->is_deleted()) {
4101         continue;
4102       }
4103       dout(20) << " kicking pg " << pg << dendl;
4104       pg->lock();
4105       if (pg->get_num_ref() != 1) {
4106         derr << "pgid " << pg->get_pgid() << " has ref count of "
4107              << pg->get_num_ref() << dendl;
4108 #ifdef PG_DEBUG_REFS
4109         pg->dump_live_ids();
4110 #endif
4111         if (cct->_conf->osd_shutdown_pgref_assert) {
4112           ceph_abort();
4113         }
4114       }
4115       pg->ch.reset();
4116       pg->unlock();
4117     }
4118   }
4119 #ifdef PG_DEBUG_REFS
4120   service.dump_live_pgids();
4121 #endif
4122
4123   osd_lock.Unlock();
4124   cct->_conf.remove_observer(this);
4125   osd_lock.Lock();
4126
4127   service.meta_ch.reset();
4128
4129   dout(10) << "syncing store" << dendl;
4130   enable_disable_fuse(true);
4131
4132   if (cct->_conf->osd_journal_flush_on_shutdown) {
4133     dout(10) << "flushing journal" << dendl;
4134     store->flush_journal();
4135   }
4136
4137   monc->shutdown();
4138   osd_lock.Unlock();
4139
4140   map_lock.get_write();
4141   osdmap = OSDMapRef();
4142   map_lock.put_write();
4143
4144   for (auto s : shards) {
4145     std::lock_guard l(s->osdmap_lock);
4146     s->shard_osdmap = OSDMapRef();
4147   }
4148   service.shutdown();
4149
4150   std::lock_guard lock(osd_lock);
4151   store->umount();
4152   delete store;
4153   store = nullptr;
4154   dout(10) << "Store synced" << dendl;
4155
4156   op_tracker.on_shutdown();
4157
4158   class_handler->shutdown();
4159   client_messenger->shutdown();
4160   cluster_messenger->shutdown();
4161   hb_front_client_messenger->shutdown();
4162   hb_back_client_messenger->shutdown();
4163   objecter_messenger->shutdown();
4164   hb_front_server_messenger->shutdown();
4165   hb_back_server_messenger->shutdown();
4166
4167   return r;
4168 }
4169
4170 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4171 {
4172   bool created = false;
4173   while (true) {
4174     dout(10) << __func__ << " cmd: " << cmd << dendl;
4175     vector<string> vcmd{cmd};
4176     bufferlist inbl;
4177     C_SaferCond w;
4178     string outs;
4179     monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4180     int r = w.wait();
4181     if (r < 0) {
4182       if (r == -ENOENT && !created) {
4183         string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4184           + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4185         vector<string> vnewcmd{newcmd};
4186         bufferlist inbl;
4187         C_SaferCond w;
4188         string outs;
4189         monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4190         int r = w.wait();
4191         if (r < 0) {
4192           derr << __func__ << " fail: osd does not exist and created failed: "
4193                << cpp_strerror(r) << dendl;
4194           return r;
4195         }
4196         created = true;
4197         continue;
4198       }
4199       derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4200       return r;
4201     }
4202     break;
4203   }
4204
4205   return 0;
4206 }
4207
4208 int OSD::update_crush_location()
4209 {
4210   if (!cct->_conf->osd_crush_update_on_start) {
4211     dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4212     return 0;
4213   }
4214
4215   char weight[32];
4216   if (cct->_conf->osd_crush_initial_weight >= 0) {
4217     snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4218   } else {
4219     struct store_statfs_t st;
4220     osd_alert_list_t alerts;
4221     int r = store->statfs(&st, &alerts);
4222     if (r < 0) {
4223       derr << "statfs: " << cpp_strerror(r) << dendl;
4224       return r;
4225     }
4226     snprintf(weight, sizeof(weight), "%.4lf",
4227              std::max(.00001,
4228                       double(st.total) /
4229                       double(1ull << 40 /* TB */)));
4230   }
4231
4232   std::multimap<string,string> loc = cct->crush_location.get_location();
4233   dout(10) << __func__ << " crush location is " << loc << dendl;
4234
4235   string cmd =
4236     string("{\"prefix\": \"osd crush create-or-move\", ") +
4237     string("\"id\": ") + stringify(whoami) + string(", ") +
4238     string("\"weight\":") + weight + string(", ") +
4239     string("\"args\": [");
4240   for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
4241     if (p != loc.begin())
4242       cmd += ", ";
4243     cmd += "\"" + p->first + "=" + p->second + "\"";
4244   }
4245   cmd += "]}";
4246
4247   return mon_cmd_maybe_osd_create(cmd);
4248 }
4249
4250 int OSD::update_crush_device_class()
4251 {
4252   if (!cct->_conf->osd_class_update_on_start) {
4253     dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4254     return 0;
4255   }
4256
4257   string device_class;
4258   int r = store->read_meta("crush_device_class", &device_class);
4259   if (r < 0 || device_class.empty()) {
4260     device_class = store->get_default_device_class();
4261   }
4262
4263   if (device_class.empty()) {
4264     dout(20) << __func__ << " no device class stored locally" << dendl;
4265     return 0;
4266   }
4267
4268   string cmd =
4269     string("{\"prefix\": \"osd crush set-device-class\", ") +
4270     string("\"class\": \"") + device_class + string("\", ") +
4271     string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4272
4273   r = mon_cmd_maybe_osd_create(cmd);
4274   if (r == -EBUSY) {
4275     // good, already bound to a device-class
4276     return 0;
4277   } else {
4278     return r;
4279   }
4280 }
4281
4282 void OSD::write_superblock(ObjectStore::Transaction& t)
4283 {
4284   dout(10) << "write_superblock " << superblock << dendl;
4285
4286   //hack: at minimum it's using the baseline feature set
4287   if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4288     superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4289
4290   bufferlist bl;
4291   encode(superblock, bl);
4292   t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4293 }
4294
4295 int OSD::read_superblock()
4296 {
4297   bufferlist bl;
4298   int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4299   if (r < 0)
4300     return r;
4301
4302   auto p = bl.cbegin();
4303   decode(superblock, p);
4304
4305   dout(10) << "read_superblock " << superblock << dendl;
4306
4307   return 0;
4308 }
4309
4310 void OSD::clear_temp_objects()
4311 {
4312   dout(10) << __func__ << dendl;
4313   vector<coll_t> ls;
4314   store->list_collections(ls);
4315   for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4316     spg_t pgid;
4317     if (!p->is_pg(&pgid))
4318       continue;
4319
4320     // list temp objects
4321     dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4322
4323     vector<ghobject_t> temps;
4324     ghobject_t next;
4325     while (1) {
4326       vector<ghobject_t> objects;
4327       auto ch = store->open_collection(*p);
4328       ceph_assert(ch);
4329       store->collection_list(ch, next, ghobject_t::get_max(),
4330                              store->get_ideal_list_max(),
4331                              &objects, &next);
4332       if (objects.empty())
4333         break;
4334       vector<ghobject_t>::iterator q;
4335       for (q = objects.begin(); q != objects.end(); ++q) {
4336         // Hammer set pool for temps to -1, so check for clean-up
4337         if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4338           temps.push_back(*q);
4339         } else {
4340           break;
4341         }
4342       }
4343       // If we saw a non-temp object and hit the break above we can
4344       // break out of the while loop too.
4345       if (q != objects.end())
4346         break;
4347     }
4348     if (!temps.empty()) {
4349       ObjectStore::Transaction t;
4350       int removed = 0;
4351       for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4352         dout(20) << "  removing " << *p << " object " << *q << dendl;
4353         t.remove(*p, *q);
4354         if (++removed > cct->_conf->osd_target_transaction_size) {
4355           store->queue_transaction(service.meta_ch, std::move(t));
4356           t = ObjectStore::Transaction();
4357           removed = 0;
4358         }
4359       }
4360       if (removed) {
4361         store->queue_transaction(service.meta_ch, std::move(t));
4362       }
4363     }
4364   }
4365 }
4366
4367 void OSD::recursive_remove_collection(CephContext* cct,
4368                                       ObjectStore *store, spg_t pgid,
4369                                       coll_t tmp)
4370 {
4371   OSDriver driver(
4372     store,
4373     coll_t(),
4374     make_snapmapper_oid());
4375
4376   ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4377   ObjectStore::Transaction t;
4378   SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4379
4380   ghobject_t next;
4381   int max = cct->_conf->osd_target_transaction_size;
4382   vector<ghobject_t> objects;
4383   objects.reserve(max);
4384   while (true) {
4385     objects.clear();
4386     store->collection_list(ch, next, ghobject_t::get_max(),
4387       max, &objects, &next);
4388     generic_dout(10) << __func__ << " " << objects << dendl;
4389     if (objects.empty())
4390       break;
4391     for (auto& p: objects) {
4392       OSDriver::OSTransaction _t(driver.get_transaction(&t));
4393       int r = mapper.remove_oid(p.hobj, &_t);
4394       if (r != 0 && r != -ENOENT)
4395         ceph_abort();
4396       t.remove(tmp, p);
4397     }
4398     int r = store->queue_transaction(ch, std::move(t));
4399     ceph_assert(r == 0);
4400     t = ObjectStore::Transaction();
4401   }
4402   t.remove_collection(tmp);
4403   int r = store->queue_transaction(ch, std::move(t));
4404   ceph_assert(r == 0);
4405
4406   C_SaferCond waiter;
4407   if (!ch->flush_commit(&waiter)) {
4408     waiter.wait();
4409   }
4410 }
4411
4412
4413 // ======================================================
4414 // PG's
4415
4416 PG* OSD::_make_pg(
4417   OSDMapRef createmap,
4418   spg_t pgid)
4419 {
4420   dout(10) << __func__ << " " << pgid << dendl;
4421   pg_pool_t pi;
4422   map<string,string> ec_profile;
4423   string name;
4424   if (createmap->have_pg_pool(pgid.pool())) {
4425     pi = *createmap->get_pg_pool(pgid.pool());
4426     name = createmap->get_pool_name(pgid.pool());
4427     if (pi.is_erasure()) {
4428       ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4429     }
4430   } else {
4431     // pool was deleted; grab final pg_pool_t off disk.
4432     ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4433     bufferlist bl;
4434     int r = store->read(service.meta_ch, oid, 0, 0, bl);
4435     if (r < 0) {
4436       derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4437            << dendl;
4438       return nullptr;
4439     }
4440     ceph_assert(r >= 0);
4441     auto p = bl.cbegin();
4442     decode(pi, p);
4443     decode(name, p);
4444     if (p.end()) { // dev release v13.0.2 did not include ec_profile
4445       derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4446            << " tombstone" << dendl;
4447       return nullptr;
4448     }
4449     decode(ec_profile, p);
4450   }
4451   PGPool pool(cct, createmap, pgid.pool(), pi, name);
4452   PG *pg;
4453   if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4454       pi.type == pg_pool_t::TYPE_ERASURE)
4455     pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4456   else
4457     ceph_abort();
4458   return pg;
4459 }
4460
4461 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4462 {
4463   v->clear();
4464   v->reserve(get_num_pgs());
4465   for (auto& s : shards) {
4466     std::lock_guard l(s->shard_lock);
4467     for (auto& j : s->pg_slots) {
4468       if (j.second->pg &&
4469           !j.second->pg->is_deleted()) {
4470         v->push_back(j.second->pg);
4471         if (clear_too) {
4472           s->_detach_pg(j.second.get());
4473         }
4474       }
4475     }
4476   }
4477 }
4478
4479 void OSD::_get_pgids(vector<spg_t> *v)
4480 {
4481   v->clear();
4482   v->reserve(get_num_pgs());
4483   for (auto& s : shards) {
4484     std::lock_guard l(s->shard_lock);
4485     for (auto& j : s->pg_slots) {
4486       if (j.second->pg &&
4487           !j.second->pg->is_deleted()) {
4488         v->push_back(j.first);
4489       }
4490     }
4491   }
4492 }
4493
4494 void OSD::register_pg(PGRef pg)
4495 {
4496   spg_t pgid = pg->get_pgid();
4497   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4498   auto sdata = shards[shard_index];
4499   std::lock_guard l(sdata->shard_lock);
4500   auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4501   ceph_assert(r.second);
4502   auto *slot = r.first->second.get();
4503   dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4504   sdata->_attach_pg(slot, pg.get());
4505 }
4506
4507 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4508 {
4509   auto sdata = pg->osd_shard;
4510   ceph_assert(sdata);
4511   {
4512     std::lock_guard l(sdata->shard_lock);
4513     auto p = sdata->pg_slots.find(pg->pg_id);
4514     if (p == sdata->pg_slots.end() ||
4515         !p->second->pg) {
4516       dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4517       return false;
4518     }
4519     if (p->second->waiting_for_merge_epoch) {
4520       dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4521       return false;
4522     }
4523     dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4524     sdata->_detach_pg(p->second.get());
4525   }
4526
4527   for (auto shard : shards) {
4528     shard->unprime_split_children(pg->pg_id, old_pg_num);
4529   }
4530
4531   // update pg count now since we might not get an osdmap any time soon.
4532   if (pg->is_primary())
4533     service.logger->dec(l_osd_pg_primary);
4534   else if (pg->is_replica())
4535     service.logger->dec(l_osd_pg_replica);
4536   else
4537     service.logger->dec(l_osd_pg_stray);
4538
4539   return true;
4540 }
4541
4542 PGRef OSD::_lookup_pg(spg_t pgid)
4543 {
4544   uint32_t shard_index = pgid.hash_to_shard(num_shards);
4545   auto sdata = shards[shard_index];
4546   std::lock_guard l(sdata->shard_lock);
4547   auto p = sdata->pg_slots.find(pgid);
4548   if (p == sdata->pg_slots.end()) {
4549     return nullptr;
4550   }
4551   return p->second->pg;
4552 }
4553
4554 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4555 {
4556   PGRef pg = _lookup_pg(pgid);
4557   if (!pg) {
4558     return nullptr;
4559   }
4560   pg->lock();
4561   if (!pg->is_deleted()) {
4562     return pg;
4563   }
4564   pg->unlock();
4565   return nullptr;
4566 }
4567
4568 PGRef OSD::lookup_lock_pg(spg_t pgid)
4569 {
4570   return _lookup_lock_pg(pgid);
4571 }
4572
4573 void OSD::load_pgs()
4574 {
4575   ceph_assert(osd_lock.is_locked());
4576   dout(0) << "load_pgs" << dendl;
4577
4578   {
4579     auto pghist = make_pg_num_history_oid();
4580     bufferlist bl;
4581     int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4582     if (r >= 0 && bl.length() > 0) {
4583       auto p = bl.cbegin();
4584       decode(pg_num_history, p);
4585     }
4586     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4587   }
4588
4589   vector<coll_t> ls;
4590   int r = store->list_collections(ls);
4591   if (r < 0) {
4592     derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4593   }
4594
4595   int num = 0;
4596   for (vector<coll_t>::iterator it = ls.begin();
4597        it != ls.end();
4598        ++it) {
4599     spg_t pgid;
4600     if (it->is_temp(&pgid) ||
4601        (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4602       dout(10) << "load_pgs " << *it
4603                << " removing, legacy or flagged for removal pg" << dendl;
4604       recursive_remove_collection(cct, store, pgid, *it);
4605       continue;
4606     }
4607
4608     if (!it->is_pg(&pgid)) {
4609       dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4610       continue;
4611     }
4612
4613     dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4614     epoch_t map_epoch = 0;
4615     int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4616     if (r < 0) {
4617       derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4618            << dendl;
4619       continue;
4620     }
4621
4622     PGRef pg;
4623     if (map_epoch > 0) {
4624       OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4625       if (!pgosdmap) {
4626         if (!osdmap->have_pg_pool(pgid.pool())) {
4627           derr << __func__ << ": could not find map for epoch " << map_epoch
4628                << " on pg " << pgid << ", but the pool is not present in the "
4629                << "current map, so this is probably a result of bug 10617.  "
4630                << "Skipping the pg for now, you can use ceph-objectstore-tool "
4631                << "to clean it up later." << dendl;
4632           continue;
4633         } else {
4634           derr << __func__ << ": have pgid " << pgid << " at epoch "
4635                << map_epoch << ", but missing map.  Crashing."
4636                << dendl;
4637           ceph_abort_msg("Missing map in load_pgs");
4638         }
4639       }
4640       pg = _make_pg(pgosdmap, pgid);
4641     } else {
4642       pg = _make_pg(osdmap, pgid);
4643     }
4644     if (!pg) {
4645       recursive_remove_collection(cct, store, pgid, *it);
4646       continue;
4647     }
4648
4649     // there can be no waiters here, so we don't call _wake_pg_slot
4650
4651     pg->lock();
4652     pg->ch = store->open_collection(pg->coll);
4653
4654     // read pg state, log
4655     pg->read_state(store);
4656
4657     if (pg->dne())  {
4658       dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4659       pg->ch = nullptr;
4660       pg->unlock();
4661       recursive_remove_collection(cct, store, pgid, *it);
4662       continue;
4663     }
4664     {
4665       uint32_t shard_index = pgid.hash_to_shard(shards.size());
4666       assert(NULL != shards[shard_index]);
4667       store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4668     }
4669
4670     pg->reg_next_scrub();
4671
4672     dout(10) << __func__ << " loaded " << *pg << dendl;
4673     pg->unlock();
4674
4675     register_pg(pg);
4676     ++num;
4677   }
4678   dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4679 }
4680
4681
4682 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4683                                  const PGCreateInfo *info)
4684 {
4685   spg_t pgid = info->pgid;
4686
4687   if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4688     dout(10) << __func__ << " hit max pg, dropping" << dendl;
4689     return nullptr;
4690   }
4691
4692   PG::RecoveryCtx rctx = create_context();
4693
4694   OSDMapRef startmap = get_map(info->epoch);
4695
4696   if (info->by_mon) {
4697     int64_t pool_id = pgid.pgid.pool();
4698     const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4699     if (!pool) {
4700       dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4701       return nullptr;
4702     }
4703     if (osdmap->require_osd_release >= CEPH_RELEASE_NAUTILUS &&
4704         !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4705       // this ensures we do not process old creating messages after the
4706       // pool's initial pgs have been created (and pg are subsequently
4707       // allowed to split or merge).
4708       dout(20) << __func__ << "  dropping " << pgid
4709                << "create, pool does not have CREATING flag set" << dendl;
4710       return nullptr;
4711     }
4712   }
4713
4714   int up_primary, acting_primary;
4715   vector<int> up, acting;
4716   startmap->pg_to_up_acting_osds(
4717     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4718
4719   const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4720   if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4721       store->get_type() != "bluestore") {
4722     clog->warn() << "pg " << pgid
4723                  << " is at risk of silent data corruption: "
4724                  << "the pool allows ec overwrites but is not stored in "
4725                  << "bluestore, so deep scrubbing will not detect bitrot";
4726   }
4727   PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4728   PG::_init(*rctx.transaction, pgid, pp);
4729
4730   int role = startmap->calc_pg_role(whoami, acting, acting.size());
4731   if (!pp->is_replicated() && role != pgid.shard) {
4732     role = -1;
4733   }
4734
4735   PGRef pg = _make_pg(startmap, pgid);
4736   pg->ch = store->create_new_collection(pg->coll);
4737
4738   {
4739     uint32_t shard_index = pgid.hash_to_shard(shards.size());
4740     assert(NULL != shards[shard_index]);
4741     store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4742   }
4743
4744   pg->lock(true);
4745
4746   // we are holding the shard lock
4747   ceph_assert(!pg->is_deleted());
4748
4749   pg->init(
4750     role,
4751     up,
4752     up_primary,
4753     acting,
4754     acting_primary,
4755     info->history,
4756     info->past_intervals,
4757     false,
4758     rctx.transaction);
4759
4760   if (pg->is_primary()) {
4761     Mutex::Locker locker(m_perf_queries_lock);
4762     pg->set_dynamic_perf_stats_queries(m_perf_queries);
4763   }
4764
4765   pg->handle_initialize(&rctx);
4766   pg->handle_activate_map(&rctx);
4767
4768   dispatch_context(rctx, pg.get(), osdmap, nullptr);
4769
4770   dout(10) << __func__ << " new pg " << *pg << dendl;
4771   return pg;
4772 }
4773
4774 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4775                                 spg_t pgid,
4776                                 bool is_mon_create)
4777 {
4778   const auto max_pgs_per_osd =
4779     (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4780      cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4781
4782   if (num_pgs < max_pgs_per_osd) {
4783     return false;
4784   }
4785
4786   std::lock_guard l(pending_creates_lock);
4787   if (is_mon_create) {
4788     pending_creates_from_mon++;
4789   } else {
4790     bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4791     pending_creates_from_osd.emplace(pgid.pgid, is_primary);
4792   }
4793   dout(1) << __func__ << " withhold creation of pg " << pgid
4794           << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4795   return true;
4796 }
4797
4798 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4799 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4800 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4801 static vector<int32_t> twiddle(const vector<int>& acting) {
4802   if (acting.size() > 1) {
4803     return {acting[0]};
4804   } else {
4805     vector<int32_t> twiddled(acting.begin(), acting.end());
4806     twiddled.push_back(-1);
4807     return twiddled;
4808   }
4809 }
4810
4811 void OSD::resume_creating_pg()
4812 {
4813   bool do_sub_pg_creates = false;
4814   bool have_pending_creates = false;
4815   {
4816     const auto max_pgs_per_osd =
4817       (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4818        cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4819     if (max_pgs_per_osd <= num_pgs) {
4820       // this could happen if admin decreases this setting before a PG is removed
4821       return;
4822     }
4823     unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4824     std::lock_guard l(pending_creates_lock);
4825     if (pending_creates_from_mon > 0) {
4826       dout(20) << __func__ << " pending_creates_from_mon "
4827                << pending_creates_from_mon << dendl;
4828       do_sub_pg_creates = true;
4829       if (pending_creates_from_mon >= spare_pgs) {
4830         spare_pgs = pending_creates_from_mon = 0;
4831       } else {
4832         spare_pgs -= pending_creates_from_mon;
4833         pending_creates_from_mon = 0;
4834       }
4835     }
4836     auto pg = pending_creates_from_osd.cbegin();
4837     while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4838       dout(20) << __func__ << " pg " << pg->first << dendl;
4839       vector<int> acting;
4840       osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
4841       service.queue_want_pg_temp(pg->first, twiddle(acting), true);
4842       pg = pending_creates_from_osd.erase(pg);
4843       do_sub_pg_creates = true;
4844       spare_pgs--;
4845     }
4846     have_pending_creates = (pending_creates_from_mon > 0 ||
4847                             !pending_creates_from_osd.empty());
4848   }
4849
4850   bool do_renew_subs = false;
4851   if (do_sub_pg_creates) {
4852     if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4853       dout(4) << __func__ << ": resolicit pg creates from mon since "
4854               << last_pg_create_epoch << dendl;
4855       do_renew_subs = true;
4856     }
4857   }
4858   version_t start = osdmap->get_epoch() + 1;
4859   if (have_pending_creates) {
4860     // don't miss any new osdmap deleting PGs
4861     if (monc->sub_want("osdmap", start, 0)) {
4862       dout(4) << __func__ << ": resolicit osdmap from mon since "
4863               << start << dendl;
4864       do_renew_subs = true;
4865     }
4866   } else if (do_sub_pg_creates) {
4867     // no need to subscribe the osdmap continuously anymore
4868     // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4869     if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4870       dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4871               << start << dendl;
4872       do_renew_subs = true;
4873     }
4874   }
4875
4876   if (do_renew_subs) {
4877     monc->renew_subs();
4878   }
4879
4880   service.send_pg_temp();
4881 }
4882
4883 void OSD::build_initial_pg_history(
4884   spg_t pgid,
4885   epoch_t created,
4886   utime_t created_stamp,
4887   pg_history_t *h,
4888   PastIntervals *pi)
4889 {
4890   dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4891   h->epoch_created = created;
4892   h->epoch_pool_created = created;
4893   h->same_interval_since = created;
4894   h->same_up_since = created;
4895   h->same_primary_since = created;
4896   h->last_scrub_stamp = created_stamp;
4897   h->last_deep_scrub_stamp = created_stamp;
4898   h->last_clean_scrub_stamp = created_stamp;
4899
4900   OSDMapRef lastmap = service.get_map(created);
4901   int up_primary, acting_primary;
4902   vector<int> up, acting;
4903   lastmap->pg_to_up_acting_osds(
4904     pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4905
4906   ostringstream debug;
4907   for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4908     OSDMapRef osdmap = service.get_map(e);
4909     int new_up_primary, new_acting_primary;
4910     vector<int> new_up, new_acting;
4911     osdmap->pg_to_up_acting_osds(
4912       pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4913
4914     // this is a bit imprecise, but sufficient?
4915     struct min_size_predicate_t : public IsPGRecoverablePredicate {
4916       const pg_pool_t *pi;
4917       bool operator()(const set<pg_shard_t> &have) const {
4918         return have.size() >= pi->min_size;
4919       }
4920       explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4921     } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4922
4923     bool new_interval = PastIntervals::check_new_interval(
4924       acting_primary,
4925       new_acting_primary,
4926       acting, new_acting,
4927       up_primary,
4928       new_up_primary,
4929       up, new_up,
4930       h->same_interval_since,
4931       h->last_epoch_clean,
4932       osdmap,
4933       lastmap,
4934       pgid.pgid,
4935       &min_size_predicate,
4936       pi,
4937       &debug);
4938     if (new_interval) {
4939       h->same_interval_since = e;
4940       if (up != new_up) {
4941         h->same_up_since = e;
4942       }
4943       if (acting_primary != new_acting_primary) {
4944         h->same_primary_since = e;
4945       }
4946       if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4947                              osdmap->get_pg_num(pgid.pgid.pool()),
4948                              nullptr)) {
4949         h->last_epoch_split = e;
4950       }
4951       up = new_up;
4952       acting = new_acting;
4953       up_primary = new_up_primary;
4954       acting_primary = new_acting_primary;
4955     }
4956     lastmap = osdmap;
4957   }
4958   dout(20) << __func__ << " " << debug.str() << dendl;
4959   dout(10) << __func__ << " " << *h << " " << *pi
4960            << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4961                        pi->get_bounds()) << ")"
4962            << dendl;
4963 }
4964
4965 void OSD::_add_heartbeat_peer(int p)
4966 {
4967   if (p == whoami)
4968     return;
4969   HeartbeatInfo *hi;
4970
4971   map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4972   if (i == heartbeat_peers.end()) {
4973     pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4974     if (!cons.first)
4975       return;
4976     hi = &heartbeat_peers[p];
4977     hi->peer = p;
4978     RefCountedPtr s{new HeartbeatSession{p}, false};
4979     hi->hb_interval_start = ceph_clock_now();
4980     hi->con_back = cons.first.get();
4981     hi->con_back->set_priv(s);
4982     if (cons.second) {
4983       hi->con_front = cons.second.get();
4984       hi->con_front->set_priv(s);
4985       dout(10) << "_add_heartbeat_peer: new peer osd." << p
4986                << " " << hi->con_back->get_peer_addr()
4987                << " " << hi->con_front->get_peer_addr()
4988                << dendl;
4989     } else {
4990       hi->con_front.reset(NULL);
4991       dout(10) << "_add_heartbeat_peer: new peer osd." << p
4992                << " " << hi->con_back->get_peer_addr()
4993                << dendl;
4994     }
4995   } else {
4996     hi = &i->second;
4997   }
4998   hi->epoch = osdmap->get_epoch();
4999 }
5000
5001 void OSD::_remove_heartbeat_peer(int n)
5002 {
5003   map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5004   ceph_assert(q != heartbeat_peers.end());
5005   dout(20) << " removing heartbeat peer osd." << n
5006            << " " << q->second.con_back->get_peer_addr()
5007            << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5008            << dendl;
5009   q->second.con_back->mark_down();
5010   if (q->second.con_front) {
5011     q->second.con_front->mark_down();
5012   }
5013   heartbeat_peers.erase(q);
5014 }
5015
5016 void OSD::need_heartbeat_peer_update()
5017 {
5018   if (is_stopping())
5019     return;
5020   dout(20) << "need_heartbeat_peer_update" << dendl;
5021   heartbeat_set_peers_need_update();
5022 }
5023
5024 void OSD::maybe_update_heartbeat_peers()
5025 {
5026   ceph_assert(osd_lock.is_locked());
5027
5028   if (is_waiting_for_healthy() || is_active()) {
5029     utime_t now = ceph_clock_now();
5030     if (last_heartbeat_resample == utime_t()) {
5031       last_heartbeat_resample = now;
5032       heartbeat_set_peers_need_update();
5033     } else if (!heartbeat_peers_need_update()) {
5034       utime_t dur = now - last_heartbeat_resample;
5035       if (dur > cct->_conf->osd_heartbeat_grace) {
5036         dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5037         heartbeat_set_peers_need_update();
5038         last_heartbeat_resample = now;
5039         // automatically clean up any stale heartbeat peers
5040         // if we are unhealthy, then clean all
5041         reset_heartbeat_peers(is_waiting_for_healthy());
5042       }
5043     }
5044   }
5045
5046   if (!heartbeat_peers_need_update())
5047     return;
5048   heartbeat_clear_peers_need_update();
5049
5050   std::lock_guard l(heartbeat_lock);
5051
5052   dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5053
5054
5055   // build heartbeat from set
5056   if (is_active()) {
5057     vector<PGRef> pgs;
5058     _get_pgs(&pgs);
5059     for (auto& pg : pgs) {
5060       pg->with_heartbeat_peers([&](int peer) {
5061           if (osdmap->is_up(peer)) {
5062             _add_heartbeat_peer(peer);
5063           }
5064         });
5065     }
5066   }
5067
5068   // include next and previous up osds to ensure we have a fully-connected set
5069   set<int> want, extras;
5070   const int next = osdmap->get_next_up_osd_after(whoami);
5071   if (next >= 0)
5072     want.insert(next);
5073   int prev = osdmap->get_previous_up_osd_before(whoami);
5074   if (prev >= 0 && prev != next)
5075     want.insert(prev);
5076
5077   // make sure we have at least **min_down** osds coming from different
5078   // subtree level (e.g., hosts) for fast failure detection.
5079   auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5080   auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5081   osdmap->get_random_up_osds_by_subtree(
5082     whoami, subtree, min_down, want, &want);
5083
5084   for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5085     dout(10) << " adding neighbor peer osd." << *p << dendl;
5086     extras.insert(*p);
5087     _add_heartbeat_peer(*p);
5088   }
5089
5090   // remove down peers; enumerate extras
5091   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5092   while (p != heartbeat_peers.end()) {
5093     if (!osdmap->is_up(p->first)) {
5094       int o = p->first;
5095       ++p;
5096       _remove_heartbeat_peer(o);
5097       continue;
5098     }
5099     if (p->second.epoch < osdmap->get_epoch()) {
5100       extras.insert(p->first);
5101     }
5102     ++p;
5103   }
5104
5105   // too few?
5106   for (int n = next; n >= 0; ) {
5107     if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5108       break;
5109     if (!extras.count(n) && !want.count(n) && n != whoami) {
5110       dout(10) << " adding random peer osd." << n << dendl;
5111       extras.insert(n);
5112       _add_heartbeat_peer(n);
5113     }
5114     n = osdmap->get_next_up_osd_after(n);
5115     if (n == next)
5116       break;  // came full circle; stop
5117   }
5118
5119   // too many?
5120   for (set<int>::iterator p = extras.begin();
5121        (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5122        ++p) {
5123     if (want.count(*p))
5124       continue;
5125     _remove_heartbeat_peer(*p);
5126   }
5127
5128   dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5129 }
5130
5131 void OSD::reset_heartbeat_peers(bool all)
5132 {
5133   ceph_assert(osd_lock.is_locked());
5134   dout(10) << "reset_heartbeat_peers" << dendl;
5135   utime_t stale = ceph_clock_now();
5136   stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5137   std::lock_guard l(heartbeat_lock);
5138   for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5139     HeartbeatInfo& hi = it->second;
5140     if (all || hi.is_stale(stale)) {
5141       hi.con_back->mark_down();
5142       if (hi.con_front) {
5143         hi.con_front->mark_down();
5144       }
5145       // stop sending failure_report to mon too
5146       failure_queue.erase(it->first);
5147       heartbeat_peers.erase(it++);
5148     } else {
5149       it++;
5150     }
5151   }
5152 }
5153
5154 void OSD::handle_osd_ping(MOSDPing *m)
5155 {
5156   if (superblock.cluster_fsid != m->fsid) {
5157     dout(20) << "handle_osd_ping from " << m->get_source_inst()
5158              << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
5159     m->put();
5160     return;
5161   }
5162
5163   int from = m->get_source().num();
5164
5165   heartbeat_lock.Lock();
5166   if (is_stopping()) {
5167     heartbeat_lock.Unlock();
5168     m->put();
5169     return;
5170   }
5171
5172   OSDMapRef curmap = service.get_osdmap();
5173   if (!curmap) {
5174     heartbeat_lock.Unlock();
5175     m->put();
5176     return;
5177   }
5178
5179   switch (m->op) {
5180
5181   case MOSDPing::PING:
5182     {
5183       if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5184         auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5185         if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5186           if (heartbeat_drop->second == 0) {
5187             debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5188           } else {
5189             --heartbeat_drop->second;
5190             dout(5) << "Dropping heartbeat from " << from
5191                     << ", " << heartbeat_drop->second
5192                     << " remaining to drop" << dendl;
5193             break;
5194           }
5195         } else if (cct->_conf->osd_debug_drop_ping_probability >
5196                    ((((double)(rand()%100))/100.0))) {
5197           heartbeat_drop =
5198             debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5199                              cct->_conf->osd_debug_drop_ping_duration)).first;
5200           dout(5) << "Dropping heartbeat from " << from
5201                   << ", " << heartbeat_drop->second
5202                   << " remaining to drop" << dendl;
5203           break;
5204         }
5205       }
5206
5207       if (!cct->get_heartbeat_map()->is_healthy()) {
5208         dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5209         break;
5210       }
5211
5212       Message *r = new MOSDPing(monc->get_fsid(),
5213                                 curmap->get_epoch(),
5214                                 MOSDPing::PING_REPLY, m->stamp,
5215                                 cct->_conf->osd_heartbeat_min_size);
5216       m->get_connection()->send_message(r);
5217
5218       if (curmap->is_up(from)) {
5219         service.note_peer_epoch(from, m->map_epoch);
5220         if (is_active()) {
5221           ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5222           if (con) {
5223             service.share_map_peer(from, con.get());
5224           }
5225         }
5226       } else if (!curmap->exists(from) ||
5227                  curmap->get_down_at(from) > m->map_epoch) {
5228         // tell them they have died
5229         Message *r = new MOSDPing(monc->get_fsid(),
5230                                   curmap->get_epoch(),
5231                                   MOSDPing::YOU_DIED,
5232                                   m->stamp,
5233                                   cct->_conf->osd_heartbeat_min_size);
5234         m->get_connection()->send_message(r);
5235       }
5236     }
5237     break;
5238
5239   case MOSDPing::PING_REPLY:
5240     {
5241       map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5242       if (i != heartbeat_peers.end()) {
5243         auto acked = i->second.ping_history.find(m->stamp);
5244         if (acked != i->second.ping_history.end()) {
5245           utime_t now = ceph_clock_now();
5246           int &unacknowledged = acked->second.second;
5247           if (m->get_connection() == i->second.con_back) {
5248             dout(25) << "handle_osd_ping got reply from osd." << from
5249                      << " first_tx " << i->second.first_tx
5250                      << " last_tx " << i->second.last_tx
5251                      << " last_rx_back " << i->second.last_rx_back << " -> " << now
5252                      << " last_rx_front " << i->second.last_rx_front
5253                      << dendl;
5254             i->second.last_rx_back = now;
5255             ceph_assert(unacknowledged > 0);
5256             --unacknowledged;
5257             // if there is no front con, set both stamps.
5258             if (i->second.con_front == NULL) {
5259               i->second.last_rx_front = now;
5260               ceph_assert(unacknowledged > 0);
5261               --unacknowledged;
5262             }
5263           } else if (m->get_connection() == i->second.con_front) {
5264             dout(25) << "handle_osd_ping got reply from osd." << from
5265                      << " first_tx " << i->second.first_tx
5266                      << " last_tx " << i->second.last_tx
5267                      << " last_rx_back " << i->second.last_rx_back
5268                      << " last_rx_front " << i->second.last_rx_front << " -> " << now
5269                      << dendl;
5270             i->second.last_rx_front = now;
5271             ceph_assert(unacknowledged > 0);
5272             --unacknowledged;
5273           }
5274
5275           if (unacknowledged == 0) {
5276             // succeeded in getting all replies
5277             dout(25) << "handle_osd_ping got all replies from osd." << from
5278                      << " , erase pending ping(sent at " << m->stamp << ")"
5279                      << " and older pending ping(s)"
5280                      << dendl;
5281
5282 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5283             ++i->second.hb_average_count;
5284             uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->stamp);
5285             i->second.hb_total_back += back_pingtime;
5286             if (back_pingtime < i->second.hb_min_back)
5287               i->second.hb_min_back = back_pingtime;
5288             if (back_pingtime > i->second.hb_max_back)
5289               i->second.hb_max_back = back_pingtime;
5290             uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->stamp);
5291             i->second.hb_total_front += front_pingtime;
5292             if (front_pingtime < i->second.hb_min_front)
5293               i->second.hb_min_front = front_pingtime;
5294             if (front_pingtime > i->second.hb_max_front)
5295               i->second.hb_max_front = front_pingtime;
5296
5297             ceph_assert(i->second.hb_interval_start != utime_t());
5298             if (i->second.hb_interval_start == utime_t())
5299               i->second.hb_interval_start = now;
5300             int64_t hb_avg_time_period = 60;
5301             if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5302               hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5303             }
5304             if (now - i->second.hb_interval_start >=  utime_t(hb_avg_time_period, 0)) {
5305               uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5306               uint32_t back_min = i->second.hb_min_back;
5307               uint32_t back_max = i->second.hb_max_back;
5308               uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5309               uint32_t front_min = i->second.hb_min_front;
5310               uint32_t front_max = i->second.hb_max_front;
5311
5312               // Reset for new interval
5313               i->second.hb_average_count = 0;
5314               i->second.hb_interval_start = now;
5315               i->second.hb_total_back = i->second.hb_max_back = 0;
5316               i->second.hb_min_back =  UINT_MAX;
5317               i->second.hb_total_front = i->second.hb_max_front = 0;
5318               i->second.hb_min_front = UINT_MAX;
5319
5320               // Record per osd interace ping times
5321               // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5322               if (i->second.hb_back_pingtime.size() == 0) {
5323                 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5324                 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5325                   i->second.hb_back_pingtime.push_back(back_avg);
5326                   i->second.hb_back_min.push_back(back_min);
5327                   i->second.hb_back_max.push_back(back_max);
5328                   i->second.hb_front_pingtime.push_back(front_avg);
5329                   i->second.hb_front_min.push_back(front_min);
5330                   i->second.hb_front_max.push_back(front_max);
5331                   ++i->second.hb_index;
5332                 }
5333               } else {
5334                 int index = i->second.hb_index & (hb_vector_size - 1);
5335                 i->second.hb_back_pingtime[index] = back_avg;
5336                 i->second.hb_back_min[index] = back_min;
5337                 i->second.hb_back_max[index] = back_max;
5338                 i->second.hb_front_pingtime[index] = front_avg;
5339                 i->second.hb_front_min[index] = front_min;
5340                 i->second.hb_front_max[index] = front_max;
5341                 ++i->second.hb_index;
5342               }
5343
5344               {
5345                 std::lock_guard l(service.stat_lock);
5346                 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5347                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5348
5349                 uint32_t total = 0;
5350                 uint32_t min = UINT_MAX;
5351                 uint32_t max = 0;
5352                 uint32_t count = 0;
5353                 uint32_t which = 0;
5354                 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5355                 for (int32_t k = size - 1 ; k >= 0; --k) {
5356                   ++count;
5357                   int index = (i->second.hb_index + k) % size;
5358                   total += i->second.hb_back_pingtime[index];
5359                   if (i->second.hb_back_min[index] < min)
5360                     min = i->second.hb_back_min[index];
5361                   if (i->second.hb_back_max[index] > max)
5362                     max = i->second.hb_back_max[index];
5363                   if (count == 1 || count == 5 || count == 15) {
5364                     service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5365                     service.osd_stat.hb_pingtime[from].back_min[which] = min;
5366                     service.osd_stat.hb_pingtime[from].back_max[which] = max;
5367                     which++;
5368                     if (count == 15)
5369                       break;
5370                   }
5371                 }
5372
5373                 if (i->second.con_front != NULL) {
5374                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5375
5376                   total = 0;
5377                   min = UINT_MAX;
5378                   max = 0;
5379                   count = 0;
5380                   which = 0;
5381                   for (int32_t k = size - 1 ; k >= 0; --k) {
5382                     ++count;
5383                     int index = (i->second.hb_index + k) % size;
5384                     total += i->second.hb_front_pingtime[index];
5385                     if (i->second.hb_front_min[index] < min)
5386                       min = i->second.hb_front_min[index];
5387                     if (i->second.hb_front_max[index] > max)
5388                       max = i->second.hb_front_max[index];
5389                     if (count == 1 || count == 5 || count == 15) {
5390                       service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5391                       service.osd_stat.hb_pingtime[from].front_min[which] = min;
5392                       service.osd_stat.hb_pingtime[from].front_max[which] = max;
5393                       which++;
5394                       if (count == 15)
5395                         break;
5396                     }
5397                   }
5398                 }
5399               }
5400             } else {
5401                 std::lock_guard l(service.stat_lock);
5402                 service.osd_stat.hb_pingtime[from].back_last =  back_pingtime;
5403                 if (i->second.con_front != NULL)
5404                   service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5405             }
5406             i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5407           }
5408
5409           if (i->second.is_healthy(now)) {
5410             // Cancel false reports
5411             auto failure_queue_entry = failure_queue.find(from);
5412             if (failure_queue_entry != failure_queue.end()) {
5413               dout(10) << "handle_osd_ping canceling queued "
5414                        << "failure report for osd." << from << dendl;
5415               failure_queue.erase(failure_queue_entry);
5416             }
5417
5418             auto failure_pending_entry = failure_pending.find(from);
5419             if (failure_pending_entry != failure_pending.end()) {
5420               dout(10) << "handle_osd_ping canceling in-flight "
5421                        << "failure report for osd." << from << dendl;
5422               send_still_alive(curmap->get_epoch(),
5423                                from,
5424                                failure_pending_entry->second.second);
5425               failure_pending.erase(failure_pending_entry);
5426             }
5427           }
5428         } else {
5429           // old replies, deprecated by newly sent pings.
5430           dout(10) << "handle_osd_ping no pending ping(sent at " << m->stamp
5431                    << ") is found, treat as covered by newly sent pings "
5432                    << "and ignore"
5433                    << dendl;
5434         }
5435       }
5436
5437       if (m->map_epoch &&
5438           curmap->is_up(from)) {
5439         service.note_peer_epoch(from, m->map_epoch);
5440         if (is_active()) {
5441           ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5442           if (con) {
5443             service.share_map_peer(from, con.get());
5444           }
5445         }
5446       }
5447     }
5448     break;
5449
5450   case MOSDPing::YOU_DIED:
5451     dout(10) << "handle_osd_ping " << m->get_source_inst()
5452              << " says i am down in " << m->map_epoch << dendl;
5453     osdmap_subscribe(curmap->get_epoch()+1, false);
5454     break;
5455   }
5456
5457   heartbeat_lock.Unlock();
5458   m->put();
5459 }
5460
5461 void OSD::heartbeat_entry()
5462 {
5463   std::lock_guard l(heartbeat_lock);
5464   if (is_stopping())
5465     return;
5466   while (!heartbeat_stop) {
5467     heartbeat();
5468
5469     double wait;
5470     if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5471       wait = (float)cct->_conf->osd_heartbeat_interval;
5472     } else {
5473       wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5474     }
5475     utime_t w;
5476     w.set_from_double(wait);
5477     dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5478     heartbeat_cond.WaitInterval(heartbeat_lock, w);
5479     if (is_stopping())
5480       return;
5481     dout(30) << "heartbeat_entry woke up" << dendl;
5482   }
5483 }
5484
5485 void OSD::heartbeat_check()
5486 {
5487   ceph_assert(heartbeat_lock.is_locked());
5488   utime_t now = ceph_clock_now();
5489
5490   // check for incoming heartbeats (move me elsewhere?)
5491   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5492        p != heartbeat_peers.end();
5493        ++p) {
5494
5495     if (p->second.first_tx == utime_t()) {
5496       dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5497                << " yet, skipping" << dendl;
5498       continue;
5499     }
5500
5501     dout(25) << "heartbeat_check osd." << p->first
5502              << " first_tx " << p->second.first_tx
5503              << " last_tx " << p->second.last_tx
5504              << " last_rx_back " << p->second.last_rx_back
5505              << " last_rx_front " << p->second.last_rx_front
5506              << dendl;
5507     if (p->second.is_unhealthy(now)) {
5508       utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5509       if (p->second.last_rx_back == utime_t() ||
5510           p->second.last_rx_front == utime_t()) {
5511         derr << "heartbeat_check: no reply from "
5512              << p->second.con_front->get_peer_addr().get_sockaddr()
5513              << " osd." << p->first
5514              << " ever on either front or back, first ping sent "
5515              << p->second.first_tx
5516              << " (oldest deadline " << oldest_deadline << ")"
5517              << dendl;
5518         // fail
5519         failure_queue[p->first] = p->second.first_tx;
5520       } else {
5521         derr << "heartbeat_check: no reply from "
5522              << p->second.con_front->get_peer_addr().get_sockaddr()
5523              << " osd." << p->first << " since back " << p->second.last_rx_back
5524              << " front " << p->second.last_rx_front
5525              << " (oldest deadline " << oldest_deadline << ")"
5526              << dendl;
5527         // fail
5528         failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5529       }
5530     }
5531   }
5532 }
5533
5534 void OSD::heartbeat()
5535 {
5536   ceph_assert(heartbeat_lock.is_locked_by_me());
5537   dout(30) << "heartbeat" << dendl;
5538
5539   // get CPU load avg
5540   double loadavgs[1];
5541   int hb_interval = cct->_conf->osd_heartbeat_interval;
5542   int n_samples = 86400;
5543   if (hb_interval > 1) {
5544     n_samples /= hb_interval;
5545     if (n_samples < 1)
5546       n_samples = 1;
5547   }
5548
5549   if (getloadavg(loadavgs, 1) == 1) {
5550     logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5551     daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5552     dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5553   }
5554
5555   dout(30) << "heartbeat checking stats" << dendl;
5556
5557   // refresh peer list and osd stats
5558   vector<int> hb_peers;
5559   for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5560        p != heartbeat_peers.end();
5561        ++p)
5562     hb_peers.push_back(p->first);
5563
5564   auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5565   dout(5) << __func__ << " " << new_stat << dendl;
5566   ceph_assert(new_stat.statfs.total);
5567
5568   float pratio;
5569   float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5570
5571   service.check_full_status(ratio, pratio);
5572
5573   utime_t now = ceph_clock_now();
5574   utime_t deadline = now;
5575   deadline += cct->_conf->osd_heartbeat_grace;
5576
5577   // send heartbeats
5578   for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5579        i != heartbeat_peers.end();
5580        ++i) {
5581     int peer = i->first;
5582     i->second.last_tx = now;
5583     if (i->second.first_tx == utime_t())
5584       i->second.first_tx = now;
5585     i->second.ping_history[now] = make_pair(deadline,
5586       HeartbeatInfo::HEARTBEAT_MAX_CONN);
5587     if (i->second.hb_interval_start == utime_t())
5588       i->second.hb_interval_start = now;
5589     dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5590     i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5591                                           service.get_osdmap_epoch(),
5592                                           MOSDPing::PING, now,
5593                                           cct->_conf->osd_heartbeat_min_size));
5594
5595     if (i->second.con_front)
5596       i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5597                                              service.get_osdmap_epoch(),
5598                                              MOSDPing::PING, now,
5599                                           cct->_conf->osd_heartbeat_min_size));
5600   }
5601
5602   logger->set(l_osd_hb_to, heartbeat_peers.size());
5603
5604   // hmm.. am i all alone?
5605   dout(30) << "heartbeat lonely?" << dendl;
5606   if (heartbeat_peers.empty()) {
5607     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5608       last_mon_heartbeat = now;
5609       dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5610       osdmap_subscribe(osdmap->get_epoch() + 1, false);
5611     }
5612   }
5613
5614   dout(30) << "heartbeat done" << dendl;
5615 }
5616
5617 bool OSD::heartbeat_reset(Connection *con)
5618 {
5619   std::lock_guard l(heartbeat_lock);
5620   auto s = con->get_priv();
5621   con->set_priv(nullptr);
5622   if (s) {
5623     if (is_stopping()) {
5624       return true;
5625     }
5626     auto heartbeat_session = static_cast<HeartbeatSession*>(s.get());
5627     auto p = heartbeat_peers.find(heartbeat_session->peer);
5628     if (p != heartbeat_peers.end() &&
5629         (p->second.con_back == con ||
5630          p->second.con_front == con)) {
5631       dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5632                << ", reopening" << dendl;
5633       if (con != p->second.con_back) {
5634         p->second.con_back->mark_down();
5635       }
5636       p->second.con_back.reset(NULL);
5637       if (p->second.con_front && con != p->second.con_front) {
5638         p->second.con_front->mark_down();
5639       }
5640       p->second.con_front.reset(NULL);
5641       pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5642       if (newcon.first) {
5643         p->second.con_back = newcon.first.get();
5644         p->second.con_back->set_priv(s);
5645         if (newcon.second) {
5646           p->second.con_front = newcon.second.get();
5647           p->second.con_front->set_priv(s);
5648         }
5649         p->second.ping_history.clear();
5650       } else {
5651         dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5652                  << ", raced with osdmap update, closing out peer" << dendl;
5653         heartbeat_peers.erase(p);
5654       }
5655     } else {
5656       dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5657     }
5658   }
5659   return true;
5660 }
5661
5662
5663
5664 // =========================================
5665
5666 void OSD::tick()
5667 {
5668   ceph_assert(osd_lock.is_locked());
5669   dout(10) << "tick" << dendl;
5670
5671   if (is_active() || is_waiting_for_healthy()) {
5672     maybe_update_heartbeat_peers();
5673   }
5674
5675   if (is_waiting_for_healthy()) {
5676     start_boot();
5677   }
5678
5679   if (is_waiting_for_healthy() || is_booting()) {
5680     std::lock_guard l(heartbeat_lock);
5681     utime_t now = ceph_clock_now();
5682     if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5683       last_mon_heartbeat = now;
5684       dout(1) << __func__ << " checking mon for new map" << dendl;
5685       osdmap_subscribe(osdmap->get_epoch() + 1, false);
5686     }
5687   }
5688
5689   do_waiters();
5690
5691   tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5692 }
5693
5694 void OSD::tick_without_osd_lock()
5695 {
5696   ceph_assert(tick_timer_lock.is_locked());
5697   dout(10) << "tick_without_osd_lock" << dendl;
5698
5699   logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5700   logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5701   logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5702
5703   // refresh osd stats
5704   struct store_statfs_t stbuf;
5705   osd_alert_list_t alerts;
5706   int r = store->statfs(&stbuf, &alerts);
5707   ceph_assert(r == 0);
5708   service.set_statfs(stbuf, alerts);
5709
5710   // osd_lock is not being held, which means the OSD state
5711   // might change when doing the monitor report
5712   if (is_active() || is_waiting_for_healthy()) {
5713     heartbeat_lock.Lock();
5714     heartbeat_check();
5715     heartbeat_lock.Unlock();
5716
5717     map_lock.get_read();
5718     std::lock_guard l(mon_report_lock);
5719
5720     // mon report?
5721     utime_t now = ceph_clock_now();
5722     if (service.need_fullness_update() ||
5723         now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5724       last_mon_report = now;
5725       send_full_update();
5726       send_failures();
5727     }
5728     map_lock.put_read();
5729
5730     epoch_t max_waiting_epoch = 0;
5731     for (auto s : shards) {
5732       max_waiting_epoch = std::max(max_waiting_epoch,
5733                                    s->get_max_waiting_epoch());
5734     }
5735     if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5736       dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5737                << ", requesting new map" << dendl;
5738       osdmap_subscribe(superblock.newest_map + 1, false);
5739     }
5740   }
5741
5742   if (is_active()) {
5743     if (!scrub_random_backoff()) {
5744       sched_scrub();
5745     }
5746     service.promote_throttle_recalibrate();
5747     resume_creating_pg();
5748     bool need_send_beacon = false;
5749     const auto now = ceph::coarse_mono_clock::now();
5750     {
5751       // borrow lec lock to pretect last_sent_beacon from changing
5752       std::lock_guard l{min_last_epoch_clean_lock};
5753       const auto elapsed = now - last_sent_beacon;
5754       if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5755         cct->_conf->osd_beacon_report_interval) {
5756         need_send_beacon = true;
5757       }
5758     }
5759     if (need_send_beacon) {
5760       send_beacon(now);
5761     }
5762   }
5763
5764   mgrc.update_daemon_health(get_health_metrics());
5765   service.kick_recovery_queue();
5766   tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5767                                               new C_Tick_WithoutOSDLock(this));
5768 }
5769
5770 // Usage:
5771 //   setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5772 //   rmomapkey <pool-id> [namespace/]<obj-name> <key>
5773 //   setomapheader <pool-id> [namespace/]<obj-name> <header>
5774 //   getomap <pool> [namespace/]<obj-name>
5775 //   truncobj <pool-id> [namespace/]<obj-name> <newlen>
5776 //   injectmdataerr [namespace/]<obj-name> [shardid]
5777 //   injectdataerr [namespace/]<obj-name> [shardid]
5778 //
5779 //   set_recovery_delay [utime]
5780 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5781                                  std::string_view command,
5782                                  const cmdmap_t& cmdmap, ostream &ss)
5783 {
5784   //Test support
5785   //Support changing the omap on a single osd by using the Admin Socket to
5786   //directly request the osd make a change.
5787   if (command == "setomapval" || command == "rmomapkey" ||
5788       command == "setomapheader" || command == "getomap" ||
5789       command == "truncobj" || command == "injectmdataerr" ||
5790       command == "injectdataerr"
5791     ) {
5792     pg_t rawpg;
5793     int64_t pool;
5794     OSDMapRef curmap = service->get_osdmap();
5795     int r = -1;
5796
5797     string poolstr;
5798
5799     cmd_getval(service->cct, cmdmap, "pool", poolstr);
5800     pool = curmap->lookup_pg_pool_name(poolstr);
5801     //If we can't find it by name then maybe id specified
5802     if (pool < 0 && isdigit(poolstr[0]))
5803       pool = atoll(poolstr.c_str());
5804     if (pool < 0) {
5805       ss << "Invalid pool '" << poolstr << "''";
5806       return;
5807     }
5808
5809     string objname, nspace;
5810     cmd_getval(service->cct, cmdmap, "objname", objname);
5811     std::size_t found = objname.find_first_of('/');
5812     if (found != string::npos) {
5813       nspace = objname.substr(0, found);
5814       objname = objname.substr(found+1);
5815     }
5816     object_locator_t oloc(pool, nspace);
5817     r = curmap->object_locator_to_pg(object_t(objname), oloc,  rawpg);
5818
5819     if (r < 0) {
5820       ss << "Invalid namespace/objname";
5821       return;
5822     }
5823
5824     int64_t shardid;
5825     cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5826     hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5827     ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5828     spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5829     if (curmap->pg_is_ec(rawpg)) {
5830         if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5831             ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5832             return;
5833         }
5834     }
5835
5836     ObjectStore::Transaction t;
5837
5838     if (command == "setomapval") {
5839       map<string, bufferlist> newattrs;
5840       bufferlist val;
5841       string key, valstr;
5842       cmd_getval(service->cct, cmdmap, "key", key);
5843       cmd_getval(service->cct, cmdmap, "val", valstr);
5844
5845       val.append(valstr);
5846       newattrs[key] = val;
5847       t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5848       r = store->queue_transaction(service->meta_ch, std::move(t));
5849       if (r < 0)
5850         ss << "error=" << r;
5851       else
5852         ss << "ok";
5853     } else if (command == "rmomapkey") {
5854       string key;
5855       set<string> keys;
5856       cmd_getval(service->cct, cmdmap, "key", key);
5857
5858       keys.insert(key);
5859       t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5860       r = store->queue_transaction(service->meta_ch, std::move(t));
5861       if (r < 0)
5862         ss << "error=" << r;
5863       else
5864         ss << "ok";
5865     } else if (command == "setomapheader") {
5866       bufferlist newheader;
5867       string headerstr;
5868
5869       cmd_getval(service->cct, cmdmap, "header", headerstr);
5870       newheader.append(headerstr);
5871       t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5872       r = store->queue_transaction(service->meta_ch, std::move(t));
5873       if (r < 0)
5874         ss << "error=" << r;
5875       else
5876         ss << "ok";
5877     } else if (command == "getomap") {
5878       //Debug: Output entire omap
5879       bufferlist hdrbl;
5880       map<string, bufferlist> keyvals;
5881       auto ch = store->open_collection(coll_t(pgid));
5882       if (!ch) {
5883         ss << "unable to open collection for " << pgid;
5884         r = -ENOENT;
5885       } else {
5886         r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
5887         if (r >= 0) {
5888           ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5889           for (map<string, bufferlist>::iterator it = keyvals.begin();
5890                it != keyvals.end(); ++it)
5891             ss << " key=" << (*it).first << " val="
5892                << string((*it).second.c_str(), (*it).second.length());
5893         } else {
5894           ss << "error=" << r;
5895         }
5896       }
5897     } else if (command == "truncobj") {
5898       int64_t trunclen;
5899       cmd_getval(service->cct, cmdmap, "len", trunclen);
5900       t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5901       r = store->queue_transaction(service->meta_ch, std::move(t));
5902       if (r < 0)
5903         ss << "error=" << r;
5904       else
5905         ss << "ok";
5906     } else if (command == "injectdataerr") {
5907       store->inject_data_error(gobj);
5908       ss << "ok";
5909     } else if (command == "injectmdataerr") {
5910       store->inject_mdata_error(gobj);
5911       ss << "ok";
5912     }
5913     return;
5914   }
5915   if (command == "set_recovery_delay") {
5916     int64_t delay;
5917     cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5918     ostringstream oss;
5919     oss << delay;
5920     int r = service->cct->_conf.set_val("osd_recovery_delay_start",
5921                                          oss.str().c_str());
5922     if (r != 0) {
5923       ss << "set_recovery_delay: error setting "
5924          << "osd_recovery_delay_start to '" << delay << "': error "
5925          << r;
5926       return;
5927     }
5928     service->cct->_conf.apply_changes(nullptr);
5929     ss << "set_recovery_delay: set osd_recovery_delay_start "
5930        << "to " << service->cct->_conf->osd_recovery_delay_start;
5931     return;
5932   }
5933   if (command ==  "trigger_scrub" || command == "trigger_deep_scrub") {
5934     spg_t pgid;
5935     bool deep = (command == "trigger_deep_scrub");
5936     OSDMapRef curmap = service->get_osdmap();
5937
5938     string pgidstr;
5939
5940     cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5941     if (!pgid.parse(pgidstr.c_str())) {
5942       ss << "Invalid pgid specified";
5943       return;
5944     }
5945
5946     int64_t time;
5947     cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
5948
5949     PGRef pg = service->osd->_lookup_lock_pg(pgid);
5950     if (pg == nullptr) {
5951       ss << "Can't find pg " << pgid;
5952       return;
5953     }
5954
5955     if (pg->is_primary()) {
5956       pg->unreg_next_scrub();
5957       const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5958       double pool_scrub_max_interval = 0;
5959       double scrub_max_interval;
5960       if (deep) {
5961         p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
5962         scrub_max_interval = pool_scrub_max_interval > 0 ?
5963           pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
5964       } else {
5965         p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5966         scrub_max_interval = pool_scrub_max_interval > 0 ?
5967           pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
5968       }
5969       // Instead of marking must_scrub force a schedule scrub
5970       utime_t stamp = ceph_clock_now();
5971       if (time == 0)
5972         stamp -= scrub_max_interval;
5973       else
5974         stamp -=  (float)time;
5975       stamp -= 100.0;  // push back last scrub more for good measure
5976       if (deep) {
5977         pg->set_last_deep_scrub_stamp(stamp);
5978       } else {
5979         pg->set_last_scrub_stamp(stamp);
5980       }
5981       pg->reg_next_scrub();
5982       pg->publish_stats_to_osd();
5983       ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
5984     } else {
5985       ss << "Not primary";
5986     }
5987     pg->unlock();
5988     return;
5989   }
5990   if (command == "injectfull") {
5991     int64_t count;
5992     string type;
5993     OSDService::s_names state;
5994     cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5995     cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5996     if (type == "none" || count == 0) {
5997       type = "none";
5998       count = 0;
5999     }
6000     state = service->get_full_state(type);
6001     if (state == OSDService::s_names::INVALID) {
6002       ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6003       return;
6004     }
6005     service->set_injectfull(state, count);
6006     return;
6007   }
6008   ss << "Internal error - command=" << command;
6009 }
6010
6011 // =========================================
6012
6013 void OSD::ms_handle_connect(Connection *con)
6014 {
6015   dout(10) << __func__ << " con " << con << dendl;
6016   if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6017     std::lock_guard l(osd_lock);
6018     if (is_stopping())
6019       return;
6020     dout(10) << __func__ << " on mon" << dendl;
6021
6022     if (is_preboot()) {
6023       start_boot();
6024     } else if (is_booting()) {
6025       _send_boot();       // resend boot message
6026     } else {
6027       map_lock.get_read();
6028       std::lock_guard l2(mon_report_lock);
6029
6030       utime_t now = ceph_clock_now();
6031       last_mon_report = now;
6032
6033       // resend everything, it's a new session
6034       send_full_update();
6035       send_alive();
6036       service.requeue_pg_temp();
6037       service.clear_sent_ready_to_merge();
6038       service.send_pg_temp();
6039       service.send_ready_to_merge();
6040       service.send_pg_created();
6041       requeue_failures();
6042       send_failures();
6043
6044       map_lock.put_read();
6045       if (is_active()) {
6046         send_beacon(ceph::coarse_mono_clock::now());
6047       }
6048     }
6049
6050     // full map requests may happen while active or pre-boot
6051     if (requested_full_first) {
6052       rerequest_full_maps();
6053     }
6054   }
6055 }
6056
6057 void OSD::ms_handle_fast_connect(Connection *con)
6058 {
6059   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6060       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6061     auto priv = con->get_priv();
6062     auto s = static_cast<Session*>(priv.get());
6063     if (!s) {
6064       s = new Session{cct, con};
6065       con->set_priv(RefCountedPtr{s, false});
6066       dout(10) << " new session (outgoing) " << s << " con=" << s->con
6067           << " addr=" << s->con->get_peer_addr() << dendl;
6068       // we don't connect to clients
6069       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6070       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6071     }
6072   }
6073 }
6074
6075 void OSD::ms_handle_fast_accept(Connection *con)
6076 {
6077   if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6078       con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6079     auto priv = con->get_priv();
6080     auto s = static_cast<Session*>(priv.get());
6081     if (!s) {
6082       s = new Session{cct, con};
6083       con->set_priv(RefCountedPtr{s, false});
6084       dout(10) << "new session (incoming)" << s << " con=" << con
6085           << " addr=" << con->get_peer_addr()
6086           << " must have raced with connect" << dendl;
6087       ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6088       s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6089     }
6090   }
6091 }
6092
6093 bool OSD::ms_handle_reset(Connection *con)
6094 {
6095   auto s = con->get_priv();
6096   auto session = static_cast<Session*>(s.get());
6097   dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
6098   if (!session)
6099     return false;
6100   session->wstate.reset(con);
6101   session->con->set_priv(nullptr);
6102   session->con.reset();  // break con <-> session ref cycle
6103   // note that we break session->con *before* the session_handle_reset
6104   // cleanup below.  this avoids a race between us and
6105   // PG::add_backoff, Session::check_backoff, etc.
6106   session_handle_reset(SessionRef{session});
6107   return true;
6108 }
6109
6110 bool OSD::ms_handle_refused(Connection *con)
6111 {
6112   if (!cct->_conf->osd_fast_fail_on_connection_refused)
6113     return false;
6114
6115   auto priv = con->get_priv();
6116   auto session = static_cast<Session*>(priv.get());
6117   dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
6118   if (!session)
6119     return false;
6120   int type = con->get_peer_type();
6121   // handle only OSD failures here
6122   if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6123     OSDMapRef osdmap = get_osdmap();
6124     if (osdmap) {
6125       int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6126       if (id >= 0 && osdmap->is_up(id)) {
6127         // I'm cheating mon heartbeat grace logic, because we know it's not going
6128         // to respawn alone. +1 so we won't hit any boundary case.
6129         monc->send_mon_message(
6130           new MOSDFailure(
6131             monc->get_fsid(),
6132             id,
6133             osdmap->get_addrs(id),
6134             cct->_conf->osd_heartbeat_grace + 1,
6135             osdmap->get_epoch(),
6136             MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6137             ));
6138       }
6139     }
6140   }
6141   return true;
6142 }
6143
6144 struct C_OSD_GetVersion : public Context {
6145   OSD *osd;
6146   uint64_t oldest, newest;
6147   explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6148   void finish(int r) override {
6149     if (r >= 0)
6150       osd->_got_mon_epochs(oldest, newest);
6151   }
6152 };
6153
6154 void OSD::start_boot()
6155 {
6156   if (!_is_healthy()) {
6157     // if we are not healthy, do not mark ourselves up (yet)
6158     dout(1) << "not healthy; waiting to boot" << dendl;
6159     if (!is_waiting_for_healthy())
6160       start_waiting_for_healthy();
6161     // send pings sooner rather than later
6162     heartbeat_kick();
6163     return;
6164   }
6165   dout(1) << __func__ << dendl;
6166   set_state(STATE_PREBOOT);
6167   dout(10) << "start_boot - have maps " << superblock.oldest_map
6168            << ".." << superblock.newest_map << dendl;
6169   C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6170   monc->get_version("osdmap", &c->newest, &c->oldest, c);
6171 }
6172
6173 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6174 {
6175   std::lock_guard l(osd_lock);
6176   if (is_preboot()) {
6177     _preboot(oldest, newest);
6178   }
6179 }
6180
6181 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6182 {
6183   ceph_assert(is_preboot());
6184   dout(10) << __func__ << " _preboot mon has osdmaps "
6185            << oldest << ".." << newest << dendl;
6186
6187   // ensure our local fullness awareness is accurate
6188   {
6189     std::lock_guard l(heartbeat_lock);
6190     heartbeat();
6191   }
6192
6193   // if our map within recent history, try to add ourselves to the osdmap.
6194   if (osdmap->get_epoch() == 0) {
6195     derr << "waiting for initial osdmap" << dendl;
6196   } else if (osdmap->is_destroyed(whoami)) {
6197     derr << "osdmap says I am destroyed" << dendl;
6198     // provide a small margin so we don't livelock seeing if we
6199     // un-destroyed ourselves.
6200     if (osdmap->get_epoch() > newest - 1) {
6201       exit(0);
6202     }
6203   } else if (osdmap->is_noup(whoami)) {
6204     derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6205   } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6206     derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6207          << dendl;
6208   } else if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6209     derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6210          << dendl;
6211   } else if (service.need_fullness_update()) {
6212     derr << "osdmap fullness state needs update" << dendl;
6213     send_full_update();
6214   } else if (osdmap->get_epoch() >= oldest - 1 &&
6215              osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6216
6217     // wait for pgs to fully catch up in a different thread, since
6218     // this thread might be required for splitting and merging PGs to
6219     // make progress.
6220     boot_finisher.queue(
6221       new FunctionContext(
6222         [this](int r) {
6223           std::lock_guard l(osd_lock);
6224           if (is_preboot()) {
6225             dout(10) << __func__ << " waiting for peering work to drain"
6226                      << dendl;
6227             osd_lock.Unlock();
6228             for (auto shard : shards) {
6229               shard->wait_min_pg_epoch(osdmap->get_epoch());
6230             }
6231             osd_lock.Lock();
6232           }
6233           if (is_preboot()) {
6234             _send_boot();
6235           }
6236         }));
6237     return;
6238   }
6239
6240   // get all the latest maps
6241   if (osdmap->get_epoch() + 1 >= oldest)
6242     osdmap_subscribe(osdmap->get_epoch() + 1, false);
6243   else
6244     osdmap_subscribe(oldest - 1, true);
6245 }
6246
6247 void OSD::send_full_update()
6248 {
6249   if (!service.need_fullness_update())
6250     return;
6251   unsigned state = 0;
6252   if (service.is_full()) {
6253     state = CEPH_OSD_FULL;
6254   } else if (service.is_backfillfull()) {
6255     state = CEPH_OSD_BACKFILLFULL;
6256   } else if (service.is_nearfull()) {
6257     state = CEPH_OSD_NEARFULL;
6258   }
6259   set<string> s;
6260   OSDMap::calc_state_set(state, s);
6261   dout(10) << __func__ << " want state " << s << dendl;
6262   monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
6263 }
6264
6265 void OSD::start_waiting_for_healthy()
6266 {
6267   dout(1) << "start_waiting_for_healthy" << dendl;
6268   set_state(STATE_WAITING_FOR_HEALTHY);
6269   last_heartbeat_resample = utime_t();
6270
6271   // subscribe to osdmap updates, in case our peers really are known to be dead
6272   osdmap_subscribe(osdmap->get_epoch() + 1, false);
6273 }
6274
6275 bool OSD::_is_healthy()
6276 {
6277   if (!cct->get_heartbeat_map()->is_healthy()) {
6278     dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6279     return false;
6280   }
6281
6282   if (is_waiting_for_healthy()) {
6283      utime_t now = ceph_clock_now();
6284      utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6285      while (!osd_markdown_log.empty() &&
6286              osd_markdown_log.front() + grace < now)
6287        osd_markdown_log.pop_front();
6288      if (osd_markdown_log.size() <= 1) {
6289        dout(5) << __func__ << " first time marked as down,"
6290                << " try reboot unconditionally" << dendl;
6291        return true;
6292     }
6293     std::lock_guard l(heartbeat_lock);
6294     int num = 0, up = 0;
6295     for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6296          p != heartbeat_peers.end();
6297          ++p) {
6298       if (p->second.is_healthy(now))
6299         ++up;
6300       ++num;
6301     }
6302     if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6303       dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6304               << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6305       return false;
6306     }
6307   }
6308
6309   return true;
6310 }
6311
6312 void OSD::_send_boot()
6313 {
6314   dout(10) << "_send_boot" << dendl;
6315   Connection *local_connection =
6316     cluster_messenger->get_loopback_connection().get();
6317   entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6318   entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6319   entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6320   entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6321
6322   dout(20) << " initial client_addrs " << client_addrs
6323            << ", cluster_addrs " << cluster_addrs
6324            << ", hb_back_addrs " << hb_back_addrs
6325            << ", hb_front_addrs " << hb_front_addrs
6326            << dendl;
6327   if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6328     dout(10) << " assuming cluster_addrs match client_addrs "
6329              << client_addrs << dendl;
6330     cluster_addrs = cluster_messenger->get_myaddrs();
6331   }
6332   if (auto session = local_connection->get_priv(); !session) {
6333     cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6334   }
6335
6336   local_connection = hb_back_server_messenger->get_loopback_connection().get();
6337   if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6338     dout(10) << " assuming hb_back_addrs match cluster_addrs "
6339              << cluster_addrs << dendl;
6340     hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6341   }
6342   if (auto session = local_connection->get_priv(); !session) {
6343     hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6344   }
6345
6346   local_connection = hb_front_server_messenger->get_loopback_connection().get();
6347   if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6348     dout(10) << " assuming hb_front_addrs match client_addrs "
6349              << client_addrs << dendl;
6350     hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6351   }
6352   if (auto session = local_connection->get_priv(); !session) {
6353     hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6354   }
6355
6356   // we now know what our front and back addrs will be, and we are
6357   // about to tell the mon what our metadata (including numa bindings)
6358   // are, so now is a good time!
6359   set_numa_affinity();
6360
6361   MOSDBoot *mboot = new MOSDBoot(
6362     superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6363     hb_back_addrs, hb_front_addrs, cluster_addrs,
6364     CEPH_FEATURES_ALL);
6365   dout(10) << " final client_addrs " << client_addrs
6366            << ", cluster_addrs " << cluster_addrs
6367            << ", hb_back_addrs " << hb_back_addrs
6368            << ", hb_front_addrs " << hb_front_addrs
6369            << dendl;
6370   _collect_metadata(&mboot->metadata);
6371   monc->send_mon_message(mboot);
6372   set_state(STATE_BOOTING);
6373 }
6374
6375 void OSD::_collect_metadata(map<string,string> *pm)
6376 {
6377   // config info
6378   (*pm)["osd_data"] = dev_path;
6379   if (store->get_type() == "filestore") {
6380     // not applicable for bluestore
6381     (*pm)["osd_journal"] = journal_path;
6382   }
6383   (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6384   (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6385   (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6386   (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6387
6388   // backend
6389   (*pm)["osd_objectstore"] = store->get_type();
6390   (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6391   (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6392   (*pm)["default_device_class"] = store->get_default_device_class();
6393   store->collect_metadata(pm);
6394
6395   collect_sys_info(pm, cct);
6396
6397   (*pm)["front_iface"] = pick_iface(
6398     cct,
6399     client_messenger->get_myaddrs().front().get_sockaddr_storage());
6400   (*pm)["back_iface"] = pick_iface(
6401     cct,
6402     cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6403
6404   // network numa
6405   {
6406     int node = -1;
6407     set<int> nodes;
6408     set<string> unknown;
6409     for (auto nm : { "front_iface", "back_iface" }) {
6410       if (!(*pm)[nm].size()) {
6411         unknown.insert(nm);
6412         continue;
6413       }
6414       int n = -1;
6415       int r = get_iface_numa_node((*pm)[nm], &n);
6416       if (r < 0) {
6417         unknown.insert((*pm)[nm]);
6418         continue;
6419       }
6420       nodes.insert(n);
6421       if (node < 0) {
6422         node = n;
6423       }
6424     }
6425     if (unknown.size()) {
6426       (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6427     }
6428     if (!nodes.empty()) {
6429       (*pm)["network_numa_nodes"] = stringify(nodes);
6430     }
6431     if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6432       (*pm)["network_numa_node"] = stringify(node);
6433     }
6434   }
6435
6436   if (numa_node >= 0) {
6437     (*pm)["numa_node"] = stringify(numa_node);
6438     (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6439                                                   &numa_cpu_set);
6440   }
6441
6442   set<string> devnames;
6443   store->get_devices(&devnames);
6444   (*pm)["devices"] = stringify(devnames);
6445   string devids;
6446   for (auto& dev : devnames) {
6447     string err;
6448     string id = get_device_id(dev, &err);
6449     if (id.size()) {
6450       if (!devids.empty()) {
6451         devids += ",";
6452       }
6453       devids += dev + "=" + id;
6454     } else {
6455       dout(10) << __func__ << " no unique device id for " << dev << ": "
6456                << err << dendl;
6457     }
6458   }
6459   (*pm)["device_ids"] = devids;
6460
6461   dout(10) << __func__ << " " << *pm << dendl;
6462 }
6463
6464 void OSD::queue_want_up_thru(epoch_t want)
6465 {
6466   map_lock.get_read();
6467   epoch_t cur = osdmap->get_up_thru(whoami);
6468   std::lock_guard l(mon_report_lock);
6469   if (want > up_thru_wanted) {
6470     dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6471              << ", currently " << cur
6472              << dendl;
6473     up_thru_wanted = want;
6474     send_alive();
6475   } else {
6476     dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6477              << ", currently " << cur
6478              << dendl;
6479   }
6480   map_lock.put_read();
6481 }
6482
6483 void OSD::send_alive()
6484 {
6485   ceph_assert(mon_report_lock.is_locked());
6486   if (!osdmap->exists(whoami))
6487     return;
6488   epoch_t up_thru = osdmap->get_up_thru(whoami);
6489   dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6490   if (up_thru_wanted > up_thru) {
6491     dout(10) << "send_alive want " << up_thru_wanted << dendl;
6492     monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6493   }
6494 }
6495
6496 void OSD::request_full_map(epoch_t first, epoch_t last)
6497 {
6498   dout(10) << __func__ << " " << first << ".." << last
6499            << ", previously requested "
6500            << requested_full_first << ".." << requested_full_last << dendl;
6501   ceph_assert(osd_lock.is_locked());
6502   ceph_assert(first > 0 && last > 0);
6503   ceph_assert(first <= last);
6504   ceph_assert(first >= requested_full_first);  // we shouldn't ever ask for older maps
6505   if (requested_full_first == 0) {
6506     // first request
6507     requested_full_first = first;
6508     requested_full_last = last;
6509   } else if (last <= requested_full_last) {
6510     // dup
6511     return;
6512   } else {
6513     // additional request
6514     first = requested_full_last + 1;
6515     requested_full_last = last;
6516   }
6517   MMonGetOSDMap *req = new MMonGetOSDMap;
6518   req->request_full(first, last);
6519   monc->send_mon_message(req);
6520 }
6521
6522 void OSD::got_full_map(epoch_t e)
6523 {
6524   ceph_assert(requested_full_first <= requested_full_last);
6525   ceph_assert(osd_lock.is_locked());
6526   if (requested_full_first == 0) {
6527     dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6528     return;
6529   }
6530   if (e < requested_full_first) {
6531     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6532              << ".." << requested_full_last
6533              << ", ignoring" << dendl;
6534     return;
6535   }
6536   if (e >= requested_full_last) {
6537     dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6538              << ".." << requested_full_last << ", resetting" << dendl;
6539     requested_full_first = requested_full_last = 0;
6540     return;
6541   }
6542
6543   requested_full_first = e + 1;
6544
6545   dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6546            << ".." << requested_full_last
6547            << ", still need more" << dendl;
6548 }
6549
6550 void OSD::requeue_failures()
6551 {
6552   std::lock_guard l(heartbeat_lock);
6553   unsigned old_queue = failure_queue.size();
6554   unsigned old_pending = failure_pending.size();
6555   for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6556     failure_queue[p->first] = p->second.first;
6557     failure_pending.erase(p++);
6558   }
6559   dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6560            << failure_queue.size() << dendl;
6561 }
6562
6563 void OSD::send_failures()
6564 {
6565   ceph_assert(map_lock.is_locked());
6566   ceph_assert(mon_report_lock.is_locked());
6567   std::lock_guard l(heartbeat_lock);
6568   utime_t now = ceph_clock_now();
6569   while (!failure_queue.empty()) {
6570     int osd = failure_queue.begin()->first;
6571     if (!failure_pending.count(osd)) {
6572       int failed_for = (int)(double)(now - failure_queue.begin()->second);
6573       monc->send_mon_message(
6574         new MOSDFailure(
6575           monc->get_fsid(),
6576           osd,
6577           osdmap->get_addrs(osd),
6578           failed_for,
6579           osdmap->get_epoch()));
6580       failure_pending[osd] = make_pair(failure_queue.begin()->second,
6581                                        osdmap->get_addrs(osd));
6582     }
6583     failure_queue.erase(osd);
6584   }
6585 }
6586
6587 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6588 {
6589   MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6590                                    MOSDFailure::FLAG_ALIVE);
6591   monc->send_mon_message(m);
6592 }
6593
6594 void OSD::cancel_pending_failures()
6595 {
6596   std::lock_guard l(heartbeat_lock);
6597   auto it = failure_pending.begin();
6598   while (it != failure_pending.end()) {
6599     dout(10) << __func__ << " canceling in-flight failure report for osd."
6600              << it->first << dendl;
6601     send_still_alive(osdmap->get_epoch(), it->first, it->second.second);
6602     failure_pending.erase(it++);
6603   }
6604 }
6605
6606 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6607 {
6608   const auto& monmap = monc->monmap;
6609   // send beacon to mon even if we are just connected, and the monmap is not
6610   // initialized yet by then.
6611   if (monmap.epoch > 0 &&
6612       monmap.get_required_features().contains_all(
6613         ceph::features::mon::FEATURE_LUMINOUS)) {
6614     dout(20) << __func__ << " sending" << dendl;
6615     MOSDBeacon* beacon = nullptr;
6616     {
6617       std::lock_guard l{min_last_epoch_clean_lock};
6618       beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6619       beacon->pgs = min_last_epoch_clean_pgs;
6620       last_sent_beacon = now;
6621     }
6622     monc->send_mon_message(beacon);
6623   } else {
6624     dout(20) << __func__ << " not sending" << dendl;
6625   }
6626 }
6627
6628 void OSD::handle_command(MMonCommand *m)
6629 {
6630   if (!require_mon_peer(m)) {
6631     m->put();
6632     return;
6633   }
6634
6635   Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6636   command_wq.queue(c);
6637   m->put();
6638 }
6639
6640 void OSD::handle_command(MCommand *m)
6641 {
6642   ConnectionRef con = m->get_connection();
6643   auto priv = con->get_priv();
6644   auto session = static_cast<Session *>(priv.get());
6645   if (!session) {
6646     con->send_message(new MCommandReply(m, -EPERM));
6647     m->put();
6648     return;
6649   }
6650
6651   OSDCap& caps = session->caps;
6652   priv.reset();
6653
6654   if (!caps.allow_all() || m->get_source().is_mon()) {
6655     con->send_message(new MCommandReply(m, -EPERM));
6656     m->put();
6657     return;
6658   }
6659
6660   Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6661   command_wq.queue(c);
6662
6663   m->put();
6664 }
6665
6666 struct OSDCommand {
6667   string cmdstring;
6668   string helpstring;
6669   string module;
6670   string perm;
6671 } osd_commands[] = {
6672
6673 #define COMMAND(parsesig, helptext, module, perm) \
6674   {parsesig, helptext, module, perm},
6675
6676 // yes, these are really pg commands, but there's a limit to how
6677 // much work it's worth.  The OSD returns all of them.  Make this
6678 // form (pg <pgid> <cmd>) valid only for the cli.
6679 // Rest uses "tell <pgid> <cmd>"
6680
6681 COMMAND("pg " \
6682         "name=pgid,type=CephPgid " \
6683         "name=cmd,type=CephChoices,strings=query", \
6684         "show details of a specific pg", "osd", "r")
6685 COMMAND("pg " \
6686         "name=pgid,type=CephPgid " \
6687         "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6688         "name=mulcmd,type=CephChoices,strings=revert|delete", \
6689         "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6690         "osd", "rw")
6691 COMMAND("pg " \
6692         "name=pgid,type=CephPgid " \
6693         "name=cmd,type=CephChoices,strings=list_unfound " \
6694         "name=offset,type=CephString,req=false",
6695         "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6696         "osd", "r")
6697
6698 // new form: tell <pgid> <cmd> for both cli and rest
6699
6700 COMMAND("query",
6701         "show details of a specific pg", "osd", "r")
6702 COMMAND("mark_unfound_lost " \
6703         "name=mulcmd,type=CephChoices,strings=revert|delete", \
6704         "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6705         "osd", "rw")
6706 COMMAND("list_unfound " \
6707         "name=offset,type=CephString,req=false",
6708         "list unfound objects on this pg, perhaps starting at an offset given in JSON",
6709         "osd", "r")
6710 COMMAND("perf histogram dump "
6711         "name=logger,type=CephString,req=false "
6712         "name=counter,type=CephString,req=false",
6713         "Get histogram data",
6714         "osd", "r")
6715
6716 // tell <osd.n> commands.  Validation of osd.n must be special-cased in client
6717 COMMAND("version", "report version of OSD", "osd", "r")
6718 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r")
6719 COMMAND("injectargs " \
6720         "name=injected_args,type=CephString,n=N",
6721         "inject configuration arguments into running OSD",
6722         "osd", "rw")
6723 COMMAND("config set " \
6724         "name=key,type=CephString name=value,type=CephString",
6725         "Set a configuration option at runtime (not persistent)",
6726         "osd", "rw")
6727 COMMAND("config get " \
6728         "name=key,type=CephString",
6729         "Get a configuration option at runtime",
6730         "osd", "r")
6731 COMMAND("config unset " \
6732         "name=key,type=CephString",
6733         "Unset a configuration option at runtime (not persistent)",
6734         "osd", "rw")
6735 COMMAND("cluster_log " \
6736         "name=level,type=CephChoices,strings=error,warning,info,debug " \
6737         "name=message,type=CephString,n=N",
6738         "log a message to the cluster log",
6739         "osd", "rw")
6740 COMMAND("bench " \
6741         "name=count,type=CephInt,req=false " \
6742         "name=size,type=CephInt,req=false " \
6743         "name=object_size,type=CephInt,req=false " \
6744         "name=object_num,type=CephInt,req=false ", \
6745         "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
6746         "(default count=1G default size=4MB). Results in log.",
6747         "osd", "rw")
6748 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw")
6749 COMMAND("heap " \
6750         "name=heapcmd,type=CephChoices,strings="\
6751             "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
6752         "name=value,type=CephString,req=false",
6753         "show heap usage info (available only if compiled with tcmalloc)",
6754         "osd", "rw")
6755 COMMAND("debug dump_missing " \
6756         "name=filename,type=CephFilepath",
6757         "dump missing objects to a named file", "osd", "r")
6758 COMMAND("debug kick_recovery_wq " \
6759         "name=delay,type=CephInt,range=0",
6760         "set osd_recovery_delay_start to <val>", "osd", "rw")
6761 COMMAND("cpu_profiler " \
6762         "name=arg,type=CephChoices,strings=status|flush",
6763         "run cpu profiling on daemon", "osd", "rw")
6764 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6765         "osd", "r")
6766 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6767         "osd", "rw")
6768 COMMAND("compact",
6769         "compact object store's omap. "
6770         "WARNING: Compaction probably slows your requests",
6771         "osd", "rw")
6772 COMMAND("smart name=devid,type=CephString,req=False",
6773         "runs smartctl on this osd devices.  ",
6774         "osd", "rw")
6775 COMMAND("cache drop",
6776         "Drop all OSD caches",
6777         "osd", "rwx")
6778 COMMAND("cache status",
6779         "Get OSD caches statistics",
6780         "osd", "r")
6781 COMMAND("send_beacon",
6782         "Send OSD beacon to mon immediately",
6783         "osd", "r")
6784 };
6785
6786 void OSD::do_command(
6787   Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6788 {
6789   dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6790
6791   int r = 0;
6792   stringstream ss, ds;
6793   bufferlist odata;
6794   cmdmap_t cmdmap;
6795   if (cmd.empty()) {
6796     ss << "no command given";
6797     goto out;
6798   }
6799   if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6800     r = -EINVAL;
6801     goto out;
6802   }
6803
6804   try {
6805     r = _do_command(con, cmdmap, tid, data, odata, ss, ds);
6806   } catch (const bad_cmd_get& e) {
6807     r = -EINVAL;
6808     ss << e.what();
6809   }
6810   if (r == -EAGAIN) {
6811     return;
6812   }
6813  out:
6814   string rs = ss.str();
6815   odata.append(ds);
6816   dout(0) << "do_command r=" << r << " " << rs << dendl;
6817   clog->info() << rs;
6818   if (con) {
6819     MCommandReply *reply = new MCommandReply(r, rs);
6820     reply->set_tid(tid);
6821     reply->set_data(odata);
6822     con->send_message(reply);
6823   }
6824 }
6825
6826 namespace {
6827   class unlock_guard {
6828     Mutex& m;
6829   public:
6830     explicit unlock_guard(Mutex& mutex)
6831       : m(mutex)
6832     {
6833       m.unlock();
6834     }
6835     unlock_guard(unlock_guard&) = delete;
6836     ~unlock_guard() {
6837       m.lock();
6838     }
6839   };
6840 }
6841
6842 int OSD::_do_command(
6843   Connection *con, cmdmap_t& cmdmap, ceph_tid_t tid, bufferlist& data,
6844   bufferlist& odata, stringstream& ss, stringstream& ds)
6845 {
6846   int r = 0;
6847   string prefix;
6848   string format;
6849   string pgidstr;
6850   boost::scoped_ptr<Formatter> f;
6851
6852   cmd_getval(cct, cmdmap, "prefix", prefix);
6853
6854   if (prefix == "get_command_descriptions") {
6855     int cmdnum = 0;
6856     JSONFormatter *f = new JSONFormatter();
6857     f->open_object_section("command_descriptions");
6858     for (OSDCommand *cp = osd_commands;
6859          cp < &osd_commands[std::size(osd_commands)]; cp++) {
6860
6861       ostringstream secname;
6862       secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6863       dump_cmddesc_to_json(f, con->get_features(),
6864                            secname.str(), cp->cmdstring, cp->helpstring,
6865                            cp->module, cp->perm, 0);
6866       cmdnum++;
6867     }
6868     f->close_section(); // command_descriptions
6869
6870     f->flush(ds);
6871     delete f;
6872     goto out;
6873   }
6874
6875   cmd_getval(cct, cmdmap, "format", format);
6876   f.reset(Formatter::create(format));
6877
6878   if (prefix == "version") {
6879     if (f) {
6880       f->open_object_section("version");
6881       f->dump_string("version", pretty_version_to_str());
6882       f->close_section();
6883       f->flush(ds);
6884     } else {
6885       ds << pretty_version_to_str();
6886     }
6887     goto out;
6888   }
6889   else if (prefix == "injectargs") {
6890     vector<string> argsvec;
6891     cmd_getval(cct, cmdmap, "injected_args", argsvec);
6892
6893     if (argsvec.empty()) {
6894       r = -EINVAL;
6895       ss << "ignoring empty injectargs";
6896       goto out;
6897     }
6898     string args = argsvec.front();
6899     for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6900       args += " " + *a;
6901     unlock_guard unlock{osd_lock};
6902     r = cct->_conf.injectargs(args, &ss);
6903   }
6904   else if (prefix == "config set") {
6905     std::string key;
6906     std::string val;
6907     cmd_getval(cct, cmdmap, "key", key);
6908     cmd_getval(cct, cmdmap, "value", val);
6909     unlock_guard unlock{osd_lock};
6910     r = cct->_conf.set_val(key, val, &ss);
6911     if (r == 0) {
6912       cct->_conf.apply_changes(nullptr);
6913     }
6914   }
6915   else if (prefix == "config get") {
6916     std::string key;
6917     cmd_getval(cct, cmdmap, "key", key);
6918     unlock_guard unlock{osd_lock};
6919     std::string val;
6920     r = cct->_conf.get_val(key, &val);
6921     if (r == 0) {
6922       ds << val;
6923     }
6924   }
6925   else if (prefix == "config unset") {
6926     std::string key;
6927     cmd_getval(cct, cmdmap, "key", key);
6928     unlock_guard unlock{osd_lock};
6929     r = cct->_conf.rm_val(key);
6930     if (r == 0) {
6931       cct->_conf.apply_changes(nullptr);
6932     }
6933     if (r == -ENOENT) {
6934       r = 0;  // make command idempotent
6935     }
6936   }
6937   else if (prefix == "cluster_log") {
6938     vector<string> msg;
6939     cmd_getval(cct, cmdmap, "message", msg);
6940     if (msg.empty()) {
6941       r = -EINVAL;
6942       ss << "ignoring empty log message";
6943       goto out;
6944     }
6945     string message = msg.front();
6946     for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6947       message += " " + *a;
6948     string lvl;
6949     cmd_getval(cct, cmdmap, "level", lvl);
6950     clog_type level = string_to_clog_type(lvl);
6951     if (level < 0) {
6952       r = -EINVAL;
6953       ss << "unknown level '" << lvl << "'";
6954       goto out;
6955     }
6956     clog->do_log(level, message);
6957   }
6958
6959   // either 'pg <pgid> <command>' or
6960   // 'tell <pgid>' (which comes in without any of that prefix)?
6961
6962   else if (prefix == "pg" ||
6963             prefix == "query" ||
6964             prefix == "mark_unfound_lost" ||
6965             prefix == "list_unfound"
6966            ) {
6967     pg_t pgid;
6968
6969     if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6970       ss << "no pgid specified";
6971       r = -EINVAL;
6972     } else if (!pgid.parse(pgidstr.c_str())) {
6973       ss << "couldn't parse pgid '" << pgidstr << "'";
6974       r = -EINVAL;
6975     } else {
6976       spg_t pcand;
6977       PGRef pg;
6978       if (osdmap->get_primary_shard(pgid, &pcand) &&
6979           (pg = _lookup_lock_pg(pcand))) {
6980         if (pg->is_primary()) {
6981           // simulate pg <pgid> cmd= for pg->do-command
6982           if (prefix != "pg")
6983             cmd_putval(cct, cmdmap, "cmd", prefix);
6984           try {
6985             r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6986           } catch (const bad_cmd_get& e) {
6987             pg->unlock();
6988             ss << e.what();
6989             return -EINVAL;
6990           }
6991           if (r == -EAGAIN) {
6992             pg->unlock();
6993             // don't reply, pg will do so async
6994             return -EAGAIN;
6995           }
6996         } else {
6997           ss << "not primary for pgid " << pgid;
6998
6999           // send them the latest diff to ensure they realize the mapping
7000           // has changed.
7001           service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
7002
7003           // do not reply; they will get newer maps and realize they
7004           // need to resend.
7005           pg->unlock();
7006           return -EAGAIN;
7007         }
7008         pg->unlock();
7009       } else {
7010         ss << "i don't have pgid " << pgid;
7011         r = -ENOENT;
7012       }
7013     }
7014   }
7015
7016   else if (prefix == "bench") {
7017     int64_t count;
7018     int64_t bsize;
7019     int64_t osize, onum;
7020     // default count 1G, size 4MB
7021     cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
7022     cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
7023     cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
7024     cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
7025
7026     uint32_t duration = cct->_conf->osd_bench_duration;
7027
7028     if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
7029       // let us limit the block size because the next checks rely on it
7030       // having a sane value.  If we allow any block size to be set things
7031       // can still go sideways.
7032       ss << "block 'size' values are capped at "
7033          << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
7034          << " a higher value, please adjust 'osd_bench_max_block_size'";
7035       r = -EINVAL;
7036       goto out;
7037     } else if (bsize < (int64_t) (1 << 20)) {
7038       // entering the realm of small block sizes.
7039       // limit the count to a sane value, assuming a configurable amount of
7040       // IOPS and duration, so that the OSD doesn't get hung up on this,
7041       // preventing timeouts from going off
7042       int64_t max_count =
7043         bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
7044       if (count > max_count) {
7045         ss << "'count' values greater than " << max_count
7046            << " for a block size of " << byte_u_t(bsize) << ", assuming "
7047            << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
7048            << " for " << duration << " seconds,"
7049            << " can cause ill effects on osd. "
7050            << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
7051            << " value if you wish to use a higher 'count'.";
7052         r = -EINVAL;
7053         goto out;
7054       }
7055     } else {
7056       // 1MB block sizes are big enough so that we get more stuff done.
7057       // However, to avoid the osd from getting hung on this and having
7058       // timers being triggered, we are going to limit the count assuming
7059       // a configurable throughput and duration.
7060       // NOTE: max_count is the total amount of bytes that we believe we
7061       //       will be able to write during 'duration' for the given
7062       //       throughput.  The block size hardly impacts this unless it's
7063       //       way too big.  Given we already check how big the block size
7064       //       is, it's safe to assume everything will check out.
7065       int64_t max_count =
7066         cct->_conf->osd_bench_large_size_max_throughput * duration;
7067       if (count > max_count) {
7068         ss << "'count' values greater than " << max_count
7069            << " for a block size of " << byte_u_t(bsize) << ", assuming "
7070            << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
7071            << " for " << duration << " seconds,"
7072            << " can cause ill effects on osd. "
7073            << " Please adjust 'osd_bench_large_size_max_throughput'"
7074            << " with a higher value if you wish to use a higher 'count'.";
7075         r = -EINVAL;
7076         goto out;
7077       }
7078     }
7079
7080     if (osize && bsize > osize)
7081       bsize = osize;
7082
7083     dout(1) << " bench count " << count
7084             << " bsize " << byte_u_t(bsize) << dendl;
7085
7086     ObjectStore::Transaction cleanupt;
7087
7088     if (osize && onum) {
7089       bufferlist bl;
7090       bufferptr bp(osize);
7091       bp.zero();
7092       bl.push_back(std::move(bp));
7093       bl.rebuild_page_aligned();
7094       for (int i=0; i<onum; ++i) {
7095         char nm[30];
7096         snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
7097         object_t oid(nm);
7098         hobject_t soid(sobject_t(oid, 0));
7099         ObjectStore::Transaction t;
7100         t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
7101         store->queue_transaction(service.meta_ch, std::move(t), NULL);
7102         cleanupt.remove(coll_t(), ghobject_t(soid));
7103       }
7104     }
7105
7106     bufferlist bl;
7107     bufferptr bp(bsize);
7108     bp.zero();
7109     bl.push_back(std::move(bp));
7110     bl.rebuild_page_aligned();
7111
7112     {
7113       C_SaferCond waiter;
7114       if (!service.meta_ch->flush_commit(&waiter)) {
7115         waiter.wait();
7116       }
7117     }
7118
7119     utime_t start = ceph_clock_now();
7120     for (int64_t pos = 0; pos < count; pos += bsize) {
7121       char nm[30];
7122       unsigned offset = 0;
7123       if (onum && osize) {
7124         snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
7125         offset = rand() % (osize / bsize) * bsize;
7126       } else {
7127         snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
7128       }
7129       object_t oid(nm);
7130       hobject_t soid(sobject_t(oid, 0));
7131       ObjectStore::Transaction t;
7132       t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
7133       store->queue_transaction(service.meta_ch, std::move(t), NULL);
7134       if (!onum || !osize)
7135         cleanupt.remove(coll_t::meta(), ghobject_t(soid));
7136     }
7137
7138     {
7139       C_SaferCond waiter;
7140       if (!service.meta_ch->flush_commit(&waiter)) {
7141         waiter.wait();
7142       }
7143     }
7144     utime_t end = ceph_clock_now();
7145
7146     // clean up
7147     store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
7148     {
7149       C_SaferCond waiter;
7150       if (!service.meta_ch->flush_commit(&waiter)) {
7151         waiter.wait();
7152       }
7153     }
7154
7155     double elapsed = end - start;
7156     double rate = count / elapsed;
7157     double iops = rate / bsize;
7158     if (f) {
7159       f->open_object_section("osd_bench_results");
7160       f->dump_int("bytes_written", count);
7161       f->dump_int("blocksize", bsize);
7162       f->dump_float("elapsed_sec", elapsed);
7163       f->dump_float("bytes_per_sec", rate);
7164       f->dump_float("iops", iops);
7165       f->close_section();
7166       f->flush(ds);
7167     } else {
7168       ds << "bench: wrote " << byte_u_t(count)
7169          << " in blocks of " << byte_u_t(bsize) << " in "
7170          << elapsed << " sec at " << byte_u_t(rate) << "/sec "
7171          << si_u_t(iops) << " IOPS";
7172     }
7173   }
7174
7175   else if (prefix == "flush_pg_stats") {
7176     mgrc.send_pgstats();
7177     ds << service.get_osd_stat_seq() << "\n";
7178   }
7179
7180   else if (prefix == "heap") {
7181     r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
7182   }
7183
7184   else if (prefix == "debug dump_missing") {
7185     if (!f) {
7186       f.reset(new JSONFormatter(true));
7187     }
7188     f->open_array_section("pgs");
7189     vector<PGRef> pgs;
7190     _get_pgs(&pgs);
7191     for (auto& pg : pgs) {
7192       string s = stringify(pg->pg_id);
7193       f->open_array_section(s.c_str());
7194       pg->lock();
7195       pg->dump_missing(f.get());
7196       pg->unlock();
7197       f->close_section();
7198     }
7199     f->close_section();
7200     f->flush(ds);
7201   }
7202   else if (prefix == "debug kick_recovery_wq") {
7203     int64_t delay;
7204     cmd_getval(cct, cmdmap, "delay", delay);
7205     ostringstream oss;
7206     oss << delay;
7207     unlock_guard unlock{osd_lock};
7208     r = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
7209     if (r != 0) {
7210       ss << "kick_recovery_wq: error setting "
7211          << "osd_recovery_delay_start to '" << delay << "': error "
7212          << r;
7213       goto out;
7214     }
7215     cct->_conf.apply_changes(nullptr);
7216     ss << "kicking recovery queue. set osd_recovery_delay_start "
7217        << "to " << cct->_conf->osd_recovery_delay_start;
7218   }
7219
7220   else if (prefix == "cpu_profiler") {
7221     string arg;
7222     cmd_getval(cct, cmdmap, "arg", arg);
7223     vector<string> argvec;
7224     get_str_vec(arg, argvec);
7225     cpu_profiler_handle_command(argvec, ds);
7226   }
7227
7228   else if (prefix == "dump_pg_recovery_stats") {
7229     stringstream s;
7230     if (f) {
7231       pg_recovery_stats.dump_formatted(f.get());
7232       f->flush(ds);
7233     } else {
7234       pg_recovery_stats.dump(s);
7235       ds << "dump pg recovery stats: " << s.str();
7236     }
7237   }
7238
7239   else if (prefix == "reset_pg_recovery_stats") {
7240     ss << "reset pg recovery stats";
7241     pg_recovery_stats.reset();
7242   }
7243
7244   else if (prefix == "perf histogram dump") {
7245     std::string logger;
7246     std::string counter;
7247     cmd_getval(cct, cmdmap, "logger", logger);
7248     cmd_getval(cct, cmdmap, "counter", counter);
7249     if (f) {
7250       cct->get_perfcounters_collection()->dump_formatted_histograms(
7251           f.get(), false, logger, counter);
7252       f->flush(ds);
7253     }
7254   }
7255
7256   else if (prefix == "compact") {
7257     dout(1) << "triggering manual compaction" << dendl;
7258     auto start = ceph::coarse_mono_clock::now();
7259     store->compact();
7260     auto end = ceph::coarse_mono_clock::now();
7261     double duration = std::chrono::duration<double>(end-start).count();
7262     dout(1) << "finished manual compaction in "
7263             << duration
7264             << " seconds" << dendl;
7265     ss << "compacted omap in " << duration << " seconds";
7266   }
7267
7268   else if (prefix == "smart") {
7269     string devid;
7270     cmd_getval(cct, cmdmap, "devid", devid);
7271     probe_smart(devid, ds);
7272   }
7273
7274   else if (prefix == "cache drop") {
7275     dout(20) << "clearing all caches" << dendl;
7276     // Clear the objectstore's cache - onode and buffer for Bluestore,
7277     // system's pagecache for Filestore
7278     r = store->flush_cache(&ss);
7279     if (r < 0) {
7280       ds << "Error flushing objectstore cache: " << cpp_strerror(r);
7281       goto out;
7282     }
7283     // Clear the objectcontext cache (per PG)
7284     vector<PGRef> pgs;
7285     _get_pgs(&pgs);
7286     for (auto& pg: pgs) {
7287       pg->clear_cache();
7288     }
7289   }
7290
7291   else if (prefix == "cache status") {
7292     int obj_ctx_count = 0;
7293     vector<PGRef> pgs;
7294     _get_pgs(&pgs);
7295     for (auto& pg: pgs) {
7296       obj_ctx_count += pg->get_cache_obj_count();
7297     }
7298     if (f) {
7299       f->open_object_section("cache_status");
7300       f->dump_int("object_ctx", obj_ctx_count);
7301       store->dump_cache_stats(f.get());
7302       f->close_section();
7303       f->flush(ds);
7304     } else {
7305       ds << "object_ctx: " << obj_ctx_count;
7306       store->dump_cache_stats(ds);
7307     }
7308   }
7309   else if (prefix == "send_beacon") {
7310     if (is_active()) {
7311       send_beacon(ceph::coarse_mono_clock::now());
7312     }
7313   } else {
7314     ss << "unrecognized command '" << prefix << "'";
7315     r = -EINVAL;
7316   }
7317
7318  out:
7319   return r;
7320 }
7321
7322 void OSD::probe_smart(const string& only_devid, ostream& ss)
7323 {
7324   set<string> devnames;
7325   store->get_devices(&devnames);
7326   uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7327     "osd_smart_report_timeout");
7328
7329   // == typedef std::map<std::string, mValue> mObject;
7330   json_spirit::mObject json_map;
7331
7332   for (auto dev : devnames) {
7333     // smartctl works only on physical devices; filter out any logical device
7334     if (dev.find("dm-") == 0) {
7335       continue;
7336     }
7337
7338     string err;
7339     string devid = get_device_id(dev, &err);
7340     if (devid.size() == 0) {
7341       dout(10) << __func__ << " no unique id for dev " << dev << " ("
7342                << err << "), skipping" << dendl;
7343       continue;
7344     }
7345     if (only_devid.size() && devid != only_devid) {
7346       continue;
7347     }
7348
7349     json_spirit::mValue smart_json;
7350     if (block_device_get_metrics(dev, smart_timeout,
7351                                  &smart_json)) {
7352       dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7353       continue;
7354     }
7355     json_map[devid] = smart_json;
7356   }
7357   json_spirit::write(json_map, ss, json_spirit::pretty_print);
7358 }
7359
7360 bool OSD::heartbeat_dispatch(Message *m)
7361 {
7362   dout(30) << "heartbeat_dispatch " << m << dendl;
7363   switch (m->get_type()) {
7364
7365   case CEPH_MSG_PING:
7366     dout(10) << "ping from " << m->get_source_inst() << dendl;
7367     m->put();
7368     break;
7369
7370   case MSG_OSD_PING:
7371     handle_osd_ping(static_cast<MOSDPing*>(m));
7372     break;
7373
7374   default:
7375     dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7376     m->put();
7377   }
7378
7379   return true;
7380 }
7381
7382 bool OSD::ms_dispatch(Message *m)
7383 {
7384   dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7385   if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7386     service.got_stop_ack();
7387     m->put();
7388     return true;
7389   }
7390
7391   // lock!
7392
7393   osd_lock.Lock();
7394   if (is_stopping()) {
7395     osd_lock.Unlock();
7396     m->put();
7397     return true;
7398   }
7399
7400   do_waiters();
7401   _dispatch(m);
7402
7403   osd_lock.Unlock();
7404
7405   return true;
7406 }
7407
7408 void OSD::maybe_share_map(
7409   Session *session,
7410   OpRequestRef op,
7411   OSDMapRef osdmap)
7412 {
7413   if (!op->check_send_map) {
7414     return;
7415   }
7416   epoch_t last_sent_epoch = 0;
7417
7418   session->sent_epoch_lock.lock();
7419   last_sent_epoch = session->last_sent_epoch;
7420   session->sent_epoch_lock.unlock();
7421
7422   // assume the peer has the newer of the op's sent_epoch and what
7423   // we think we sent them.
7424   epoch_t from = std::max(last_sent_epoch, op->sent_epoch);
7425
7426   const Message *m = op->get_req();
7427   service.share_map(
7428     m->get_source(),
7429     m->get_connection().get(),
7430     from,
7431     osdmap,
7432     session ? &last_sent_epoch : NULL);
7433
7434   session->sent_epoch_lock.lock();
7435   if (session->last_sent_epoch < last_sent_epoch) {
7436     session->last_sent_epoch = last_sent_epoch;
7437   }
7438   session->sent_epoch_lock.unlock();
7439
7440   op->check_send_map = false;
7441 }
7442
7443 void OSD::dispatch_session_waiting(SessionRef session, OSDMapRef osdmap)
7444 {
7445   ceph_assert(session->session_dispatch_lock.is_locked());
7446
7447   auto i = session->waiting_on_map.begin();
7448   while (i != session->waiting_on_map.end()) {
7449     OpRequestRef op = &(*i);
7450     ceph_assert(ms_can_fast_dispatch(op->get_req()));
7451     const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7452       op->get_req());
7453     if (m->get_min_epoch() > osdmap->get_epoch()) {
7454       break;
7455     }
7456     session->waiting_on_map.erase(i++);
7457     op->put();
7458
7459     spg_t pgid;
7460     if (m->get_type() == CEPH_MSG_OSD_OP) {
7461       pg_t actual_pgid = osdmap->raw_pg_to_pg(
7462         static_cast<const MOSDOp*>(m)->get_pg());
7463       if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7464         continue;
7465       }
7466     } else {
7467       pgid = m->get_spg();
7468     }
7469     enqueue_op(pgid, std::move(op), m->get_map_epoch());
7470   }
7471
7472   if (session->waiting_on_map.empty()) {
7473     clear_session_waiting_on_map(session);
7474   } else {
7475     register_session_waiting_on_map(session);
7476   }
7477 }
7478
7479 void OSD::ms_fast_dispatch(Message *m)
7480 {
7481   FUNCTRACE(cct);
7482   if (service.is_stopping()) {
7483     m->put();
7484     return;
7485   }
7486
7487   // peering event?
7488   switch (m->get_type()) {
7489   case CEPH_MSG_PING:
7490     dout(10) << "ping from " << m->get_source() << dendl;
7491     m->put();
7492     return;
7493   case MSG_MON_COMMAND:
7494     handle_command(static_cast<MMonCommand*>(m));
7495     return;
7496   case MSG_OSD_FORCE_RECOVERY:
7497     handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7498     return;
7499   case MSG_OSD_SCRUB2:
7500     handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7501     return;
7502
7503   case MSG_OSD_PG_CREATE2:
7504     return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7505   case MSG_OSD_PG_QUERY:
7506     return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7507   case MSG_OSD_PG_NOTIFY:
7508     return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7509   case MSG_OSD_PG_INFO:
7510     return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7511   case MSG_OSD_PG_REMOVE:
7512     return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7513
7514     // these are single-pg messages that handle themselves
7515   case MSG_OSD_PG_LOG:
7516   case MSG_OSD_PG_TRIM:
7517   case MSG_OSD_BACKFILL_RESERVE:
7518   case MSG_OSD_RECOVERY_RESERVE:
7519     {
7520       MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7521       if (require_osd_peer(pm)) {
7522         enqueue_peering_evt(
7523           pm->get_spg(),
7524           PGPeeringEventRef(pm->get_event()));
7525       }
7526       pm->put();
7527       return;
7528     }
7529   }
7530
7531   OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7532   {
7533 #ifdef WITH_LTTNG
7534     osd_reqid_t reqid = op->get_reqid();
7535 #endif
7536     tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7537         reqid.name._num, reqid.tid, reqid.inc);
7538   }
7539
7540   if (m->trace)
7541     op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7542
7543   // note sender epoch, min req's epoch
7544   op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7545   op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7546   ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7547
7548   service.maybe_inject_dispatch_delay();
7549
7550   if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7551       m->get_type() != CEPH_MSG_OSD_OP) {
7552     // queue it directly
7553     enqueue_op(
7554       static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7555       std::move(op),
7556       static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7557   } else {
7558     // legacy client, and this is an MOSDOp (the *only* fast dispatch
7559     // message that didn't have an explicit spg_t); we need to map
7560     // them to an spg_t while preserving delivery order.
7561     auto priv = m->get_connection()->get_priv();
7562     if (auto session = static_cast<Session*>(priv.get()); session) {
7563       std::lock_guard l{session->session_dispatch_lock};
7564       op->get();
7565       session->waiting_on_map.push_back(*op);
7566       OSDMapRef nextmap = service.get_nextmap_reserved();
7567       dispatch_session_waiting(session, nextmap);
7568       service.release_map(nextmap);
7569     }
7570   }
7571   OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7572 }
7573
7574 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7575 {
7576   dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7577
7578   if (is_stopping()) {
7579     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7580     return false;
7581   }
7582
7583   if (dest_type == CEPH_ENTITY_TYPE_MON)
7584     return true;
7585
7586   *authorizer = monc->build_authorizer(dest_type);
7587   return *authorizer != NULL;
7588 }
7589
7590 KeyStore *OSD::ms_get_auth1_authorizer_keystore()
7591 {
7592   return monc->rotating_secrets.get();
7593 }
7594
7595 int OSD::ms_handle_authentication(Connection *con)
7596 {
7597   int ret = 0;
7598   auto priv = con->get_priv();
7599   Session *s = static_cast<Session*>(priv.get());
7600   if (!s) {
7601     s = new Session(cct, con);
7602     con->set_priv(RefCountedPtr{s, false});
7603     s->entity_name = con->get_peer_entity_name();
7604     dout(10) << __func__ << " new session " << s << " con " << s->con
7605              << " entity " << s->entity_name
7606              << " addr " << con->get_peer_addrs() << dendl;
7607   } else {
7608     dout(10) << __func__ << " existing session " << s << " con " << s->con
7609              << " entity " << s->entity_name
7610              << " addr " << con->get_peer_addrs() << dendl;
7611   }
7612
7613   AuthCapsInfo &caps_info = con->get_peer_caps_info();
7614   if (caps_info.allow_all)
7615     s->caps.set_allow_all();
7616
7617   if (caps_info.caps.length() > 0) {
7618     bufferlist::const_iterator p = caps_info.caps.cbegin();
7619     string str;
7620     try {
7621       decode(str, p);
7622     }
7623     catch (buffer::error& e) {
7624       dout(10) << __func__ << " session " << s << " " << s->entity_name
7625                << " failed to decode caps string" << dendl;
7626       ret = -EPERM;
7627     }
7628     if (!ret) {
7629       bool success = s->caps.parse(str);
7630       if (success) {
7631         dout(10) << __func__ << " session " << s
7632                  << " " << s->entity_name
7633                  << " has caps " << s->caps << " '" << str << "'" << dendl;
7634         ret = 1;
7635       } else {
7636         dout(10) << __func__ << " session " << s << " " << s->entity_name
7637                  << " failed to parse caps '" << str << "'" << dendl;
7638         ret = -EPERM;
7639       }
7640     }
7641   }
7642   return ret;
7643 }
7644
7645 void OSD::do_waiters()
7646 {
7647   ceph_assert(osd_lock.is_locked());
7648
7649   dout(10) << "do_waiters -- start" << dendl;
7650   while (!finished.empty()) {
7651     OpRequestRef next = finished.front();
7652     finished.pop_front();
7653     dispatch_op(next);
7654   }
7655   dout(10) << "do_waiters -- finish" << dendl;
7656 }
7657
7658 void OSD::dispatch_op(OpRequestRef op)
7659 {
7660   switch (op->get_req()->get_type()) {
7661
7662   case MSG_OSD_PG_CREATE:
7663     handle_pg_create(op);
7664     break;
7665   }
7666 }
7667
7668 void OSD::_dispatch(Message *m)
7669 {
7670   ceph_assert(osd_lock.is_locked());
7671   dout(20) << "_dispatch " << m << " " << *m << dendl;
7672
7673   switch (m->get_type()) {
7674     // -- don't need OSDMap --
7675
7676     // map and replication
7677   case CEPH_MSG_OSD_MAP:
7678     handle_osd_map(static_cast<MOSDMap*>(m));
7679     break;
7680
7681     // osd
7682   case MSG_OSD_SCRUB:
7683     handle_scrub(static_cast<MOSDScrub*>(m));
7684     break;
7685
7686   case MSG_COMMAND:
7687     handle_command(static_cast<MCommand*>(m));
7688     return;
7689
7690     // -- need OSDMap --
7691
7692   case MSG_OSD_PG_CREATE:
7693     {
7694       OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7695       if (m->trace)
7696         op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7697       // no map?  starting up?
7698       if (!osdmap) {
7699         dout(7) << "no OSDMap, not booted" << dendl;
7700         logger->inc(l_osd_waiting_for_map);
7701         waiting_for_osdmap.push_back(op);
7702         op->mark_delayed("no osdmap");
7703         break;
7704       }
7705
7706       // need OSDMap
7707       dispatch_op(op);
7708     }
7709   }
7710 }
7711
7712 // remove me post-nautilus
7713 void OSD::handle_scrub(MOSDScrub *m)
7714 {
7715   dout(10) << "handle_scrub " << *m << dendl;
7716   if (!require_mon_or_mgr_peer(m)) {
7717     m->put();
7718     return;
7719   }
7720   if (m->fsid != monc->get_fsid()) {
7721     dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7722             << dendl;
7723     m->put();
7724     return;
7725   }
7726
7727   vector<spg_t> spgs;
7728   _get_pgids(&spgs);
7729
7730   if (!m->scrub_pgs.empty()) {
7731     vector<spg_t> v;
7732     for (auto pgid : m->scrub_pgs) {
7733       spg_t pcand;
7734       if (osdmap->get_primary_shard(pgid, &pcand) &&
7735           std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7736         v.push_back(pcand);
7737       }
7738     }
7739     spgs.swap(v);
7740   }
7741
7742   for (auto pgid : spgs) {
7743     enqueue_peering_evt(
7744       pgid,
7745       PGPeeringEventRef(
7746         std::make_shared<PGPeeringEvent>(
7747           get_osdmap_epoch(),
7748           get_osdmap_epoch(),
7749           PG::RequestScrub(m->deep, m->repair))));
7750   }
7751
7752   m->put();
7753 }
7754
7755 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7756 {
7757   dout(10) << __func__ <<  " " << *m << dendl;
7758   if (!require_mon_or_mgr_peer(m)) {
7759     m->put();
7760     return;
7761   }
7762   if (m->fsid != monc->get_fsid()) {
7763     dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7764             << dendl;
7765     m->put();
7766     return;
7767   }
7768   for (auto pgid : m->scrub_pgs) {
7769     enqueue_peering_evt(
7770       pgid,
7771       PGPeeringEventRef(
7772         std::make_shared<PGPeeringEvent>(
7773           m->epoch,
7774           m->epoch,
7775           PG::RequestScrub(m->deep, m->repair))));
7776   }
7777   m->put();
7778 }
7779
7780 bool OSD::scrub_random_backoff()
7781 {
7782   bool coin_flip = (rand() / (double)RAND_MAX >=
7783                     cct->_conf->osd_scrub_backoff_ratio);
7784   if (!coin_flip) {
7785     dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7786     return true;
7787   }
7788   return false;
7789 }
7790
7791 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7792                                const spg_t& pg, const utime_t& timestamp,
7793                                double pool_scrub_min_interval,
7794                                double pool_scrub_max_interval, bool must)
7795   : cct(cct),
7796     pgid(pg),
7797     sched_time(timestamp),
7798     deadline(timestamp)
7799 {
7800   // if not explicitly requested, postpone the scrub with a random delay
7801   if (!must) {
7802     double scrub_min_interval = pool_scrub_min_interval > 0 ?
7803       pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7804     double scrub_max_interval = pool_scrub_max_interval > 0 ?
7805       pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7806
7807     sched_time += scrub_min_interval;
7808     double r = rand() / (double)RAND_MAX;
7809     sched_time +=
7810       scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7811     if (scrub_max_interval == 0) {
7812       deadline = utime_t();
7813     } else {
7814       deadline += scrub_max_interval;
7815     }
7816
7817   }
7818 }
7819
7820 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7821   if (sched_time < rhs.sched_time)
7822     return true;
7823   if (sched_time > rhs.sched_time)
7824     return false;
7825   return pgid < rhs.pgid;
7826 }
7827
7828 bool OSD::scrub_time_permit(utime_t now)
7829 {
7830   struct tm bdt;
7831   time_t tt = now.sec();
7832   localtime_r(&tt, &bdt);
7833
7834   bool day_permit = false;
7835   if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7836     if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7837       day_permit = true;
7838     }
7839   } else {
7840     if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7841       day_permit = true;
7842     }
7843   }
7844
7845   if (!day_permit) {
7846     dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7847             << " - " << cct->_conf->osd_scrub_end_week_day
7848             << " now " << bdt.tm_wday << " = no" << dendl;
7849     return false;
7850   }
7851
7852   bool time_permit = false;
7853   if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7854     if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7855       time_permit = true;
7856     }
7857   } else {
7858     if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7859       time_permit = true;
7860     }
7861   }
7862   if (!time_permit) {
7863     dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7864             << " - " << cct->_conf->osd_scrub_end_hour
7865             << " now " << bdt.tm_hour << " = no" << dendl;
7866   } else {
7867     dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7868             << " - " << cct->_conf->osd_scrub_end_hour
7869             << " now " << bdt.tm_hour << " = yes" << dendl;
7870   }
7871   return time_permit;
7872 }
7873
7874 bool OSD::scrub_load_below_threshold()
7875 {
7876   double loadavgs[3];
7877   if (getloadavg(loadavgs, 3) != 3) {
7878     dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7879     return false;
7880   }
7881
7882   // allow scrub if below configured threshold
7883   long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7884   double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7885   if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7886     dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7887              << " < max " << cct->_conf->osd_scrub_load_threshold
7888              << " = yes" << dendl;
7889     return true;
7890   }
7891
7892   // allow scrub if below daily avg and currently decreasing
7893   if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7894     dout(20) << __func__ << " loadavg " << loadavgs[0]
7895              << " < daily_loadavg " << daily_loadavg
7896              << " and < 15m avg " << loadavgs[2]
7897              << " = yes" << dendl;
7898     return true;
7899   }
7900
7901   dout(20) << __func__ << " loadavg " << loadavgs[0]
7902            << " >= max " << cct->_conf->osd_scrub_load_threshold
7903            << " and ( >= daily_loadavg " << daily_loadavg
7904            << " or >= 15m avg " << loadavgs[2]
7905            << ") = no" << dendl;
7906   return false;
7907 }
7908
7909 void OSD::sched_scrub()
7910 {
7911   // if not permitted, fail fast
7912   if (!service.can_inc_scrubs()) {
7913     return;
7914   }
7915   bool allow_requested_repair_only = false;
7916   if (service.is_recovery_active()) {
7917     if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
7918       dout(10) << __func__
7919                << " will only schedule explicitly requested repair due to active recovery"
7920                << dendl;
7921       allow_requested_repair_only = true;
7922     } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
7923       dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7924       return;
7925     }
7926   }
7927
7928   utime_t now = ceph_clock_now();
7929   bool time_permit = scrub_time_permit(now);
7930   bool load_is_low = scrub_load_below_threshold();
7931   dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7932
7933   OSDService::ScrubJob scrub;
7934   if (service.first_scrub_stamp(&scrub)) {
7935     do {
7936       dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7937
7938       if (scrub.sched_time > now) {
7939         // save ourselves some effort
7940         dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7941                  << " > " << now << dendl;
7942         break;
7943       }
7944
7945       if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7946         dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7947                  << (!time_permit ? "time not permit" : "high load") << dendl;
7948         continue;
7949       }
7950
7951       PGRef pg = _lookup_lock_pg(scrub.pgid);
7952       if (!pg)
7953         continue;
7954       // This has already started, so go on to the next scrub job
7955       if (pg->scrubber.active) {
7956         pg->unlock();
7957         dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7958         continue;
7959       }
7960       // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7961       if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7962         pg->unlock();
7963         dout(10) << __func__ << " skip " << scrub.pgid
7964                  << " because repairing is not explicitly requested on it"
7965                  << dendl;
7966         continue;
7967       }
7968       // If it is reserving, let it resolve before going to the next scrub job
7969       if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7970         pg->unlock();
7971         dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7972         break;
7973       }
7974       dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7975                << (pg->get_must_scrub() ? ", explicitly requested" :
7976                    (load_is_low ? ", load_is_low" : " deadline < now"))
7977                << dendl;
7978       if (pg->sched_scrub()) {
7979         pg->unlock();
7980         break;
7981       }
7982       pg->unlock();
7983     } while (service.next_scrub_stamp(scrub, &scrub));
7984   }
7985   dout(20) << "sched_scrub done" << dendl;
7986 }
7987
7988 void OSD::resched_all_scrubs()
7989 {
7990   dout(10) << __func__ << ": start" << dendl;
7991   OSDService::ScrubJob scrub;
7992   if (service.first_scrub_stamp(&scrub)) {
7993     do {
7994       dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
7995
7996       PGRef pg = _lookup_lock_pg(scrub.pgid);
7997       if (!pg)
7998         continue;
7999       if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
8000         dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
8001         pg->on_info_history_change();
8002       }
8003       pg->unlock();
8004     } while (service.next_scrub_stamp(scrub, &scrub));
8005   }
8006   dout(10) << __func__ << ": done" << dendl;
8007 }
8008
8009 MPGStats* OSD::collect_pg_stats()
8010 {
8011   // This implementation unconditionally sends every is_primary PG's
8012   // stats every time we're called.  This has equivalent cost to the
8013   // previous implementation's worst case where all PGs are busy and
8014   // their stats are always enqueued for sending.
8015   RWLock::RLocker l(map_lock);
8016
8017   utime_t had_for = ceph_clock_now() - had_map_since;
8018   osd_stat_t cur_stat = service.get_osd_stat();
8019   cur_stat.os_perf_stat = store->get_cur_stats();
8020
8021   auto m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
8022   m->osd_stat = cur_stat;
8023
8024   std::lock_guard lec{min_last_epoch_clean_lock};
8025   min_last_epoch_clean = osdmap->get_epoch();
8026   min_last_epoch_clean_pgs.clear();
8027
8028   std::set<int64_t> pool_set;
8029   vector<PGRef> pgs;
8030   _get_pgs(&pgs);
8031   for (auto& pg : pgs) {
8032     auto pool = pg->pg_id.pgid.pool();
8033     pool_set.emplace((int64_t)pool);
8034     if (!pg->is_primary()) {
8035       continue;
8036     }
8037     pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
8038         m->pg_stat[pg->pg_id.pgid] = s;
8039         min_last_epoch_clean = min(min_last_epoch_clean, lec);
8040         min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
8041       });
8042   }
8043   store_statfs_t st;
8044   bool per_pool_stats = false;
8045   for (auto p : pool_set) {
8046     int r = store->pool_statfs(p, &st);
8047     if (r == -ENOTSUP) {
8048       break;
8049     } else {
8050       assert(r >= 0);
8051       m->pool_stat[p] = st;
8052       per_pool_stats = true;
8053     }
8054   }
8055
8056   // indicate whether we are reporting per-pool stats
8057   m->osd_stat.num_osds = 1;
8058   m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
8059
8060   return m;
8061 }
8062
8063 vector<DaemonHealthMetric> OSD::get_health_metrics()
8064 {
8065   vector<DaemonHealthMetric> metrics;
8066   {
8067     utime_t oldest_secs;
8068     const utime_t now = ceph_clock_now();
8069     auto too_old = now;
8070     too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
8071     int slow = 0;
8072     TrackedOpRef oldest_op;
8073     auto count_slow_ops = [&](TrackedOp& op) {
8074       if (op.get_initiated() < too_old) {
8075         lgeneric_subdout(cct,osd,20) << "slow op " << op.get_desc()
8076                                      << " initiated "
8077                                      << op.get_initiated() << dendl;
8078         slow++;
8079         if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
8080           oldest_op = &op;
8081         }
8082         return true;
8083       } else {
8084         return false;
8085       }
8086     };
8087     if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
8088       if (slow) {
8089         derr << __func__ << " reporting " << slow << " slow ops, oldest is "
8090              << oldest_op->get_desc() << dendl;
8091       }
8092       metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
8093     } else {
8094       // no news is not good news.
8095       metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
8096     }
8097   }
8098   {
8099     std::lock_guard l(pending_creates_lock);
8100     auto n_primaries = pending_creates_from_mon;
8101     for (const auto& create : pending_creates_from_osd) {
8102       if (create.second) {
8103         n_primaries++;
8104       }
8105     }
8106     metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
8107   }
8108   return metrics;
8109 }
8110
8111 // =====================================================
8112 // MAP
8113
8114 void OSD::wait_for_new_map(OpRequestRef op)
8115 {
8116   // ask?
8117   if (waiting_for_osdmap.empty()) {
8118     osdmap_subscribe(osdmap->get_epoch() + 1, false);
8119   }
8120
8121   logger->inc(l_osd_waiting_for_map);
8122   waiting_for_osdmap.push_back(op);
8123   op->mark_delayed("wait for new map");
8124 }
8125
8126
8127 /** update_map
8128  * assimilate new OSDMap(s).  scan pgs, etc.
8129  */
8130
8131 void OSD::note_down_osd(int peer)
8132 {
8133   ceph_assert(osd_lock.is_locked());
8134   cluster_messenger->mark_down_addrs(osdmap->get_cluster_addrs(peer));
8135
8136   heartbeat_lock.Lock();
8137   failure_queue.erase(peer);
8138   failure_pending.erase(peer);
8139   map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
8140   if (p != heartbeat_peers.end()) {
8141     p->second.con_back->mark_down();
8142     if (p->second.con_front) {
8143       p->second.con_front->mark_down();
8144     }
8145     heartbeat_peers.erase(p);
8146   }
8147   heartbeat_lock.Unlock();
8148 }
8149
8150 void OSD::note_up_osd(int peer)
8151 {
8152   service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
8153   heartbeat_set_peers_need_update();
8154 }
8155
8156 struct C_OnMapCommit : public Context {
8157   OSD *osd;
8158   epoch_t first, last;
8159   MOSDMap *msg;
8160   C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
8161     : osd(o), first(f), last(l), msg(m) {}
8162   void finish(int r) override {
8163     osd->_committed_osd_maps(first, last, msg);
8164     msg->put();
8165   }
8166 };
8167
8168 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
8169 {
8170   std::lock_guard l(osdmap_subscribe_lock);
8171   if (latest_subscribed_epoch >= epoch && !force_request)
8172     return;
8173
8174   latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
8175
8176   if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
8177       force_request) {
8178     monc->renew_subs();
8179   }
8180 }
8181
8182 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
8183 {
8184   epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
8185   if (min <= superblock.oldest_map)
8186     return;
8187
8188   int num = 0;
8189   ObjectStore::Transaction t;
8190   for (epoch_t e = superblock.oldest_map; e < min; ++e) {
8191     dout(20) << " removing old osdmap epoch " << e << dendl;
8192     t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
8193     t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
8194     superblock.oldest_map = e + 1;
8195     num++;
8196     if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
8197       service.publish_superblock(superblock);
8198       write_superblock(t);
8199       int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8200       ceph_assert(tr == 0);
8201       num = 0;
8202       if (!skip_maps) {
8203         // skip_maps leaves us with a range of old maps if we fail to remove all
8204         // of them before moving superblock.oldest_map forward to the first map
8205         // in the incoming MOSDMap msg. so we should continue removing them in
8206         // this case, even we could do huge series of delete transactions all at
8207         // once.
8208         break;
8209       }
8210     }
8211   }
8212   if (num > 0) {
8213     service.publish_superblock(superblock);
8214     write_superblock(t);
8215     int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
8216     ceph_assert(tr == 0);
8217   }
8218   // we should not remove the cached maps
8219   ceph_assert(min <= service.map_cache.cached_key_lower_bound());
8220 }
8221
8222 void OSD::handle_osd_map(MOSDMap *m)
8223 {
8224   // wait for pgs to catch up
8225   {
8226     // we extend the map cache pins to accomodate pgs slow to consume maps
8227     // for some period, until we hit the max_lag_factor bound, at which point
8228     // we block here to stop injesting more maps than they are able to keep
8229     // up with.
8230     epoch_t max_lag = cct->_conf->osd_map_cache_size *
8231       m_osd_pg_epoch_max_lag_factor;
8232     ceph_assert(max_lag > 0);
8233     epoch_t osd_min = 0;
8234     for (auto shard : shards) {
8235       epoch_t min = shard->get_min_pg_epoch();
8236       if (osd_min == 0 || min < osd_min) {
8237         osd_min = min;
8238       }
8239     }
8240     if (osd_min > 0 &&
8241         osdmap->get_epoch() > max_lag &&
8242         osdmap->get_epoch() - max_lag > osd_min) {
8243       epoch_t need = osdmap->get_epoch() - max_lag;
8244       dout(10) << __func__ << " waiting for pgs to catch up (need " << need
8245                << " max_lag " << max_lag << ")" << dendl;
8246       for (auto shard : shards) {
8247         epoch_t min = shard->get_min_pg_epoch();
8248         if (need > min) {
8249           dout(10) << __func__ << " waiting for pgs to consume " << need
8250                    << " (shard " << shard->shard_id << " min " << min
8251                    << ", map cache is " << cct->_conf->osd_map_cache_size
8252                    << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
8253                    << ")" << dendl;
8254           unlock_guard unlock{osd_lock};
8255           shard->wait_min_pg_epoch(need);
8256         }
8257       }
8258     }
8259   }
8260
8261   ceph_assert(osd_lock.is_locked());
8262   map<epoch_t,OSDMapRef> added_maps;
8263   map<epoch_t,bufferlist> added_maps_bl;
8264   if (m->fsid != monc->get_fsid()) {
8265     dout(0) << "handle_osd_map fsid " << m->fsid << " != "
8266             << monc->get_fsid() << dendl;
8267     m->put();
8268     return;
8269   }
8270   if (is_initializing()) {
8271     dout(0) << "ignoring osdmap until we have initialized" << dendl;
8272     m->put();
8273     return;
8274   }
8275
8276   auto priv = m->get_connection()->get_priv();
8277   if (auto session = static_cast<Session *>(priv.get());
8278       session && !(session->entity_name.is_mon() ||
8279                    session->entity_name.is_osd())) {
8280     //not enough perms!
8281     dout(10) << "got osd map from Session " << session
8282              << " which we can't take maps from (not a mon or osd)" << dendl;
8283     m->put();
8284     return;
8285   }
8286
8287   // share with the objecter
8288   if (!is_preboot())
8289     service.objecter->handle_osd_map(m);
8290
8291   epoch_t first = m->get_first();
8292   epoch_t last = m->get_last();
8293   dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
8294           << superblock.newest_map
8295           << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
8296           << dendl;
8297
8298   logger->inc(l_osd_map);
8299   logger->inc(l_osd_mape, last - first + 1);
8300   if (first <= superblock.newest_map)
8301     logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8302   if (service.max_oldest_map < m->oldest_map) {
8303     service.max_oldest_map = m->oldest_map;
8304     ceph_assert(service.max_oldest_map >= superblock.oldest_map);
8305   }
8306
8307   // make sure there is something new, here, before we bother flushing
8308   // the queues and such
8309   if (last <= superblock.newest_map) {
8310     dout(10) << " no new maps here, dropping" << dendl;
8311     m->put();
8312     return;
8313   }
8314
8315   // missing some?
8316   bool skip_maps = false;
8317   if (first > superblock.newest_map + 1) {
8318     dout(10) << "handle_osd_map message skips epochs "
8319              << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8320     if (m->oldest_map <= superblock.newest_map + 1) {
8321       osdmap_subscribe(superblock.newest_map + 1, false);
8322       m->put();
8323       return;
8324     }
8325     // always try to get the full range of maps--as many as we can.  this
8326     //  1- is good to have
8327     //  2- is at present the only way to ensure that we get a *full* map as
8328     //     the first map!
8329     if (m->oldest_map < first) {
8330       osdmap_subscribe(m->oldest_map - 1, true);
8331       m->put();
8332       return;
8333     }
8334     skip_maps = true;
8335   }
8336
8337   ObjectStore::Transaction t;
8338   uint64_t txn_size = 0;
8339
8340   // store new maps: queue for disk and put in the osdmap cache
8341   epoch_t start = std::max(superblock.newest_map + 1, first);
8342   for (epoch_t e = start; e <= last; e++) {
8343     if (txn_size >= t.get_num_bytes()) {
8344       derr << __func__ << " transaction size overflowed" << dendl;
8345       ceph_assert(txn_size < t.get_num_bytes());
8346     }
8347     txn_size = t.get_num_bytes();
8348     map<epoch_t,bufferlist>::iterator p;
8349     p = m->maps.find(e);
8350     if (p != m->maps.end()) {
8351       dout(10) << "handle_osd_map  got full map for epoch " << e << dendl;
8352       OSDMap *o = new OSDMap;
8353       bufferlist& bl = p->second;
8354
8355       o->decode(bl);
8356
8357       ghobject_t fulloid = get_osdmap_pobject_name(e);
8358       t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8359       added_maps[e] = add_map(o);
8360       added_maps_bl[e] = bl;
8361       got_full_map(e);
8362       continue;
8363     }
8364
8365     p = m->incremental_maps.find(e);
8366     if (p != m->incremental_maps.end()) {
8367       dout(10) << "handle_osd_map  got inc map for epoch " << e << dendl;
8368       bufferlist& bl = p->second;
8369       ghobject_t oid = get_inc_osdmap_pobject_name(e);
8370       t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8371
8372       OSDMap *o = new OSDMap;
8373       if (e > 1) {
8374         bufferlist obl;
8375         bool got = get_map_bl(e - 1, obl);
8376         if (!got) {
8377           auto p = added_maps_bl.find(e - 1);
8378           ceph_assert(p != added_maps_bl.end());
8379           obl = p->second;
8380         }
8381         o->decode(obl);
8382       }
8383
8384       OSDMap::Incremental inc;
8385       auto p = bl.cbegin();
8386       inc.decode(p);
8387
8388       if (o->apply_incremental(inc) < 0) {
8389         derr << "ERROR: bad fsid?  i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
8390         ceph_abort_msg("bad fsid");
8391       }
8392
8393       bufferlist fbl;
8394       o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8395
8396       bool injected_failure = false;
8397       if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8398           (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8399         derr << __func__ << " injecting map crc failure" << dendl;
8400         injected_failure = true;
8401       }
8402
8403       if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8404         dout(2) << "got incremental " << e
8405                 << " but failed to encode full with correct crc; requesting"
8406                 << dendl;
8407         clog->warn() << "failed to encode map e" << e << " with expected crc";
8408         dout(20) << "my encoded map was:\n";
8409         fbl.hexdump(*_dout);
8410         *_dout << dendl;
8411         delete o;
8412         request_full_map(e, last);
8413         last = e - 1;
8414         break;
8415       }
8416       got_full_map(e);
8417
8418       ghobject_t fulloid = get_osdmap_pobject_name(e);
8419       t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8420       added_maps[e] = add_map(o);
8421       added_maps_bl[e] = fbl;
8422       continue;
8423     }
8424
8425     ceph_abort_msg("MOSDMap lied about what maps it had?");
8426   }
8427
8428   // even if this map isn't from a mon, we may have satisfied our subscription
8429   monc->sub_got("osdmap", last);
8430
8431   if (!m->maps.empty() && requested_full_first) {
8432     dout(10) << __func__ << " still missing full maps " << requested_full_first
8433              << ".." << requested_full_last << dendl;
8434     rerequest_full_maps();
8435   }
8436
8437   if (superblock.oldest_map) {
8438     // make sure we at least keep pace with incoming maps
8439     trim_maps(m->oldest_map, last - first + 1, skip_maps);
8440     pg_num_history.prune(superblock.oldest_map);
8441   }
8442
8443   if (!superblock.oldest_map || skip_maps)
8444     superblock.oldest_map = first;
8445   superblock.newest_map = last;
8446   superblock.current_epoch = last;
8447
8448   // note in the superblock that we were clean thru the prior epoch
8449   epoch_t boot_epoch = service.get_boot_epoch();
8450   if (boot_epoch && boot_epoch >= superblock.mounted) {
8451     superblock.mounted = boot_epoch;
8452     superblock.clean_thru = last;
8453   }
8454
8455   // check for pg_num changes and deleted pools
8456   OSDMapRef lastmap;
8457   for (auto& i : added_maps) {
8458     if (!lastmap) {
8459       if (!(lastmap = service.try_get_map(i.first - 1))) {
8460         dout(10) << __func__ << " can't get previous map " << i.first - 1
8461                  << " probably first start of this osd" << dendl;
8462         continue;
8463       }
8464     }
8465     ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8466     for (auto& j : lastmap->get_pools()) {
8467       if (!i.second->have_pg_pool(j.first)) {
8468         pg_num_history.log_pool_delete(i.first, j.first);
8469         dout(10) << __func__ << " recording final pg_pool_t for pool "
8470                  << j.first << dendl;
8471         // this information is needed by _make_pg() if have to restart before
8472         // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8473         ghobject_t obj = make_final_pool_info_oid(j.first);
8474         bufferlist bl;
8475         encode(j.second, bl, CEPH_FEATURES_ALL);
8476         string name = lastmap->get_pool_name(j.first);
8477         encode(name, bl);
8478         map<string,string> profile;
8479         if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8480           profile = lastmap->get_erasure_code_profile(
8481             lastmap->get_pg_pool(j.first)->erasure_code_profile);
8482         }
8483         encode(profile, bl);
8484         t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8485         service.store_deleted_pool_pg_num(j.first, j.second.get_pg_num());
8486       } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8487                  new_pg_num != j.second.get_pg_num()) {
8488         dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8489                  << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8490         pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8491       }
8492     }
8493     for (auto& j : i.second->get_pools()) {
8494       if (!lastmap->have_pg_pool(j.first)) {
8495         dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8496                  << j.second.get_pg_num() << dendl;
8497         pg_num_history.log_pg_num_change(i.first, j.first,
8498                                          j.second.get_pg_num());
8499       }
8500     }
8501     lastmap = i.second;
8502   }
8503   pg_num_history.epoch = last;
8504   {
8505     bufferlist bl;
8506     ::encode(pg_num_history, bl);
8507     t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8508     dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8509   }
8510
8511   // superblock and commit
8512   write_superblock(t);
8513   t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8514   store->queue_transaction(
8515     service.meta_ch,
8516     std::move(t));
8517   service.publish_superblock(superblock);
8518 }
8519
8520 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8521 {
8522   dout(10) << __func__ << " " << first << ".." << last << dendl;
8523   if (is_stopping()) {
8524     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8525     return;
8526   }
8527   std::lock_guard l(osd_lock);
8528   if (is_stopping()) {
8529     dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8530     return;
8531   }
8532   map_lock.get_write();
8533
8534   bool do_shutdown = false;
8535   bool do_restart = false;
8536   bool network_error = false;
8537
8538   // advance through the new maps
8539   for (epoch_t cur = first; cur <= last; cur++) {
8540     dout(10) << " advance to epoch " << cur
8541              << " (<= last " << last
8542              << " <= newest_map " << superblock.newest_map
8543              << ")" << dendl;
8544
8545     OSDMapRef newmap = get_map(cur);
8546     ceph_assert(newmap);  // we just cached it above!
8547
8548     // start blacklisting messages sent to peers that go down.
8549     service.pre_publish_map(newmap);
8550
8551     // kill connections to newly down osds
8552     bool waited_for_reservations = false;
8553     set<int> old;
8554     osdmap->get_all_osds(old);
8555     for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8556       if (*p != whoami &&
8557           osdmap->is_up(*p) && // in old map
8558           newmap->is_down(*p)) {    // but not the new one
8559         if (!waited_for_reservations) {
8560           service.await_reserved_maps();
8561           waited_for_reservations = true;
8562         }
8563         note_down_osd(*p);
8564       } else if (*p != whoami &&
8565                 osdmap->is_down(*p) &&
8566                 newmap->is_up(*p)) {
8567         note_up_osd(*p);
8568       }
8569     }
8570
8571     if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8572       dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8573                << dendl;
8574       if (is_booting()) {
8575         // this captures the case where we sent the boot message while
8576         // NOUP was being set on the mon and our boot request was
8577         // dropped, and then later it is cleared.  it imperfectly
8578         // handles the case where our original boot message was not
8579         // dropped and we restart even though we might have booted, but
8580         // that is harmless (boot will just take slightly longer).
8581         do_restart = true;
8582       }
8583     }
8584
8585     osdmap = newmap;
8586     epoch_t up_epoch;
8587     epoch_t boot_epoch;
8588     service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8589     if (!up_epoch &&
8590         osdmap->is_up(whoami) &&
8591         osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8592       up_epoch = osdmap->get_epoch();
8593       dout(10) << "up_epoch is " << up_epoch << dendl;
8594       if (!boot_epoch) {
8595         boot_epoch = osdmap->get_epoch();
8596         dout(10) << "boot_epoch is " << boot_epoch << dendl;
8597       }
8598       service.set_epochs(&boot_epoch, &up_epoch, NULL);
8599     }
8600   }
8601
8602   had_map_since = ceph_clock_now();
8603
8604   epoch_t _bind_epoch = service.get_bind_epoch();
8605   if (osdmap->is_up(whoami) &&
8606       osdmap->get_addrs(whoami).legacy_equals(
8607         client_messenger->get_myaddrs()) &&
8608       _bind_epoch < osdmap->get_up_from(whoami)) {
8609
8610     if (is_booting()) {
8611       dout(1) << "state: booting -> active" << dendl;
8612       set_state(STATE_ACTIVE);
8613       do_restart = false;
8614
8615       // set incarnation so that osd_reqid_t's we generate for our
8616       // objecter requests are unique across restarts.
8617       service.objecter->set_client_incarnation(osdmap->get_epoch());
8618       cancel_pending_failures();
8619     }
8620   }
8621
8622   if (osdmap->get_epoch() > 0 &&
8623       is_active()) {
8624     if (!osdmap->exists(whoami)) {
8625       dout(0) << "map says i do not exist.  shutting down." << dendl;
8626       do_shutdown = true;   // don't call shutdown() while we have
8627                             // everything paused
8628     } else if (!osdmap->is_up(whoami) ||
8629                !osdmap->get_addrs(whoami).legacy_equals(
8630                  client_messenger->get_myaddrs()) ||
8631                !osdmap->get_cluster_addrs(whoami).legacy_equals(
8632                  cluster_messenger->get_myaddrs()) ||
8633                !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8634                  hb_back_server_messenger->get_myaddrs()) ||
8635                !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8636                  hb_front_server_messenger->get_myaddrs())) {
8637       if (!osdmap->is_up(whoami)) {
8638         if (service.is_preparing_to_stop() || service.is_stopping()) {
8639           service.got_stop_ack();
8640         } else {
8641           clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8642                           "but it is still running";
8643           clog->debug() << "map e" << osdmap->get_epoch()
8644                         << " wrongly marked me down at e"
8645                         << osdmap->get_down_at(whoami);
8646         }
8647       } else if (!osdmap->get_addrs(whoami).legacy_equals(
8648                    client_messenger->get_myaddrs())) {
8649         clog->error() << "map e" << osdmap->get_epoch()
8650                       << " had wrong client addr (" << osdmap->get_addrs(whoami)
8651                       << " != my " << client_messenger->get_myaddrs() << ")";
8652       } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8653                    cluster_messenger->get_myaddrs())) {
8654         clog->error() << "map e" << osdmap->get_epoch()
8655                       << " had wrong cluster addr ("
8656                       << osdmap->get_cluster_addrs(whoami)
8657                       << " != my " << cluster_messenger->get_myaddrs() << ")";
8658       } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8659                    hb_back_server_messenger->get_myaddrs())) {
8660         clog->error() << "map e" << osdmap->get_epoch()
8661                       << " had wrong heartbeat back addr ("
8662                       << osdmap->get_hb_back_addrs(whoami)
8663                       << " != my " << hb_back_server_messenger->get_myaddrs()
8664                       << ")";
8665       } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8666                    hb_front_server_messenger->get_myaddrs())) {
8667         clog->error() << "map e" << osdmap->get_epoch()
8668                       << " had wrong heartbeat front addr ("
8669                       << osdmap->get_hb_front_addrs(whoami)
8670                       << " != my " << hb_front_server_messenger->get_myaddrs()
8671                       << ")";
8672       }
8673
8674       if (!service.is_stopping()) {
8675         epoch_t up_epoch = 0;
8676         epoch_t bind_epoch = osdmap->get_epoch();
8677         service.set_epochs(NULL,&up_epoch, &bind_epoch);
8678         do_restart = true;
8679
8680         //add markdown log
8681         utime_t now = ceph_clock_now();
8682         utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8683         osd_markdown_log.push_back(now);
8684         //clear all out-of-date log
8685         while (!osd_markdown_log.empty() &&
8686                osd_markdown_log.front() + grace < now)
8687           osd_markdown_log.pop_front();
8688         if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8689           dout(0) << __func__ << " marked down "
8690                   << osd_markdown_log.size()
8691                   << " > osd_max_markdown_count "
8692                   << cct->_conf->osd_max_markdown_count
8693                   << " in last " << grace << " seconds, shutting down"
8694                   << dendl;
8695           do_restart = false;
8696           do_shutdown = true;
8697         }
8698
8699         start_waiting_for_healthy();
8700
8701         set<int> avoid_ports;
8702 #if defined(__FreeBSD__)
8703         // prevent FreeBSD from grabbing the client_messenger port during
8704         // rebinding. In which case a cluster_meesneger will connect also
8705         // to the same port
8706         client_messenger->get_myaddrs().get_ports(&avoid_ports);
8707 #endif
8708         cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8709         hb_back_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8710         hb_front_server_messenger->get_myaddrs().get_ports(&avoid_ports);
8711
8712         int r = cluster_messenger->rebind(avoid_ports);
8713         if (r != 0) {
8714           do_shutdown = true;  // FIXME: do_restart?
8715           network_error = true;
8716           dout(0) << __func__ << " marked down:"
8717                   << " rebind cluster_messenger failed" << dendl;
8718         }
8719
8720         r = hb_back_server_messenger->rebind(avoid_ports);
8721         if (r != 0) {
8722           do_shutdown = true;  // FIXME: do_restart?
8723           network_error = true;
8724           dout(0) << __func__ << " marked down:"
8725                   << " rebind hb_back_server_messenger failed" << dendl;
8726         }
8727
8728         r = hb_front_server_messenger->rebind(avoid_ports);
8729         if (r != 0) {
8730           do_shutdown = true;  // FIXME: do_restart?
8731           network_error = true;
8732           dout(0) << __func__ << " marked down:"
8733                   << " rebind hb_front_server_messenger failed" << dendl;
8734         }
8735
8736         hb_front_client_messenger->mark_down_all();
8737         hb_back_client_messenger->mark_down_all();
8738
8739         reset_heartbeat_peers(true);
8740       }
8741     }
8742   }
8743
8744   map_lock.put_write();
8745
8746   check_osdmap_features();
8747
8748   // yay!
8749   consume_map();
8750
8751   if (is_active() || is_waiting_for_healthy())
8752     maybe_update_heartbeat_peers();
8753
8754   if (is_active()) {
8755     activate_map();
8756   }
8757
8758   if (do_shutdown) {
8759     if (network_error) {
8760       cancel_pending_failures();
8761     }
8762     // trigger shutdown in a different thread
8763     dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8764     queue_async_signal(SIGINT);
8765   }
8766   else if (m->newest_map && m->newest_map > last) {
8767     dout(10) << " msg say newest map is " << m->newest_map
8768              << ", requesting more" << dendl;
8769     osdmap_subscribe(osdmap->get_epoch()+1, false);
8770   }
8771   else if (is_preboot()) {
8772     if (m->get_source().is_mon())
8773       _preboot(m->oldest_map, m->newest_map);
8774     else
8775       start_boot();
8776   }
8777   else if (do_restart)
8778     start_boot();
8779
8780 }
8781
8782 void OSD::check_osdmap_features()
8783 {
8784   // adjust required feature bits?
8785
8786   // we have to be a bit careful here, because we are accessing the
8787   // Policy structures without taking any lock.  in particular, only
8788   // modify integer values that can safely be read by a racing CPU.
8789   // since we are only accessing existing Policy structures a their
8790   // current memory location, and setting or clearing bits in integer
8791   // fields, and we are the only writer, this is not a problem.
8792
8793   {
8794     Messenger::Policy p = client_messenger->get_default_policy();
8795     uint64_t mask;
8796     uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8797     if ((p.features_required & mask) != features) {
8798       dout(0) << "crush map has features " << features
8799               << ", adjusting msgr requires for clients" << dendl;
8800       p.features_required = (p.features_required & ~mask) | features;
8801       client_messenger->set_default_policy(p);
8802     }
8803   }
8804   {
8805     Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8806     uint64_t mask;
8807     uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8808     if ((p.features_required & mask) != features) {
8809       dout(0) << "crush map has features " << features
8810               << " was " << p.features_required
8811               << ", adjusting msgr requires for mons" << dendl;
8812       p.features_required = (p.features_required & ~mask) | features;
8813       client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8814     }
8815   }
8816   {
8817     Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8818     uint64_t mask;
8819     uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8820
8821     if ((p.features_required & mask) != features) {
8822       dout(0) << "crush map has features " << features
8823               << ", adjusting msgr requires for osds" << dendl;
8824       p.features_required = (p.features_required & ~mask) | features;
8825       cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8826     }
8827
8828     if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8829       dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8830       superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8831       ObjectStore::Transaction t;
8832       write_superblock(t);
8833       int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8834       ceph_assert(err == 0);
8835     }
8836   }
8837
8838   if (osdmap->require_osd_release < CEPH_RELEASE_NAUTILUS) {
8839     heartbeat_dispatcher.ms_set_require_authorizer(false);
8840   }
8841
8842   if (osdmap->require_osd_release != last_require_osd_release) {
8843     dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8844             << " -> " << to_string(osdmap->require_osd_release) << dendl;
8845     store->write_meta("require_osd_release",
8846                       stringify((int)osdmap->require_osd_release));
8847     last_require_osd_release = osdmap->require_osd_release;
8848   }
8849 }
8850
8851 struct C_FinishSplits : public Context {
8852   OSD *osd;
8853   set<PGRef> pgs;
8854   C_FinishSplits(OSD *osd, const set<PGRef> &in)
8855     : osd(osd), pgs(in) {}
8856   void finish(int r) override {
8857     osd->_finish_splits(pgs);
8858   }
8859 };
8860
8861 void OSD::_finish_splits(set<PGRef>& pgs)
8862 {
8863   dout(10) << __func__ << " " << pgs << dendl;
8864   if (is_stopping())
8865     return;
8866   PG::RecoveryCtx rctx = create_context();
8867   for (set<PGRef>::iterator i = pgs.begin();
8868        i != pgs.end();
8869        ++i) {
8870     PG *pg = i->get();
8871
8872     pg->lock();
8873     dout(10) << __func__ << " " << *pg << dendl;
8874     epoch_t e = pg->get_osdmap_epoch();
8875     pg->handle_initialize(&rctx);
8876     pg->queue_null(e, e);
8877     dispatch_context_transaction(rctx, pg);
8878     pg->unlock();
8879
8880     unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8881     shards[shard_index]->register_and_wake_split_child(pg);
8882   }
8883
8884   dispatch_context(rctx, 0, service.get_osdmap());
8885 };
8886
8887 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8888                            unsigned need)
8889 {
8890   std::lock_guard l(merge_lock);
8891   auto& p = merge_waiters[nextmap->get_epoch()][target];
8892   p[src->pg_id] = src;
8893   dout(10) << __func__ << " added merge_waiter " << src->pg_id
8894            << " for " << target  << ", have " << p.size() << "/" << need
8895            << dendl;
8896   return p.size() == need;
8897 }
8898
8899 bool OSD::advance_pg(
8900   epoch_t osd_epoch,
8901   PG *pg,
8902   ThreadPool::TPHandle &handle,
8903   PG::RecoveryCtx *rctx)
8904 {
8905   if (osd_epoch <= pg->get_osdmap_epoch()) {
8906     return true;
8907   }
8908   ceph_assert(pg->is_locked());
8909   OSDMapRef lastmap = pg->get_osdmap();
8910   ceph_assert(lastmap->get_epoch() < osd_epoch);
8911   set<PGRef> new_pgs;  // any split children
8912   bool ret = true;
8913
8914   unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8915     lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8916   for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8917        next_epoch <= osd_epoch;
8918        ++next_epoch) {
8919     OSDMapRef nextmap = service.try_get_map(next_epoch);
8920     if (!nextmap) {
8921       dout(20) << __func__ << " missing map " << next_epoch << dendl;
8922       continue;
8923     }
8924
8925     unsigned new_pg_num =
8926       (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8927       nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8928     if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8929       // check for merge
8930       if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8931         spg_t parent;
8932         if (pg->pg_id.is_merge_source(
8933               old_pg_num,
8934               new_pg_num,
8935               &parent)) {
8936           // we are merge source
8937           PGRef spg = pg; // carry a ref
8938           dout(1) << __func__ << " " << pg->pg_id
8939                   << " is merge source, target is " << parent
8940                    << dendl;
8941           pg->write_if_dirty(rctx);
8942           dispatch_context_transaction(*rctx, pg, &handle);
8943           pg->ch->flush();
8944           // release backoffs explicitly, since the on_shutdown path
8945           // aggressively tears down backoff state.
8946           if (pg->is_primary()) {
8947             pg->release_pg_backoffs();
8948           }
8949           pg->on_shutdown();
8950           OSDShard *sdata = pg->osd_shard;
8951           {
8952             std::lock_guard l(sdata->shard_lock);
8953             if (pg->pg_slot) {
8954               sdata->_detach_pg(pg->pg_slot);
8955               // update pg count now since we might not get an osdmap
8956               // any time soon.
8957               if (pg->is_primary())
8958                 logger->dec(l_osd_pg_primary);
8959               else if (pg->is_replica())
8960                 logger->dec(l_osd_pg_replica);
8961               else
8962                 logger->dec(l_osd_pg_stray);
8963             }
8964           }
8965           pg->unlock();
8966
8967           set<spg_t> children;
8968           parent.is_split(new_pg_num, old_pg_num, &children);
8969           if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8970             enqueue_peering_evt(
8971               parent,
8972               PGPeeringEventRef(
8973                 std::make_shared<PGPeeringEvent>(
8974                   nextmap->get_epoch(),
8975                   nextmap->get_epoch(),
8976                   NullEvt())));
8977           }
8978           ret = false;
8979           goto out;
8980         } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8981           // we are merge target
8982           set<spg_t> children;
8983           pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8984           dout(20) << __func__ << " " << pg->pg_id
8985                    << " is merge target, sources are " << children
8986                    << dendl;
8987           map<spg_t,PGRef> sources;
8988           {
8989             std::lock_guard l(merge_lock);
8990             auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8991             unsigned need = children.size();
8992             dout(20) << __func__ << " have " << s.size() << "/"
8993                      << need << dendl;
8994             if (s.size() == need) {
8995               sources.swap(s);
8996               merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8997               if (merge_waiters[nextmap->get_epoch()].empty()) {
8998                 merge_waiters.erase(nextmap->get_epoch());
8999               }
9000             }
9001           }
9002           if (!sources.empty()) {
9003             unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
9004             unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
9005             dout(1) << __func__ << " merging " << pg->pg_id << dendl;
9006             pg->merge_from(
9007               sources, rctx, split_bits,
9008               nextmap->get_pg_pool(
9009                 pg->pg_id.pool())->last_pg_merge_meta);
9010             pg->pg_slot->waiting_for_merge_epoch = 0;
9011           } else {
9012             dout(20) << __func__ << " not ready to merge yet" << dendl;
9013             pg->write_if_dirty(rctx);
9014             pg->unlock();
9015             // kick source(s) to get them ready
9016             for (auto& i : children) {
9017               dout(20) << __func__ << " kicking source " << i << dendl;
9018               enqueue_peering_evt(
9019                 i,
9020                 PGPeeringEventRef(
9021                   std::make_shared<PGPeeringEvent>(
9022                     nextmap->get_epoch(),
9023                     nextmap->get_epoch(),
9024                     NullEvt())));
9025             }
9026             ret = false;
9027             goto out;
9028           }
9029         }
9030       }
9031     }
9032
9033     vector<int> newup, newacting;
9034     int up_primary, acting_primary;
9035     nextmap->pg_to_up_acting_osds(
9036       pg->pg_id.pgid,
9037       &newup, &up_primary,
9038       &newacting, &acting_primary);
9039     pg->handle_advance_map(
9040       nextmap, lastmap, newup, up_primary,
9041       newacting, acting_primary, rctx);
9042
9043     auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
9044     auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
9045     if (oldpool != lastmap->get_pools().end()
9046         && newpool != nextmap->get_pools().end()) {
9047       dout(20) << __func__
9048                << " new pool opts " << newpool->second.opts
9049                << " old pool opts " << oldpool->second.opts
9050                << dendl;
9051
9052       double old_min_interval = 0, new_min_interval = 0;
9053       oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
9054       newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
9055
9056       double old_max_interval = 0, new_max_interval = 0;
9057       oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
9058       newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
9059
9060       // Assume if an interval is change from set to unset or vice versa the actual config
9061       // is different.  Keep it simple even if it is possible to call resched_all_scrub()
9062       // unnecessarily.
9063       if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
9064         pg->on_info_history_change();
9065       }
9066     }
9067
9068     if (new_pg_num && old_pg_num != new_pg_num) {
9069       // check for split
9070       set<spg_t> children;
9071       if (pg->pg_id.is_split(
9072             old_pg_num,
9073             new_pg_num,
9074             &children)) {
9075         split_pgs(
9076           pg, children, &new_pgs, lastmap, nextmap,
9077           rctx);
9078       }
9079     }
9080
9081     lastmap = nextmap;
9082     old_pg_num = new_pg_num;
9083     handle.reset_tp_timeout();
9084   }
9085   pg->handle_activate_map(rctx);
9086
9087   ret = true;
9088  out:
9089   if (!new_pgs.empty()) {
9090     rctx->transaction->register_on_applied(new C_FinishSplits(this, new_pgs));
9091   }
9092   return ret;
9093 }
9094
9095 void OSD::consume_map()
9096 {
9097   ceph_assert(osd_lock.is_locked());
9098   dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
9099
9100   /** make sure the cluster is speaking in SORTBITWISE, because we don't
9101    *  speak the older sorting version any more. Be careful not to force
9102    *  a shutdown if we are merely processing old maps, though.
9103    */
9104   if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
9105     derr << __func__ << " SORTBITWISE flag is not set" << dendl;
9106     ceph_abort();
9107   }
9108
9109   service.pre_publish_map(osdmap);
9110   service.await_reserved_maps();
9111   service.publish_map(osdmap);
9112
9113   // prime splits and merges
9114   set<pair<spg_t,epoch_t>> newly_split;  // splits, and when
9115   set<pair<spg_t,epoch_t>> merge_pgs;    // merge participants, and when
9116   for (auto& shard : shards) {
9117     shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
9118   }
9119   if (!newly_split.empty()) {
9120     for (auto& shard : shards) {
9121       shard->prime_splits(osdmap, &newly_split);
9122     }
9123     ceph_assert(newly_split.empty());
9124   }
9125
9126   // prune sent_ready_to_merge
9127   service.prune_sent_ready_to_merge(osdmap);
9128
9129   // FIXME, maybe: We could race against an incoming peering message
9130   // that instantiates a merge PG after identify_merges() below and
9131   // never set up its peer to complete the merge.  An OSD restart
9132   // would clear it up.  This is a hard race to resolve,
9133   // extraordinarily rare (we only merge PGs that are stable and
9134   // clean, so it'd have to be an imported PG to an OSD with a
9135   // slightly stale OSDMap...), so I'm ignoring it for now.  We plan to
9136   // replace all of this with a seastar-based code soon anyway.
9137   if (!merge_pgs.empty()) {
9138     // mark the pgs we already have, or create new and empty merge
9139     // participants for those we are missing.  do this all under the
9140     // shard lock so we don't have to worry about racing pg creates
9141     // via _process.
9142     for (auto& shard : shards) {
9143       shard->prime_merges(osdmap, &merge_pgs);
9144     }
9145     ceph_assert(merge_pgs.empty());
9146   }
9147
9148   service.prune_pg_created();
9149
9150   unsigned pushes_to_free = 0;
9151   for (auto& shard : shards) {
9152     shard->consume_map(osdmap, &pushes_to_free);
9153   }
9154
9155   vector<spg_t> pgids;
9156   _get_pgids(&pgids);
9157
9158   // count (FIXME, probably during seastar rewrite)
9159   int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
9160   vector<PGRef> pgs;
9161   _get_pgs(&pgs);
9162   for (auto& pg : pgs) {
9163     // FIXME (probably during seastar rewrite): this is lockless and
9164     // racy, but we don't want to take pg lock here.
9165     if (pg->is_primary())
9166       num_pg_primary++;
9167     else if (pg->is_replica())
9168       num_pg_replica++;
9169     else
9170       num_pg_stray++;
9171   }
9172
9173   {
9174     // FIXME (as part of seastar rewrite): move to OSDShard
9175     std::lock_guard l(pending_creates_lock);
9176     for (auto pg = pending_creates_from_osd.begin();
9177          pg != pending_creates_from_osd.end();) {
9178       if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
9179         dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
9180                  << "discarding pending_create_from_osd" << dendl;
9181         pg = pending_creates_from_osd.erase(pg);
9182       } else {
9183         ++pg;
9184       }
9185     }
9186   }
9187
9188   service.maybe_inject_dispatch_delay();
9189
9190   dispatch_sessions_waiting_on_map();
9191
9192   service.maybe_inject_dispatch_delay();
9193
9194   service.release_reserved_pushes(pushes_to_free);
9195
9196   // queue null events to push maps down to individual PGs
9197   for (auto pgid : pgids) {
9198     enqueue_peering_evt(
9199       pgid,
9200       PGPeeringEventRef(
9201         std::make_shared<PGPeeringEvent>(
9202           osdmap->get_epoch(),
9203           osdmap->get_epoch(),
9204           NullEvt())));
9205   }
9206   logger->set(l_osd_pg, pgids.size());
9207   logger->set(l_osd_pg_primary, num_pg_primary);
9208   logger->set(l_osd_pg_replica, num_pg_replica);
9209   logger->set(l_osd_pg_stray, num_pg_stray);
9210 }
9211
9212 void OSD::activate_map()
9213 {
9214   ceph_assert(osd_lock.is_locked());
9215
9216   dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
9217
9218   if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
9219     dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
9220     osdmap_subscribe(osdmap->get_epoch() + 1, false);
9221   }
9222
9223   // norecover?
9224   if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
9225     if (!service.recovery_is_paused()) {
9226       dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
9227       service.pause_recovery();
9228     }
9229   } else {
9230     if (service.recovery_is_paused()) {
9231       dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
9232       service.unpause_recovery();
9233     }
9234   }
9235
9236   service.activate_map();
9237
9238   // process waiters
9239   take_waiters(waiting_for_osdmap);
9240 }
9241
9242 bool OSD::require_mon_peer(const Message *m)
9243 {
9244   if (!m->get_connection()->peer_is_mon()) {
9245     dout(0) << "require_mon_peer received from non-mon "
9246             << m->get_connection()->get_peer_addr()
9247             << " " << *m << dendl;
9248     return false;
9249   }
9250   return true;
9251 }
9252
9253 bool OSD::require_mon_or_mgr_peer(const Message *m)
9254 {
9255   if (!m->get_connection()->peer_is_mon() &&
9256       !m->get_connection()->peer_is_mgr()) {
9257     dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
9258             << m->get_connection()->get_peer_addr()
9259             << " " << *m << dendl;
9260     return false;
9261   }
9262   return true;
9263 }
9264
9265 bool OSD::require_osd_peer(const Message *m)
9266 {
9267   if (!m->get_connection()->peer_is_osd()) {
9268     dout(0) << "require_osd_peer received from non-osd "
9269             << m->get_connection()->get_peer_addr()
9270             << " " << *m << dendl;
9271     return false;
9272   }
9273   return true;
9274 }
9275
9276 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9277 {
9278   epoch_t up_epoch = service.get_up_epoch();
9279   if (epoch < up_epoch) {
9280     dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9281     return false;
9282   }
9283
9284   if (!is_active()) {
9285     dout(7) << "still in boot state, dropping message " << *m << dendl;
9286     return false;
9287   }
9288
9289   return true;
9290 }
9291
9292 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
9293                                      bool is_fast_dispatch)
9294 {
9295   int from = m->get_source().num();
9296
9297   if (map->is_down(from) ||
9298       (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9299     dout(5) << "from dead osd." << from << ", marking down, "
9300             << " msg was " << m->get_source_inst().addr
9301             << " expected "
9302             << (map->is_up(from) ?
9303                 map->get_cluster_addrs(from) : entity_addrvec_t())
9304             << dendl;
9305     ConnectionRef con = m->get_connection();
9306     con->mark_down();
9307     auto priv = con->get_priv();
9308     if (auto s = static_cast<Session*>(priv.get()); s) {
9309       if (!is_fast_dispatch)
9310         s->session_dispatch_lock.Lock();
9311       clear_session_waiting_on_map(s);
9312       con->set_priv(nullptr);   // break ref <-> session cycle, if any
9313       s->con.reset();
9314       if (!is_fast_dispatch)
9315         s->session_dispatch_lock.Unlock();
9316     }
9317     return false;
9318   }
9319   return true;
9320 }
9321
9322
9323 /*
9324  * require that we have same (or newer) map, and that
9325  * the source is the pg primary.
9326  */
9327 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9328                                     bool is_fast_dispatch)
9329 {
9330   const Message *m = op->get_req();
9331   dout(15) << "require_same_or_newer_map " << epoch
9332            << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9333
9334   ceph_assert(osd_lock.is_locked());
9335
9336   // do they have a newer map?
9337   if (epoch > osdmap->get_epoch()) {
9338     dout(7) << "waiting for newer map epoch " << epoch
9339             << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9340     wait_for_new_map(op);
9341     return false;
9342   }
9343
9344   if (!require_self_aliveness(op->get_req(), epoch)) {
9345     return false;
9346   }
9347
9348   // ok, our map is same or newer.. do they still exist?
9349   if (m->get_connection()->get_messenger() == cluster_messenger &&
9350       !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9351     return false;
9352   }
9353
9354   return true;
9355 }
9356
9357
9358
9359
9360
9361 // ----------------------------------------
9362 // pg creation
9363
9364 void OSD::split_pgs(
9365   PG *parent,
9366   const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9367   OSDMapRef curmap,
9368   OSDMapRef nextmap,
9369   PG::RecoveryCtx *rctx)
9370 {
9371   unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9372   parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9373
9374   vector<object_stat_sum_t> updated_stats;
9375   parent->start_split_stats(childpgids, &updated_stats);
9376
9377   vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9378   for (set<spg_t>::const_iterator i = childpgids.begin();
9379        i != childpgids.end();
9380        ++i, ++stat_iter) {
9381     ceph_assert(stat_iter != updated_stats.end());
9382     dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9383     PG* child = _make_pg(nextmap, *i);
9384     child->lock(true);
9385     out_pgs->insert(child);
9386     child->ch = store->create_new_collection(child->coll);
9387
9388     {
9389       uint32_t shard_index = i->hash_to_shard(shards.size());
9390       assert(NULL != shards[shard_index]);
9391       store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9392     }
9393
9394     unsigned split_bits = i->get_split_bits(pg_num);
9395     dout(10) << " pg_num is " << pg_num
9396              << ", m_seed " << i->ps()
9397              << ", split_bits is " << split_bits << dendl;
9398     parent->split_colls(
9399       *i,
9400       split_bits,
9401       i->ps(),
9402       &child->get_pool().info,
9403       rctx->transaction);
9404     parent->split_into(
9405       i->pgid,
9406       child,
9407       split_bits);
9408
9409     child->finish_split_stats(*stat_iter, rctx->transaction);
9410     child->unlock();
9411   }
9412   ceph_assert(stat_iter != updated_stats.end());
9413   parent->finish_split_stats(*stat_iter, rctx->transaction);
9414 }
9415
9416 /*
9417  * holding osd_lock
9418  */
9419 void OSD::handle_pg_create(OpRequestRef op)
9420 {
9421   const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
9422   ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9423
9424   dout(10) << "handle_pg_create " << *m << dendl;
9425
9426   if (!require_mon_peer(op->get_req())) {
9427     return;
9428   }
9429
9430   if (!require_same_or_newer_map(op, m->epoch, false))
9431     return;
9432
9433   op->mark_started();
9434
9435   map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9436   for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9437        p != m->mkpg.end();
9438        ++p, ++ci) {
9439     ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9440     epoch_t created = p->second.created;
9441     if (p->second.split_bits) // Skip split pgs
9442       continue;
9443     pg_t on = p->first;
9444
9445     if (!osdmap->have_pg_pool(on.pool())) {
9446       dout(20) << "ignoring pg on deleted pool " << on << dendl;
9447       continue;
9448     }
9449
9450     dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9451
9452     // is it still ours?
9453     vector<int> up, acting;
9454     int up_primary = -1;
9455     int acting_primary = -1;
9456     osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9457     int role = osdmap->calc_pg_role(whoami, acting, acting.size());
9458
9459     if (acting_primary != whoami) {
9460       dout(10) << "mkpg " << on << "  not acting_primary (" << acting_primary
9461                << "), my role=" << role << ", skipping" << dendl;
9462       continue;
9463     }
9464
9465     spg_t pgid;
9466     bool mapped = osdmap->get_primary_shard(on, &pgid);
9467     ceph_assert(mapped);
9468
9469     PastIntervals pi;
9470     pg_history_t history;
9471     build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9472
9473     // The mon won't resend unless the primary changed, so we ignore
9474     // same_interval_since.  We'll pass this history with the current
9475     // epoch as the event.
9476     if (history.same_primary_since > m->epoch) {
9477       dout(10) << __func__ << ": got obsolete pg create on pgid "
9478                << pgid << " from epoch " << m->epoch
9479                << ", primary changed in " << history.same_primary_since
9480                << dendl;
9481       continue;
9482     }
9483     enqueue_peering_evt(
9484       pgid,
9485       PGPeeringEventRef(
9486         std::make_shared<PGPeeringEvent>(
9487           osdmap->get_epoch(),
9488           osdmap->get_epoch(),
9489           NullEvt(),
9490           true,
9491           new PGCreateInfo(
9492             pgid,
9493             osdmap->get_epoch(),
9494             history,
9495             pi,
9496             true)
9497           )));
9498   }
9499
9500   {
9501     std::lock_guard l(pending_creates_lock);
9502     if (pending_creates_from_mon == 0) {
9503       last_pg_create_epoch = m->epoch;
9504     }
9505   }
9506
9507   maybe_update_heartbeat_peers();
9508 }
9509
9510
9511 // ----------------------------------------
9512 // peering and recovery
9513
9514 PG::RecoveryCtx OSD::create_context()
9515 {
9516   ObjectStore::Transaction *t = new ObjectStore::Transaction;
9517   map<int, map<spg_t,pg_query_t> > *query_map =
9518     new map<int, map<spg_t, pg_query_t> >;
9519   map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
9520     new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
9521   map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
9522     new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
9523   PG::RecoveryCtx rctx(query_map, info_map, notify_list, t);
9524   return rctx;
9525 }
9526
9527 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
9528                                        ThreadPool::TPHandle *handle)
9529 {
9530   if (!ctx.transaction->empty() || ctx.transaction->has_contexts()) {
9531     int tr = store->queue_transaction(
9532       pg->ch,
9533       std::move(*ctx.transaction), TrackedOpRef(), handle);
9534     ceph_assert(tr == 0);
9535     delete (ctx.transaction);
9536     ctx.transaction = new ObjectStore::Transaction;
9537   }
9538 }
9539
9540 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
9541                            ThreadPool::TPHandle *handle)
9542 {
9543   if (!service.get_osdmap()->is_up(whoami)) {
9544     dout(20) << __func__ << " not up in osdmap" << dendl;
9545   } else if (!is_active()) {
9546     dout(20) << __func__ << " not active" << dendl;
9547   } else {
9548     do_notifies(*ctx.notify_list, curmap);
9549     do_queries(*ctx.query_map, curmap);
9550     do_infos(*ctx.info_map, curmap);
9551   }
9552   if ((!ctx.transaction->empty() || ctx.transaction->has_contexts()) && pg) {
9553     int tr = store->queue_transaction(
9554       pg->ch,
9555       std::move(*ctx.transaction), TrackedOpRef(),
9556       handle);
9557     ceph_assert(tr == 0);
9558   }
9559   delete ctx.notify_list;
9560   delete ctx.query_map;
9561   delete ctx.info_map;
9562   delete ctx.transaction;
9563 }
9564
9565 void OSD::discard_context(PG::RecoveryCtx& ctx)
9566 {
9567   delete ctx.notify_list;
9568   delete ctx.query_map;
9569   delete ctx.info_map;
9570   delete ctx.transaction;
9571 }
9572
9573
9574 /** do_notifies
9575  * Send an MOSDPGNotify to a primary, with a list of PGs that I have
9576  * content for, and they are primary for.
9577  */
9578
9579 void OSD::do_notifies(
9580   map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
9581   OSDMapRef curmap)
9582 {
9583   for (map<int,
9584            vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
9585          notify_list.begin();
9586        it != notify_list.end();
9587        ++it) {
9588     if (!curmap->is_up(it->first)) {
9589       dout(20) << __func__ << " skipping down osd." << it->first << dendl;
9590       continue;
9591     }
9592     ConnectionRef con = service.get_con_osd_cluster(
9593       it->first, curmap->get_epoch());
9594     if (!con) {
9595       dout(20) << __func__ << " skipping osd." << it->first
9596                << " (NULL con)" << dendl;
9597       continue;
9598     }
9599     service.share_map_peer(it->first, con.get(), curmap);
9600     dout(7) << __func__ << " osd." << it->first
9601             << " on " << it->second.size() << " PGs" << dendl;
9602     MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
9603                                        it->second);
9604     con->send_message(m);
9605   }
9606 }
9607
9608
9609 /** do_queries
9610  * send out pending queries for info | summaries
9611  */
9612 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
9613                      OSDMapRef curmap)
9614 {
9615   for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
9616        pit != query_map.end();
9617        ++pit) {
9618     if (!curmap->is_up(pit->first)) {
9619       dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
9620       continue;
9621     }
9622     int who = pit->first;
9623     ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
9624     if (!con) {
9625       dout(20) << __func__ << " skipping osd." << who
9626                << " (NULL con)" << dendl;
9627       continue;
9628     }
9629     service.share_map_peer(who, con.get(), curmap);
9630     dout(7) << __func__ << " querying osd." << who
9631             << " on " << pit->second.size() << " PGs" << dendl;
9632     MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
9633     con->send_message(m);
9634   }
9635 }
9636
9637
9638 void OSD::do_infos(map<int,
9639                        vector<pair<pg_notify_t, PastIntervals> > >& info_map,
9640                    OSDMapRef curmap)
9641 {
9642   for (map<int,
9643            vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
9644          info_map.begin();
9645        p != info_map.end();
9646        ++p) {
9647     if (!curmap->is_up(p->first)) {
9648       dout(20) << __func__ << " skipping down osd." << p->first << dendl;
9649       continue;
9650     }
9651     for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
9652          i != p->second.end();
9653          ++i) {
9654       dout(20) << __func__ << " sending info " << i->first.info
9655                << " to shard " << p->first << dendl;
9656     }
9657     ConnectionRef con = service.get_con_osd_cluster(
9658       p->first, curmap->get_epoch());
9659     if (!con) {
9660       dout(20) << __func__ << " skipping osd." << p->first
9661                << " (NULL con)" << dendl;
9662       continue;
9663     }
9664     service.share_map_peer(p->first, con.get(), curmap);
9665     MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9666     m->pg_list = p->second;
9667     con->send_message(m);
9668   }
9669   info_map.clear();
9670 }
9671
9672 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9673 {
9674   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9675   if (!require_mon_peer(m)) {
9676     m->put();
9677     return;
9678   }
9679   for (auto& p : m->pgs) {
9680     spg_t pgid = p.first;
9681     epoch_t created = p.second.first;
9682     utime_t created_stamp = p.second.second;
9683     dout(20) << __func__ << " " << pgid << " e" << created
9684              << "@" << created_stamp << dendl;
9685     pg_history_t h;
9686     h.epoch_created = created;
9687     h.epoch_pool_created = created;
9688     h.same_up_since = created;
9689     h.same_interval_since = created;
9690     h.same_primary_since = created;
9691     h.last_scrub_stamp = created_stamp;
9692     h.last_deep_scrub_stamp = created_stamp;
9693     h.last_clean_scrub_stamp = created_stamp;
9694
9695     enqueue_peering_evt(
9696       pgid,
9697       PGPeeringEventRef(
9698         std::make_shared<PGPeeringEvent>(
9699           m->epoch,
9700           m->epoch,
9701           NullEvt(),
9702           true,
9703           new PGCreateInfo(
9704             pgid,
9705             created,
9706             h,
9707             PastIntervals(),
9708             true)
9709           )));
9710   }
9711
9712   {
9713     std::lock_guard l(pending_creates_lock);
9714     if (pending_creates_from_mon == 0) {
9715       last_pg_create_epoch = m->epoch;
9716     }
9717   }
9718
9719   m->put();
9720 }
9721
9722 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9723 {
9724   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9725   if (!require_osd_peer(m)) {
9726     m->put();
9727     return;
9728   }
9729   int from = m->get_source().num();
9730   for (auto& p : m->pg_list) {
9731     enqueue_peering_evt(
9732       p.first,
9733       PGPeeringEventRef(
9734         std::make_shared<PGPeeringEvent>(
9735           p.second.epoch_sent, p.second.epoch_sent,
9736           MQuery(
9737             p.first,
9738             pg_shard_t(from, p.second.from),
9739             p.second,
9740             p.second.epoch_sent),
9741           false))
9742       );
9743   }
9744   m->put();
9745 }
9746
9747 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9748 {
9749   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9750   if (!require_osd_peer(m)) {
9751     m->put();
9752     return;
9753   }
9754   int from = m->get_source().num();
9755   for (auto& p : m->get_pg_list()) {
9756     spg_t pgid(p.first.info.pgid.pgid, p.first.to);
9757     enqueue_peering_evt(
9758       pgid,
9759       PGPeeringEventRef(
9760         std::make_shared<PGPeeringEvent>(
9761           p.first.epoch_sent,
9762           p.first.query_epoch,
9763           MNotifyRec(
9764             pgid, pg_shard_t(from, p.first.from),
9765             p.first,
9766             m->get_connection()->get_features(),
9767             p.second),
9768           true,
9769           new PGCreateInfo(
9770             pgid,
9771             p.first.query_epoch,
9772             p.first.info.history,
9773             p.second,
9774             false)
9775           )));
9776   }
9777   m->put();
9778 }
9779
9780 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9781 {
9782   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9783   if (!require_osd_peer(m)) {
9784     m->put();
9785     return;
9786   }
9787   int from = m->get_source().num();
9788   for (auto& p : m->pg_list) {
9789     enqueue_peering_evt(
9790       spg_t(p.first.info.pgid.pgid, p.first.to),
9791       PGPeeringEventRef(
9792         std::make_shared<PGPeeringEvent>(
9793           p.first.epoch_sent, p.first.query_epoch,
9794           MInfoRec(
9795             pg_shard_t(from, p.first.from),
9796             p.first.info,
9797             p.first.epoch_sent)))
9798       );
9799   }
9800   m->put();
9801 }
9802
9803 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9804 {
9805   dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9806   if (!require_osd_peer(m)) {
9807     m->put();
9808     return;
9809   }
9810   for (auto& pgid : m->pg_list) {
9811     enqueue_peering_evt(
9812       pgid,
9813       PGPeeringEventRef(
9814         std::make_shared<PGPeeringEvent>(
9815           m->get_epoch(), m->get_epoch(),
9816           PG::DeleteStart())));
9817   }
9818   m->put();
9819 }
9820
9821 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9822 {
9823   dout(10) << __func__ << " " << *m << dendl;
9824   if (!require_mon_or_mgr_peer(m)) {
9825     m->put();
9826     return;
9827   }
9828   epoch_t epoch = get_osdmap_epoch();
9829   for (auto pgid : m->forced_pgs) {
9830     if (m->options & OFR_BACKFILL) {
9831       if (m->options & OFR_CANCEL) {
9832         enqueue_peering_evt(
9833           pgid,
9834           PGPeeringEventRef(
9835             std::make_shared<PGPeeringEvent>(
9836               epoch, epoch,
9837               PG::UnsetForceBackfill())));
9838       } else {
9839         enqueue_peering_evt(
9840           pgid,
9841           PGPeeringEventRef(
9842             std::make_shared<PGPeeringEvent>(
9843               epoch, epoch,
9844               PG::SetForceBackfill())));
9845       }
9846     } else if (m->options & OFR_RECOVERY) {
9847       if (m->options & OFR_CANCEL) {
9848         enqueue_peering_evt(
9849           pgid,
9850           PGPeeringEventRef(
9851             std::make_shared<PGPeeringEvent>(
9852               epoch, epoch,
9853               PG::UnsetForceRecovery())));
9854       } else {
9855         enqueue_peering_evt(
9856           pgid,
9857           PGPeeringEventRef(
9858             std::make_shared<PGPeeringEvent>(
9859               epoch, epoch,
9860               PG::SetForceRecovery())));
9861       }
9862     }
9863   }
9864   m->put();
9865 }
9866
9867 void OSD::handle_pg_query_nopg(const MQuery& q)
9868 {
9869   spg_t pgid = q.pgid;
9870   dout(10) << __func__ << " " << pgid << dendl;
9871
9872   OSDMapRef osdmap = get_osdmap();
9873   if (!osdmap->have_pg_pool(pgid.pool()))
9874     return;
9875
9876   dout(10) << " pg " << pgid << " dne" << dendl;
9877   pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9878   ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9879   if (con) {
9880     Message *m;
9881     if (q.query.type == pg_query_t::LOG ||
9882         q.query.type == pg_query_t::FULLLOG) {
9883       m = new MOSDPGLog(
9884         q.query.from, q.query.to,
9885         osdmap->get_epoch(), empty,
9886         q.query.epoch_sent);
9887     } else {
9888       vector<pair<pg_notify_t,PastIntervals>> ls;
9889       ls.push_back(
9890         make_pair(
9891           pg_notify_t(
9892             q.query.from, q.query.to,
9893             q.query.epoch_sent,
9894             osdmap->get_epoch(),
9895             empty),
9896           PastIntervals()));
9897       m = new MOSDPGNotify(osdmap->get_epoch(), ls);
9898     }
9899     service.share_map_peer(q.from.osd, con.get(), osdmap);
9900     con->send_message(m);
9901   }
9902 }
9903
9904
9905 // =========================================================
9906 // RECOVERY
9907
9908 void OSDService::_maybe_queue_recovery() {
9909   ceph_assert(recovery_lock.is_locked_by_me());
9910   uint64_t available_pushes;
9911   while (!awaiting_throttle.empty() &&
9912          _recover_now(&available_pushes)) {
9913     uint64_t to_start = std::min(
9914       available_pushes,
9915       cct->_conf->osd_recovery_max_single_start);
9916     _queue_for_recovery(awaiting_throttle.front(), to_start);
9917     awaiting_throttle.pop_front();
9918     dout(10) << __func__ << " starting " << to_start
9919              << ", recovery_ops_reserved " << recovery_ops_reserved
9920              << " -> " << (recovery_ops_reserved + to_start) << dendl;
9921     recovery_ops_reserved += to_start;
9922   }
9923 }
9924
9925 bool OSDService::_recover_now(uint64_t *available_pushes)
9926 {
9927   if (available_pushes)
9928       *available_pushes = 0;
9929
9930   if (ceph_clock_now() < defer_recovery_until) {
9931     dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9932     return false;
9933   }
9934
9935   if (recovery_paused) {
9936     dout(15) << __func__ << " paused" << dendl;
9937     return false;
9938   }
9939
9940   uint64_t max = cct->_conf->osd_recovery_max_active;
9941   if (max <= recovery_ops_active + recovery_ops_reserved) {
9942     dout(15) << __func__ << " active " << recovery_ops_active
9943              << " + reserved " << recovery_ops_reserved
9944              << " >= max " << max << dendl;
9945     return false;
9946   }
9947
9948   if (available_pushes)
9949     *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9950
9951   return true;
9952 }
9953
9954 void OSD::do_recovery(
9955   PG *pg, epoch_t queued, uint64_t reserved_pushes,
9956   ThreadPool::TPHandle &handle)
9957 {
9958   uint64_t started = 0;
9959
9960   /*
9961    * When the value of osd_recovery_sleep is set greater than zero, recovery
9962    * ops are scheduled after osd_recovery_sleep amount of time from the previous
9963    * recovery event's schedule time. This is done by adding a
9964    * recovery_requeue_callback event, which re-queues the recovery op using
9965    * queue_recovery_after_sleep.
9966    */
9967   float recovery_sleep = get_osd_recovery_sleep();
9968   {
9969     std::lock_guard l(service.sleep_lock);
9970     if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9971       PGRef pgref(pg);
9972       auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9973         dout(20) << "do_recovery wake up at "
9974                  << ceph_clock_now()
9975                  << ", re-queuing recovery" << dendl;
9976         std::lock_guard l(service.sleep_lock);
9977         service.recovery_needs_sleep = false;
9978         service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9979       });
9980
9981       // This is true for the first recovery op and when the previous recovery op
9982       // has been scheduled in the past. The next recovery op is scheduled after
9983       // completing the sleep from now.
9984       if (service.recovery_schedule_time < ceph_clock_now()) {
9985         service.recovery_schedule_time = ceph_clock_now();
9986       }
9987       service.recovery_schedule_time += recovery_sleep;
9988       service.sleep_timer.add_event_at(service.recovery_schedule_time,
9989                                                 recovery_requeue_callback);
9990       dout(20) << "Recovery event scheduled at "
9991                << service.recovery_schedule_time << dendl;
9992       return;
9993     }
9994   }
9995
9996   {
9997     {
9998       std::lock_guard l(service.sleep_lock);
9999       service.recovery_needs_sleep = true;
10000     }
10001
10002     if (pg->pg_has_reset_since(queued)) {
10003       goto out;
10004     }
10005
10006     dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
10007 #ifdef DEBUG_RECOVERY_OIDS
10008     dout(20) << "  active was " << service.recovery_oids[pg->pg_id] << dendl;
10009 #endif
10010
10011     bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
10012     dout(10) << "do_recovery started " << started << "/" << reserved_pushes
10013              << " on " << *pg << dendl;
10014
10015     if (do_unfound) {
10016       PG::RecoveryCtx rctx = create_context();
10017       rctx.handle = &handle;
10018       pg->find_unfound(queued, &rctx);
10019       dispatch_context(rctx, pg, pg->get_osdmap());
10020     }
10021   }
10022
10023  out:
10024   ceph_assert(started <= reserved_pushes);
10025   service.release_reserved_pushes(reserved_pushes);
10026 }
10027
10028 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
10029 {
10030   std::lock_guard l(recovery_lock);
10031   dout(10) << "start_recovery_op " << *pg << " " << soid
10032            << " (" << recovery_ops_active << "/"
10033            << cct->_conf->osd_recovery_max_active << " rops)"
10034            << dendl;
10035   recovery_ops_active++;
10036
10037 #ifdef DEBUG_RECOVERY_OIDS
10038   dout(20) << "  active was " << recovery_oids[pg->pg_id] << dendl;
10039   ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
10040   recovery_oids[pg->pg_id].insert(soid);
10041 #endif
10042 }
10043
10044 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
10045 {
10046   std::lock_guard l(recovery_lock);
10047   dout(10) << "finish_recovery_op " << *pg << " " << soid
10048            << " dequeue=" << dequeue
10049            << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
10050            << dendl;
10051
10052   // adjust count
10053   ceph_assert(recovery_ops_active > 0);
10054   recovery_ops_active--;
10055
10056 #ifdef DEBUG_RECOVERY_OIDS
10057   dout(20) << "  active oids was " << recovery_oids[pg->pg_id] << dendl;
10058   ceph_assert(recovery_oids[pg->pg_id].count(soid));
10059   recovery_oids[pg->pg_id].erase(soid);
10060 #endif
10061
10062   _maybe_queue_recovery();
10063 }
10064
10065 bool OSDService::is_recovery_active()
10066 {
10067   if (cct->_conf->osd_debug_pretend_recovery_active) {
10068     return true;
10069   }
10070   return local_reserver.has_reservation() || remote_reserver.has_reservation();
10071 }
10072
10073 void OSDService::release_reserved_pushes(uint64_t pushes)
10074 {
10075   std::lock_guard l(recovery_lock);
10076   dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
10077            << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
10078            << dendl;
10079   ceph_assert(recovery_ops_reserved >= pushes);
10080   recovery_ops_reserved -= pushes;
10081   _maybe_queue_recovery();
10082 }
10083
10084 // =========================================================
10085 // OPS
10086
10087 bool OSD::op_is_discardable(const MOSDOp *op)
10088 {
10089   // drop client request if they are not connected and can't get the
10090   // reply anyway.
10091   if (!op->get_connection()->is_connected()) {
10092     return true;
10093   }
10094   return false;
10095 }
10096
10097 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
10098 {
10099   const utime_t stamp = op->get_req()->get_recv_stamp();
10100   const utime_t latency = ceph_clock_now() - stamp;
10101   const unsigned priority = op->get_req()->get_priority();
10102   const int cost = op->get_req()->get_cost();
10103   const uint64_t owner = op->get_req()->get_source().num();
10104
10105   dout(15) << "enqueue_op " << op << " prio " << priority
10106            << " cost " << cost
10107            << " latency " << latency
10108            << " epoch " << epoch
10109            << " " << *(op->get_req()) << dendl;
10110   op->osd_trace.event("enqueue op");
10111   op->osd_trace.keyval("priority", priority);
10112   op->osd_trace.keyval("cost", cost);
10113   op->mark_queued_for_pg();
10114   logger->tinc(l_osd_op_before_queue_op_lat, latency);
10115   op_shardedwq.queue(
10116     OpQueueItem(
10117       unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
10118       cost, priority, stamp, owner, epoch));
10119 }
10120
10121 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
10122 {
10123   dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10124   op_shardedwq.queue(
10125     OpQueueItem(
10126       unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10127       10,
10128       cct->_conf->osd_peering_op_priority,
10129       utime_t(),
10130       0,
10131       evt->get_epoch_sent()));
10132 }
10133
10134 void OSD::enqueue_peering_evt_front(spg_t pgid, PGPeeringEventRef evt)
10135 {
10136   dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
10137   op_shardedwq.queue_front(
10138     OpQueueItem(
10139       unique_ptr<OpQueueItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
10140       10,
10141       cct->_conf->osd_peering_op_priority,
10142       utime_t(),
10143       0,
10144       evt->get_epoch_sent()));
10145 }
10146
10147 /*
10148  * NOTE: dequeue called in worker thread, with pg lock
10149  */
10150 void OSD::dequeue_op(
10151   PGRef pg, OpRequestRef op,
10152   ThreadPool::TPHandle &handle)
10153 {
10154   FUNCTRACE(cct);
10155   OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
10156
10157   utime_t now = ceph_clock_now();
10158   op->set_dequeued_time(now);
10159   utime_t latency = now - op->get_req()->get_recv_stamp();
10160   dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
10161            << " cost " << op->get_req()->get_cost()
10162            << " latency " << latency
10163            << " " << *(op->get_req())
10164            << " pg " << *pg << dendl;
10165
10166   logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
10167
10168   auto priv = op->get_req()->get_connection()->get_priv();
10169   if (auto session = static_cast<Session *>(priv.get()); session) {
10170     maybe_share_map(session, op, pg->get_osdmap());
10171   }
10172
10173   if (pg->is_deleting())
10174     return;
10175
10176   op->mark_reached_pg();
10177   op->osd_trace.event("dequeue_op");
10178
10179   pg->do_request(op, handle);
10180
10181   // finish
10182   dout(10) << "dequeue_op " << op << " finish" << dendl;
10183   OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
10184 }
10185
10186
10187 void OSD::dequeue_peering_evt(
10188   OSDShard *sdata,
10189   PG *pg,
10190   PGPeeringEventRef evt,
10191   ThreadPool::TPHandle& handle)
10192 {
10193   PG::RecoveryCtx rctx = create_context();
10194   auto curmap = sdata->get_osdmap();
10195   epoch_t need_up_thru = 0, same_interval_since = 0;
10196   if (!pg) {
10197     if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
10198       handle_pg_query_nopg(*q);
10199     } else {
10200       derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
10201       ceph_abort();
10202     }
10203   } else if (advance_pg(curmap->get_epoch(), pg, handle, &rctx)) {
10204     pg->do_peering_event(evt, &rctx);
10205     if (pg->is_deleted()) {
10206       // do not dispatch rctx; the final _delete_some already did it.
10207       discard_context(rctx);
10208       pg->unlock();
10209       return;
10210     }
10211     dispatch_context_transaction(rctx, pg, &handle);
10212     need_up_thru = pg->get_need_up_thru();
10213     same_interval_since = pg->get_same_interval_since();
10214     pg->unlock();
10215   }
10216
10217   if (need_up_thru) {
10218     queue_want_up_thru(same_interval_since);
10219   }
10220   dispatch_context(rctx, pg, curmap, &handle);
10221
10222   service.send_pg_temp();
10223 }
10224
10225 void OSD::dequeue_delete(
10226   OSDShard *sdata,
10227   PG *pg,
10228   epoch_t e,
10229   ThreadPool::TPHandle& handle)
10230 {
10231   dequeue_peering_evt(
10232     sdata,
10233     pg,
10234     PGPeeringEventRef(
10235       std::make_shared<PGPeeringEvent>(
10236         e, e,
10237         PG::DeleteSome())),
10238     handle);
10239 }
10240
10241
10242
10243 // --------------------------------
10244
10245 const char** OSD::get_tracked_conf_keys() const
10246 {
10247   static const char* KEYS[] = {
10248     "osd_max_backfills",
10249     "osd_min_recovery_priority",
10250     "osd_max_trimming_pgs",
10251     "osd_op_complaint_time",
10252     "osd_op_log_threshold",
10253     "osd_op_history_size",
10254     "osd_op_history_duration",
10255     "osd_op_history_slow_op_size",
10256     "osd_op_history_slow_op_threshold",
10257     "osd_enable_op_tracker",
10258     "osd_map_cache_size",
10259     "osd_pg_epoch_max_lag_factor",
10260     "osd_pg_epoch_persisted_max_stale",
10261     // clog & admin clog
10262     "clog_to_monitors",
10263     "clog_to_syslog",
10264     "clog_to_syslog_facility",
10265     "clog_to_syslog_level",
10266     "osd_objectstore_fuse",
10267     "clog_to_graylog",
10268     "clog_to_graylog_host",
10269     "clog_to_graylog_port",
10270     "host",
10271     "fsid",
10272     "osd_recovery_delay_start",
10273     "osd_client_message_size_cap",
10274     "osd_client_message_cap",
10275     "osd_heartbeat_min_size",
10276     "osd_heartbeat_interval",
10277     "osd_scrub_min_interval",
10278     "osd_scrub_max_interval",
10279     NULL
10280   };
10281   return KEYS;
10282 }
10283
10284 void OSD::handle_conf_change(const ConfigProxy& conf,
10285                              const std::set <std::string> &changed)
10286 {
10287   Mutex::Locker l(osd_lock);
10288   if (changed.count("osd_max_backfills")) {
10289     service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10290     service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10291   }
10292   if (changed.count("osd_min_recovery_priority")) {
10293     service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10294     service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10295   }
10296   if (changed.count("osd_max_trimming_pgs")) {
10297     service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10298   }
10299   if (changed.count("osd_op_complaint_time") ||
10300       changed.count("osd_op_log_threshold")) {
10301     op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10302                                            cct->_conf->osd_op_log_threshold);
10303   }
10304   if (changed.count("osd_op_history_size") ||
10305       changed.count("osd_op_history_duration")) {
10306     op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10307                                              cct->_conf->osd_op_history_duration);
10308   }
10309   if (changed.count("osd_op_history_slow_op_size") ||
10310       changed.count("osd_op_history_slow_op_threshold")) {
10311     op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10312                                                       cct->_conf->osd_op_history_slow_op_threshold);
10313   }
10314   if (changed.count("osd_enable_op_tracker")) {
10315       op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10316   }
10317   if (changed.count("osd_map_cache_size")) {
10318     service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10319     service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10320     service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10321   }
10322   if (changed.count("clog_to_monitors") ||
10323       changed.count("clog_to_syslog") ||
10324       changed.count("clog_to_syslog_level") ||
10325       changed.count("clog_to_syslog_facility") ||
10326       changed.count("clog_to_graylog") ||
10327       changed.count("clog_to_graylog_host") ||
10328       changed.count("clog_to_graylog_port") ||
10329       changed.count("host") ||
10330       changed.count("fsid")) {
10331     update_log_config();
10332   }
10333   if (changed.count("osd_pg_epoch_max_lag_factor")) {
10334     m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10335       "osd_pg_epoch_max_lag_factor");
10336   }
10337
10338 #ifdef HAVE_LIBFUSE
10339   if (changed.count("osd_objectstore_fuse")) {
10340     if (store) {
10341       enable_disable_fuse(false);
10342     }
10343   }
10344 #endif
10345
10346   if (changed.count("osd_recovery_delay_start")) {
10347     service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10348     service.kick_recovery_queue();
10349   }
10350
10351   if (changed.count("osd_client_message_cap")) {
10352     uint64_t newval = cct->_conf->osd_client_message_cap;
10353     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10354     if (pol.throttler_messages && newval > 0) {
10355       pol.throttler_messages->reset_max(newval);
10356     }
10357   }
10358   if (changed.count("osd_client_message_size_cap")) {
10359     uint64_t newval = cct->_conf->osd_client_message_size_cap;
10360     Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10361     if (pol.throttler_bytes && newval > 0) {
10362       pol.throttler_bytes->reset_max(newval);
10363     }
10364   }
10365
10366   if (changed.count("osd_scrub_min_interval") ||
10367       changed.count("osd_scrub_max_interval")) {
10368     resched_all_scrubs();
10369     dout(0) << __func__ << ": scrub interval change" << dendl;
10370   }
10371   check_config();
10372 }
10373
10374 void OSD::update_log_config()
10375 {
10376   map<string,string> log_to_monitors;
10377   map<string,string> log_to_syslog;
10378   map<string,string> log_channel;
10379   map<string,string> log_prio;
10380   map<string,string> log_to_graylog;
10381   map<string,string> log_to_graylog_host;
10382   map<string,string> log_to_graylog_port;
10383   uuid_d fsid;
10384   string host;
10385
10386   if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10387                                log_channel, log_prio, log_to_graylog,
10388                                log_to_graylog_host, log_to_graylog_port,
10389                                fsid, host) == 0)
10390     clog->update_config(log_to_monitors, log_to_syslog,
10391                         log_channel, log_prio, log_to_graylog,
10392                         log_to_graylog_host, log_to_graylog_port,
10393                         fsid, host);
10394   derr << "log_to_monitors " << log_to_monitors << dendl;
10395 }
10396
10397 void OSD::check_config()
10398 {
10399   // some sanity checks
10400   if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10401     clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10402                  << " is not > osd_pg_epoch_persisted_max_stale ("
10403                  << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10404   }
10405 }
10406
10407 // --------------------------------
10408
10409 void OSD::get_latest_osdmap()
10410 {
10411   dout(10) << __func__ << " -- start" << dendl;
10412
10413   C_SaferCond cond;
10414   service.objecter->wait_for_latest_osdmap(&cond);
10415   cond.wait();
10416
10417   dout(10) << __func__ << " -- finish" << dendl;
10418 }
10419
10420 // --------------------------------
10421
10422 int OSD::init_op_flags(OpRequestRef& op)
10423 {
10424   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10425   vector<OSDOp>::const_iterator iter;
10426
10427   // client flags have no bearing on whether an op is a read, write, etc.
10428   op->rmw_flags = 0;
10429
10430   if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10431     op->set_force_rwordered();
10432   }
10433
10434   // set bits based on op codes, called methods.
10435   for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10436     if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10437          iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10438       /* This a bit odd.  PING isn't actually a write.  It can't
10439        * result in an update to the object_info.  PINGs also aren't
10440        * resent, so there's no reason to write out a log entry.
10441        *
10442        * However, we pipeline them behind writes, so let's force
10443        * the write_ordered flag.
10444        */
10445       op->set_force_rwordered();
10446     } else {
10447       if (ceph_osd_op_mode_modify(iter->op.op))
10448         op->set_write();
10449     }
10450     if (ceph_osd_op_mode_read(iter->op.op))
10451       op->set_read();
10452
10453     // set READ flag if there are src_oids
10454     if (iter->soid.oid.name.length())
10455       op->set_read();
10456
10457     // set PGOP flag if there are PG ops
10458     if (ceph_osd_op_type_pg(iter->op.op))
10459       op->set_pg_op();
10460
10461     if (ceph_osd_op_mode_cache(iter->op.op))
10462       op->set_cache();
10463
10464     // check for ec base pool
10465     int64_t poolid = m->get_pg().pool();
10466     const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10467     if (pool && pool->is_tier()) {
10468       const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10469       if (base_pool && base_pool->require_rollback()) {
10470         if ((iter->op.op != CEPH_OSD_OP_READ) &&
10471             (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
10472             (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
10473             (iter->op.op != CEPH_OSD_OP_STAT) &&
10474             (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10475             (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10476             (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10477             (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10478             (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10479             (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10480             (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10481             (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10482             (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10483             (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10484             (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10485             (iter->op.op != CEPH_OSD_OP_CREATE) &&
10486             (iter->op.op != CEPH_OSD_OP_DELETE) &&
10487             (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10488             (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10489             (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10490             (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10491             (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10492           op->set_promote();
10493         }
10494       }
10495     }
10496
10497     switch (iter->op.op) {
10498     case CEPH_OSD_OP_CALL:
10499       {
10500         bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10501         int is_write, is_read;
10502         string cname, mname;
10503         bp.copy(iter->op.cls.class_len, cname);
10504         bp.copy(iter->op.cls.method_len, mname);
10505
10506         ClassHandler::ClassData *cls;
10507         int r = class_handler->open_class(cname, &cls);
10508         if (r) {
10509           derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10510           if (r == -ENOENT)
10511             r = -EOPNOTSUPP;
10512           else if (r != -EPERM) // propagate permission errors
10513             r = -EIO;
10514           return r;
10515         }
10516         int flags = cls->get_method_flags(mname.c_str());
10517         if (flags < 0) {
10518           if (flags == -ENOENT)
10519             r = -EOPNOTSUPP;
10520           else
10521             r = flags;
10522           return r;
10523         }
10524         is_read = flags & CLS_METHOD_RD;
10525         is_write = flags & CLS_METHOD_WR;
10526         bool is_promote = flags & CLS_METHOD_PROMOTE;
10527
10528         dout(10) << "class " << cname << " method " << mname << " "
10529                  << "flags=" << (is_read ? "r" : "")
10530                              << (is_write ? "w" : "")
10531                              << (is_promote ? "p" : "")
10532                  << dendl;
10533         if (is_read)
10534           op->set_class_read();
10535         if (is_write)
10536           op->set_class_write();
10537         if (is_promote)
10538           op->set_promote();
10539         op->add_class(std::move(cname), std::move(mname), is_read, is_write,
10540                       cls->whitelisted);
10541         break;
10542       }
10543
10544     case CEPH_OSD_OP_WATCH:
10545       // force the read bit for watch since it is depends on previous
10546       // watch state (and may return early if the watch exists) or, in
10547       // the case of ping, is simply a read op.
10548       op->set_read();
10549       // fall through
10550     case CEPH_OSD_OP_NOTIFY:
10551     case CEPH_OSD_OP_NOTIFY_ACK:
10552       {
10553         op->set_promote();
10554         break;
10555       }
10556
10557     case CEPH_OSD_OP_DELETE:
10558       // if we get a delete with FAILOK we can skip handle cache. without
10559       // FAILOK we still need to promote (or do something smarter) to
10560       // determine whether to return ENOENT or 0.
10561       if (iter == m->ops.begin() &&
10562           iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10563         op->set_skip_handle_cache();
10564       }
10565       // skip promotion when proxying a delete op
10566       if (m->ops.size() == 1) {
10567         op->set_skip_promote();
10568       }
10569       break;
10570
10571     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10572     case CEPH_OSD_OP_CACHE_FLUSH:
10573     case CEPH_OSD_OP_CACHE_EVICT:
10574       // If try_flush/flush/evict is the only op, can skip handle cache.
10575       if (m->ops.size() == 1) {
10576         op->set_skip_handle_cache();
10577       }
10578       break;
10579
10580     case CEPH_OSD_OP_READ:
10581     case CEPH_OSD_OP_SYNC_READ:
10582     case CEPH_OSD_OP_SPARSE_READ:
10583     case CEPH_OSD_OP_CHECKSUM:
10584     case CEPH_OSD_OP_WRITEFULL:
10585       if (m->ops.size() == 1 &&
10586           (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10587            iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10588         op->set_skip_promote();
10589       }
10590       break;
10591
10592     // force promotion when pin an object in cache tier
10593     case CEPH_OSD_OP_CACHE_PIN:
10594       op->set_promote();
10595       break;
10596
10597     default:
10598       break;
10599     }
10600   }
10601
10602   if (op->rmw_flags == 0)
10603     return -EINVAL;
10604
10605   return 0;
10606 }
10607
10608 void OSD::set_perf_queries(
10609     const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries) {
10610   dout(10) << "setting " << queries.size() << " queries" << dendl;
10611
10612   std::list<OSDPerfMetricQuery> supported_queries;
10613   for (auto &it : queries) {
10614     auto &query = it.first;
10615     if (!query.key_descriptor.empty()) {
10616       supported_queries.push_back(query);
10617     }
10618   }
10619   if (supported_queries.size() < queries.size()) {
10620     dout(1) << queries.size() - supported_queries.size()
10621             << " unsupported queries" << dendl;
10622   }
10623
10624   {
10625     Mutex::Locker locker(m_perf_queries_lock);
10626     m_perf_queries = supported_queries;
10627     m_perf_limits = queries;
10628   }
10629
10630   std::vector<PGRef> pgs;
10631   _get_pgs(&pgs);
10632   for (auto& pg : pgs) {
10633     pg->lock();
10634     pg->set_dynamic_perf_stats_queries(supported_queries);
10635     pg->unlock();
10636   }
10637 }
10638
10639 void OSD::get_perf_reports(
10640     std::map<OSDPerfMetricQuery, OSDPerfMetricReport> *reports) {
10641   std::vector<PGRef> pgs;
10642   _get_pgs(&pgs);
10643   DynamicPerfStats dps;
10644   for (auto& pg : pgs) {
10645     // m_perf_queries can be modified only in set_perf_queries by mgr client
10646     // request, and it is protected by by mgr client's lock, which is held
10647     // when set_perf_queries/get_perf_reports are called, so we may not hold
10648     // m_perf_queries_lock here.
10649     DynamicPerfStats pg_dps(m_perf_queries);
10650     pg->lock();
10651     pg->get_dynamic_perf_stats(&pg_dps);
10652     pg->unlock();
10653     dps.merge(pg_dps);
10654   }
10655   dps.add_to_reports(m_perf_limits, reports);
10656   dout(20) << "reports for " << reports->size() << " queries" << dendl;
10657 }
10658
10659 // =============================================================
10660
10661 #undef dout_context
10662 #define dout_context cct
10663 #undef dout_prefix
10664 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10665
10666 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10667 {
10668   dout(10) << pg->pg_id << " " << pg << dendl;
10669   slot->pg = pg;
10670   pg->osd_shard = this;
10671   pg->pg_slot = slot;
10672   osd->inc_num_pgs();
10673
10674   slot->epoch = pg->get_osdmap_epoch();
10675   pg_slots_by_epoch.insert(*slot);
10676 }
10677
10678 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10679 {
10680   dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10681   slot->pg->osd_shard = nullptr;
10682   slot->pg->pg_slot = nullptr;
10683   slot->pg = nullptr;
10684   osd->dec_num_pgs();
10685
10686   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10687   slot->epoch = 0;
10688   if (waiting_for_min_pg_epoch) {
10689     min_pg_epoch_cond.notify_all();
10690   }
10691 }
10692
10693 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10694 {
10695   std::lock_guard l(shard_lock);
10696   dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10697            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10698   pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10699   dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10700   slot->epoch = e;
10701   pg_slots_by_epoch.insert(*slot);
10702   dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10703            << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10704   if (waiting_for_min_pg_epoch) {
10705     min_pg_epoch_cond.notify_all();
10706   }
10707 }
10708
10709 epoch_t OSDShard::get_min_pg_epoch()
10710 {
10711   std::lock_guard l(shard_lock);
10712   auto p = pg_slots_by_epoch.begin();
10713   if (p == pg_slots_by_epoch.end()) {
10714     return 0;
10715   }
10716   return p->epoch;
10717 }
10718
10719 void OSDShard::wait_min_pg_epoch(epoch_t need)
10720 {
10721   std::unique_lock l{shard_lock};
10722   ++waiting_for_min_pg_epoch;
10723   min_pg_epoch_cond.wait(l, [need, this] {
10724     if (pg_slots_by_epoch.empty()) {
10725       return true;
10726     } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10727       return true;
10728     } else {
10729       dout(10) << need << " waiting on "
10730                << pg_slots_by_epoch.begin()->epoch << dendl;
10731       return false;
10732     }
10733   });
10734   --waiting_for_min_pg_epoch;
10735 }
10736
10737 epoch_t OSDShard::get_max_waiting_epoch()
10738 {
10739   std::lock_guard l(shard_lock);
10740   epoch_t r = 0;
10741   for (auto& i : pg_slots) {
10742     if (!i.second->waiting_peering.empty()) {
10743       r = std::max(r, i.second->waiting_peering.rbegin()->first);
10744     }
10745   }
10746   return r;
10747 }
10748
10749 void OSDShard::consume_map(
10750   OSDMapRef& new_osdmap,
10751   unsigned *pushes_to_free)
10752 {
10753   std::lock_guard l(shard_lock);
10754   OSDMapRef old_osdmap;
10755   {
10756     std::lock_guard l(osdmap_lock);
10757     old_osdmap = std::move(shard_osdmap);
10758     shard_osdmap = new_osdmap;
10759   }
10760   dout(10) << new_osdmap->get_epoch()
10761            << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10762            << dendl;
10763   bool queued = false;
10764
10765   // check slots
10766   auto p = pg_slots.begin();
10767   while (p != pg_slots.end()) {
10768     OSDShardPGSlot *slot = p->second.get();
10769     const spg_t& pgid = p->first;
10770     dout(20) << __func__ << " " << pgid << dendl;
10771     if (!slot->waiting_for_split.empty()) {
10772       dout(20) << __func__ << "  " << pgid
10773                << " waiting for split " << slot->waiting_for_split << dendl;
10774       ++p;
10775       continue;
10776     }
10777     if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10778       dout(20) << __func__ << "  " << pgid
10779                << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10780                << dendl;
10781       ++p;
10782       continue;
10783     }
10784     if (!slot->waiting_peering.empty()) {
10785       epoch_t first = slot->waiting_peering.begin()->first;
10786       if (first <= new_osdmap->get_epoch()) {
10787         dout(20) << __func__ << "  " << pgid
10788                  << " pending_peering first epoch " << first
10789                  << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10790         _wake_pg_slot(pgid, slot);
10791         queued = true;
10792       }
10793       ++p;
10794       continue;
10795     }
10796     if (!slot->waiting.empty()) {
10797       if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10798         dout(20) << __func__ << "  " << pgid << " maps to us, keeping"
10799                  << dendl;
10800         ++p;
10801         continue;
10802       }
10803       while (!slot->waiting.empty() &&
10804              slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10805         auto& qi = slot->waiting.front();
10806         dout(20) << __func__ << "  " << pgid
10807                  << " waiting item " << qi
10808                  << " epoch " << qi.get_map_epoch()
10809                  << " <= " << new_osdmap->get_epoch()
10810                  << ", "
10811                  << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10812                      "misdirected")
10813                  << ", dropping" << dendl;
10814         *pushes_to_free += qi.get_reserved_pushes();
10815         slot->waiting.pop_front();
10816       }
10817     }
10818     if (slot->waiting.empty() &&
10819         slot->num_running == 0 &&
10820         slot->waiting_for_split.empty() &&
10821         !slot->pg) {
10822       dout(20) << __func__ << "  " << pgid << " empty, pruning" << dendl;
10823       p = pg_slots.erase(p);
10824       continue;
10825     }
10826
10827     ++p;
10828   }
10829   if (queued) {
10830     std::lock_guard l{sdata_wait_lock};
10831     sdata_cond.notify_one();
10832   }
10833 }
10834
10835 void OSDShard::_wake_pg_slot(
10836   spg_t pgid,
10837   OSDShardPGSlot *slot)
10838 {
10839   dout(20) << __func__ << " " << pgid
10840            << " to_process " << slot->to_process
10841            << " waiting " << slot->waiting
10842            << " waiting_peering " << slot->waiting_peering << dendl;
10843   for (auto i = slot->to_process.rbegin();
10844        i != slot->to_process.rend();
10845        ++i) {
10846     _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10847   }
10848   slot->to_process.clear();
10849   for (auto i = slot->waiting.rbegin();
10850        i != slot->waiting.rend();
10851        ++i) {
10852     _enqueue_front(std::move(*i), osd->op_prio_cutoff);
10853   }
10854   slot->waiting.clear();
10855   for (auto i = slot->waiting_peering.rbegin();
10856        i != slot->waiting_peering.rend();
10857        ++i) {
10858     // this is overkill; we requeue everything, even if some of these
10859     // items are waiting for maps we don't have yet.  FIXME, maybe,
10860     // someday, if we decide this inefficiency matters
10861     for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10862       _enqueue_front(std::move(*j), osd->op_prio_cutoff);
10863     }
10864   }
10865   slot->waiting_peering.clear();
10866   ++slot->requeue_seq;
10867 }
10868
10869 void OSDShard::identify_splits_and_merges(
10870   const OSDMapRef& as_of_osdmap,
10871   set<pair<spg_t,epoch_t>> *split_pgs,
10872   set<pair<spg_t,epoch_t>> *merge_pgs)
10873 {
10874   std::lock_guard l(shard_lock);
10875   if (shard_osdmap) {
10876     for (auto& i : pg_slots) {
10877       const spg_t& pgid = i.first;
10878       auto *slot = i.second.get();
10879       if (slot->pg) {
10880         osd->service.identify_splits_and_merges(
10881           shard_osdmap, as_of_osdmap, pgid,
10882           split_pgs, merge_pgs);
10883       } else if (!slot->waiting_for_split.empty()) {
10884         osd->service.identify_splits_and_merges(
10885           shard_osdmap, as_of_osdmap, pgid,
10886           split_pgs, nullptr);
10887       } else {
10888         dout(20) << __func__ << " slot " << pgid
10889                  << " has no pg and waiting_for_split "
10890                  << slot->waiting_for_split << dendl;
10891       }
10892     }
10893   }
10894 }
10895
10896 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10897                             set<pair<spg_t,epoch_t>> *pgids)
10898 {
10899   std::lock_guard l(shard_lock);
10900   _prime_splits(pgids);
10901   if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10902     set<pair<spg_t,epoch_t>> newer_children;
10903     for (auto i : *pgids) {
10904       osd->service.identify_splits_and_merges(
10905         as_of_osdmap, shard_osdmap, i.first,
10906         &newer_children, nullptr);
10907     }
10908     newer_children.insert(pgids->begin(), pgids->end());
10909     dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10910              << shard_osdmap->get_epoch() << ", new children " << newer_children
10911              << dendl;
10912     _prime_splits(&newer_children);
10913     // note: we don't care what is left over here for other shards.
10914     // if this shard is ahead of us and one isn't, e.g., one thread is
10915     // calling into prime_splits via _process (due to a newly created
10916     // pg) and this shard has a newer map due to a racing consume_map,
10917     // then any grandchildren left here will be identified (or were
10918     // identified) when the slower shard's osdmap is advanced.
10919     // _prime_splits() will tolerate the case where the pgid is
10920     // already primed.
10921   }
10922 }
10923
10924 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10925 {
10926   dout(10) << *pgids << dendl;
10927   auto p = pgids->begin();
10928   while (p != pgids->end()) {
10929     unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10930     if (shard_index == shard_id) {
10931       auto r = pg_slots.emplace(p->first, nullptr);
10932       if (r.second) {
10933         dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10934         r.first->second = make_unique<OSDShardPGSlot>();
10935         r.first->second->waiting_for_split.insert(p->second);
10936       } else {
10937         auto q = r.first;
10938         ceph_assert(q != pg_slots.end());
10939         dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10940                  << dendl;
10941         q->second->waiting_for_split.insert(p->second);
10942       }
10943       p = pgids->erase(p);
10944     } else {
10945       ++p;
10946     }
10947   }
10948 }
10949
10950 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10951                             set<pair<spg_t,epoch_t>> *merge_pgs)
10952 {
10953   std::lock_guard l(shard_lock);
10954   dout(20) << __func__ << " checking shard " << shard_id
10955            << " for remaining merge pgs " << merge_pgs << dendl;
10956   auto p = merge_pgs->begin();
10957   while (p != merge_pgs->end()) {
10958     spg_t pgid = p->first;
10959     epoch_t epoch = p->second;
10960     unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10961     if (shard_index != shard_id) {
10962       ++p;
10963       continue;
10964     }
10965     OSDShardPGSlot *slot;
10966     auto r = pg_slots.emplace(pgid, nullptr);
10967     if (r.second) {
10968       r.first->second = make_unique<OSDShardPGSlot>();
10969     }
10970     slot = r.first->second.get();
10971     if (slot->pg) {
10972       // already have pg
10973       dout(20) << __func__ << "  have merge participant pg " << pgid
10974                << " " << slot->pg << dendl;
10975     } else if (!slot->waiting_for_split.empty() &&
10976                *slot->waiting_for_split.begin() < epoch) {
10977       dout(20) << __func__ << "  pending split on merge participant pg " << pgid
10978                << " " << slot->waiting_for_split << dendl;
10979     } else {
10980       dout(20) << __func__ << "  creating empty merge participant " << pgid
10981                << " for merge in " << epoch << dendl;
10982       // leave history zeroed; PG::merge_from() will fill it in.
10983       pg_history_t history;
10984       PGCreateInfo cinfo(pgid, epoch - 1,
10985                          history, PastIntervals(), false);
10986       PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10987       _attach_pg(r.first->second.get(), pg.get());
10988       _wake_pg_slot(pgid, slot);
10989       pg->unlock();
10990     }
10991     // mark slot for merge
10992     dout(20) << __func__ << "  marking merge participant " << pgid << dendl;
10993     slot->waiting_for_merge_epoch = epoch;
10994     p = merge_pgs->erase(p);
10995   }
10996 }
10997
10998 void OSDShard::register_and_wake_split_child(PG *pg)
10999 {
11000   epoch_t epoch;
11001   {
11002     std::lock_guard l(shard_lock);
11003     dout(10) << pg->pg_id << " " << pg << dendl;
11004     auto p = pg_slots.find(pg->pg_id);
11005     ceph_assert(p != pg_slots.end());
11006     auto *slot = p->second.get();
11007     dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
11008              << dendl;
11009     ceph_assert(!slot->pg);
11010     ceph_assert(!slot->waiting_for_split.empty());
11011     _attach_pg(slot, pg);
11012
11013     epoch = pg->get_osdmap_epoch();
11014     ceph_assert(slot->waiting_for_split.count(epoch));
11015     slot->waiting_for_split.erase(epoch);
11016     if (slot->waiting_for_split.empty()) {
11017       _wake_pg_slot(pg->pg_id, slot);
11018     } else {
11019       dout(10) << __func__ << " still waiting for split on "
11020                << slot->waiting_for_split << dendl;
11021     }
11022   }
11023
11024   // kick child to ensure it pulls up to the latest osdmap
11025   osd->enqueue_peering_evt(
11026     pg->pg_id,
11027     PGPeeringEventRef(
11028       std::make_shared<PGPeeringEvent>(
11029         epoch,
11030         epoch,
11031         NullEvt())));
11032
11033   std::lock_guard l{sdata_wait_lock};
11034   sdata_cond.notify_one();
11035 }
11036
11037 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
11038 {
11039   std::lock_guard l(shard_lock);
11040   vector<spg_t> to_delete;
11041   for (auto& i : pg_slots) {
11042     if (i.first != parent &&
11043         i.first.get_ancestor(old_pg_num) == parent) {
11044       dout(10) << __func__ << " parent " << parent << " clearing " << i.first
11045                << dendl;
11046       _wake_pg_slot(i.first, i.second.get());
11047       to_delete.push_back(i.first);
11048     }
11049   }
11050   for (auto pgid : to_delete) {
11051     pg_slots.erase(pgid);
11052   }
11053 }
11054
11055
11056 // =============================================================
11057
11058 #undef dout_context
11059 #define dout_context osd->cct
11060 #undef dout_prefix
11061 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
11062
11063 void OSD::ShardedOpWQ::_add_slot_waiter(
11064   spg_t pgid,
11065   OSDShardPGSlot *slot,
11066   OpQueueItem&& qi)
11067 {
11068   if (qi.is_peering()) {
11069     dout(20) << __func__ << " " << pgid
11070              << " peering, item epoch is "
11071              << qi.get_map_epoch()
11072              << ", will wait on " << qi << dendl;
11073     slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
11074   } else {
11075     dout(20) << __func__ << " " << pgid
11076              << " item epoch is "
11077              << qi.get_map_epoch()
11078              << ", will wait on " << qi << dendl;
11079     slot->waiting.push_back(std::move(qi));
11080   }
11081 }
11082
11083 #undef dout_prefix
11084 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
11085
11086 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
11087 {
11088   uint32_t shard_index = thread_index % osd->num_shards;
11089   auto& sdata = osd->shards[shard_index];
11090   ceph_assert(sdata);
11091
11092   // If all threads of shards do oncommits, there is a out-of-order
11093   // problem.  So we choose the thread which has the smallest
11094   // thread_index(thread_index < num_shards) of shard to do oncommit
11095   // callback.
11096   bool is_smallest_thread_index = thread_index < osd->num_shards;
11097
11098   // peek at spg_t
11099   sdata->shard_lock.lock();
11100   if (sdata->pqueue->empty() &&
11101       (!is_smallest_thread_index || sdata->context_queue.empty())) {
11102     std::unique_lock wait_lock{sdata->sdata_wait_lock};
11103     if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11104       // we raced with a context_queue addition, don't wait
11105       wait_lock.unlock();
11106     } else if (!sdata->stop_waiting) {
11107       dout(20) << __func__ << " empty q, waiting" << dendl;
11108       osd->cct->get_heartbeat_map()->clear_timeout(hb);
11109       sdata->shard_lock.unlock();
11110       sdata->sdata_cond.wait(wait_lock);
11111       wait_lock.unlock();
11112       sdata->shard_lock.lock();
11113       if (sdata->pqueue->empty() &&
11114          !(is_smallest_thread_index && !sdata->context_queue.empty())) {
11115         sdata->shard_lock.unlock();
11116         return;
11117       }
11118       osd->cct->get_heartbeat_map()->reset_timeout(hb,
11119           osd->cct->_conf->threadpool_default_timeout, 0);
11120     } else {
11121       dout(20) << __func__ << " need return immediately" << dendl;
11122       wait_lock.unlock();
11123       sdata->shard_lock.unlock();
11124       return;
11125     }
11126   }
11127
11128   list<Context *> oncommits;
11129   if (is_smallest_thread_index && !sdata->context_queue.empty()) {
11130     sdata->context_queue.swap(oncommits);
11131   }
11132
11133   if (sdata->pqueue->empty()) {
11134     if (osd->is_stopping()) {
11135       sdata->shard_lock.unlock();
11136       for (auto c : oncommits) {
11137         dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11138         delete c;
11139       }
11140       return;    // OSD shutdown, discard.
11141     }
11142     sdata->shard_lock.unlock();
11143     handle_oncommits(oncommits);
11144     return;
11145   }
11146
11147   OpQueueItem item = sdata->pqueue->dequeue();
11148   if (osd->is_stopping()) {
11149     sdata->shard_lock.unlock();
11150     for (auto c : oncommits) {
11151       dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11152       delete c;
11153     }
11154     return;    // OSD shutdown, discard.
11155   }
11156
11157   const auto token = item.get_ordering_token();
11158   auto r = sdata->pg_slots.emplace(token, nullptr);
11159   if (r.second) {
11160     r.first->second = make_unique<OSDShardPGSlot>();
11161   }
11162   OSDShardPGSlot *slot = r.first->second.get();
11163   dout(20) << __func__ << " " << token
11164            << (r.second ? " (new)" : "")
11165            << " to_process " << slot->to_process
11166            << " waiting " << slot->waiting
11167            << " waiting_peering " << slot->waiting_peering
11168            << dendl;
11169   slot->to_process.push_back(std::move(item));
11170   dout(20) << __func__ << " " << slot->to_process.back()
11171            << " queued" << dendl;
11172
11173  retry_pg:
11174   PGRef pg = slot->pg;
11175
11176   // lock pg (if we have it)
11177   if (pg) {
11178     // note the requeue seq now...
11179     uint64_t requeue_seq = slot->requeue_seq;
11180     ++slot->num_running;
11181
11182     sdata->shard_lock.unlock();
11183     osd->service.maybe_inject_dispatch_delay();
11184     pg->lock();
11185     osd->service.maybe_inject_dispatch_delay();
11186     sdata->shard_lock.lock();
11187
11188     auto q = sdata->pg_slots.find(token);
11189     if (q == sdata->pg_slots.end()) {
11190       // this can happen if we race with pg removal.
11191       dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
11192       pg->unlock();
11193       sdata->shard_lock.unlock();
11194       handle_oncommits(oncommits);
11195       return;
11196     }
11197     slot = q->second.get();
11198     --slot->num_running;
11199
11200     if (slot->to_process.empty()) {
11201       // raced with _wake_pg_slot or consume_map
11202       dout(20) << __func__ << " " << token
11203                << " nothing queued" << dendl;
11204       pg->unlock();
11205       sdata->shard_lock.unlock();
11206       handle_oncommits(oncommits);
11207       return;
11208     }
11209     if (requeue_seq != slot->requeue_seq) {
11210       dout(20) << __func__ << " " << token
11211                << " requeue_seq " << slot->requeue_seq << " > our "
11212                << requeue_seq << ", we raced with _wake_pg_slot"
11213                << dendl;
11214       pg->unlock();
11215       sdata->shard_lock.unlock();
11216       handle_oncommits(oncommits);
11217       return;
11218     }
11219     if (slot->pg != pg) {
11220       // this can happen if we race with pg removal.
11221       dout(20) << __func__ << " slot " << token << " no longer attached to "
11222                << pg << dendl;
11223       pg->unlock();
11224       goto retry_pg;
11225     }
11226   }
11227
11228   dout(20) << __func__ << " " << token
11229            << " to_process " << slot->to_process
11230            << " waiting " << slot->waiting
11231            << " waiting_peering " << slot->waiting_peering << dendl;
11232
11233   ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
11234                                  suicide_interval);
11235
11236   // take next item
11237   auto qi = std::move(slot->to_process.front());
11238   slot->to_process.pop_front();
11239   dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11240   set<pair<spg_t,epoch_t>> new_children;
11241   OSDMapRef osdmap;
11242
11243   while (!pg) {
11244     // should this pg shard exist on this osd in this (or a later) epoch?
11245     osdmap = sdata->shard_osdmap;
11246     const PGCreateInfo *create_info = qi.creates_pg();
11247     if (!slot->waiting_for_split.empty()) {
11248       dout(20) << __func__ << " " << token
11249                << " splitting " << slot->waiting_for_split << dendl;
11250       _add_slot_waiter(token, slot, std::move(qi));
11251     } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11252       dout(20) << __func__ << " " << token
11253                << " map " << qi.get_map_epoch() << " > "
11254                << osdmap->get_epoch() << dendl;
11255       _add_slot_waiter(token, slot, std::move(qi));
11256     } else if (qi.is_peering()) {
11257       if (!qi.peering_requires_pg()) {
11258         // for pg-less events, we run them under the ordering lock, since
11259         // we don't have the pg lock to keep them ordered.
11260         qi.run(osd, sdata, pg, tp_handle);
11261       } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11262         if (create_info) {
11263           if (create_info->by_mon &&
11264               osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11265             dout(20) << __func__ << " " << token
11266                      << " no pg, no longer primary, ignoring mon create on "
11267                      << qi << dendl;
11268           } else {
11269             dout(20) << __func__ << " " << token
11270                      << " no pg, should create on " << qi << dendl;
11271             pg = osd->handle_pg_create_info(osdmap, create_info);
11272             if (pg) {
11273               // we created the pg! drop out and continue "normally"!
11274               sdata->_attach_pg(slot, pg.get());
11275               sdata->_wake_pg_slot(token, slot);
11276
11277               // identify split children between create epoch and shard epoch.
11278               osd->service.identify_splits_and_merges(
11279                 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11280               sdata->_prime_splits(&new_children);
11281               // distribute remaining split children to other shards below!
11282               break;
11283             }
11284             dout(20) << __func__ << " ignored create on " << qi << dendl;
11285           }
11286         } else {
11287           dout(20) << __func__ << " " << token
11288                    << " no pg, peering, !create, discarding " << qi << dendl;
11289         }
11290       } else {
11291         dout(20) << __func__ << " " << token
11292                  << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11293                  << ", discarding " << qi
11294                  << dendl;
11295       }
11296     } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11297       dout(20) << __func__ << " " << token
11298                << " no pg, should exist e" << osdmap->get_epoch()
11299                << ", will wait on " << qi << dendl;
11300       _add_slot_waiter(token, slot, std::move(qi));
11301     } else {
11302       dout(20) << __func__ << " " << token
11303                << " no pg, shouldn't exist e" << osdmap->get_epoch()
11304                << ", dropping " << qi << dendl;
11305       // share map with client?
11306       if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11307         auto priv = (*_op)->get_req()->get_connection()->get_priv();
11308         if (auto session = static_cast<Session *>(priv.get()); session) {
11309           osd->maybe_share_map(session, *_op, sdata->shard_osdmap);
11310         }
11311       }
11312       unsigned pushes_to_free = qi.get_reserved_pushes();
11313       if (pushes_to_free > 0) {
11314         sdata->shard_lock.unlock();
11315         osd->service.release_reserved_pushes(pushes_to_free);
11316         handle_oncommits(oncommits);
11317         return;
11318       }
11319     }
11320     sdata->shard_lock.unlock();
11321     handle_oncommits(oncommits);
11322     return;
11323   }
11324   if (qi.is_peering()) {
11325     OSDMapRef osdmap = sdata->shard_osdmap;
11326     if (qi.get_map_epoch() > osdmap->get_epoch()) {
11327       _add_slot_waiter(token, slot, std::move(qi));
11328       sdata->shard_lock.unlock();
11329       pg->unlock();
11330       handle_oncommits(oncommits);
11331       return;
11332     }
11333   }
11334   sdata->shard_lock.unlock();
11335
11336   if (!new_children.empty()) {
11337     for (auto shard : osd->shards) {
11338       shard->prime_splits(osdmap, &new_children);
11339     }
11340     ceph_assert(new_children.empty());
11341   }
11342
11343   // osd_opwq_process marks the point at which an operation has been dequeued
11344   // and will begin to be handled by a worker thread.
11345   {
11346 #ifdef WITH_LTTNG
11347     osd_reqid_t reqid;
11348     if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11349       reqid = (*_op)->get_reqid();
11350     }
11351 #endif
11352     tracepoint(osd, opwq_process_start, reqid.name._type,
11353         reqid.name._num, reqid.tid, reqid.inc);
11354   }
11355
11356   lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11357   Formatter *f = Formatter::create("json");
11358   f->open_object_section("q");
11359   dump(f);
11360   f->close_section();
11361   f->flush(*_dout);
11362   delete f;
11363   *_dout << dendl;
11364
11365   qi.run(osd, sdata, pg, tp_handle);
11366
11367   {
11368 #ifdef WITH_LTTNG
11369     osd_reqid_t reqid;
11370     if (boost::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11371       reqid = (*_op)->get_reqid();
11372     }
11373 #endif
11374     tracepoint(osd, opwq_process_finish, reqid.name._type,
11375         reqid.name._num, reqid.tid, reqid.inc);
11376   }
11377
11378   handle_oncommits(oncommits);
11379 }
11380
11381 void OSD::ShardedOpWQ::_enqueue(OpQueueItem&& item) {
11382   uint32_t shard_index =
11383     item.get_ordering_token().hash_to_shard(osd->shards.size());
11384
11385   OSDShard* sdata = osd->shards[shard_index];
11386   assert (NULL != sdata);
11387   unsigned priority = item.get_priority();
11388   unsigned cost = item.get_cost();
11389   sdata->shard_lock.lock();
11390
11391   dout(20) << __func__ << " " << item << dendl;
11392   if (priority >= osd->op_prio_cutoff)
11393     sdata->pqueue->enqueue_strict(
11394       item.get_owner(), priority, std::move(item));
11395   else
11396     sdata->pqueue->enqueue(
11397       item.get_owner(), priority, cost, std::move(item));
11398   sdata->shard_lock.unlock();
11399
11400   std::lock_guard l{sdata->sdata_wait_lock};
11401   sdata->sdata_cond.notify_one();
11402 }
11403
11404 void OSD::ShardedOpWQ::_enqueue_front(OpQueueItem&& item)
11405 {
11406   auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11407   auto& sdata = osd->shards[shard_index];
11408   ceph_assert(sdata);
11409   sdata->shard_lock.lock();
11410   auto p = sdata->pg_slots.find(item.get_ordering_token());
11411   if (p != sdata->pg_slots.end() &&
11412       !p->second->to_process.empty()) {
11413     // we may be racing with _process, which has dequeued a new item
11414     // from pqueue, put it on to_process, and is now busy taking the
11415     // pg lock.  ensure this old requeued item is ordered before any
11416     // such newer item in to_process.
11417     p->second->to_process.push_front(std::move(item));
11418     item = std::move(p->second->to_process.back());
11419     p->second->to_process.pop_back();
11420     dout(20) << __func__
11421              << " " << p->second->to_process.front()
11422              << " shuffled w/ " << item << dendl;
11423   } else {
11424     dout(20) << __func__ << " " << item << dendl;
11425   }
11426   sdata->_enqueue_front(std::move(item), osd->op_prio_cutoff);
11427   sdata->shard_lock.unlock();
11428   std::lock_guard l{sdata->sdata_wait_lock};
11429   sdata->sdata_cond.notify_one();
11430 }
11431
11432 namespace ceph {
11433 namespace osd_cmds {
11434
11435 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11436          std::ostream& os)
11437 {
11438   if (!ceph_using_tcmalloc()) {
11439         os << "could not issue heap profiler command -- not using tcmalloc!";
11440         return -EOPNOTSUPP;
11441   }
11442
11443   string cmd;
11444   if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
11445         os << "unable to get value for command \"" << cmd << "\"";
11446        return -EINVAL;
11447   }
11448
11449   std::vector<std::string> cmd_vec;
11450   get_str_vec(cmd, cmd_vec);
11451
11452   string val;
11453   if (cmd_getval(&cct, cmdmap, "value", val)) {
11454     cmd_vec.push_back(val);
11455   }
11456
11457   ceph_heap_profiler_handle_command(cmd_vec, os);
11458
11459   return 0;
11460 }
11461
11462 }} // namespace ceph::osd_cmds
11463
11464
11465 std::ostream& operator<<(std::ostream& out, const io_queue& q) {
11466   switch(q) {
11467   case io_queue::prioritized:
11468     out << "prioritized";
11469     break;
11470   case io_queue::weightedpriority:
11471     out << "weightedpriority";
11472     break;
11473   case io_queue::mclock_opclass:
11474     out << "mclock_opclass";
11475     break;
11476   case io_queue::mclock_client:
11477     out << "mclock_client";
11478     break;
11479   }
11480   return out;
11481 }