]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
eafe8130 26#include <time.h>
7c673cae 27#include <boost/scoped_ptr.hpp>
eafe8130 28#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
29
30#ifdef HAVE_SYS_PARAM_H
31#include <sys/param.h>
32#endif
33
34#ifdef HAVE_SYS_MOUNT_H
35#include <sys/mount.h>
36#endif
37
38#include "osd/PG.h"
39
40#include "include/types.h"
41#include "include/compat.h"
11fdf7f2 42#include "include/random.h"
7c673cae
FG
43
44#include "OSD.h"
45#include "OSDMap.h"
46#include "Watch.h"
47#include "osdc/Objecter.h"
48
49#include "common/errno.h"
50#include "common/ceph_argparse.h"
9f95a23c 51#include "common/ceph_releases.h"
224ce89b 52#include "common/ceph_time.h"
7c673cae 53#include "common/version.h"
b5b8bbf5 54#include "common/pick_address.h"
11fdf7f2
TL
55#include "common/blkdev.h"
56#include "common/numa.h"
7c673cae
FG
57
58#include "os/ObjectStore.h"
59#ifdef HAVE_LIBFUSE
60#include "os/FuseStore.h"
61#endif
62
63#include "PrimaryLogPG.h"
64
7c673cae
FG
65#include "msg/Messenger.h"
66#include "msg/Message.h"
67
68#include "mon/MonClient.h"
69
70#include "messages/MLog.h"
71
72#include "messages/MGenericMessage.h"
7c673cae
FG
73#include "messages/MOSDPing.h"
74#include "messages/MOSDFailure.h"
75#include "messages/MOSDMarkMeDown.h"
9f95a23c 76#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
77#include "messages/MOSDFull.h"
78#include "messages/MOSDOp.h"
79#include "messages/MOSDOpReply.h"
80#include "messages/MOSDBackoff.h"
81#include "messages/MOSDBeacon.h"
82#include "messages/MOSDRepOp.h"
83#include "messages/MOSDRepOpReply.h"
84#include "messages/MOSDBoot.h"
85#include "messages/MOSDPGTemp.h"
11fdf7f2 86#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
87
88#include "messages/MOSDMap.h"
89#include "messages/MMonGetOSDMap.h"
90#include "messages/MOSDPGNotify.h"
9f95a23c 91#include "messages/MOSDPGNotify2.h"
7c673cae 92#include "messages/MOSDPGQuery.h"
9f95a23c 93#include "messages/MOSDPGQuery2.h"
7c673cae
FG
94#include "messages/MOSDPGLog.h"
95#include "messages/MOSDPGRemove.h"
96#include "messages/MOSDPGInfo.h"
9f95a23c 97#include "messages/MOSDPGInfo2.h"
7c673cae 98#include "messages/MOSDPGCreate.h"
11fdf7f2 99#include "messages/MOSDPGCreate2.h"
7c673cae 100#include "messages/MOSDPGScan.h"
7c673cae
FG
101#include "messages/MBackfillReserve.h"
102#include "messages/MRecoveryReserve.h"
c07f9fc5 103#include "messages/MOSDForceRecovery.h"
7c673cae
FG
104#include "messages/MOSDECSubOpWrite.h"
105#include "messages/MOSDECSubOpWriteReply.h"
106#include "messages/MOSDECSubOpRead.h"
107#include "messages/MOSDECSubOpReadReply.h"
108#include "messages/MOSDPGCreated.h"
109#include "messages/MOSDPGUpdateLogMissing.h"
110#include "messages/MOSDPGUpdateLogMissingReply.h"
111
11fdf7f2
TL
112#include "messages/MOSDPeeringOp.h"
113
7c673cae
FG
114#include "messages/MOSDAlive.h"
115
116#include "messages/MOSDScrub.h"
11fdf7f2 117#include "messages/MOSDScrub2.h"
7c673cae
FG
118#include "messages/MOSDRepScrub.h"
119
7c673cae
FG
120#include "messages/MCommand.h"
121#include "messages/MCommandReply.h"
122
123#include "messages/MPGStats.h"
124#include "messages/MPGStatsAck.h"
125
126#include "messages/MWatchNotify.h"
127#include "messages/MOSDPGPush.h"
128#include "messages/MOSDPGPushReply.h"
129#include "messages/MOSDPGPull.h"
130
9f95a23c
TL
131#include "messages/MMonGetPurgedSnaps.h"
132#include "messages/MMonGetPurgedSnapsReply.h"
133
7c673cae
FG
134#include "common/perf_counters.h"
135#include "common/Timer.h"
136#include "common/LogClient.h"
137#include "common/AsyncReserver.h"
138#include "common/HeartbeatMap.h"
139#include "common/admin_socket.h"
140#include "common/ceph_context.h"
141
142#include "global/signal_handler.h"
143#include "global/pidfile.h"
144
145#include "include/color.h"
146#include "perfglue/cpu_profiler.h"
147#include "perfglue/heap_profiler.h"
148
149#include "osd/OpRequest.h"
150
151#include "auth/AuthAuthorizeHandler.h"
152#include "auth/RotatingKeyRing.h"
7c673cae
FG
153
154#include "objclass/objclass.h"
155
156#include "common/cmdparse.h"
157#include "include/str_list.h"
158#include "include/util.h"
159
11fdf7f2 160#include "include/ceph_assert.h"
7c673cae
FG
161#include "common/config.h"
162#include "common/EventTrace.h"
163
11fdf7f2
TL
164#include "json_spirit/json_spirit_reader.h"
165#include "json_spirit/json_spirit_writer.h"
166
7c673cae
FG
167#ifdef WITH_LTTNG
168#define TRACEPOINT_DEFINE
169#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170#include "tracing/osd.h"
171#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172#undef TRACEPOINT_DEFINE
173#else
174#define tracepoint(...)
175#endif
176
177#define dout_context cct
178#define dout_subsys ceph_subsys_osd
179#undef dout_prefix
180#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
181
9f95a23c
TL
182using namespace ceph::osd::scheduler;
183using TOPNSPC::common::cmd_getval;
224ce89b 184
7c673cae
FG
185static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
186 return *_dout << "osd." << whoami << " " << epoch << " ";
187}
188
7c673cae
FG
189//Initial features in new superblock.
190//Features here are also automatically upgraded
191CompatSet OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
194 CompatSet::FeatureSet ceph_osd_feature_incompat;
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
205 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
206 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
207 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
208 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 209 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
9f95a23c 210 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
7c673cae
FG
211 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
212 ceph_osd_feature_incompat);
213}
214
215//Features are added here that this OSD supports.
216CompatSet OSD::get_osd_compat_set() {
217 CompatSet compat = get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
220 return compat;
221}
222
223OSDService::OSDService(OSD *osd) :
224 osd(osd),
225 cct(osd->cct),
7c673cae
FG
226 whoami(osd->whoami), store(osd->store),
227 log_client(osd->log_client), clog(osd->clog),
228 pg_recovery_stats(osd->pg_recovery_stats),
229 cluster_messenger(osd->cluster_messenger),
230 client_messenger(osd->client_messenger),
231 logger(osd->logger),
232 recoverystate_perf(osd->recoverystate_perf),
233 monc(osd->monc),
11fdf7f2
TL
234 osd_max_object_size(cct->_conf, "osd_max_object_size"),
235 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
236 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
7c673cae 238 max_oldest_map(0),
eafe8130
TL
239 scrubs_local(0),
240 scrubs_remote(0),
7c673cae
FG
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
7c673cae
FG
247 agent_timer(osd->client_messenger->cct, agent_timer_lock),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
9f95a23c
TL
251 objecter(make_unique<Objecter>(osd->client_messenger->cct,
252 osd->objecter_messenger,
253 osd->monc, nullptr, 0, 0)),
11fdf7f2 254 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
7c673cae 257 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2 258 sleep_timer(cct, sleep_lock, false),
7c673cae 259 reserver_finisher(cct),
3efd9988 260 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 261 cct->_conf->osd_min_recovery_priority),
3efd9988 262 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 263 cct->_conf->osd_min_recovery_priority),
3efd9988 264 snap_reserver(cct, &reserver_finisher,
7c673cae 265 cct->_conf->osd_max_trimming_pgs),
7c673cae
FG
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
7c673cae
FG
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae 272 cur_state(NONE),
11fdf7f2 273 cur_ratio(0), physical_ratio(0),
9f95a23c 274 boot_epoch(0), up_epoch(0), bind_epoch(0)
7c673cae
FG
275{
276 objecter->init();
11fdf7f2
TL
277
278 for (int i = 0; i < m_objecter_finishers; i++) {
279 ostringstream str;
280 str << "objecter-finisher-" << i;
9f95a23c
TL
281 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
282 objecter_finishers.push_back(std::move(fin));
11fdf7f2 283 }
7c673cae
FG
284}
285
31f18b77
FG
286#ifdef PG_DEBUG_REFS
287void OSDService::add_pgid(spg_t pgid, PG *pg){
11fdf7f2 288 std::lock_guard l(pgid_lock);
31f18b77
FG
289 if (!pgid_tracker.count(pgid)) {
290 live_pgs[pgid] = pg;
291 }
292 pgid_tracker[pgid]++;
293}
294void OSDService::remove_pgid(spg_t pgid, PG *pg)
295{
11fdf7f2
TL
296 std::lock_guard l(pgid_lock);
297 ceph_assert(pgid_tracker.count(pgid));
298 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
299 pgid_tracker[pgid]--;
300 if (pgid_tracker[pgid] == 0) {
301 pgid_tracker.erase(pgid);
302 live_pgs.erase(pgid);
303 }
304}
305void OSDService::dump_live_pgids()
306{
11fdf7f2 307 std::lock_guard l(pgid_lock);
31f18b77
FG
308 derr << "live pgids:" << dendl;
309 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
310 i != pgid_tracker.cend();
311 ++i) {
312 derr << "\t" << *i << dendl;
313 live_pgs[i->first]->dump_live_ids();
314 }
315}
316#endif
317
318
9f95a23c
TL
319ceph::signedspan OSDService::get_mnow()
320{
321 return ceph::mono_clock::now() - osd->startup_time;
322}
7c673cae 323
11fdf7f2
TL
324void OSDService::identify_splits_and_merges(
325 OSDMapRef old_map,
326 OSDMapRef new_map,
327 spg_t pgid,
328 set<pair<spg_t,epoch_t>> *split_children,
329 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 330{
11fdf7f2 331 if (!old_map->have_pg_pool(pgid.pool())) {
7c673cae 332 return;
7c673cae 333 }
7c673cae 334 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
335 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
336 if (p == osd->pg_num_history.pg_nums.end()) {
337 return;
338 }
339 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
340 << " to e" << new_map->get_epoch()
341 << " pg_nums " << p->second << dendl;
342 deque<spg_t> queue;
343 queue.push_back(pgid);
eafe8130 344 set<spg_t> did;
11fdf7f2
TL
345 while (!queue.empty()) {
346 auto cur = queue.front();
347 queue.pop_front();
eafe8130 348 did.insert(cur);
11fdf7f2
TL
349 unsigned pgnum = old_pgnum;
350 for (auto q = p->second.lower_bound(old_map->get_epoch());
351 q != p->second.end() &&
352 q->first <= new_map->get_epoch();
353 ++q) {
354 if (pgnum < q->second) {
355 // split?
356 if (cur.ps() < pgnum) {
357 set<spg_t> children;
358 if (cur.is_split(pgnum, q->second, &children)) {
359 dout(20) << __func__ << " " << cur << " e" << q->first
360 << " pg_num " << pgnum << " -> " << q->second
361 << " children " << children << dendl;
362 for (auto i : children) {
363 split_children->insert(make_pair(i, q->first));
eafe8130
TL
364 if (!did.count(i))
365 queue.push_back(i);
11fdf7f2
TL
366 }
367 }
368 } else if (cur.ps() < q->second) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " is a child" << dendl;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children->insert(make_pair(cur, q->first));
377 } else {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is post-split, skipping" << dendl;
381 }
382 } else if (merge_pgs) {
383 // merge?
384 if (cur.ps() >= q->second) {
385 if (cur.ps() < pgnum) {
386 spg_t parent;
387 if (cur.is_merge_source(pgnum, q->second, &parent)) {
388 set<spg_t> children;
389 parent.is_split(q->second, pgnum, &children);
390 dout(20) << __func__ << " " << cur << " e" << q->first
391 << " pg_num " << pgnum << " -> " << q->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children << dendl;
394 merge_pgs->insert(make_pair(parent, q->first));
eafe8130
TL
395 if (!did.count(parent)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue.push_back(parent);
399 }
11fdf7f2
TL
400 for (auto c : children) {
401 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
402 if (!did.count(c))
403 queue.push_back(c);
11fdf7f2
TL
404 }
405 }
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is beyond old pgnum, skipping" << dendl;
410 }
411 } else {
412 set<spg_t> children;
413 if (cur.is_split(q->second, pgnum, &children)) {
414 dout(20) << __func__ << " " << cur << " e" << q->first
415 << " pg_num " << pgnum << " -> " << q->second
416 << " is merge target, source " << children << dendl;
417 for (auto c : children) {
418 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
419 if (!did.count(c))
420 queue.push_back(c);
11fdf7f2
TL
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
7c673cae
FG
424 }
425 }
11fdf7f2 426 pgnum = q->second;
7c673cae
FG
427 }
428 }
429}
430
7c673cae
FG
431void OSDService::need_heartbeat_peer_update()
432{
433 osd->need_heartbeat_peer_update();
434}
435
9f95a23c
TL
436HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
437{
438 std::lock_guard l(hb_stamp_lock);
439 if (peer >= hb_stamps.size()) {
440 hb_stamps.resize(peer + 1);
441 }
442 if (!hb_stamps[peer]) {
443 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
444 }
445 return hb_stamps[peer];
446}
447
448void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
449{
450 osd->enqueue_peering_evt(
451 spgid,
452 PGPeeringEventRef(
453 std::make_shared<PGPeeringEvent>(
454 epoch, epoch,
455 RenewLease())));
456}
457
7c673cae
FG
458void OSDService::start_shutdown()
459{
460 {
11fdf7f2 461 std::lock_guard l(agent_timer_lock);
7c673cae
FG
462 agent_timer.shutdown();
463 }
31f18b77
FG
464
465 {
11fdf7f2
TL
466 std::lock_guard l(sleep_lock);
467 sleep_timer.shutdown();
31f18b77 468 }
81eedcae
TL
469
470 {
471 std::lock_guard l(recovery_request_lock);
472 recovery_request_timer.shutdown();
473 }
7c673cae
FG
474}
475
31f18b77 476void OSDService::shutdown_reserver()
7c673cae
FG
477{
478 reserver_finisher.wait_for_empty();
479 reserver_finisher.stop();
31f18b77
FG
480}
481
482void OSDService::shutdown()
483{
9f95a23c
TL
484 mono_timer.suspend();
485
7c673cae 486 {
11fdf7f2 487 std::lock_guard l(watch_lock);
7c673cae
FG
488 watch_timer.shutdown();
489 }
490
491 objecter->shutdown();
9f95a23c 492 for (auto& f : objecter_finishers) {
11fdf7f2
TL
493 f->wait_for_empty();
494 f->stop();
7c673cae
FG
495 }
496
11fdf7f2 497 publish_map(OSDMapRef());
7c673cae
FG
498 next_osdmap = OSDMapRef();
499}
500
501void OSDService::init()
502{
503 reserver_finisher.start();
9f95a23c 504 for (auto& f : objecter_finishers) {
11fdf7f2
TL
505 f->start();
506 }
7c673cae
FG
507 objecter->set_client_incarnation(0);
508
509 // deprioritize objecter in daemonperf output
510 objecter->get_logger()->set_prio_adjust(-3);
511
512 watch_timer.init();
513 agent_timer.init();
9f95a23c 514 mono_timer.resume();
7c673cae
FG
515
516 agent_thread.create("osd_srv_agent");
517
518 if (cct->_conf->osd_recovery_delay_start)
519 defer_recovery(cct->_conf->osd_recovery_delay_start);
520}
521
522void OSDService::final_init()
523{
524 objecter->start(osdmap.get());
525}
526
527void OSDService::activate_map()
528{
529 // wake/unwake the tiering agent
9f95a23c 530 std::lock_guard l{agent_lock};
7c673cae
FG
531 agent_active =
532 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
533 osd->is_active();
9f95a23c 534 agent_cond.notify_all();
7c673cae
FG
535}
536
181888fb
FG
537void OSDService::request_osdmap_update(epoch_t e)
538{
539 osd->osdmap_subscribe(e, false);
540}
541
9f95a23c 542
7c673cae
FG
543class AgentTimeoutCB : public Context {
544 PGRef pg;
545public:
546 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
547 void finish(int) override {
548 pg->agent_choose_mode_restart();
549 }
550};
551
552void OSDService::agent_entry()
553{
554 dout(10) << __func__ << " start" << dendl;
9f95a23c 555 std::unique_lock agent_locker{agent_lock};
7c673cae
FG
556
557 while (!agent_stop_flag) {
558 if (agent_queue.empty()) {
559 dout(20) << __func__ << " empty queue" << dendl;
9f95a23c 560 agent_cond.wait(agent_locker);
7c673cae
FG
561 continue;
562 }
563 uint64_t level = agent_queue.rbegin()->first;
564 set<PGRef>& top = agent_queue.rbegin()->second;
565 dout(10) << __func__
566 << " tiers " << agent_queue.size()
567 << ", top is " << level
568 << " with pgs " << top.size()
569 << ", ops " << agent_ops << "/"
570 << cct->_conf->osd_agent_max_ops
571 << (agent_active ? " active" : " NOT ACTIVE")
572 << dendl;
573 dout(20) << __func__ << " oids " << agent_oids << dendl;
574 int max = cct->_conf->osd_agent_max_ops - agent_ops;
575 int agent_flush_quota = max;
576 if (!flush_mode_high_count)
577 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
578 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
9f95a23c 579 agent_cond.wait(agent_locker);
7c673cae
FG
580 continue;
581 }
582
583 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
584 agent_queue_pos = top.begin();
585 agent_valid_iterator = true;
586 }
587 PGRef pg = *agent_queue_pos;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota << dendl;
9f95a23c 591 agent_locker.unlock();
7c673cae 592 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 593 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
594 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
595 << " seconds" << dendl;
596
597 osd->logger->inc(l_osd_tier_delay);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
9f95a23c 599 std::lock_guard timer_locker{agent_timer_lock};
7c673cae
FG
600 Context *cb = new AgentTimeoutCB(pg);
601 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
7c673cae 602 }
9f95a23c 603 agent_locker.lock();
7c673cae 604 }
7c673cae
FG
605 dout(10) << __func__ << " finish" << dendl;
606}
607
608void OSDService::agent_stop()
609{
610 {
11fdf7f2 611 std::lock_guard l(agent_lock);
7c673cae
FG
612
613 // By this time all ops should be cancelled
11fdf7f2 614 ceph_assert(agent_ops == 0);
7c673cae
FG
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue.empty()) {
617 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
618 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
619 ceph_abort_msg("agent queue not empty");
7c673cae
FG
620 }
621
622 agent_stop_flag = true;
9f95a23c 623 agent_cond.notify_all();
7c673cae
FG
624 }
625 agent_thread.join();
626}
627
628// -------------------------------------
629
630void OSDService::promote_throttle_recalibrate()
631{
632 utime_t now = ceph_clock_now();
633 double dur = now - last_recalibrate;
634 last_recalibrate = now;
635 unsigned prob = promote_probability_millis;
636
637 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
638 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
639
640 unsigned min_prob = 1;
641
642 uint64_t attempts, obj, bytes;
643 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
644 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 645 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 646 << target_obj_sec << " obj/sec or "
1adf2230 647 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
648 << dendl;
649
650 // calculate what the probability *should* be, given the targets
651 unsigned new_prob;
652 if (attempts && dur > 0) {
653 uint64_t avg_size = 1;
654 if (obj)
11fdf7f2 655 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
656 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
657 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
658 / (double)attempts;
659 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
660 << avg_size << dendl;
661 if (target_obj_sec && target_bytes_sec)
11fdf7f2 662 new_prob = std::min(po, pb);
7c673cae
FG
663 else if (target_obj_sec)
664 new_prob = po;
665 else if (target_bytes_sec)
666 new_prob = pb;
667 else
668 new_prob = 1000;
669 } else {
670 new_prob = 1000;
671 }
672 dout(20) << __func__ << " new_prob " << new_prob << dendl;
673
674 // correct for persistent skew between target rate and actual rate, adjust
675 double ratio = 1.0;
676 unsigned actual = 0;
677 if (attempts && obj) {
678 actual = obj * 1000 / attempts;
679 ratio = (double)actual / (double)prob;
680 new_prob = (double)new_prob / ratio;
681 }
11fdf7f2
TL
682 new_prob = std::max(new_prob, min_prob);
683 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
684
685 // adjust
686 prob = (prob + new_prob) / 2;
11fdf7f2
TL
687 prob = std::max(prob, min_prob);
688 prob = std::min(prob, 1000u);
7c673cae
FG
689 dout(10) << __func__ << " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis << " -> " << prob
693 << dendl;
694 promote_probability_millis = prob;
695
696 // set hard limits for this interval to mitigate stampedes
91327a77
AA
697 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
698 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
699}
700
701// -------------------------------------
702
703float OSDService::get_failsafe_full_ratio()
704{
705 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
706 if (full_ratio > 1.0) full_ratio /= 100.0;
707 return full_ratio;
708}
709
11fdf7f2 710OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 711{
7c673cae
FG
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap = get_osdmap();
717 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 718 return NONE;
7c673cae
FG
719 }
720 float nearfull_ratio = osdmap->get_nearfull_ratio();
721 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
722 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
723 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
724
9f95a23c 725 if (osdmap->require_osd_release < ceph_release_t::luminous) {
7c673cae
FG
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio = failsafe_ratio;
729 backfillfull_ratio = failsafe_ratio;
730 nearfull_ratio = failsafe_ratio;
731 } else if (full_ratio <= 0 ||
732 backfillfull_ratio <= 0 ||
733 nearfull_ratio <= 0) {
734 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio = failsafe_ratio;
738 backfillfull_ratio = failsafe_ratio;
739 nearfull_ratio = failsafe_ratio;
740 }
741
7c673cae 742 if (injectfull_state > NONE && injectfull) {
7c673cae 743 inject = "(Injected)";
11fdf7f2
TL
744 return injectfull_state;
745 } else if (pratio > failsafe_ratio) {
746 return FAILSAFE;
7c673cae 747 } else if (ratio > full_ratio) {
11fdf7f2 748 return FULL;
7c673cae 749 } else if (ratio > backfillfull_ratio) {
11fdf7f2 750 return BACKFILLFULL;
92f5a8d4 751 } else if (pratio > nearfull_ratio) {
11fdf7f2 752 return NEARFULL;
7c673cae 753 }
11fdf7f2
TL
754 return NONE;
755}
756
757void OSDService::check_full_status(float ratio, float pratio)
758{
759 std::lock_guard l(full_status_lock);
760
761 cur_ratio = ratio;
762 physical_ratio = pratio;
763
764 string inject;
765 s_names new_state;
766 new_state = recalc_full_state(ratio, pratio, inject);
767
7c673cae 768 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 769 << ", physical ratio " << pratio
7c673cae
FG
770 << ", new state " << get_full_state_name(new_state)
771 << " " << inject
772 << dendl;
773
774 // warn
775 if (cur_state != new_state) {
776 dout(10) << __func__ << " " << get_full_state_name(cur_state)
777 << " -> " << get_full_state_name(new_state) << dendl;
778 if (new_state == FAILSAFE) {
c07f9fc5 779 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
780 << (int)roundf(ratio * 100) << "% full";
781 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
782 clog->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
784 }
785 cur_state = new_state;
786 }
787}
788
789bool OSDService::need_fullness_update()
790{
791 OSDMapRef osdmap = get_osdmap();
792 s_names cur = NONE;
793 if (osdmap->exists(whoami)) {
794 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
795 cur = FULL;
796 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
797 cur = BACKFILLFULL;
798 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
799 cur = NEARFULL;
800 }
801 }
802 s_names want = NONE;
803 if (is_full())
804 want = FULL;
805 else if (is_backfillfull())
806 want = BACKFILLFULL;
807 else if (is_nearfull())
808 want = NEARFULL;
809 return want != cur;
810}
811
11fdf7f2 812bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 813{
7c673cae
FG
814 if (injectfull && injectfull_state >= type) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
817 if (injectfull > 0)
818 --injectfull;
11fdf7f2
TL
819 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
820 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
821 << dendl;
7c673cae
FG
822 return true;
823 }
11fdf7f2
TL
824 return false;
825}
826
827bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
828{
829 std::lock_guard l(full_status_lock);
830
831 if (_check_inject_full(dpp, type))
832 return true;
833
834 if (cur_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
836 << " physical " << physical_ratio << dendl;
7c673cae 837
7c673cae
FG
838 return cur_state >= type;
839}
840
11fdf7f2
TL
841bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
842{
843 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
844 {
845 std::lock_guard l(full_status_lock);
846 if (_check_inject_full(dpp, type)) {
847 return true;
848 }
849 }
850
851 float pratio;
852 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
853
854 string notused;
855 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
856
857 if (tentative_state >= type)
858 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
859
860 return tentative_state >= type;
861}
862
863bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
864{
865 return _check_full(dpp, FAILSAFE);
866}
867
868bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 869{
11fdf7f2 870 return _check_full(dpp, FULL);
7c673cae
FG
871}
872
11fdf7f2 873bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 874{
11fdf7f2 875 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
876}
877
11fdf7f2 878bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 879{
11fdf7f2 880 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
881}
882
11fdf7f2 883bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 884{
11fdf7f2 885 return _check_full(dpp, NEARFULL);
7c673cae
FG
886}
887
888bool OSDService::is_failsafe_full() const
889{
11fdf7f2 890 std::lock_guard l(full_status_lock);
7c673cae
FG
891 return cur_state == FAILSAFE;
892}
893
894bool OSDService::is_full() const
895{
11fdf7f2 896 std::lock_guard l(full_status_lock);
7c673cae
FG
897 return cur_state >= FULL;
898}
899
900bool OSDService::is_backfillfull() const
901{
11fdf7f2 902 std::lock_guard l(full_status_lock);
7c673cae
FG
903 return cur_state >= BACKFILLFULL;
904}
905
906bool OSDService::is_nearfull() const
907{
11fdf7f2 908 std::lock_guard l(full_status_lock);
7c673cae
FG
909 return cur_state >= NEARFULL;
910}
911
912void OSDService::set_injectfull(s_names type, int64_t count)
913{
11fdf7f2 914 std::lock_guard l(full_status_lock);
7c673cae
FG
915 injectfull_state = type;
916 injectfull = count;
917}
918
11fdf7f2
TL
919void OSDService::set_statfs(const struct store_statfs_t &stbuf,
920 osd_alert_list_t& alerts)
7c673cae 921{
224ce89b 922 uint64_t bytes = stbuf.total;
224ce89b 923 uint64_t avail = stbuf.available;
11fdf7f2
TL
924 uint64_t used = stbuf.get_used_raw();
925
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct->_conf->fake_statfs_for_testing) {
929 uint64_t total_num_bytes = 0;
930 vector<PGRef> pgs;
931 osd->_get_pgs(&pgs);
932 for (auto p : pgs) {
933 total_num_bytes += p->get_stats_num_bytes();
934 }
935 bytes = cct->_conf->fake_statfs_for_testing;
936 if (total_num_bytes < bytes)
937 avail = bytes - total_num_bytes;
938 else
939 avail = 0;
940 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
941 << " adjust available " << avail
942 << dendl;
943 used = bytes - avail;
944 }
7c673cae 945
224ce89b
WB
946 osd->logger->set(l_osd_stat_bytes, bytes);
947 osd->logger->set(l_osd_stat_bytes_used, used);
948 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 949
11fdf7f2
TL
950 std::lock_guard l(stat_lock);
951 osd_stat.statfs = stbuf;
952 osd_stat.os_alerts.clear();
953 osd_stat.os_alerts[whoami].swap(alerts);
954 if (cct->_conf->fake_statfs_for_testing) {
955 osd_stat.statfs.total = bytes;
956 osd_stat.statfs.available = avail;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
959 }
960}
7c673cae 961
11fdf7f2
TL
962osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
963 int num_pgs)
224ce89b 964{
eafe8130
TL
965 utime_t now = ceph_clock_now();
966 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
11fdf7f2
TL
967 std::lock_guard l(stat_lock);
968 osd_stat.hb_peers.swap(hb_peers);
969 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
970 osd_stat.num_pgs = num_pgs;
eafe8130
TL
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i: osd_stat.hb_pingtime) {
974 if (i.second.last_update == 0)
975 continue;
976 if (stale_time && now.sec() - i.second.last_update > stale_time) {
977 dout(20) << __func__ << " time out heartbeat for osd " << i.first
978 << " last_update " << i.second.last_update << dendl;
979 osd_stat.hb_pingtime.erase(i.first);
980 break;
981 }
982 }
11fdf7f2
TL
983 return osd_stat;
984}
985
986void OSDService::inc_osd_stat_repaired()
987{
988 std::lock_guard l(stat_lock);
989 osd_stat.num_shards_repaired++;
990 return;
991}
992
993float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
994 uint64_t adjust_used)
995{
996 *pratio =
997 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
998
999 if (adjust_used) {
1000 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1001 if (new_stat.statfs.available > adjust_used)
1002 new_stat.statfs.available -= adjust_used;
1003 else
1004 new_stat.statfs.available = 0;
1005 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
1006 }
1007
11fdf7f2
TL
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted = 0;
1010 vector<PGRef> pgs;
1011 osd->_get_pgs(&pgs);
1012 for (auto p : pgs) {
1013 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1014 }
1015 if (backfill_adjusted) {
1016 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1017 }
1018 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
7c673cae
FG
1019}
1020
7c673cae
FG
1021void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1022{
1023 OSDMapRef next_map = get_nextmap_reserved();
1024 // service map is always newer/newest
11fdf7f2 1025 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1026
1027 if (next_map->is_down(peer) ||
1028 next_map->get_info(peer).up_from > from_epoch) {
1029 m->put();
1030 release_map(next_map);
1031 return;
1032 }
9f95a23c
TL
1033 ConnectionRef peer_con;
1034 if (peer == whoami) {
1035 peer_con = osd->cluster_messenger->get_loopback_connection();
1036 } else {
1037 peer_con = osd->cluster_messenger->connect_to_osd(
1038 next_map->get_cluster_addrs(peer), false, true);
1039 }
1040 maybe_share_map(peer_con.get(), next_map);
7c673cae
FG
1041 peer_con->send_message(m);
1042 release_map(next_map);
1043}
1044
9f95a23c
TL
1045void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1046{
1047 OSDMapRef next_map = get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch <= next_map->get_epoch());
1050
1051 for (auto& iter : messages) {
1052 if (next_map->is_down(iter.first) ||
1053 next_map->get_info(iter.first).up_from > from_epoch) {
1054 iter.second->put();
1055 continue;
1056 }
1057 ConnectionRef peer_con;
1058 if (iter.first == whoami) {
1059 peer_con = osd->cluster_messenger->get_loopback_connection();
1060 } else {
1061 peer_con = osd->cluster_messenger->connect_to_osd(
1062 next_map->get_cluster_addrs(iter.first), false, true);
1063 }
1064 maybe_share_map(peer_con.get(), next_map);
1065 peer_con->send_message(iter.second);
1066 }
1067 release_map(next_map);
1068}
7c673cae
FG
1069ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1070{
1071 OSDMapRef next_map = get_nextmap_reserved();
1072 // service map is always newer/newest
11fdf7f2 1073 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1074
1075 if (next_map->is_down(peer) ||
1076 next_map->get_info(peer).up_from > from_epoch) {
1077 release_map(next_map);
1078 return NULL;
1079 }
9f95a23c
TL
1080 ConnectionRef con;
1081 if (peer == whoami) {
1082 con = osd->cluster_messenger->get_loopback_connection();
1083 } else {
1084 con = osd->cluster_messenger->connect_to_osd(
1085 next_map->get_cluster_addrs(peer), false, true);
1086 }
7c673cae
FG
1087 release_map(next_map);
1088 return con;
1089}
1090
1091pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1092{
1093 OSDMapRef next_map = get_nextmap_reserved();
1094 // service map is always newer/newest
11fdf7f2 1095 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1096
1097 pair<ConnectionRef,ConnectionRef> ret;
1098 if (next_map->is_down(peer) ||
1099 next_map->get_info(peer).up_from > from_epoch) {
1100 release_map(next_map);
1101 return ret;
1102 }
11fdf7f2
TL
1103 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1104 next_map->get_hb_back_addrs(peer));
1105 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1106 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1107 release_map(next_map);
1108 return ret;
1109}
1110
11fdf7f2
TL
1111entity_name_t OSDService::get_cluster_msgr_name() const
1112{
1113 return cluster_messenger->get_myname();
1114}
7c673cae 1115
94b18763
FG
1116void OSDService::queue_want_pg_temp(pg_t pgid,
1117 const vector<int>& want,
1118 bool forced)
7c673cae 1119{
11fdf7f2 1120 std::lock_guard l(pg_temp_lock);
94b18763 1121 auto p = pg_temp_pending.find(pgid);
7c673cae 1122 if (p == pg_temp_pending.end() ||
94b18763
FG
1123 p->second.acting != want ||
1124 forced) {
11fdf7f2 1125 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1126 }
1127}
1128
1129void OSDService::remove_want_pg_temp(pg_t pgid)
1130{
11fdf7f2 1131 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1132 pg_temp_wanted.erase(pgid);
1133 pg_temp_pending.erase(pgid);
1134}
1135
1136void OSDService::_sent_pg_temp()
1137{
11fdf7f2
TL
1138#ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending.merge(pg_temp_wanted);
1140#else
94b18763
FG
1141 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1142 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1143#endif
7c673cae
FG
1144 pg_temp_wanted.clear();
1145}
1146
1147void OSDService::requeue_pg_temp()
1148{
11fdf7f2 1149 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted = pg_temp_wanted.size();
1153 unsigned old_pending = pg_temp_pending.size();
1154 _sent_pg_temp();
1155 pg_temp_wanted.swap(pg_temp_pending);
1156 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1157 << pg_temp_wanted.size() << dendl;
1158}
1159
94b18763
FG
1160std::ostream& operator<<(std::ostream& out,
1161 const OSDService::pg_temp_t& pg_temp)
1162{
1163 out << pg_temp.acting;
1164 if (pg_temp.forced) {
1165 out << " (forced)";
1166 }
1167 return out;
1168}
1169
7c673cae
FG
1170void OSDService::send_pg_temp()
1171{
11fdf7f2 1172 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1173 if (pg_temp_wanted.empty())
1174 return;
1175 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1176 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1177 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1178 auto& m = ms[pg_temp.forced];
94b18763
FG
1179 if (!m) {
1180 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1181 m->forced = pg_temp.forced;
94b18763 1182 }
11fdf7f2 1183 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1184 }
1185 for (auto m : ms) {
1186 if (m) {
1187 monc->send_mon_message(m);
1188 }
1189 }
7c673cae
FG
1190 _sent_pg_temp();
1191}
1192
1193void OSDService::send_pg_created(pg_t pgid)
1194{
11fdf7f2 1195 std::lock_guard l(pg_created_lock);
7c673cae 1196 dout(20) << __func__ << dendl;
11fdf7f2 1197 auto o = get_osdmap();
9f95a23c 1198 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2 1199 pg_created.insert(pgid);
c07f9fc5
FG
1200 monc->send_mon_message(new MOSDPGCreated(pgid));
1201 }
7c673cae
FG
1202}
1203
11fdf7f2
TL
1204void OSDService::send_pg_created()
1205{
1206 std::lock_guard l(pg_created_lock);
1207 dout(20) << __func__ << dendl;
1208 auto o = get_osdmap();
9f95a23c 1209 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2
TL
1210 for (auto pgid : pg_created) {
1211 monc->send_mon_message(new MOSDPGCreated(pgid));
1212 }
1213 }
1214}
1215
1216void OSDService::prune_pg_created()
1217{
1218 std::lock_guard l(pg_created_lock);
1219 dout(20) << __func__ << dendl;
1220 auto o = get_osdmap();
1221 auto i = pg_created.begin();
1222 while (i != pg_created.end()) {
1223 auto p = o->get_pg_pool(i->pool());
1224 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1225 dout(20) << __func__ << " pruning " << *i << dendl;
1226 i = pg_created.erase(i);
1227 } else {
1228 dout(20) << __func__ << " keeping " << *i << dendl;
1229 ++i;
1230 }
1231 }
1232}
1233
1234
7c673cae
FG
1235// --------------------------------------
1236// dispatch
1237
eafe8130 1238bool OSDService::can_inc_scrubs()
7c673cae
FG
1239{
1240 bool can_inc = false;
11fdf7f2 1241 std::lock_guard l(sched_scrub_lock);
7c673cae 1242
eafe8130
TL
1243 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1245 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1246 can_inc = true;
1247 } else {
eafe8130
TL
1248 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1249 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1250 }
1251
1252 return can_inc;
1253}
1254
eafe8130 1255bool OSDService::inc_scrubs_local()
7c673cae
FG
1256{
1257 bool result = false;
eafe8130
TL
1258 std::lock_guard l{sched_scrub_lock};
1259 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1260 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
7c673cae 1262 result = true;
eafe8130 1263 ++scrubs_local;
7c673cae 1264 } else {
eafe8130 1265 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1266 }
7c673cae
FG
1267 return result;
1268}
1269
eafe8130 1270void OSDService::dec_scrubs_local()
7c673cae 1271{
eafe8130
TL
1272 std::lock_guard l{sched_scrub_lock};
1273 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1274 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1275 --scrubs_local;
1276 ceph_assert(scrubs_local >= 0);
7c673cae
FG
1277}
1278
eafe8130 1279bool OSDService::inc_scrubs_remote()
7c673cae 1280{
eafe8130
TL
1281 bool result = false;
1282 std::lock_guard l{sched_scrub_lock};
1283 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1284 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1285 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1286 result = true;
1287 ++scrubs_remote;
7c673cae 1288 } else {
eafe8130 1289 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae 1290 }
eafe8130
TL
1291 return result;
1292}
1293
1294void OSDService::dec_scrubs_remote()
1295{
1296 std::lock_guard l{sched_scrub_lock};
1297 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1298 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1299 --scrubs_remote;
1300 ceph_assert(scrubs_remote >= 0);
7c673cae
FG
1301}
1302
eafe8130 1303void OSDService::dump_scrub_reservations(Formatter *f)
7c673cae 1304{
eafe8130
TL
1305 std::lock_guard l{sched_scrub_lock};
1306 f->dump_int("scrubs_local", scrubs_local);
1307 f->dump_int("scrubs_remote", scrubs_remote);
1308 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
7c673cae
FG
1309}
1310
1311void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1312 epoch_t *_bind_epoch) const
1313{
11fdf7f2 1314 std::lock_guard l(epoch_lock);
7c673cae
FG
1315 if (_boot_epoch)
1316 *_boot_epoch = boot_epoch;
1317 if (_up_epoch)
1318 *_up_epoch = up_epoch;
1319 if (_bind_epoch)
1320 *_bind_epoch = bind_epoch;
1321}
1322
1323void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1324 const epoch_t *_bind_epoch)
1325{
11fdf7f2 1326 std::lock_guard l(epoch_lock);
7c673cae 1327 if (_boot_epoch) {
11fdf7f2 1328 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1329 boot_epoch = *_boot_epoch;
1330 }
1331 if (_up_epoch) {
11fdf7f2 1332 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1333 up_epoch = *_up_epoch;
1334 }
1335 if (_bind_epoch) {
11fdf7f2 1336 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1337 bind_epoch = *_bind_epoch;
1338 }
1339}
1340
1341bool OSDService::prepare_to_stop()
1342{
9f95a23c 1343 std::unique_lock l(is_stopping_lock);
7c673cae
FG
1344 if (get_state() != NOT_STOPPING)
1345 return false;
1346
1347 OSDMapRef osdmap = get_osdmap();
1348 if (osdmap && osdmap->is_up(whoami)) {
1349 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1350 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1351 monc->send_mon_message(
1352 new MOSDMarkMeDown(
1353 monc->get_fsid(),
1354 whoami,
1355 osdmap->get_addrs(whoami),
1356 osdmap->get_epoch(),
1357 true // request ack
1358 ));
9f95a23c
TL
1359 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1360 is_stopping_cond.wait_for(l, timeout,
1361 [this] { return get_state() == STOPPING; });
7c673cae
FG
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366}
1367
1368void OSDService::got_stop_ack()
1369{
9f95a23c 1370 std::scoped_lock l(is_stopping_lock);
7c673cae
FG
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
9f95a23c 1374 is_stopping_cond.notify_all();
7c673cae
FG
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378}
1379
1380MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382{
28e407b8
AA
1383 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1384 osdmap->get_encoding_features());
7c673cae
FG
1385 m->oldest_map = max_oldest_map;
1386 m->newest_map = sblock.newest_map;
1387
11fdf7f2
TL
1388 int max = cct->_conf->osd_map_message_max;
1389 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1390
1391 if (since < m->oldest_map) {
1392 // we don't have the next map the target wants, so start with a
1393 // full map.
1394 bufferlist bl;
1395 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1396 << since << ", starting with full map" << dendl;
1397 since = m->oldest_map;
1398 if (!get_map_bl(since, bl)) {
1399 derr << __func__ << " missing full map " << since << dendl;
1400 goto panic;
1401 }
1402 max--;
1403 max_bytes -= bl.length();
1404 m->maps[since].claim(bl);
1405 }
1406 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1407 bufferlist bl;
11fdf7f2 1408 if (get_inc_map_bl(e, bl)) {
7c673cae 1409 m->incremental_maps[e].claim(bl);
11fdf7f2
TL
1410 } else {
1411 derr << __func__ << " missing incremental map " << e << dendl;
1412 if (!get_map_bl(e, bl)) {
1413 derr << __func__ << " also missing full map " << e << dendl;
1414 goto panic;
1415 }
7c673cae 1416 m->maps[e].claim(bl);
11fdf7f2
TL
1417 }
1418 max--;
1419 max_bytes -= bl.length();
1420 if (max <= 0 || max_bytes <= 0) {
7c673cae 1421 break;
11fdf7f2
TL
1422 }
1423 }
1424 return m;
1425
1426 panic:
1427 if (!m->maps.empty() ||
1428 !m->incremental_maps.empty()) {
1429 // send what we have so far
1430 return m;
1431 }
1432 // send something
1433 bufferlist bl;
1434 if (get_inc_map_bl(m->newest_map, bl)) {
1435 m->incremental_maps[m->newest_map].claim(bl);
1436 } else {
1437 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1438 if (!get_map_bl(m->newest_map, bl)) {
1439 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1440 << dendl;
11fdf7f2 1441 ceph_abort();
7c673cae 1442 }
11fdf7f2 1443 m->maps[m->newest_map].claim(bl);
7c673cae
FG
1444 }
1445 return m;
1446}
1447
1448void OSDService::send_map(MOSDMap *m, Connection *con)
1449{
1450 con->send_message(m);
1451}
1452
1453void OSDService::send_incremental_map(epoch_t since, Connection *con,
9f95a23c 1454 const OSDMapRef& osdmap)
7c673cae
FG
1455{
1456 epoch_t to = osdmap->get_epoch();
1457 dout(10) << "send_incremental_map " << since << " -> " << to
1458 << " to " << con << " " << con->get_peer_addr() << dendl;
1459
1460 MOSDMap *m = NULL;
1461 while (!m) {
1462 OSDSuperblock sblock(get_superblock());
1463 if (since < sblock.oldest_map) {
1464 // just send latest full map
28e407b8
AA
1465 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1466 osdmap->get_encoding_features());
7c673cae
FG
1467 m->oldest_map = max_oldest_map;
1468 m->newest_map = sblock.newest_map;
1469 get_map_bl(to, m->maps[to]);
1470 send_map(m, con);
1471 return;
1472 }
1473
1474 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1475 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl;
1477 since = to - cct->_conf->osd_map_share_max_epochs;
1478 }
1479
7c673cae
FG
1480 m = build_incremental_map_msg(since, to, sblock);
1481 }
1482 send_map(m, con);
1483}
1484
1485bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1486{
1487 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1488 if (found) {
1489 if (logger)
1490 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1491 return true;
31f18b77
FG
1492 }
1493 if (logger)
1494 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1495 found = store->read(meta_ch,
31f18b77
FG
1496 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1498 if (found) {
7c673cae 1499 _add_map_bl(e, bl);
31f18b77 1500 }
7c673cae
FG
1501 return found;
1502}
1503
1504bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1505{
11fdf7f2 1506 std::lock_guard l(map_cache_lock);
7c673cae 1507 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1508 if (found) {
1509 if (logger)
1510 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1511 return true;
31f18b77
FG
1512 }
1513 if (logger)
1514 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1515 found = store->read(meta_ch,
31f18b77
FG
1516 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1518 if (found) {
7c673cae 1519 _add_map_inc_bl(e, bl);
31f18b77 1520 }
7c673cae
FG
1521 return found;
1522}
1523
1524void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1525{
1526 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1527 // cache a contiguous buffer
1528 if (bl.get_num_buffers() > 1) {
1529 bl.rebuild();
1530 }
1531 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1532 map_bl_cache.add(e, bl);
1533}
1534
1535void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1536{
1537 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1538 // cache a contiguous buffer
1539 if (bl.get_num_buffers() > 1) {
1540 bl.rebuild();
1541 }
1542 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1543 map_bl_inc_cache.add(e, bl);
1544}
1545
7c673cae
FG
1546OSDMapRef OSDService::_add_map(OSDMap *o)
1547{
1548 epoch_t e = o->get_epoch();
1549
1550 if (cct->_conf->osd_map_dedup) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup = map_cache.lower_bound(e);
1553 if (for_dedup) {
1554 OSDMap::dedup(for_dedup.get(), o);
1555 }
1556 }
1557 bool existed;
1558 OSDMapRef l = map_cache.add(e, o, &existed);
1559 if (existed) {
1560 delete o;
1561 }
1562 return l;
1563}
1564
1565OSDMapRef OSDService::try_get_map(epoch_t epoch)
1566{
11fdf7f2 1567 std::lock_guard l(map_cache_lock);
7c673cae
FG
1568 OSDMapRef retval = map_cache.lookup(epoch);
1569 if (retval) {
1570 dout(30) << "get_map " << epoch << " -cached" << dendl;
1571 if (logger) {
1572 logger->inc(l_osd_map_cache_hit);
1573 }
1574 return retval;
1575 }
1576 if (logger) {
1577 logger->inc(l_osd_map_cache_miss);
1578 epoch_t lb = map_cache.cached_key_lower_bound();
1579 if (epoch < lb) {
1580 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1581 logger->inc(l_osd_map_cache_miss_low);
1582 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1583 }
1584 }
1585
1586 OSDMap *map = new OSDMap;
1587 if (epoch > 0) {
1588 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1589 bufferlist bl;
1590 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1591 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1592 delete map;
1593 return OSDMapRef();
1594 }
1595 map->decode(bl);
1596 } else {
1597 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1598 }
1599 return _add_map(map);
1600}
1601
1602// ops
1603
1604
1605void OSDService::reply_op_error(OpRequestRef op, int err)
1606{
9f95a23c 1607 reply_op_error(op, err, eversion_t(), 0, {});
7c673cae
FG
1608}
1609
1610void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
9f95a23c
TL
1611 version_t uv,
1612 vector<pg_log_op_return_item_t> op_returns)
7c673cae 1613{
9f95a23c 1614 auto m = op->get_req<MOSDOp>();
11fdf7f2 1615 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1616 int flags;
1617 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1618
9f95a23c
TL
1619 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1620 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
7c673cae 1621 reply->set_reply_versions(v, uv);
9f95a23c 1622 reply->set_op_returns(op_returns);
7c673cae
FG
1623 m->get_connection()->send_message(reply);
1624}
1625
1626void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1627{
31f18b77
FG
1628 if (!cct->_conf->osd_debug_misdirected_ops) {
1629 return;
1630 }
1631
9f95a23c 1632 auto m = op->get_req<MOSDOp>();
11fdf7f2 1633 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1634
11fdf7f2 1635 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1636
1637 if (pg->is_ec_pg()) {
1638 /**
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1644 * [3, 2, 3]/3
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1646 * -- misdirected op
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1648 * it and fulfils it
1649 *
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1653 */
11fdf7f2 1654 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1655 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1656 if (!opmap) {
1657 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 pg_t _pgid = m->get_raw_pg();
1662 spg_t pgid;
1663 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1664 _pgid = opmap->raw_pg_to_pg(_pgid);
1665 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1666 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1667 dout(7) << __func__ << ": " << *pg << " primary changed since "
1668 << m->get_map_epoch() << ", dropping" << dendl;
1669 return;
1670 }
1671 }
1672
1673 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1674 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1675 << " pg " << m->get_raw_pg()
1676 << " to osd." << whoami
11fdf7f2 1677 << " not " << pg->get_acting()
7c673cae 1678 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1679}
1680
9f95a23c 1681void OSDService::enqueue_back(OpSchedulerItem&& qi)
7c673cae 1682{
11fdf7f2 1683 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1684}
1685
9f95a23c 1686void OSDService::enqueue_front(OpSchedulerItem&& qi)
7c673cae 1687{
11fdf7f2 1688 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1689}
1690
11fdf7f2
TL
1691void OSDService::queue_recovery_context(
1692 PG *pg,
1693 GenContext<ThreadPool::TPHandle&> *c)
7c673cae 1694{
11fdf7f2
TL
1695 epoch_t e = get_osdmap_epoch();
1696 enqueue_back(
9f95a23c
TL
1697 OpSchedulerItem(
1698 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1699 new PGRecoveryContext(pg->get_pgid(), c, e)),
1700 cct->_conf->osd_recovery_cost,
1701 cct->_conf->osd_recovery_priority,
1702 ceph_clock_now(),
1703 0,
1704 e));
7c673cae
FG
1705}
1706
1707void OSDService::queue_for_snap_trim(PG *pg)
1708{
1709 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2 1710 enqueue_back(
9f95a23c
TL
1711 OpSchedulerItem(
1712 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1713 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1714 cct->_conf->osd_snap_trim_cost,
1715 cct->_conf->osd_snap_trim_priority,
1716 ceph_clock_now(),
1717 0,
1718 pg->get_osdmap_epoch()));
1719}
1720
1721void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1722{
1723 unsigned scrub_queue_priority = pg->scrubber.priority;
1724 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1725 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1726 }
1727 const auto epoch = pg->get_osdmap_epoch();
1728 enqueue_back(
9f95a23c
TL
1729 OpSchedulerItem(
1730 unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
11fdf7f2
TL
1731 cct->_conf->osd_scrub_cost,
1732 scrub_queue_priority,
1733 ceph_clock_now(),
1734 0,
1735 epoch));
1736}
1737
1738void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1739{
1740 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1741 enqueue_back(
9f95a23c
TL
1742 OpSchedulerItem(
1743 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1744 new PGDelete(pgid, e)),
1745 cct->_conf->osd_pg_delete_cost,
1746 cct->_conf->osd_pg_delete_priority,
1747 ceph_clock_now(),
1748 0,
1749 e));
1750}
1751
1752bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1753{
1754 return osd->try_finish_pg_delete(pg, old_pg_num);
1755}
1756
1757// ---
1758
1759void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1760{
1761 std::lock_guard l(merge_lock);
1762 dout(10) << __func__ << " " << pg->pg_id << dendl;
1763 ready_to_merge_source[pg->pg_id.pgid] = version;
1764 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1765 _send_ready_to_merge();
1766}
1767
1768void OSDService::set_ready_to_merge_target(PG *pg,
1769 eversion_t version,
1770 epoch_t last_epoch_started,
1771 epoch_t last_epoch_clean)
1772{
1773 std::lock_guard l(merge_lock);
1774 dout(10) << __func__ << " " << pg->pg_id << dendl;
1775 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1776 make_tuple(version,
1777 last_epoch_started,
1778 last_epoch_clean)));
1779 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1780 _send_ready_to_merge();
1781}
1782
1783void OSDService::set_not_ready_to_merge_source(pg_t source)
1784{
1785 std::lock_guard l(merge_lock);
1786 dout(10) << __func__ << " " << source << dendl;
1787 not_ready_to_merge_source.insert(source);
1788 assert(ready_to_merge_source.count(source) == 0);
1789 _send_ready_to_merge();
1790}
1791
1792void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1793{
1794 std::lock_guard l(merge_lock);
1795 dout(10) << __func__ << " " << target << " source " << source << dendl;
1796 not_ready_to_merge_target[target] = source;
1797 assert(ready_to_merge_target.count(target) == 0);
1798 _send_ready_to_merge();
1799}
1800
1801void OSDService::send_ready_to_merge()
1802{
1803 std::lock_guard l(merge_lock);
1804 _send_ready_to_merge();
1805}
1806
1807void OSDService::_send_ready_to_merge()
1808{
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1815 << dendl;
1816 for (auto src : not_ready_to_merge_source) {
1817 if (sent_ready_to_merge_source.count(src) == 0) {
1818 monc->send_mon_message(new MOSDPGReadyToMerge(
1819 src,
1820 {}, {}, 0, 0,
1821 false,
1822 osdmap->get_epoch()));
1823 sent_ready_to_merge_source.insert(src);
1824 }
1825 }
1826 for (auto p : not_ready_to_merge_target) {
1827 if (sent_ready_to_merge_source.count(p.second) == 0) {
1828 monc->send_mon_message(new MOSDPGReadyToMerge(
1829 p.second,
1830 {}, {}, 0, 0,
1831 false,
1832 osdmap->get_epoch()));
1833 sent_ready_to_merge_source.insert(p.second);
1834 }
1835 }
1836 for (auto src : ready_to_merge_source) {
1837 if (not_ready_to_merge_source.count(src.first) ||
1838 not_ready_to_merge_target.count(src.first.get_parent())) {
1839 continue;
1840 }
1841 auto p = ready_to_merge_target.find(src.first.get_parent());
1842 if (p != ready_to_merge_target.end() &&
1843 sent_ready_to_merge_source.count(src.first) == 0) {
1844 monc->send_mon_message(new MOSDPGReadyToMerge(
1845 src.first, // source pgid
1846 src.second, // src version
1847 std::get<0>(p->second), // target version
1848 std::get<1>(p->second), // PG's last_epoch_started
1849 std::get<2>(p->second), // PG's last_epoch_clean
1850 true,
1851 osdmap->get_epoch()));
1852 sent_ready_to_merge_source.insert(src.first);
1853 }
1854 }
1855}
1856
1857void OSDService::clear_ready_to_merge(PG *pg)
1858{
1859 std::lock_guard l(merge_lock);
1860 dout(10) << __func__ << " " << pg->pg_id << dendl;
1861 ready_to_merge_source.erase(pg->pg_id.pgid);
1862 ready_to_merge_target.erase(pg->pg_id.pgid);
1863 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1864 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1865 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1866}
1867
1868void OSDService::clear_sent_ready_to_merge()
1869{
1870 std::lock_guard l(merge_lock);
1871 sent_ready_to_merge_source.clear();
1872}
1873
9f95a23c 1874void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
11fdf7f2
TL
1875{
1876 std::lock_guard l(merge_lock);
1877 auto i = sent_ready_to_merge_source.begin();
1878 while (i != sent_ready_to_merge_source.end()) {
1879 if (!osdmap->pg_exists(*i)) {
1880 dout(10) << __func__ << " " << *i << dendl;
1881 i = sent_ready_to_merge_source.erase(i);
1882 } else {
1883 ++i;
1884 }
1885 }
7c673cae
FG
1886}
1887
11fdf7f2
TL
1888// ---
1889
1890void OSDService::_queue_for_recovery(
1891 std::pair<epoch_t, PGRef> p,
1892 uint64_t reserved_pushes)
1893{
9f95a23c 1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
11fdf7f2 1895 enqueue_back(
9f95a23c
TL
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1898 new PGRecovery(
1899 p.second->get_pgid(), p.first, reserved_pushes)),
1900 cct->_conf->osd_recovery_cost,
1901 cct->_conf->osd_recovery_priority,
1902 ceph_clock_now(),
1903 0,
1904 p.first));
1905}
7c673cae
FG
1906
1907// ====================================================================
1908// OSD
1909
1910#undef dout_prefix
1911#define dout_prefix *_dout
1912
1913// Commands shared between OSD's console and admin console:
1914namespace ceph {
1915namespace osd_cmds {
1916
11fdf7f2 1917int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
7c673cae
FG
1918
1919}} // namespace ceph::osd_cmds
1920
11fdf7f2 1921int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami)
7c673cae
FG
1922{
1923 int ret;
1924
7c673cae
FG
1925 OSDSuperblock sb;
1926 bufferlist sbbl;
11fdf7f2 1927 ObjectStore::CollectionHandle ch;
7c673cae
FG
1928
1929 // if we are fed a uuid for this osd, use it.
1930 store->set_fsid(cct->_conf->osd_uuid);
1931
1932 ret = store->mkfs();
1933 if (ret) {
224ce89b
WB
1934 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret) << dendl;
7c673cae
FG
1936 goto free_store;
1937 }
1938
31f18b77 1939 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
1940
1941 ret = store->mount();
1942 if (ret) {
224ce89b
WB
1943 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret) << dendl;
7c673cae
FG
1945 goto free_store;
1946 }
1947
11fdf7f2
TL
1948 ch = store->open_collection(coll_t::meta());
1949 if (ch) {
1950 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1951 if (ret < 0) {
1952 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
1953 goto free_store;
1954 }
7c673cae
FG
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
1957 auto p = sbbl.cbegin();
1958 decode(sb, p);
7c673cae
FG
1959 if (whoami != sb.whoami) {
1960 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1961 << dendl;
1962 ret = -EINVAL;
1963 goto umount_store;
1964 }
1965 if (fsid != sb.cluster_fsid) {
1966 derr << "provided cluster fsid " << fsid
1967 << " != superblock's " << sb.cluster_fsid << dendl;
1968 ret = -EINVAL;
1969 goto umount_store;
1970 }
1971 } else {
1972 // create superblock
1973 sb.cluster_fsid = fsid;
1974 sb.osd_fsid = store->get_fsid();
1975 sb.whoami = whoami;
1976 sb.compat_features = get_osd_initial_compat_set();
1977
1978 bufferlist bl;
11fdf7f2 1979 encode(sb, bl);
7c673cae 1980
11fdf7f2
TL
1981 ObjectStore::CollectionHandle ch = store->create_new_collection(
1982 coll_t::meta());
7c673cae
FG
1983 ObjectStore::Transaction t;
1984 t.create_collection(coll_t::meta(), 0);
1985 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 1986 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
1987 if (ret) {
1988 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 1989 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
1990 goto umount_store;
1991 }
1992 }
1993
3efd9988 1994 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
7c673cae 1995 if (ret) {
224ce89b
WB
1996 derr << "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret) << dendl;
7c673cae
FG
1998 goto umount_store;
1999 }
2000
2001umount_store:
11fdf7f2
TL
2002 if (ch) {
2003 ch.reset();
2004 }
7c673cae
FG
2005 store->umount();
2006free_store:
2007 delete store;
2008 return ret;
2009}
2010
3efd9988 2011int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
7c673cae
FG
2012{
2013 char val[80];
2014 int r;
2015
2016 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2017 r = store->write_meta("magic", val);
2018 if (r < 0)
2019 return r;
2020
2021 snprintf(val, sizeof(val), "%d", whoami);
2022 r = store->write_meta("whoami", val);
2023 if (r < 0)
2024 return r;
2025
2026 cluster_fsid.print(val);
2027 r = store->write_meta("ceph_fsid", val);
2028 if (r < 0)
2029 return r;
2030
11fdf7f2 2031 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2032 if (key.size()) {
2033 r = store->write_meta("osd_key", key);
2034 if (r < 0)
2035 return r;
b32b8144 2036 } else {
11fdf7f2 2037 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2038 if (!keyfile.empty()) {
2039 bufferlist keybl;
2040 string err;
11fdf7f2 2041 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2042 if (r < 0) {
2043 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2044 << err << ": " << cpp_strerror(r) << dendl;
2045 return r;
2046 }
2047 r = store->write_meta("osd_key", keybl.to_str());
2048 if (r < 0)
2049 return r;
2050 }
3efd9988
FG
2051 }
2052
7c673cae
FG
2053 r = store->write_meta("ready", "ready");
2054 if (r < 0)
2055 return r;
2056
2057 return 0;
2058}
2059
11fdf7f2
TL
2060int OSD::peek_meta(ObjectStore *store,
2061 std::string *magic,
2062 uuid_d *cluster_fsid,
2063 uuid_d *osd_fsid,
2064 int *whoami,
9f95a23c 2065 ceph_release_t *require_osd_release)
7c673cae
FG
2066{
2067 string val;
2068
2069 int r = store->read_meta("magic", &val);
2070 if (r < 0)
2071 return r;
11fdf7f2 2072 *magic = val;
7c673cae
FG
2073
2074 r = store->read_meta("whoami", &val);
2075 if (r < 0)
2076 return r;
11fdf7f2 2077 *whoami = atoi(val.c_str());
7c673cae
FG
2078
2079 r = store->read_meta("ceph_fsid", &val);
2080 if (r < 0)
2081 return r;
11fdf7f2 2082 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2083 if (!r)
2084 return -EINVAL;
2085
2086 r = store->read_meta("fsid", &val);
2087 if (r < 0) {
11fdf7f2 2088 *osd_fsid = uuid_d();
7c673cae 2089 } else {
11fdf7f2 2090 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2091 if (!r)
2092 return -EINVAL;
2093 }
2094
11fdf7f2
TL
2095 r = store->read_meta("require_osd_release", &val);
2096 if (r >= 0) {
9f95a23c 2097 *require_osd_release = ceph_release_from_name(val);
11fdf7f2
TL
2098 }
2099
7c673cae
FG
2100 return 0;
2101}
2102
2103
2104#undef dout_prefix
2105#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2106
2107// cons/des
2108
2109OSD::OSD(CephContext *cct_, ObjectStore *store_,
2110 int id,
2111 Messenger *internal_messenger,
2112 Messenger *external_messenger,
2113 Messenger *hb_client_front,
2114 Messenger *hb_client_back,
2115 Messenger *hb_front_serverm,
2116 Messenger *hb_back_serverm,
2117 Messenger *osdc_messenger,
2118 MonClient *mc,
2119 const std::string &dev, const std::string &jdev) :
2120 Dispatcher(cct_),
7c673cae 2121 tick_timer(cct, osd_lock),
7c673cae 2122 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2123 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2124 cluster_messenger(internal_messenger),
2125 client_messenger(external_messenger),
2126 objecter_messenger(osdc_messenger),
2127 monc(mc),
9f95a23c 2128 mgrc(cct_, client_messenger, &mc->monmap),
7c673cae
FG
2129 logger(NULL),
2130 recoverystate_perf(NULL),
2131 store(store_),
2132 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2133 clog(log_client.create_channel()),
2134 whoami(id),
2135 dev_path(dev), journal_path(jdev),
31f18b77 2136 store_is_rotational(store->is_rotational()),
7c673cae
FG
2137 trace_endpoint("0.0.0.0", 0, "osd"),
2138 asok_hook(NULL),
11fdf7f2
TL
2139 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2140 "osd_pg_epoch_max_lag_factor")),
7c673cae 2141 osd_compat(get_osd_compat_set()),
7c673cae 2142 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2143 get_num_op_threads()),
7c673cae
FG
2144 heartbeat_stop(false),
2145 heartbeat_need_update(true),
2146 hb_front_client_messenger(hb_client_front),
2147 hb_back_client_messenger(hb_client_back),
2148 hb_front_server_messenger(hb_front_serverm),
2149 hb_back_server_messenger(hb_back_serverm),
2150 daily_loadavg(0.0),
2151 heartbeat_thread(this),
2152 heartbeat_dispatcher(this),
2153 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2154 cct->_conf->osd_num_op_tracker_shard),
2155 test_ops_hook(NULL),
7c673cae 2156 op_shardedwq(
7c673cae
FG
2157 this,
2158 cct->_conf->osd_op_thread_timeout,
2159 cct->_conf->osd_op_thread_suicide_timeout,
2160 &osd_op_tp),
7c673cae 2161 last_pg_create_epoch(0),
11fdf7f2 2162 boot_finisher(cct),
7c673cae
FG
2163 up_thru_wanted(0),
2164 requested_full_first(0),
2165 requested_full_last(0),
7c673cae
FG
2166 service(this)
2167{
11fdf7f2
TL
2168
2169 if (!gss_ktfile_client.empty()) {
2170 // Assert we can export environment variable
2171 /*
2172 The default client keytab is used, if it is present and readable,
2173 to automatically obtain initial credentials for GSSAPI client
2174 applications. The principal name of the first entry in the client
2175 keytab is used by default when obtaining initial credentials.
2176 1. The KRB5_CLIENT_KTNAME environment variable.
2177 2. The default_client_keytab_name profile variable in [libdefaults].
2178 3. The hardcoded default, DEFCKTNAME.
2179 */
2180 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2181 gss_ktfile_client.c_str(), 1));
2182 ceph_assert(set_result == 0);
2183 }
2184
7c673cae
FG
2185 monc->set_messenger(client_messenger);
2186 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2187 cct->_conf->osd_op_log_threshold);
2188 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2189 cct->_conf->osd_op_history_duration);
2190 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2191 cct->_conf->osd_op_history_slow_op_threshold);
9f95a23c 2192 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
7c673cae
FG
2193#ifdef WITH_BLKIN
2194 std::stringstream ss;
2195 ss << "osd." << whoami;
2196 trace_endpoint.copy_name(ss.str());
2197#endif
11fdf7f2
TL
2198
2199 // initialize shards
2200 num_shards = get_num_op_shards();
2201 for (uint32_t i = 0; i < num_shards; i++) {
2202 OSDShard *one_shard = new OSDShard(
2203 i,
2204 cct,
9f95a23c 2205 this);
11fdf7f2
TL
2206 shards.push_back(one_shard);
2207 }
7c673cae
FG
2208}
2209
2210OSD::~OSD()
2211{
11fdf7f2
TL
2212 while (!shards.empty()) {
2213 delete shards.back();
2214 shards.pop_back();
2215 }
7c673cae
FG
2216 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2217 cct->get_perfcounters_collection()->remove(logger);
2218 delete recoverystate_perf;
2219 delete logger;
2220 delete store;
2221}
2222
91327a77
AA
2223double OSD::get_tick_interval() const
2224{
2225 // vary +/- 5% to avoid scrub scheduling livelocks
2226 constexpr auto delta = 0.05;
91327a77 2227 return (OSD_TICK_INTERVAL *
11fdf7f2 2228 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2229}
2230
7c673cae
FG
2231void OSD::handle_signal(int signum)
2232{
11fdf7f2 2233 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2234 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2235 shutdown();
2236}
2237
2238int OSD::pre_init()
2239{
11fdf7f2 2240 std::lock_guard lock(osd_lock);
7c673cae
FG
2241 if (is_stopping())
2242 return 0;
2243
2244 if (store->test_mount_in_use()) {
2245 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2246 << "currently in use. (Is ceph-osd already running?)" << dendl;
2247 return -EBUSY;
2248 }
2249
11fdf7f2
TL
2250 cct->_conf.add_observer(this);
2251 return 0;
2252}
2253
2254int OSD::set_numa_affinity()
2255{
2256 // storage numa node
2257 int store_node = -1;
2258 store->get_numa_node(&store_node, nullptr, nullptr);
2259 if (store_node >= 0) {
2260 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2261 }
2262
2263 // check network numa node(s)
2264 int front_node = -1, back_node = -1;
2265 string front_iface = pick_iface(
2266 cct,
2267 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2268 string back_iface = pick_iface(
2269 cct,
2270 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2271 int r = get_iface_numa_node(front_iface, &front_node);
92f5a8d4 2272 if (r >= 0 && front_node >= 0) {
11fdf7f2 2273 dout(1) << __func__ << " public network " << front_iface << " numa node "
92f5a8d4 2274 << front_node << dendl;
11fdf7f2 2275 r = get_iface_numa_node(back_iface, &back_node);
92f5a8d4 2276 if (r >= 0 && back_node >= 0) {
11fdf7f2
TL
2277 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2278 << back_node << dendl;
2279 if (front_node == back_node &&
2280 front_node == store_node) {
2281 dout(1) << " objectstore and network numa nodes all match" << dendl;
2282 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2283 numa_node = front_node;
2284 }
92f5a8d4
TL
2285 } else if (front_node != back_node) {
2286 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2287 << dendl;
11fdf7f2
TL
2288 } else {
2289 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2290 << dendl;
2291 }
92f5a8d4
TL
2292 } else if (back_node == -2) {
2293 dout(1) << __func__ << " cluster network " << back_iface
2294 << " ports numa nodes do not match" << dendl;
2295 } else {
2296 derr << __func__ << " unable to identify cluster interface '" << back_iface
2297 << "' numa node: " << cpp_strerror(r) << dendl;
11fdf7f2 2298 }
92f5a8d4
TL
2299 } else if (front_node == -2) {
2300 dout(1) << __func__ << " public network " << front_iface
2301 << " ports numa nodes do not match" << dendl;
11fdf7f2
TL
2302 } else {
2303 derr << __func__ << " unable to identify public interface '" << front_iface
2304 << "' numa node: " << cpp_strerror(r) << dendl;
2305 }
2306 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2307 // this takes precedence over the automagic logic above
2308 numa_node = node;
2309 }
2310 if (numa_node >= 0) {
2311 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2312 if (r < 0) {
2313 dout(1) << __func__ << " unable to determine numa node " << numa_node
2314 << " CPUs" << dendl;
2315 numa_node = -1;
2316 } else {
2317 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2318 << " cpus "
2319 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2320 << dendl;
92f5a8d4 2321 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
11fdf7f2
TL
2322 if (r < 0) {
2323 r = -errno;
2324 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2325 << dendl;
2326 numa_node = -1;
2327 }
2328 }
2329 } else {
2330 dout(1) << __func__ << " not setting numa affinity" << dendl;
2331 }
7c673cae
FG
2332 return 0;
2333}
2334
2335// asok
2336
2337class OSDSocketHook : public AdminSocketHook {
2338 OSD *osd;
2339public:
2340 explicit OSDSocketHook(OSD *o) : osd(o) {}
9f95a23c
TL
2341 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2342 Formatter *f,
2343 std::ostream& ss,
2344 bufferlist& out) override {
2345 ceph_abort("should use async hook");
2346 }
2347 void call_async(
2348 std::string_view prefix,
2349 const cmdmap_t& cmdmap,
2350 Formatter *f,
2351 const bufferlist& inbl,
2352 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
11fdf7f2 2353 try {
9f95a23c
TL
2354 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2355 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2356 bufferlist empty;
2357 on_finish(-EINVAL, e.what(), empty);
11fdf7f2 2358 }
7c673cae
FG
2359 }
2360};
2361
11fdf7f2
TL
2362std::set<int64_t> OSD::get_mapped_pools()
2363{
2364 std::set<int64_t> pools;
2365 std::vector<spg_t> pgids;
2366 _get_pgids(&pgids);
2367 for (const auto &pgid : pgids) {
2368 pools.insert(pgid.pool());
2369 }
2370 return pools;
2371}
2372
9f95a23c
TL
2373void OSD::asok_command(
2374 std::string_view prefix, const cmdmap_t& cmdmap,
2375 Formatter *f,
2376 const bufferlist& inbl,
2377 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 2378{
9f95a23c
TL
2379 int ret = 0;
2380 stringstream ss; // stderr error message stream
2381 bufferlist outbl; // if empty at end, we'll dump formatter as output
2382
2383 // --- PG commands are routed here to PG::do_command ---
2384 if (prefix == "pg" ||
2385 prefix == "query" ||
2386 prefix == "mark_unfound_lost" ||
2387 prefix == "list_unfound" ||
2388 prefix == "scrub" ||
2389 prefix == "deep_scrub"
2390 ) {
2391 string pgidstr;
2392 pg_t pgid;
2393 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2394 ss << "no pgid specified";
2395 ret = -EINVAL;
2396 goto out;
2397 }
2398 if (!pgid.parse(pgidstr.c_str())) {
2399 ss << "couldn't parse pgid '" << pgidstr << "'";
2400 ret = -EINVAL;
2401 goto out;
2402 }
2403 spg_t pcand;
2404 PGRef pg;
2405 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2406 (pg = _lookup_lock_pg(pcand))) {
2407 if (pg->is_primary()) {
2408 cmdmap_t new_cmdmap = cmdmap;
2409 try {
2410 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2411 pg->unlock();
2412 return; // the pg handler calls on_finish directly
2413 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2414 pg->unlock();
2415 ss << e.what();
2416 ret = -EINVAL;
2417 goto out;
2418 }
2419 } else {
2420 ss << "not primary for pgid " << pgid;
2421 // do not reply; they will get newer maps and realize they
2422 // need to resend.
2423 pg->unlock();
2424 ret = -EAGAIN;
2425 goto out;
2426 }
2427 } else {
2428 ss << "i don't have pgid " << pgid;
2429 ret = -ENOENT;
2430 }
2431 }
2432
2433 // --- OSD commands follow ---
2434
2435 else if (prefix == "status") {
2436 lock_guard l(osd_lock);
7c673cae
FG
2437 f->open_object_section("status");
2438 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2439 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2440 f->dump_unsigned("whoami", superblock.whoami);
2441 f->dump_string("state", get_state_name(get_state()));
2442 f->dump_unsigned("oldest_map", superblock.oldest_map);
2443 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2444 f->dump_unsigned("num_pgs", num_pgs);
7c673cae 2445 f->close_section();
9f95a23c 2446 } else if (prefix == "flush_journal") {
7c673cae 2447 store->flush_journal();
9f95a23c
TL
2448 } else if (prefix == "dump_ops_in_flight" ||
2449 prefix == "ops" ||
2450 prefix == "dump_blocked_ops" ||
2451 prefix == "dump_historic_ops" ||
2452 prefix == "dump_historic_ops_by_duration" ||
2453 prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2454
2455 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2456even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2457will start to track new ops received afterwards.";
2458
2459 set<string> filters;
2460 vector<string> filter_str;
9f95a23c 2461 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
c07f9fc5
FG
2462 copy(filter_str.begin(), filter_str.end(),
2463 inserter(filters, filters.end()));
2464 }
2465
9f95a23c
TL
2466 if (prefix == "dump_ops_in_flight" ||
2467 prefix == "ops") {
c07f9fc5
FG
2468 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2469 ss << error_str;
9f95a23c
TL
2470 ret = -EINVAL;
2471 goto out;
c07f9fc5
FG
2472 }
2473 }
9f95a23c 2474 if (prefix == "dump_blocked_ops") {
c07f9fc5
FG
2475 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2476 ss << error_str;
9f95a23c
TL
2477 ret = -EINVAL;
2478 goto out;
c07f9fc5
FG
2479 }
2480 }
9f95a23c 2481 if (prefix == "dump_historic_ops") {
c07f9fc5
FG
2482 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2483 ss << error_str;
9f95a23c
TL
2484 ret = -EINVAL;
2485 goto out;
c07f9fc5
FG
2486 }
2487 }
9f95a23c 2488 if (prefix == "dump_historic_ops_by_duration") {
c07f9fc5
FG
2489 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2490 ss << error_str;
9f95a23c
TL
2491 ret = -EINVAL;
2492 goto out;
c07f9fc5
FG
2493 }
2494 }
9f95a23c 2495 if (prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2496 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2497 ss << error_str;
9f95a23c
TL
2498 ret = -EINVAL;
2499 goto out;
c07f9fc5 2500 }
7c673cae 2501 }
9f95a23c 2502 } else if (prefix == "dump_op_pq_state") {
7c673cae
FG
2503 f->open_object_section("pq");
2504 op_shardedwq.dump(f);
2505 f->close_section();
9f95a23c 2506 } else if (prefix == "dump_blacklist") {
7c673cae
FG
2507 list<pair<entity_addr_t,utime_t> > bl;
2508 OSDMapRef curmap = service.get_osdmap();
2509
2510 f->open_array_section("blacklist");
2511 curmap->get_blacklist(&bl);
2512 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2513 it != bl.end(); ++it) {
224ce89b 2514 f->open_object_section("entry");
7c673cae
FG
2515 f->open_object_section("entity_addr_t");
2516 it->first.dump(f);
2517 f->close_section(); //entity_addr_t
2518 it->second.localtime(f->dump_stream("expire_time"));
2519 f->close_section(); //entry
2520 }
2521 f->close_section(); //blacklist
9f95a23c 2522 } else if (prefix == "dump_watchers") {
7c673cae
FG
2523 list<obj_watch_item_t> watchers;
2524 // scan pg's
11fdf7f2
TL
2525 vector<PGRef> pgs;
2526 _get_pgs(&pgs);
2527 for (auto& pg : pgs) {
2528 list<obj_watch_item_t> pg_watchers;
2529 pg->get_watchers(&pg_watchers);
2530 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2531 }
2532
2533 f->open_array_section("watchers");
2534 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2535 it != watchers.end(); ++it) {
2536
224ce89b 2537 f->open_object_section("watch");
7c673cae
FG
2538
2539 f->dump_string("namespace", it->obj.nspace);
2540 f->dump_string("object", it->obj.oid.name);
2541
2542 f->open_object_section("entity_name");
2543 it->wi.name.dump(f);
2544 f->close_section(); //entity_name_t
2545
224ce89b
WB
2546 f->dump_unsigned("cookie", it->wi.cookie);
2547 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2548
2549 f->open_object_section("entity_addr_t");
2550 it->wi.addr.dump(f);
2551 f->close_section(); //entity_addr_t
2552
2553 f->close_section(); //watch
2554 }
2555
2556 f->close_section(); //watchers
9f95a23c 2557 } else if (prefix == "dump_recovery_reservations") {
7c673cae
FG
2558 f->open_object_section("reservations");
2559 f->open_object_section("local_reservations");
2560 service.local_reserver.dump(f);
2561 f->close_section();
2562 f->open_object_section("remote_reservations");
2563 service.remote_reserver.dump(f);
2564 f->close_section();
2565 f->close_section();
9f95a23c 2566 } else if (prefix == "dump_scrub_reservations") {
eafe8130
TL
2567 f->open_object_section("scrub_reservations");
2568 service.dump_scrub_reservations(f);
2569 f->close_section();
9f95a23c 2570 } else if (prefix == "get_latest_osdmap") {
7c673cae 2571 get_latest_osdmap();
9f95a23c 2572 } else if (prefix == "set_heap_property") {
7c673cae
FG
2573 string property;
2574 int64_t value = 0;
2575 string error;
2576 bool success = false;
9f95a23c 2577 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2578 error = "unable to get property";
2579 success = false;
9f95a23c 2580 } else if (!cmd_getval(cmdmap, "value", value)) {
7c673cae
FG
2581 error = "unable to get value";
2582 success = false;
2583 } else if (value < 0) {
2584 error = "negative value not allowed";
2585 success = false;
2586 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2587 error = "invalid property";
2588 success = false;
2589 } else {
2590 success = true;
2591 }
2592 f->open_object_section("result");
2593 f->dump_string("error", error);
2594 f->dump_bool("success", success);
2595 f->close_section();
9f95a23c 2596 } else if (prefix == "get_heap_property") {
7c673cae
FG
2597 string property;
2598 size_t value = 0;
2599 string error;
2600 bool success = false;
9f95a23c 2601 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2602 error = "unable to get property";
2603 success = false;
2604 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2605 error = "invalid property";
2606 success = false;
2607 } else {
2608 success = true;
2609 }
2610 f->open_object_section("result");
2611 f->dump_string("error", error);
2612 f->dump_bool("success", success);
2613 f->dump_int("value", value);
2614 f->close_section();
9f95a23c 2615 } else if (prefix == "dump_objectstore_kv_stats") {
7c673cae 2616 store->get_db_statistics(f);
9f95a23c 2617 } else if (prefix == "dump_scrubs") {
7c673cae 2618 service.dumps_scrub(f);
9f95a23c 2619 } else if (prefix == "calc_objectstore_db_histogram") {
7c673cae 2620 store->generate_db_histogram(f);
9f95a23c 2621 } else if (prefix == "flush_store_cache") {
11fdf7f2 2622 store->flush_cache(&ss);
9f95a23c 2623 } else if (prefix == "dump_pgstate_history") {
7c673cae 2624 f->open_object_section("pgstate_history");
9f95a23c 2625 f->open_array_section("pgs");
11fdf7f2
TL
2626 vector<PGRef> pgs;
2627 _get_pgs(&pgs);
2628 for (auto& pg : pgs) {
9f95a23c 2629 f->open_object_section("pg");
11fdf7f2 2630 f->dump_stream("pg") << pg->pg_id;
9f95a23c 2631 f->dump_string("currently", pg->get_current_state());
11fdf7f2 2632 pg->dump_pgstate_history(f);
9f95a23c 2633 f->close_section();
7c673cae
FG
2634 }
2635 f->close_section();
9f95a23c
TL
2636 f->close_section();
2637 } else if (prefix == "compact") {
224ce89b
WB
2638 dout(1) << "triggering manual compaction" << dendl;
2639 auto start = ceph::coarse_mono_clock::now();
2640 store->compact();
2641 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2642 double duration = std::chrono::duration<double>(end-start).count();
224ce89b 2643 dout(1) << "finished manual compaction in "
11fdf7f2 2644 << duration
224ce89b
WB
2645 << " seconds" << dendl;
2646 f->open_object_section("compact_result");
11fdf7f2
TL
2647 f->dump_float("elapsed_time", duration);
2648 f->close_section();
9f95a23c 2649 } else if (prefix == "get_mapped_pools") {
11fdf7f2
TL
2650 f->open_array_section("mapped_pools");
2651 set<int64_t> poollist = get_mapped_pools();
2652 for (auto pool : poollist) {
2653 f->dump_int("pool_id", pool);
2654 }
2655 f->close_section();
9f95a23c 2656 } else if (prefix == "smart") {
11fdf7f2 2657 string devid;
9f95a23c
TL
2658 cmd_getval(cmdmap, "devid", devid);
2659 ostringstream out;
2660 probe_smart(devid, out);
2661 outbl.append(out.str());
2662 } else if (prefix == "list_devices") {
11fdf7f2
TL
2663 set<string> devnames;
2664 store->get_devices(&devnames);
9f95a23c 2665 f->open_array_section("list_devices");
11fdf7f2
TL
2666 for (auto dev : devnames) {
2667 if (dev.find("dm-") == 0) {
2668 continue;
2669 }
9f95a23c
TL
2670 string err;
2671 f->open_object_section("device");
11fdf7f2 2672 f->dump_string("device", "/dev/" + dev);
9f95a23c
TL
2673 f->dump_string("device_id", get_device_id(dev, &err));
2674 f->close_section();
11fdf7f2 2675 }
224ce89b 2676 f->close_section();
9f95a23c
TL
2677 } else if (prefix == "send_beacon") {
2678 lock_guard l(osd_lock);
11fdf7f2
TL
2679 if (is_active()) {
2680 send_beacon(ceph::coarse_mono_clock::now());
2681 }
9f95a23c
TL
2682 }
2683
2684 else if (prefix == "cluster_log") {
2685 vector<string> msg;
2686 cmd_getval(cmdmap, "message", msg);
2687 if (msg.empty()) {
2688 ret = -EINVAL;
2689 ss << "ignoring empty log message";
2690 goto out;
2691 }
2692 string message = msg.front();
2693 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2694 message += " " + *a;
2695 string lvl;
2696 cmd_getval(cmdmap, "level", lvl);
2697 clog_type level = string_to_clog_type(lvl);
2698 if (level < 0) {
2699 ret = -EINVAL;
2700 ss << "unknown level '" << lvl << "'";
2701 goto out;
2702 }
2703 clog->do_log(level, message);
2704 }
2705
2706 else if (prefix == "bench") {
2707 lock_guard l(osd_lock);
2708 int64_t count;
2709 int64_t bsize;
2710 int64_t osize, onum;
2711 // default count 1G, size 4MB
2712 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2713 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2714 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2715 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2716
2717 uint32_t duration = cct->_conf->osd_bench_duration;
2718
2719 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2720 // let us limit the block size because the next checks rely on it
2721 // having a sane value. If we allow any block size to be set things
2722 // can still go sideways.
2723 ss << "block 'size' values are capped at "
2724 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2725 << " a higher value, please adjust 'osd_bench_max_block_size'";
2726 ret = -EINVAL;
2727 goto out;
2728 } else if (bsize < (int64_t) (1 << 20)) {
2729 // entering the realm of small block sizes.
2730 // limit the count to a sane value, assuming a configurable amount of
2731 // IOPS and duration, so that the OSD doesn't get hung up on this,
2732 // preventing timeouts from going off
2733 int64_t max_count =
2734 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2735 if (count > max_count) {
2736 ss << "'count' values greater than " << max_count
2737 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2738 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2739 << " for " << duration << " seconds,"
2740 << " can cause ill effects on osd. "
2741 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2742 << " value if you wish to use a higher 'count'.";
2743 ret = -EINVAL;
2744 goto out;
eafe8130
TL
2745 }
2746 } else {
9f95a23c
TL
2747 // 1MB block sizes are big enough so that we get more stuff done.
2748 // However, to avoid the osd from getting hung on this and having
2749 // timers being triggered, we are going to limit the count assuming
2750 // a configurable throughput and duration.
2751 // NOTE: max_count is the total amount of bytes that we believe we
2752 // will be able to write during 'duration' for the given
2753 // throughput. The block size hardly impacts this unless it's
2754 // way too big. Given we already check how big the block size
2755 // is, it's safe to assume everything will check out.
2756 int64_t max_count =
2757 cct->_conf->osd_bench_large_size_max_throughput * duration;
2758 if (count > max_count) {
2759 ss << "'count' values greater than " << max_count
2760 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2761 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2762 << " for " << duration << " seconds,"
2763 << " can cause ill effects on osd. "
2764 << " Please adjust 'osd_bench_large_size_max_throughput'"
2765 << " with a higher value if you wish to use a higher 'count'.";
2766 ret = -EINVAL;
2767 goto out;
2768 }
eafe8130 2769 }
eafe8130 2770
9f95a23c
TL
2771 if (osize && bsize > osize)
2772 bsize = osize;
eafe8130 2773
9f95a23c
TL
2774 dout(1) << " bench count " << count
2775 << " bsize " << byte_u_t(bsize) << dendl;
eafe8130 2776
9f95a23c
TL
2777 ObjectStore::Transaction cleanupt;
2778
2779 if (osize && onum) {
2780 bufferlist bl;
2781 bufferptr bp(osize);
2782 bp.zero();
2783 bl.push_back(std::move(bp));
2784 bl.rebuild_page_aligned();
2785 for (int i=0; i<onum; ++i) {
2786 char nm[30];
2787 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2788 object_t oid(nm);
2789 hobject_t soid(sobject_t(oid, 0));
2790 ObjectStore::Transaction t;
2791 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2792 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2793 cleanupt.remove(coll_t(), ghobject_t(soid));
2794 }
2795 }
2796
2797 bufferlist bl;
2798 bufferptr bp(bsize);
2799 bp.zero();
2800 bl.push_back(std::move(bp));
2801 bl.rebuild_page_aligned();
2802
2803 {
2804 C_SaferCond waiter;
2805 if (!service.meta_ch->flush_commit(&waiter)) {
2806 waiter.wait();
2807 }
2808 }
2809
2810 utime_t start = ceph_clock_now();
2811 for (int64_t pos = 0; pos < count; pos += bsize) {
2812 char nm[30];
2813 unsigned offset = 0;
2814 if (onum && osize) {
2815 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2816 offset = rand() % (osize / bsize) * bsize;
2817 } else {
2818 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2819 }
2820 object_t oid(nm);
2821 hobject_t soid(sobject_t(oid, 0));
2822 ObjectStore::Transaction t;
2823 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2824 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2825 if (!onum || !osize)
2826 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2827 }
2828
2829 {
2830 C_SaferCond waiter;
2831 if (!service.meta_ch->flush_commit(&waiter)) {
2832 waiter.wait();
2833 }
2834 }
2835 utime_t end = ceph_clock_now();
2836
2837 // clean up
2838 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2839 {
2840 C_SaferCond waiter;
2841 if (!service.meta_ch->flush_commit(&waiter)) {
2842 waiter.wait();
2843 }
2844 }
2845
2846 double elapsed = end - start;
2847 double rate = count / elapsed;
2848 double iops = rate / bsize;
2849 f->open_object_section("osd_bench_results");
2850 f->dump_int("bytes_written", count);
2851 f->dump_int("blocksize", bsize);
2852 f->dump_float("elapsed_sec", elapsed);
2853 f->dump_float("bytes_per_sec", rate);
2854 f->dump_float("iops", iops);
2855 f->close_section();
2856 }
2857
2858 else if (prefix == "flush_pg_stats") {
2859 mgrc.send_pgstats();
2860 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2861 }
2862
2863 else if (prefix == "heap") {
2864 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2865 }
2866
2867 else if (prefix == "debug dump_missing") {
2868 f->open_array_section("pgs");
2869 vector<PGRef> pgs;
2870 _get_pgs(&pgs);
2871 for (auto& pg : pgs) {
2872 string s = stringify(pg->pg_id);
2873 f->open_array_section(s.c_str());
2874 pg->lock();
2875 pg->dump_missing(f);
2876 pg->unlock();
2877 f->close_section();
2878 }
2879 f->close_section();
2880 }
2881
2882 else if (prefix == "debug kick_recovery_wq") {
2883 int64_t delay;
2884 cmd_getval(cmdmap, "delay", delay);
2885 ostringstream oss;
2886 oss << delay;
2887 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2888 if (ret != 0) {
2889 ss << "kick_recovery_wq: error setting "
2890 << "osd_recovery_delay_start to '" << delay << "': error "
2891 << ret;
2892 goto out;
2893 }
2894 cct->_conf.apply_changes(nullptr);
2895 ss << "kicking recovery queue. set osd_recovery_delay_start "
2896 << "to " << cct->_conf->osd_recovery_delay_start;
2897 }
2898
2899 else if (prefix == "cpu_profiler") {
2900 ostringstream ds;
2901 string arg;
2902 cmd_getval(cmdmap, "arg", arg);
2903 vector<string> argvec;
2904 get_str_vec(arg, argvec);
2905 cpu_profiler_handle_command(argvec, ds);
2906 outbl.append(ds.str());
2907 }
2908
2909 else if (prefix == "dump_pg_recovery_stats") {
2910 lock_guard l(osd_lock);
2911 pg_recovery_stats.dump_formatted(f);
2912 }
2913
2914 else if (prefix == "reset_pg_recovery_stats") {
2915 lock_guard l(osd_lock);
2916 pg_recovery_stats.reset();
2917 }
2918
2919 else if (prefix == "perf histogram dump") {
2920 std::string logger;
2921 std::string counter;
2922 cmd_getval(cmdmap, "logger", logger);
2923 cmd_getval(cmdmap, "counter", counter);
2924 cct->get_perfcounters_collection()->dump_formatted_histograms(
2925 f, false, logger, counter);
2926 }
2927
2928 else if (prefix == "cache drop") {
2929 lock_guard l(osd_lock);
2930 dout(20) << "clearing all caches" << dendl;
2931 // Clear the objectstore's cache - onode and buffer for Bluestore,
2932 // system's pagecache for Filestore
2933 ret = store->flush_cache(&ss);
2934 if (ret < 0) {
2935 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2936 goto out;
2937 }
2938 // Clear the objectcontext cache (per PG)
2939 vector<PGRef> pgs;
2940 _get_pgs(&pgs);
2941 for (auto& pg: pgs) {
2942 pg->clear_cache();
2943 }
2944 }
2945
2946 else if (prefix == "cache status") {
2947 lock_guard l(osd_lock);
2948 int obj_ctx_count = 0;
2949 vector<PGRef> pgs;
2950 _get_pgs(&pgs);
2951 for (auto& pg: pgs) {
2952 obj_ctx_count += pg->get_cache_obj_count();
2953 }
2954 f->open_object_section("cache_status");
2955 f->dump_int("object_ctx", obj_ctx_count);
2956 store->dump_cache_stats(f);
2957 f->close_section();
2958 }
2959
2960 else if (prefix == "scrub_purged_snaps") {
2961 lock_guard l(osd_lock);
2962 scrub_purged_snaps();
2963 }
2964
2965 else if (prefix == "dump_osd_network") {
2966 lock_guard l(osd_lock);
2967 int64_t value = 0;
2968 if (!(cmd_getval(cmdmap, "value", value))) {
2969 // Convert milliseconds to microseconds
2970 value = static_cast<double>(g_conf().get_val<double>(
2971 "mon_warn_on_slow_ping_time")) * 1000;
2972 if (value == 0) {
2973 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2974 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2975 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2976 }
2977 } else {
2978 // Convert user input to microseconds
2979 value *= 1000;
2980 }
2981 if (value < 0) value = 0;
2982
2983 struct osd_ping_time_t {
2984 uint32_t pingtime;
2985 int to;
2986 bool back;
2987 std::array<uint32_t,3> times;
2988 std::array<uint32_t,3> min;
2989 std::array<uint32_t,3> max;
2990 uint32_t last;
2991 uint32_t last_update;
2992
2993 bool operator<(const osd_ping_time_t& rhs) const {
2994 if (pingtime < rhs.pingtime)
2995 return true;
2996 if (pingtime > rhs.pingtime)
2997 return false;
2998 if (to < rhs.to)
2999 return true;
3000 if (to > rhs.to)
3001 return false;
3002 return back;
3003 }
3004 };
3005
3006 set<osd_ping_time_t> sorted;
3007 // Get pingtimes under lock and not on the stack
eafe8130
TL
3008 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3009 service.get_hb_pingtime(pingtimes);
3010 for (auto j : *pingtimes) {
3011 if (j.second.last_update == 0)
3012 continue;
3013 osd_ping_time_t item;
3014 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3015 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3016 if (item.pingtime >= value) {
3017 item.to = j.first;
3018 item.times[0] = j.second.back_pingtime[0];
3019 item.times[1] = j.second.back_pingtime[1];
3020 item.times[2] = j.second.back_pingtime[2];
3021 item.min[0] = j.second.back_min[0];
3022 item.min[1] = j.second.back_min[1];
3023 item.min[2] = j.second.back_min[2];
3024 item.max[0] = j.second.back_max[0];
3025 item.max[1] = j.second.back_max[1];
3026 item.max[2] = j.second.back_max[2];
3027 item.last = j.second.back_last;
3028 item.back = true;
3029 item.last_update = j.second.last_update;
3030 sorted.emplace(item);
3031 }
3032 if (j.second.front_last == 0)
3033 continue;
3034 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3035 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3036 if (item.pingtime >= value) {
3037 item.to = j.first;
3038 item.times[0] = j.second.front_pingtime[0];
3039 item.times[1] = j.second.front_pingtime[1];
3040 item.times[2] = j.second.front_pingtime[2];
3041 item.min[0] = j.second.front_min[0];
3042 item.min[1] = j.second.front_min[1];
3043 item.min[2] = j.second.front_min[2];
3044 item.max[0] = j.second.front_max[0];
3045 item.max[1] = j.second.front_max[1];
3046 item.max[2] = j.second.front_max[2];
3047 item.last = j.second.front_last;
3048 item.last_update = j.second.last_update;
3049 item.back = false;
3050 sorted.emplace(item);
3051 }
3052 }
3053 delete pingtimes;
3054 //
3055 // Network ping times (1min 5min 15min)
3056 f->open_object_section("network_ping_times");
3057 f->dump_int("threshold", value / 1000);
3058 f->open_array_section("entries");
3059 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3060 ceph_assert(sitem.pingtime >= value);
3061 f->open_object_section("entry");
3062
3063 const time_t lu(sitem.last_update);
3064 char buffer[26];
3065 string lustr(ctime_r(&lu, buffer));
3066 lustr.pop_back(); // Remove trailing \n
3067 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3068 f->dump_string("last update", lustr);
3069 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3070 f->dump_int("from osd", whoami);
3071 f->dump_int("to osd", sitem.to);
3072 f->dump_string("interface", (sitem.back ? "back" : "front"));
3073 f->open_object_section("average");
3074 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3075 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3076 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3077 f->close_section(); // average
3078 f->open_object_section("min");
3079 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3080 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3081 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3082 f->close_section(); // min
3083 f->open_object_section("max");
3084 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3085 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3086 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3087 f->close_section(); // max
3088 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3089 f->close_section(); // entry
3090 }
3091 f->close_section(); // entries
3092 f->close_section(); // network_ping_times
7c673cae 3093 } else {
11fdf7f2 3094 ceph_abort_msg("broken asok registration");
7c673cae 3095 }
9f95a23c
TL
3096
3097 out:
3098 on_finish(ret, ss.str(), outbl);
7c673cae
FG
3099}
3100
3101class TestOpsSocketHook : public AdminSocketHook {
3102 OSDService *service;
3103 ObjectStore *store;
3104public:
3105 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
9f95a23c
TL
3106 int call(std::string_view command, const cmdmap_t& cmdmap,
3107 Formatter *f,
3108 std::ostream& errss,
3109 bufferlist& out) override {
3110 int r = 0;
3111 stringstream outss;
11fdf7f2 3112 try {
9f95a23c
TL
3113 test_ops(service, store, command, cmdmap, outss);
3114 out.append(outss);
3115 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3116 errss << e.what();
3117 r = -EINVAL;
11fdf7f2 3118 }
9f95a23c 3119 return r;
7c673cae
FG
3120 }
3121 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 3122 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
3123
3124};
3125
3126class OSD::C_Tick : public Context {
3127 OSD *osd;
3128 public:
3129 explicit C_Tick(OSD *o) : osd(o) {}
3130 void finish(int r) override {
3131 osd->tick();
3132 }
3133};
3134
3135class OSD::C_Tick_WithoutOSDLock : public Context {
3136 OSD *osd;
3137 public:
3138 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3139 void finish(int r) override {
3140 osd->tick_without_osd_lock();
3141 }
3142};
3143
3144int OSD::enable_disable_fuse(bool stop)
3145{
3146#ifdef HAVE_LIBFUSE
3147 int r;
3148 string mntpath = cct->_conf->osd_data + "/fuse";
3149 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3150 dout(1) << __func__ << " disabling" << dendl;
3151 fuse_store->stop();
3152 delete fuse_store;
3153 fuse_store = NULL;
3154 r = ::rmdir(mntpath.c_str());
7c673cae 3155 if (r < 0) {
c07f9fc5
FG
3156 r = -errno;
3157 derr << __func__ << " failed to rmdir " << mntpath << ": "
3158 << cpp_strerror(r) << dendl;
7c673cae
FG
3159 return r;
3160 }
3161 return 0;
3162 }
3163 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3164 dout(1) << __func__ << " enabling" << dendl;
3165 r = ::mkdir(mntpath.c_str(), 0700);
3166 if (r < 0)
3167 r = -errno;
3168 if (r < 0 && r != -EEXIST) {
3169 derr << __func__ << " unable to create " << mntpath << ": "
3170 << cpp_strerror(r) << dendl;
3171 return r;
3172 }
3173 fuse_store = new FuseStore(store, mntpath);
3174 r = fuse_store->start();
3175 if (r < 0) {
3176 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3177 delete fuse_store;
3178 fuse_store = NULL;
3179 return r;
3180 }
3181 }
3182#endif // HAVE_LIBFUSE
3183 return 0;
3184}
3185
9f95a23c
TL
3186size_t OSD::get_num_cache_shards()
3187{
3188 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3189}
3190
31f18b77
FG
3191int OSD::get_num_op_shards()
3192{
3193 if (cct->_conf->osd_op_num_shards)
3194 return cct->_conf->osd_op_num_shards;
3195 if (store_is_rotational)
3196 return cct->_conf->osd_op_num_shards_hdd;
3197 else
3198 return cct->_conf->osd_op_num_shards_ssd;
3199}
3200
3201int OSD::get_num_op_threads()
3202{
3203 if (cct->_conf->osd_op_num_threads_per_shard)
3204 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3205 if (store_is_rotational)
3206 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3207 else
3208 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3209}
3210
c07f9fc5
FG
3211float OSD::get_osd_recovery_sleep()
3212{
3213 if (cct->_conf->osd_recovery_sleep)
3214 return cct->_conf->osd_recovery_sleep;
d2e6a577 3215 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 3216 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 3217 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 3218 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
3219 else
3220 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
3221}
3222
11fdf7f2
TL
3223float OSD::get_osd_delete_sleep()
3224{
3225 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3226 if (osd_delete_sleep > 0)
3227 return osd_delete_sleep;
3228 if (!store_is_rotational && !journal_is_rotational)
3229 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3230 if (store_is_rotational && !journal_is_rotational)
3231 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3232 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3233}
3234
9f95a23c
TL
3235int OSD::get_recovery_max_active()
3236{
3237 if (cct->_conf->osd_recovery_max_active)
3238 return cct->_conf->osd_recovery_max_active;
3239 if (store_is_rotational)
3240 return cct->_conf->osd_recovery_max_active_hdd;
3241 else
3242 return cct->_conf->osd_recovery_max_active_ssd;
3243}
3244
494da23a
TL
3245float OSD::get_osd_snap_trim_sleep()
3246{
3247 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3248 if (osd_snap_trim_sleep > 0)
3249 return osd_snap_trim_sleep;
3250 if (!store_is_rotational && !journal_is_rotational)
3251 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3252 if (store_is_rotational && !journal_is_rotational)
3253 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3254 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3255}
3256
7c673cae
FG
3257int OSD::init()
3258{
9f95a23c 3259 OSDMapRef osdmap;
7c673cae 3260 CompatSet initial, diff;
11fdf7f2 3261 std::lock_guard lock(osd_lock);
7c673cae
FG
3262 if (is_stopping())
3263 return 0;
3264
3265 tick_timer.init();
3266 tick_timer_without_osd_lock.init();
3267 service.recovery_request_timer.init();
11fdf7f2
TL
3268 service.sleep_timer.init();
3269
3270 boot_finisher.start();
3271
3272 {
3273 string val;
3274 store->read_meta("require_osd_release", &val);
9f95a23c 3275 last_require_osd_release = ceph_release_from_name(val);
11fdf7f2 3276 }
7c673cae
FG
3277
3278 // mount.
31f18b77
FG
3279 dout(2) << "init " << dev_path
3280 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3281 << dendl;
d2e6a577 3282 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 3283 ceph_assert(store); // call pre_init() first!
7c673cae 3284
9f95a23c 3285 store->set_cache_shards(get_num_cache_shards());
7c673cae
FG
3286
3287 int r = store->mount();
3288 if (r < 0) {
3289 derr << "OSD:init: unable to mount object store" << dendl;
3290 return r;
3291 }
d2e6a577
FG
3292 journal_is_rotational = store->is_journal_rotational();
3293 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3294 << dendl;
7c673cae
FG
3295
3296 enable_disable_fuse(false);
3297
3298 dout(2) << "boot" << dendl;
3299
11fdf7f2
TL
3300 service.meta_ch = store->open_collection(coll_t::meta());
3301
7c673cae
FG
3302 // initialize the daily loadavg with current 15min loadavg
3303 double loadavgs[3];
3304 if (getloadavg(loadavgs, 3) == 3) {
3305 daily_loadavg = loadavgs[2];
3306 } else {
3307 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3308 daily_loadavg = 1.0;
3309 }
3310
3311 int rotating_auth_attempts = 0;
11fdf7f2
TL
3312 auto rotating_auth_timeout =
3313 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
7c673cae
FG
3314
3315 // sanity check long object name handling
3316 {
3317 hobject_t l;
3318 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3319 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3320 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3321 r = store->validate_hobject_key(l);
3322 if (r < 0) {
3323 derr << "backend (" << store->get_type() << ") is unable to support max "
3324 << "object name[space] len" << dendl;
3325 derr << " osd max object name len = "
3326 << cct->_conf->osd_max_object_name_len << dendl;
3327 derr << " osd max object namespace len = "
3328 << cct->_conf->osd_max_object_namespace_len << dendl;
3329 derr << cpp_strerror(r) << dendl;
3330 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3331 goto out;
3332 }
3333 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3334 << dendl;
3335 } else {
3336 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3337 }
3338 }
3339
3340 // read superblock
3341 r = read_superblock();
3342 if (r < 0) {
3343 derr << "OSD::init() : unable to read osd superblock" << dendl;
3344 r = -EINVAL;
3345 goto out;
3346 }
3347
3348 if (osd_compat.compare(superblock.compat_features) < 0) {
3349 derr << "The disk uses features unsupported by the executable." << dendl;
3350 derr << " ondisk features " << superblock.compat_features << dendl;
3351 derr << " daemon features " << osd_compat << dendl;
3352
3353 if (osd_compat.writeable(superblock.compat_features)) {
3354 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3355 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3356 r = -EOPNOTSUPP;
3357 goto out;
3358 }
3359 else {
3360 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3361 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3362 r = -EOPNOTSUPP;
3363 goto out;
3364 }
3365 }
3366
3367 assert_warn(whoami == superblock.whoami);
3368 if (whoami != superblock.whoami) {
3369 derr << "OSD::init: superblock says osd"
3370 << superblock.whoami << " but I am osd." << whoami << dendl;
3371 r = -EINVAL;
3372 goto out;
3373 }
3374
9f95a23c
TL
3375 startup_time = ceph::mono_clock::now();
3376
11fdf7f2 3377 // load up "current" osdmap
9f95a23c
TL
3378 assert_warn(!get_osdmap());
3379 if (get_osdmap()) {
11fdf7f2
TL
3380 derr << "OSD::init: unable to read current osdmap" << dendl;
3381 r = -EINVAL;
3382 goto out;
3383 }
3384 osdmap = get_map(superblock.current_epoch);
9f95a23c 3385 set_osdmap(osdmap);
11fdf7f2
TL
3386
3387 // make sure we don't have legacy pgs deleting
3388 {
3389 vector<coll_t> ls;
3390 int r = store->list_collections(ls);
3391 ceph_assert(r >= 0);
3392 for (auto c : ls) {
3393 spg_t pgid;
3394 if (c.is_pg(&pgid) &&
3395 !osdmap->have_pg_pool(pgid.pool())) {
3396 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3397 if (!store->exists(service.meta_ch, oid)) {
3398 derr << __func__ << " missing pg_pool_t for deleted pool "
3399 << pgid.pool() << " for pg " << pgid
3400 << "; please downgrade to luminous and allow "
3401 << "pg deletion to complete before upgrading" << dendl;
3402 ceph_abort();
3403 }
3404 }
3405 }
3406 }
3407
7c673cae
FG
3408 initial = get_osd_initial_compat_set();
3409 diff = superblock.compat_features.unsupported(initial);
3410 if (superblock.compat_features.merge(initial)) {
9f95a23c
TL
3411 // Are we adding SNAPMAPPER2?
3412 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3413 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3414 << dendl;
3415 auto ch = service.meta_ch;
3416 auto hoid = make_snapmapper_oid();
3417 unsigned max = cct->_conf->osd_target_transaction_size;
3418 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3419 if (r < 0)
3420 goto out;
3421 }
7c673cae
FG
3422 // We need to persist the new compat_set before we
3423 // do anything else
3424 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3425 ObjectStore::Transaction t;
3426 write_superblock(t);
11fdf7f2 3427 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3428 if (r < 0)
3429 goto out;
3430 }
3431
3432 // make sure snap mapper object exists
11fdf7f2 3433 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3434 dout(10) << "init creating/touching snapmapper object" << dendl;
3435 ObjectStore::Transaction t;
3436 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3437 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3438 if (r < 0)
3439 goto out;
3440 }
9f95a23c
TL
3441 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3442 dout(10) << "init creating/touching purged_snaps object" << dendl;
3443 ObjectStore::Transaction t;
3444 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3445 r = store->queue_transaction(service.meta_ch, std::move(t));
3446 if (r < 0)
3447 goto out;
3448 }
7c673cae
FG
3449
3450 if (cct->_conf->osd_open_classes_on_start) {
9f95a23c 3451 int r = ClassHandler::get_instance().open_all_classes();
7c673cae
FG
3452 if (r)
3453 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3454 }
3455
11fdf7f2 3456 check_osdmap_features();
7c673cae
FG
3457
3458 create_recoverystate_perf();
3459
3460 {
3461 epoch_t bind_epoch = osdmap->get_epoch();
3462 service.set_epochs(NULL, NULL, &bind_epoch);
3463 }
3464
3465 clear_temp_objects();
3466
d2e6a577 3467 // initialize osdmap references in sharded wq
11fdf7f2
TL
3468 for (auto& shard : shards) {
3469 std::lock_guard l(shard->osdmap_lock);
3470 shard->shard_osdmap = osdmap;
3471 }
d2e6a577 3472
7c673cae
FG
3473 // load up pgs (as they previously existed)
3474 load_pgs();
3475
3476 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
7c673cae
FG
3477
3478 create_logger();
3479
11fdf7f2
TL
3480 // prime osd stats
3481 {
3482 struct store_statfs_t stbuf;
3483 osd_alert_list_t alerts;
3484 int r = store->statfs(&stbuf, &alerts);
3485 ceph_assert(r == 0);
3486 service.set_statfs(stbuf, alerts);
3487 }
3488
3489 // client_messenger auth_client is already set up by monc.
3490 for (auto m : { cluster_messenger,
3491 objecter_messenger,
3492 hb_front_client_messenger,
3493 hb_back_client_messenger,
3494 hb_front_server_messenger,
3495 hb_back_server_messenger } ) {
3496 m->set_auth_client(monc);
3497 }
3498 for (auto m : { client_messenger,
3499 cluster_messenger,
3500 hb_front_server_messenger,
3501 hb_back_server_messenger }) {
3502 m->set_auth_server(monc);
3503 }
3504 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3505
3506 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3507 | CEPH_ENTITY_TYPE_MGR);
3508 r = monc->init();
3509 if (r < 0)
3510 goto out;
3511
11fdf7f2
TL
3512 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3513 mgrc.set_perf_metric_query_cb(
9f95a23c
TL
3514 [this](const ConfigPayload &config_payload) {
3515 set_perf_queries(config_payload);
11fdf7f2 3516 },
9f95a23c
TL
3517 [this] {
3518 return get_perf_reports();
11fdf7f2 3519 });
7c673cae 3520 mgrc.init();
7c673cae
FG
3521
3522 // tell monc about log_client so it will know about mon session resets
3523 monc->set_log_client(&log_client);
3524 update_log_config();
3525
11fdf7f2
TL
3526 // i'm ready!
3527 client_messenger->add_dispatcher_tail(&mgrc);
3528 client_messenger->add_dispatcher_tail(this);
3529 cluster_messenger->add_dispatcher_head(this);
3530
3531 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3532 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3533 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3534 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3535
9f95a23c 3536 objecter_messenger->add_dispatcher_head(service.objecter.get());
11fdf7f2 3537
28e407b8
AA
3538 service.init();
3539 service.publish_map(osdmap);
3540 service.publish_superblock(superblock);
3541 service.max_oldest_map = superblock.oldest_map;
3542
11fdf7f2
TL
3543 for (auto& shard : shards) {
3544 // put PGs in a temporary set because we may modify pg_slots
3545 // unordered_map below.
3546 set<PGRef> pgs;
3547 for (auto& i : shard->pg_slots) {
3548 PGRef pg = i.second->pg;
3549 if (!pg) {
3550 continue;
3551 }
3552 pgs.insert(pg);
3553 }
3554 for (auto pg : pgs) {
9f95a23c 3555 std::scoped_lock l{*pg};
11fdf7f2
TL
3556 set<pair<spg_t,epoch_t>> new_children;
3557 set<pair<spg_t,epoch_t>> merge_pgs;
3558 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3559 &new_children, &merge_pgs);
3560 if (!new_children.empty()) {
3561 for (auto shard : shards) {
3562 shard->prime_splits(osdmap, &new_children);
3563 }
3564 assert(new_children.empty());
3565 }
3566 if (!merge_pgs.empty()) {
3567 for (auto shard : shards) {
3568 shard->prime_merges(osdmap, &merge_pgs);
3569 }
3570 assert(merge_pgs.empty());
3571 }
11fdf7f2
TL
3572 }
3573 }
3574
7c673cae 3575 osd_op_tp.start();
7c673cae 3576
7c673cae
FG
3577 // start the heartbeat
3578 heartbeat_thread.create("osd_srv_heartbt");
3579
3580 // tick
91327a77
AA
3581 tick_timer.add_event_after(get_tick_interval(),
3582 new C_Tick(this));
7c673cae 3583 {
11fdf7f2 3584 std::lock_guard l(tick_timer_lock);
91327a77
AA
3585 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3586 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3587 }
3588
9f95a23c 3589 osd_lock.unlock();
7c673cae
FG
3590
3591 r = monc->authenticate();
3592 if (r < 0) {
c07f9fc5
FG
3593 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3594 << dendl;
11fdf7f2 3595 exit(1);
7c673cae
FG
3596 }
3597
11fdf7f2 3598 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3599 derr << "unable to obtain rotating service keys; retrying" << dendl;
3600 ++rotating_auth_attempts;
11fdf7f2 3601 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
c07f9fc5 3602 derr << __func__ << " wait_auth_rotating timed out" << dendl;
11fdf7f2 3603 exit(1);
7c673cae
FG
3604 }
3605 }
3606
3607 r = update_crush_device_class();
3608 if (r < 0) {
d2e6a577
FG
3609 derr << __func__ << " unable to update_crush_device_class: "
3610 << cpp_strerror(r) << dendl;
11fdf7f2 3611 exit(1);
7c673cae
FG
3612 }
3613
3614 r = update_crush_location();
3615 if (r < 0) {
d2e6a577 3616 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3617 << cpp_strerror(r) << dendl;
11fdf7f2 3618 exit(1);
7c673cae
FG
3619 }
3620
9f95a23c 3621 osd_lock.lock();
7c673cae
FG
3622 if (is_stopping())
3623 return 0;
3624
3625 // start objecter *after* we have authenticated, so that we don't ignore
3626 // the OSDMaps it requests.
3627 service.final_init();
3628
3629 check_config();
3630
3631 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3632 consume_map();
7c673cae
FG
3633
3634 dout(0) << "done with init, starting boot process" << dendl;
3635
3636 // subscribe to any pg creations
3637 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3638
3639 // MgrClient needs this (it doesn't have MonClient reference itself)
3640 monc->sub_want("mgrmap", 0, 0);
3641
3642 // we don't need to ask for an osdmap here; objecter will
3643 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3644
3645 monc->renew_subs();
3646
3647 start_boot();
3648
3649 return 0;
7c673cae
FG
3650
3651out:
3652 enable_disable_fuse(true);
3653 store->umount();
3654 delete store;
3655 store = NULL;
3656 return r;
3657}
3658
3659void OSD::final_init()
3660{
3661 AdminSocket *admin_socket = cct->get_admin_socket();
3662 asok_hook = new OSDSocketHook(this);
9f95a23c 3663 int r = admin_socket->register_command("status", asok_hook,
7c673cae 3664 "high-level status of OSD");
11fdf7f2 3665 ceph_assert(r == 0);
9f95a23c 3666 r = admin_socket->register_command("flush_journal",
7c673cae
FG
3667 asok_hook,
3668 "flush the journal to permanent store");
11fdf7f2 3669 ceph_assert(r == 0);
9f95a23c 3670 r = admin_socket->register_command("dump_ops_in_flight " \
c07f9fc5
FG
3671 "name=filterstr,type=CephString,n=N,req=false",
3672 asok_hook,
7c673cae 3673 "show the ops currently in flight");
11fdf7f2 3674 ceph_assert(r == 0);
9f95a23c 3675 r = admin_socket->register_command("ops " \
c07f9fc5
FG
3676 "name=filterstr,type=CephString,n=N,req=false",
3677 asok_hook,
7c673cae 3678 "show the ops currently in flight");
11fdf7f2 3679 ceph_assert(r == 0);
9f95a23c 3680 r = admin_socket->register_command("dump_blocked_ops " \
c07f9fc5
FG
3681 "name=filterstr,type=CephString,n=N,req=false",
3682 asok_hook,
7c673cae 3683 "show the blocked ops currently in flight");
11fdf7f2 3684 ceph_assert(r == 0);
9f95a23c 3685 r = admin_socket->register_command("dump_historic_ops " \
c07f9fc5 3686 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3687 asok_hook,
3688 "show recent ops");
11fdf7f2 3689 ceph_assert(r == 0);
9f95a23c 3690 r = admin_socket->register_command("dump_historic_slow_ops " \
c07f9fc5 3691 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3692 asok_hook,
3693 "show slowest recent ops");
11fdf7f2 3694 ceph_assert(r == 0);
9f95a23c 3695 r = admin_socket->register_command("dump_historic_ops_by_duration " \
c07f9fc5 3696 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3697 asok_hook,
3698 "show slowest recent ops, sorted by duration");
11fdf7f2 3699 ceph_assert(r == 0);
9f95a23c 3700 r = admin_socket->register_command("dump_op_pq_state",
7c673cae
FG
3701 asok_hook,
3702 "dump op priority queue state");
11fdf7f2 3703 ceph_assert(r == 0);
9f95a23c 3704 r = admin_socket->register_command("dump_blacklist",
7c673cae
FG
3705 asok_hook,
3706 "dump blacklisted clients and times");
11fdf7f2 3707 ceph_assert(r == 0);
9f95a23c 3708 r = admin_socket->register_command("dump_watchers",
7c673cae
FG
3709 asok_hook,
3710 "show clients which have active watches,"
3711 " and on which objects");
11fdf7f2 3712 ceph_assert(r == 0);
9f95a23c 3713 r = admin_socket->register_command("dump_recovery_reservations",
7c673cae
FG
3714 asok_hook,
3715 "show recovery reservations");
11fdf7f2 3716 ceph_assert(r == 0);
9f95a23c 3717 r = admin_socket->register_command("dump_scrub_reservations",
eafe8130 3718 asok_hook,
9f95a23c 3719 "show recovery reservations");
eafe8130 3720 ceph_assert(r == 0);
9f95a23c 3721 r = admin_socket->register_command("get_latest_osdmap",
7c673cae
FG
3722 asok_hook,
3723 "force osd to update the latest map from "
3724 "the mon");
11fdf7f2 3725 ceph_assert(r == 0);
7c673cae 3726
9f95a23c 3727 r = admin_socket->register_command("set_heap_property " \
7c673cae
FG
3728 "name=property,type=CephString " \
3729 "name=value,type=CephInt",
3730 asok_hook,
3731 "update malloc extension heap property");
11fdf7f2 3732 ceph_assert(r == 0);
7c673cae 3733
9f95a23c 3734 r = admin_socket->register_command("get_heap_property " \
7c673cae
FG
3735 "name=property,type=CephString",
3736 asok_hook,
3737 "get malloc extension heap property");
11fdf7f2 3738 ceph_assert(r == 0);
7c673cae
FG
3739
3740 r = admin_socket->register_command("dump_objectstore_kv_stats",
7c673cae
FG
3741 asok_hook,
3742 "print statistics of kvdb which used by bluestore");
11fdf7f2 3743 ceph_assert(r == 0);
7c673cae
FG
3744
3745 r = admin_socket->register_command("dump_scrubs",
7c673cae
FG
3746 asok_hook,
3747 "print scheduled scrubs");
11fdf7f2 3748 ceph_assert(r == 0);
7c673cae
FG
3749
3750 r = admin_socket->register_command("calc_objectstore_db_histogram",
7c673cae
FG
3751 asok_hook,
3752 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 3753 ceph_assert(r == 0);
7c673cae
FG
3754
3755 r = admin_socket->register_command("flush_store_cache",
7c673cae
FG
3756 asok_hook,
3757 "Flush bluestore internal cache");
11fdf7f2 3758 ceph_assert(r == 0);
9f95a23c 3759 r = admin_socket->register_command("dump_pgstate_history",
7c673cae
FG
3760 asok_hook,
3761 "show recent state history");
11fdf7f2 3762 ceph_assert(r == 0);
7c673cae 3763
9f95a23c 3764 r = admin_socket->register_command("compact",
224ce89b
WB
3765 asok_hook,
3766 "Commpact object store's omap."
3767 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
3768 ceph_assert(r == 0);
3769
9f95a23c 3770 r = admin_socket->register_command("get_mapped_pools",
11fdf7f2
TL
3771 asok_hook,
3772 "dump pools whose PG(s) are mapped to this OSD.");
3773
3774 ceph_assert(r == 0);
3775
9f95a23c 3776 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
11fdf7f2
TL
3777 asok_hook,
3778 "probe OSD devices for SMART data.");
3779
3780 ceph_assert(r == 0);
3781
9f95a23c 3782 r = admin_socket->register_command("list_devices",
11fdf7f2
TL
3783 asok_hook,
3784 "list OSD devices.");
9f95a23c 3785 r = admin_socket->register_command("send_beacon",
11fdf7f2
TL
3786 asok_hook,
3787 "send OSD beacon to mon immediately");
224ce89b 3788
9f95a23c
TL
3789 r = admin_socket->register_command(
3790 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3791 "Dump osd heartbeat network ping times");
eafe8130
TL
3792 ceph_assert(r == 0);
3793
7c673cae
FG
3794 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3795 // Note: pools are CephString instead of CephPoolname because
3796 // these commands traditionally support both pool names and numbers
3797 r = admin_socket->register_command(
7c673cae
FG
3798 "setomapval " \
3799 "name=pool,type=CephString " \
3800 "name=objname,type=CephObjectname " \
3801 "name=key,type=CephString "\
3802 "name=val,type=CephString",
3803 test_ops_hook,
3804 "set omap key");
11fdf7f2 3805 ceph_assert(r == 0);
7c673cae 3806 r = admin_socket->register_command(
7c673cae
FG
3807 "rmomapkey " \
3808 "name=pool,type=CephString " \
3809 "name=objname,type=CephObjectname " \
3810 "name=key,type=CephString",
3811 test_ops_hook,
3812 "remove omap key");
11fdf7f2 3813 ceph_assert(r == 0);
7c673cae 3814 r = admin_socket->register_command(
7c673cae
FG
3815 "setomapheader " \
3816 "name=pool,type=CephString " \
3817 "name=objname,type=CephObjectname " \
3818 "name=header,type=CephString",
3819 test_ops_hook,
3820 "set omap header");
11fdf7f2 3821 ceph_assert(r == 0);
7c673cae
FG
3822
3823 r = admin_socket->register_command(
7c673cae
FG
3824 "getomap " \
3825 "name=pool,type=CephString " \
3826 "name=objname,type=CephObjectname",
3827 test_ops_hook,
3828 "output entire object map");
11fdf7f2 3829 ceph_assert(r == 0);
7c673cae
FG
3830
3831 r = admin_socket->register_command(
7c673cae
FG
3832 "truncobj " \
3833 "name=pool,type=CephString " \
3834 "name=objname,type=CephObjectname " \
3835 "name=len,type=CephInt",
3836 test_ops_hook,
3837 "truncate object to length");
11fdf7f2 3838 ceph_assert(r == 0);
7c673cae
FG
3839
3840 r = admin_socket->register_command(
7c673cae
FG
3841 "injectdataerr " \
3842 "name=pool,type=CephString " \
3843 "name=objname,type=CephObjectname " \
3844 "name=shardid,type=CephInt,req=false,range=0|255",
3845 test_ops_hook,
3846 "inject data error to an object");
11fdf7f2 3847 ceph_assert(r == 0);
7c673cae
FG
3848
3849 r = admin_socket->register_command(
7c673cae
FG
3850 "injectmdataerr " \
3851 "name=pool,type=CephString " \
3852 "name=objname,type=CephObjectname " \
3853 "name=shardid,type=CephInt,req=false,range=0|255",
3854 test_ops_hook,
3855 "inject metadata error to an object");
11fdf7f2 3856 ceph_assert(r == 0);
7c673cae 3857 r = admin_socket->register_command(
7c673cae
FG
3858 "set_recovery_delay " \
3859 "name=utime,type=CephInt,req=false",
3860 test_ops_hook,
3861 "Delay osd recovery by specified seconds");
11fdf7f2 3862 ceph_assert(r == 0);
7c673cae 3863 r = admin_socket->register_command(
7c673cae
FG
3864 "injectfull " \
3865 "name=type,type=CephString,req=false " \
3866 "name=count,type=CephInt,req=false ",
3867 test_ops_hook,
3868 "Inject a full disk (optional count times)");
11fdf7f2 3869 ceph_assert(r == 0);
9f95a23c
TL
3870 r = admin_socket->register_command(
3871 "bench " \
3872 "name=count,type=CephInt,req=false " \
3873 "name=size,type=CephInt,req=false " \
3874 "name=object_size,type=CephInt,req=false " \
3875 "name=object_num,type=CephInt,req=false ",
3876 asok_hook,
3877 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3878 "(default count=1G default size=4MB). Results in log.");
3879 ceph_assert(r == 0);
3880 r = admin_socket->register_command(
3881 "cluster_log " \
3882 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3883 "name=message,type=CephString,n=N",
3884 asok_hook,
3885 "log a message to the cluster log");
3886 ceph_assert(r == 0);
3887 r = admin_socket->register_command(
3888 "flush_pg_stats",
3889 asok_hook,
3890 "flush pg stats");
3891 ceph_assert(r == 0);
3892 r = admin_socket->register_command(
3893 "heap " \
3894 "name=heapcmd,type=CephChoices,strings=" \
3895 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3896 "name=value,type=CephString,req=false",
3897 asok_hook,
3898 "show heap usage info (available only if compiled with tcmalloc)");
3899 ceph_assert(r == 0);
3900 r = admin_socket->register_command(
3901 "debug dump_missing " \
3902 "name=filename,type=CephFilepath",
3903 asok_hook,
3904 "dump missing objects to a named file");
3905 ceph_assert(r == 0);
3906 r = admin_socket->register_command(
3907 "debug kick_recovery_wq " \
3908 "name=delay,type=CephInt,range=0",
3909 asok_hook,
3910 "set osd_recovery_delay_start to <val>");
3911 ceph_assert(r == 0);
3912 r = admin_socket->register_command(
3913 "cpu_profiler " \
3914 "name=arg,type=CephChoices,strings=status|flush",
3915 asok_hook,
3916 "run cpu profiling on daemon");
3917 ceph_assert(r == 0);
3918 r = admin_socket->register_command(
3919 "dump_pg_recovery_stats",
3920 asok_hook,
3921 "dump pg recovery statistics");
3922 ceph_assert(r == 0);
3923 r = admin_socket->register_command(
3924 "reset_pg_recovery_stats",
3925 asok_hook,
3926 "reset pg recovery statistics");
3927 ceph_assert(r == 0);
3928 r = admin_socket->register_command(
3929 "cache drop",
3930 asok_hook,
3931 "Drop all OSD caches");
3932 ceph_assert(r == 0);
3933 r = admin_socket->register_command(
3934 "cache status",
3935 asok_hook,
3936 "Get OSD caches statistics");
3937 ceph_assert(r == 0);
3938 r = admin_socket->register_command(
3939 "scrub_purged_snaps",
3940 asok_hook,
3941 "Scrub purged_snaps vs snapmapper index");
3942 ceph_assert(r == 0);
7c673cae 3943
9f95a23c
TL
3944 // -- pg commands --
3945 // old form: ceph pg <pgid> command ...
3946 r = admin_socket->register_command(
3947 "pg " \
3948 "name=pgid,type=CephPgid " \
3949 "name=cmd,type=CephChoices,strings=query",
3950 asok_hook,
3951 "");
3952 ceph_assert(r == 0);
3953 r = admin_socket->register_command(
3954 "pg " \
3955 "name=pgid,type=CephPgid " \
3956 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3957 "name=mulcmd,type=CephChoices,strings=revert|delete",
3958 asok_hook,
3959 "");
3960 ceph_assert(r == 0);
3961 r = admin_socket->register_command(
3962 "pg " \
3963 "name=pgid,type=CephPgid " \
3964 "name=cmd,type=CephChoices,strings=list_unfound " \
3965 "name=offset,type=CephString,req=false",
3966 asok_hook,
3967 "");
3968 ceph_assert(r == 0);
3969 r = admin_socket->register_command(
3970 "pg " \
3971 "name=pgid,type=CephPgid " \
3972 "name=cmd,type=CephChoices,strings=scrub " \
3973 "name=time,type=CephInt,req=false",
3974 asok_hook,
3975 "");
3976 ceph_assert(r == 0);
3977 r = admin_socket->register_command(
3978 "pg " \
3979 "name=pgid,type=CephPgid " \
3980 "name=cmd,type=CephChoices,strings=deep_scrub " \
3981 "name=time,type=CephInt,req=false",
3982 asok_hook,
3983 "");
3984 ceph_assert(r == 0);
3985 // new form: tell <pgid> <cmd> for both cli and rest
3986 r = admin_socket->register_command(
3987 "query",
3988 asok_hook,
3989 "show details of a specific pg");
3990 ceph_assert(r == 0);
3991 r = admin_socket->register_command(
3992 "mark_unfound_lost " \
3993 "name=pgid,type=CephPgid,req=false " \
3994 "name=mulcmd,type=CephChoices,strings=revert|delete",
3995 asok_hook,
3996 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
3997 ceph_assert(r == 0);
3998 r = admin_socket->register_command(
3999 "list_unfound " \
4000 "name=pgid,type=CephPgid,req=false " \
4001 "name=offset,type=CephString,req=false",
4002 asok_hook,
4003 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4004 ceph_assert(r == 0);
4005 r = admin_socket->register_command(
4006 "scrub " \
4007 "name=pgid,type=CephPgid,req=false " \
4008 "name=time,type=CephInt,req=false",
4009 asok_hook,
4010 "Trigger a scheduled scrub ");
4011 ceph_assert(r == 0);
4012 r = admin_socket->register_command(
4013 "deep_scrub " \
4014 "name=pgid,type=CephPgid,req=false " \
4015 "name=time,type=CephInt,req=false",
4016 asok_hook,
4017 "Trigger a scheduled deep scrub ");
4018 ceph_assert(r == 0);
4019}
7c673cae 4020
9f95a23c
TL
4021void OSD::create_logger()
4022{
4023 dout(10) << "create_logger" << dendl;
7c673cae 4024
9f95a23c 4025 logger = build_osd_logger(cct);
7c673cae
FG
4026 cct->get_perfcounters_collection()->add(logger);
4027}
4028
4029void OSD::create_recoverystate_perf()
4030{
4031 dout(10) << "create_recoverystate_perf" << dendl;
4032
9f95a23c 4033 recoverystate_perf = build_recoverystate_perf(cct);
7c673cae
FG
4034 cct->get_perfcounters_collection()->add(recoverystate_perf);
4035}
4036
4037int OSD::shutdown()
4038{
92f5a8d4
TL
4039 if (cct->_conf->osd_fast_shutdown) {
4040 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4041 cct->_log->flush();
4042 _exit(0);
4043 }
4044
7c673cae
FG
4045 if (!service.prepare_to_stop())
4046 return 0; // already shutting down
9f95a23c 4047 osd_lock.lock();
7c673cae 4048 if (is_stopping()) {
9f95a23c 4049 osd_lock.unlock();
7c673cae
FG
4050 return 0;
4051 }
11fdf7f2 4052 dout(0) << "shutdown" << dendl;
7c673cae
FG
4053
4054 set_state(STATE_STOPPING);
4055
4056 // Debugging
11fdf7f2
TL
4057 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4058 cct->_conf.set_val("debug_osd", "100");
4059 cct->_conf.set_val("debug_journal", "100");
4060 cct->_conf.set_val("debug_filestore", "100");
4061 cct->_conf.set_val("debug_bluestore", "100");
4062 cct->_conf.set_val("debug_ms", "100");
4063 cct->_conf.apply_changes(nullptr);
3efd9988 4064 }
7c673cae
FG
4065
4066 // stop MgrClient earlier as it's more like an internal consumer of OSD
4067 mgrc.shutdown();
4068
4069 service.start_shutdown();
4070
4071 // stop sending work to pgs. this just prevents any new work in _process
4072 // from racing with on_shutdown and potentially entering the pg after.
4073 op_shardedwq.drain();
4074
4075 // Shutdown PGs
4076 {
11fdf7f2
TL
4077 vector<PGRef> pgs;
4078 _get_pgs(&pgs);
4079 for (auto pg : pgs) {
4080 pg->shutdown();
7c673cae
FG
4081 }
4082 }
7c673cae
FG
4083
4084 // drain op queue again (in case PGs requeued something)
4085 op_shardedwq.drain();
4086 {
4087 finished.clear(); // zap waiters (bleh, this is messy)
11fdf7f2 4088 waiting_for_osdmap.clear();
7c673cae
FG
4089 }
4090
7c673cae 4091 // unregister commands
11fdf7f2 4092 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
4093 delete asok_hook;
4094 asok_hook = NULL;
4095
11fdf7f2 4096 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
4097 delete test_ops_hook;
4098 test_ops_hook = NULL;
4099
9f95a23c 4100 osd_lock.unlock();
7c673cae 4101
9f95a23c
TL
4102 {
4103 std::lock_guard l{heartbeat_lock};
4104 heartbeat_stop = true;
4105 heartbeat_cond.notify_all();
4106 heartbeat_peers.clear();
4107 }
7c673cae
FG
4108 heartbeat_thread.join();
4109
9f95a23c
TL
4110 hb_back_server_messenger->mark_down_all();
4111 hb_front_server_messenger->mark_down_all();
4112 hb_front_client_messenger->mark_down_all();
4113 hb_back_client_messenger->mark_down_all();
4114
7c673cae
FG
4115 osd_op_tp.drain();
4116 osd_op_tp.stop();
4117 dout(10) << "op sharded tp stopped" << dendl;
4118
7c673cae
FG
4119 dout(10) << "stopping agent" << dendl;
4120 service.agent_stop();
4121
11fdf7f2
TL
4122 boot_finisher.wait_for_empty();
4123
9f95a23c 4124 osd_lock.lock();
7c673cae 4125
11fdf7f2 4126 boot_finisher.stop();
494da23a 4127 reset_heartbeat_peers(true);
7c673cae
FG
4128
4129 tick_timer.shutdown();
4130
4131 {
11fdf7f2 4132 std::lock_guard l(tick_timer_lock);
7c673cae
FG
4133 tick_timer_without_osd_lock.shutdown();
4134 }
4135
4136 // note unmount epoch
9f95a23c 4137 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
7c673cae 4138 superblock.mounted = service.get_boot_epoch();
9f95a23c 4139 superblock.clean_thru = get_osdmap_epoch();
7c673cae
FG
4140 ObjectStore::Transaction t;
4141 write_superblock(t);
11fdf7f2 4142 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4143 if (r) {
4144 derr << "OSD::shutdown: error writing superblock: "
4145 << cpp_strerror(r) << dendl;
4146 }
4147
4148
31f18b77
FG
4149 service.shutdown_reserver();
4150
7c673cae
FG
4151 // Remove PGs
4152#ifdef PG_DEBUG_REFS
4153 service.dump_live_pgids();
4154#endif
11fdf7f2
TL
4155 while (true) {
4156 vector<PGRef> pgs;
4157 _get_pgs(&pgs, true);
4158 if (pgs.empty()) {
4159 break;
4160 }
4161 for (auto& pg : pgs) {
4162 if (pg->is_deleted()) {
4163 continue;
4164 }
4165 dout(20) << " kicking pg " << pg << dendl;
4166 pg->lock();
4167 if (pg->get_num_ref() != 1) {
4168 derr << "pgid " << pg->get_pgid() << " has ref count of "
4169 << pg->get_num_ref() << dendl;
7c673cae 4170#ifdef PG_DEBUG_REFS
11fdf7f2 4171 pg->dump_live_ids();
7c673cae 4172#endif
31f18b77
FG
4173 if (cct->_conf->osd_shutdown_pgref_assert) {
4174 ceph_abort();
4175 }
7c673cae 4176 }
11fdf7f2
TL
4177 pg->ch.reset();
4178 pg->unlock();
7c673cae 4179 }
7c673cae
FG
4180 }
4181#ifdef PG_DEBUG_REFS
4182 service.dump_live_pgids();
4183#endif
f64942e4 4184
9f95a23c 4185 osd_lock.unlock();
11fdf7f2 4186 cct->_conf.remove_observer(this);
9f95a23c 4187 osd_lock.lock();
7c673cae 4188
11fdf7f2
TL
4189 service.meta_ch.reset();
4190
7c673cae
FG
4191 dout(10) << "syncing store" << dendl;
4192 enable_disable_fuse(true);
4193
4194 if (cct->_conf->osd_journal_flush_on_shutdown) {
4195 dout(10) << "flushing journal" << dendl;
4196 store->flush_journal();
4197 }
4198
7c673cae 4199 monc->shutdown();
9f95a23c
TL
4200 osd_lock.unlock();
4201 {
4202 std::unique_lock l{map_lock};
4203 set_osdmap(OSDMapRef());
4204 }
11fdf7f2
TL
4205 for (auto s : shards) {
4206 std::lock_guard l(s->osdmap_lock);
4207 s->shard_osdmap = OSDMapRef();
4208 }
7c673cae 4209 service.shutdown();
11fdf7f2
TL
4210
4211 std::lock_guard lock(osd_lock);
4212 store->umount();
4213 delete store;
4214 store = nullptr;
4215 dout(10) << "Store synced" << dendl;
4216
7c673cae
FG
4217 op_tracker.on_shutdown();
4218
9f95a23c 4219 ClassHandler::get_instance().shutdown();
7c673cae
FG
4220 client_messenger->shutdown();
4221 cluster_messenger->shutdown();
4222 hb_front_client_messenger->shutdown();
4223 hb_back_client_messenger->shutdown();
4224 objecter_messenger->shutdown();
4225 hb_front_server_messenger->shutdown();
4226 hb_back_server_messenger->shutdown();
4227
7c673cae
FG
4228 return r;
4229}
4230
4231int OSD::mon_cmd_maybe_osd_create(string &cmd)
4232{
4233 bool created = false;
4234 while (true) {
4235 dout(10) << __func__ << " cmd: " << cmd << dendl;
4236 vector<string> vcmd{cmd};
4237 bufferlist inbl;
4238 C_SaferCond w;
4239 string outs;
4240 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4241 int r = w.wait();
4242 if (r < 0) {
4243 if (r == -ENOENT && !created) {
4244 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4245 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4246 vector<string> vnewcmd{newcmd};
4247 bufferlist inbl;
4248 C_SaferCond w;
4249 string outs;
4250 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4251 int r = w.wait();
4252 if (r < 0) {
4253 derr << __func__ << " fail: osd does not exist and created failed: "
4254 << cpp_strerror(r) << dendl;
4255 return r;
4256 }
4257 created = true;
4258 continue;
4259 }
4260 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4261 return r;
4262 }
4263 break;
4264 }
4265
4266 return 0;
4267}
4268
4269int OSD::update_crush_location()
4270{
4271 if (!cct->_conf->osd_crush_update_on_start) {
4272 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4273 return 0;
4274 }
4275
4276 char weight[32];
4277 if (cct->_conf->osd_crush_initial_weight >= 0) {
4278 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4279 } else {
4280 struct store_statfs_t st;
11fdf7f2
TL
4281 osd_alert_list_t alerts;
4282 int r = store->statfs(&st, &alerts);
7c673cae
FG
4283 if (r < 0) {
4284 derr << "statfs: " << cpp_strerror(r) << dendl;
4285 return r;
4286 }
4287 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4288 std::max(.00001,
4289 double(st.total) /
4290 double(1ull << 40 /* TB */)));
7c673cae
FG
4291 }
4292
9f95a23c 4293 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
7c673cae
FG
4294
4295 string cmd =
4296 string("{\"prefix\": \"osd crush create-or-move\", ") +
9f95a23c
TL
4297 string("\"id\": ") + stringify(whoami) + ", " +
4298 string("\"weight\":") + weight + ", " +
4299 string("\"args\": [") + stringify(cct->crush_location) + "]}";
7c673cae
FG
4300 return mon_cmd_maybe_osd_create(cmd);
4301}
4302
4303int OSD::update_crush_device_class()
4304{
224ce89b
WB
4305 if (!cct->_conf->osd_class_update_on_start) {
4306 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4307 return 0;
4308 }
4309
7c673cae
FG
4310 string device_class;
4311 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4312 if (r < 0 || device_class.empty()) {
4313 device_class = store->get_default_device_class();
4314 }
4315
4316 if (device_class.empty()) {
d2e6a577 4317 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4318 return 0;
224ce89b 4319 }
7c673cae
FG
4320
4321 string cmd =
4322 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4323 string("\"class\": \"") + device_class + string("\", ") +
4324 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4325
224ce89b 4326 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4327 if (r == -EBUSY) {
4328 // good, already bound to a device-class
4329 return 0;
4330 } else {
4331 return r;
4332 }
7c673cae
FG
4333}
4334
4335void OSD::write_superblock(ObjectStore::Transaction& t)
4336{
4337 dout(10) << "write_superblock " << superblock << dendl;
4338
4339 //hack: at minimum it's using the baseline feature set
4340 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4341 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4342
4343 bufferlist bl;
11fdf7f2 4344 encode(superblock, bl);
7c673cae
FG
4345 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4346}
4347
4348int OSD::read_superblock()
4349{
4350 bufferlist bl;
11fdf7f2 4351 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4352 if (r < 0)
4353 return r;
4354
11fdf7f2
TL
4355 auto p = bl.cbegin();
4356 decode(superblock, p);
7c673cae
FG
4357
4358 dout(10) << "read_superblock " << superblock << dendl;
4359
4360 return 0;
4361}
4362
4363void OSD::clear_temp_objects()
4364{
4365 dout(10) << __func__ << dendl;
4366 vector<coll_t> ls;
4367 store->list_collections(ls);
4368 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4369 spg_t pgid;
4370 if (!p->is_pg(&pgid))
4371 continue;
4372
4373 // list temp objects
4374 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4375
4376 vector<ghobject_t> temps;
4377 ghobject_t next;
4378 while (1) {
4379 vector<ghobject_t> objects;
11fdf7f2
TL
4380 auto ch = store->open_collection(*p);
4381 ceph_assert(ch);
4382 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4383 store->get_ideal_list_max(),
4384 &objects, &next);
4385 if (objects.empty())
4386 break;
4387 vector<ghobject_t>::iterator q;
4388 for (q = objects.begin(); q != objects.end(); ++q) {
4389 // Hammer set pool for temps to -1, so check for clean-up
4390 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4391 temps.push_back(*q);
4392 } else {
4393 break;
4394 }
4395 }
4396 // If we saw a non-temp object and hit the break above we can
4397 // break out of the while loop too.
4398 if (q != objects.end())
4399 break;
4400 }
4401 if (!temps.empty()) {
4402 ObjectStore::Transaction t;
4403 int removed = 0;
4404 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4405 dout(20) << " removing " << *p << " object " << *q << dendl;
4406 t.remove(*p, *q);
4407 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4408 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4409 t = ObjectStore::Transaction();
4410 removed = 0;
4411 }
4412 }
4413 if (removed) {
11fdf7f2 4414 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4415 }
4416 }
4417 }
4418}
4419
4420void OSD::recursive_remove_collection(CephContext* cct,
4421 ObjectStore *store, spg_t pgid,
4422 coll_t tmp)
4423{
4424 OSDriver driver(
4425 store,
4426 coll_t(),
4427 make_snapmapper_oid());
4428
11fdf7f2 4429 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4430 ObjectStore::Transaction t;
4431 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4432
11fdf7f2
TL
4433 ghobject_t next;
4434 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4435 vector<ghobject_t> objects;
11fdf7f2
TL
4436 objects.reserve(max);
4437 while (true) {
4438 objects.clear();
4439 store->collection_list(ch, next, ghobject_t::get_max(),
4440 max, &objects, &next);
4441 generic_dout(10) << __func__ << " " << objects << dendl;
4442 if (objects.empty())
4443 break;
4444 for (auto& p: objects) {
4445 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4446 int r = mapper.remove_oid(p.hobj, &_t);
4447 if (r != 0 && r != -ENOENT)
4448 ceph_abort();
4449 t.remove(tmp, p);
7c673cae 4450 }
11fdf7f2
TL
4451 int r = store->queue_transaction(ch, std::move(t));
4452 ceph_assert(r == 0);
4453 t = ObjectStore::Transaction();
7c673cae
FG
4454 }
4455 t.remove_collection(tmp);
11fdf7f2
TL
4456 int r = store->queue_transaction(ch, std::move(t));
4457 ceph_assert(r == 0);
7c673cae
FG
4458
4459 C_SaferCond waiter;
11fdf7f2 4460 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4461 waiter.wait();
4462 }
4463}
4464
4465
4466// ======================================================
4467// PG's
4468
7c673cae
FG
4469PG* OSD::_make_pg(
4470 OSDMapRef createmap,
4471 spg_t pgid)
4472{
11fdf7f2
TL
4473 dout(10) << __func__ << " " << pgid << dendl;
4474 pg_pool_t pi;
4475 map<string,string> ec_profile;
4476 string name;
4477 if (createmap->have_pg_pool(pgid.pool())) {
4478 pi = *createmap->get_pg_pool(pgid.pool());
4479 name = createmap->get_pool_name(pgid.pool());
4480 if (pi.is_erasure()) {
4481 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4482 }
4483 } else {
4484 // pool was deleted; grab final pg_pool_t off disk.
4485 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4486 bufferlist bl;
4487 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4488 if (r < 0) {
4489 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4490 << dendl;
4491 return nullptr;
4492 }
4493 ceph_assert(r >= 0);
4494 auto p = bl.cbegin();
4495 decode(pi, p);
4496 decode(name, p);
4497 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4498 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4499 << " tombstone" << dendl;
4500 return nullptr;
4501 }
4502 decode(ec_profile, p);
4503 }
4504 PGPool pool(cct, createmap, pgid.pool(), pi, name);
7c673cae 4505 PG *pg;
11fdf7f2
TL
4506 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4507 pi.type == pg_pool_t::TYPE_ERASURE)
4508 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4509 else
4510 ceph_abort();
7c673cae
FG
4511 return pg;
4512}
4513
11fdf7f2 4514void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4515{
11fdf7f2
TL
4516 v->clear();
4517 v->reserve(get_num_pgs());
4518 for (auto& s : shards) {
4519 std::lock_guard l(s->shard_lock);
4520 for (auto& j : s->pg_slots) {
4521 if (j.second->pg &&
4522 !j.second->pg->is_deleted()) {
4523 v->push_back(j.second->pg);
4524 if (clear_too) {
4525 s->_detach_pg(j.second.get());
4526 }
4527 }
7c673cae 4528 }
7c673cae 4529 }
7c673cae
FG
4530}
4531
11fdf7f2 4532void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4533{
11fdf7f2
TL
4534 v->clear();
4535 v->reserve(get_num_pgs());
4536 for (auto& s : shards) {
4537 std::lock_guard l(s->shard_lock);
4538 for (auto& j : s->pg_slots) {
4539 if (j.second->pg &&
4540 !j.second->pg->is_deleted()) {
4541 v->push_back(j.first);
4542 }
7c673cae
FG
4543 }
4544 }
7c673cae
FG
4545}
4546
11fdf7f2 4547void OSD::register_pg(PGRef pg)
7c673cae 4548{
11fdf7f2
TL
4549 spg_t pgid = pg->get_pgid();
4550 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4551 auto sdata = shards[shard_index];
4552 std::lock_guard l(sdata->shard_lock);
4553 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4554 ceph_assert(r.second);
4555 auto *slot = r.first->second.get();
4556 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4557 sdata->_attach_pg(slot, pg.get());
4558}
7c673cae 4559
11fdf7f2
TL
4560bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4561{
4562 auto sdata = pg->osd_shard;
4563 ceph_assert(sdata);
4564 {
4565 std::lock_guard l(sdata->shard_lock);
4566 auto p = sdata->pg_slots.find(pg->pg_id);
4567 if (p == sdata->pg_slots.end() ||
4568 !p->second->pg) {
4569 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4570 return false;
4571 }
4572 if (p->second->waiting_for_merge_epoch) {
4573 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4574 return false;
4575 }
4576 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4577 sdata->_detach_pg(p->second.get());
4578 }
7c673cae 4579
11fdf7f2
TL
4580 for (auto shard : shards) {
4581 shard->unprime_split_children(pg->pg_id, old_pg_num);
4582 }
7c673cae 4583
11fdf7f2
TL
4584 // update pg count now since we might not get an osdmap any time soon.
4585 if (pg->is_primary())
4586 service.logger->dec(l_osd_pg_primary);
9f95a23c
TL
4587 else if (pg->is_nonprimary())
4588 service.logger->dec(l_osd_pg_replica); // misnomver
11fdf7f2
TL
4589 else
4590 service.logger->dec(l_osd_pg_stray);
7c673cae 4591
11fdf7f2 4592 return true;
7c673cae
FG
4593}
4594
11fdf7f2 4595PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 4596{
11fdf7f2
TL
4597 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4598 auto sdata = shards[shard_index];
4599 std::lock_guard l(sdata->shard_lock);
4600 auto p = sdata->pg_slots.find(pgid);
4601 if (p == sdata->pg_slots.end()) {
7c673cae 4602 return nullptr;
11fdf7f2
TL
4603 }
4604 return p->second->pg;
7c673cae
FG
4605}
4606
11fdf7f2 4607PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 4608{
11fdf7f2
TL
4609 PGRef pg = _lookup_pg(pgid);
4610 if (!pg) {
4611 return nullptr;
4612 }
4613 pg->lock();
4614 if (!pg->is_deleted()) {
4615 return pg;
4616 }
4617 pg->unlock();
4618 return nullptr;
31f18b77
FG
4619}
4620
11fdf7f2 4621PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 4622{
11fdf7f2 4623 return _lookup_lock_pg(pgid);
7c673cae
FG
4624}
4625
4626void OSD::load_pgs()
4627{
9f95a23c 4628 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 4629 dout(0) << "load_pgs" << dendl;
11fdf7f2 4630
7c673cae 4631 {
11fdf7f2
TL
4632 auto pghist = make_pg_num_history_oid();
4633 bufferlist bl;
4634 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4635 if (r >= 0 && bl.length() > 0) {
4636 auto p = bl.cbegin();
4637 decode(pg_num_history, p);
4638 }
4639 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
4640 }
4641
4642 vector<coll_t> ls;
4643 int r = store->list_collections(ls);
4644 if (r < 0) {
4645 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4646 }
4647
11fdf7f2 4648 int num = 0;
7c673cae
FG
4649 for (vector<coll_t>::iterator it = ls.begin();
4650 it != ls.end();
4651 ++it) {
4652 spg_t pgid;
4653 if (it->is_temp(&pgid) ||
4654 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
11fdf7f2
TL
4655 dout(10) << "load_pgs " << *it
4656 << " removing, legacy or flagged for removal pg" << dendl;
7c673cae
FG
4657 recursive_remove_collection(cct, store, pgid, *it);
4658 continue;
4659 }
4660
4661 if (!it->is_pg(&pgid)) {
4662 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4663 continue;
4664 }
4665
7c673cae 4666 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 4667 epoch_t map_epoch = 0;
11fdf7f2 4668 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
7c673cae
FG
4669 if (r < 0) {
4670 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4671 << dendl;
4672 continue;
4673 }
4674
11fdf7f2 4675 PGRef pg;
7c673cae
FG
4676 if (map_epoch > 0) {
4677 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4678 if (!pgosdmap) {
9f95a23c 4679 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
7c673cae
FG
4680 derr << __func__ << ": could not find map for epoch " << map_epoch
4681 << " on pg " << pgid << ", but the pool is not present in the "
4682 << "current map, so this is probably a result of bug 10617. "
4683 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4684 << "to clean it up later." << dendl;
4685 continue;
4686 } else {
4687 derr << __func__ << ": have pgid " << pgid << " at epoch "
4688 << map_epoch << ", but missing map. Crashing."
4689 << dendl;
11fdf7f2 4690 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
4691 }
4692 }
11fdf7f2 4693 pg = _make_pg(pgosdmap, pgid);
7c673cae 4694 } else {
9f95a23c 4695 pg = _make_pg(get_osdmap(), pgid);
7c673cae 4696 }
11fdf7f2
TL
4697 if (!pg) {
4698 recursive_remove_collection(cct, store, pgid, *it);
4699 continue;
4700 }
4701
4702 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 4703
11fdf7f2 4704 pg->lock();
7c673cae
FG
4705 pg->ch = store->open_collection(pg->coll);
4706
4707 // read pg state, log
11fdf7f2 4708 pg->read_state(store);
7c673cae 4709
94b18763
FG
4710 if (pg->dne()) {
4711 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4712 pg->ch = nullptr;
94b18763 4713 pg->unlock();
94b18763
FG
4714 recursive_remove_collection(cct, store, pgid, *it);
4715 continue;
4716 }
11fdf7f2
TL
4717 {
4718 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4719 assert(NULL != shards[shard_index]);
4720 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4721 }
7c673cae
FG
4722
4723 pg->reg_next_scrub();
4724
11fdf7f2 4725 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 4726 pg->unlock();
7c673cae 4727
11fdf7f2
TL
4728 register_pg(pg);
4729 ++num;
7c673cae 4730 }
11fdf7f2 4731 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
4732}
4733
4734
11fdf7f2
TL
4735PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4736 const PGCreateInfo *info)
4737{
4738 spg_t pgid = info->pgid;
7c673cae 4739
11fdf7f2
TL
4740 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4741 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4742 return nullptr;
4743 }
3efd9988 4744
9f95a23c 4745 PeeringCtx rctx = create_context();
7c673cae 4746
11fdf7f2 4747 OSDMapRef startmap = get_map(info->epoch);
7c673cae 4748
11fdf7f2
TL
4749 if (info->by_mon) {
4750 int64_t pool_id = pgid.pgid.pool();
4751 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4752 if (!pool) {
4753 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4754 return nullptr;
4755 }
9f95a23c 4756 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
4757 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4758 // this ensures we do not process old creating messages after the
4759 // pool's initial pgs have been created (and pg are subsequently
4760 // allowed to split or merge).
4761 dout(20) << __func__ << " dropping " << pgid
4762 << "create, pool does not have CREATING flag set" << dendl;
4763 return nullptr;
7c673cae
FG
4764 }
4765 }
7c673cae 4766
11fdf7f2
TL
4767 int up_primary, acting_primary;
4768 vector<int> up, acting;
4769 startmap->pg_to_up_acting_osds(
4770 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 4771
11fdf7f2
TL
4772 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4773 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4774 store->get_type() != "bluestore") {
4775 clog->warn() << "pg " << pgid
4776 << " is at risk of silent data corruption: "
4777 << "the pool allows ec overwrites but is not stored in "
4778 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 4779 }
9f95a23c
TL
4780 create_pg_collection(
4781 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4782 init_pg_ondisk(rctx.transaction, pgid, pp);
7c673cae 4783
9f95a23c 4784 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae 4785
11fdf7f2
TL
4786 PGRef pg = _make_pg(startmap, pgid);
4787 pg->ch = store->create_new_collection(pg->coll);
7c673cae 4788
11fdf7f2
TL
4789 {
4790 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4791 assert(NULL != shards[shard_index]);
4792 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 4793 }
7c673cae 4794
11fdf7f2 4795 pg->lock(true);
7c673cae 4796
11fdf7f2
TL
4797 // we are holding the shard lock
4798 ceph_assert(!pg->is_deleted());
4799
4800 pg->init(
4801 role,
4802 up,
4803 up_primary,
4804 acting,
4805 acting_primary,
4806 info->history,
4807 info->past_intervals,
4808 false,
4809 rctx.transaction);
7c673cae 4810
92f5a8d4
TL
4811 pg->init_collection_pool_opts();
4812
11fdf7f2 4813 if (pg->is_primary()) {
9f95a23c 4814 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
4815 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4816 }
7c673cae 4817
9f95a23c
TL
4818 pg->handle_initialize(rctx);
4819 pg->handle_activate_map(rctx);
7c673cae 4820
11fdf7f2 4821 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 4822
11fdf7f2
TL
4823 dout(10) << __func__ << " new pg " << *pg << dendl;
4824 return pg;
7c673cae
FG
4825}
4826
11fdf7f2
TL
4827bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4828 spg_t pgid,
4829 bool is_mon_create)
3efd9988
FG
4830{
4831 const auto max_pgs_per_osd =
11fdf7f2
TL
4832 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4833 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 4834
11fdf7f2 4835 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
4836 return false;
4837 }
11fdf7f2
TL
4838
4839 std::lock_guard l(pending_creates_lock);
3efd9988
FG
4840 if (is_mon_create) {
4841 pending_creates_from_mon++;
4842 } else {
9f95a23c
TL
4843 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4844 pending_creates_from_osd.emplace(pgid, is_primary);
3efd9988 4845 }
1adf2230 4846 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 4847 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
4848 return true;
4849}
4850
4851// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4852// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4853// to up set if pg_temp is empty. so an empty pg_temp won't work.
4854static vector<int32_t> twiddle(const vector<int>& acting) {
4855 if (acting.size() > 1) {
4856 return {acting[0]};
4857 } else {
4858 vector<int32_t> twiddled(acting.begin(), acting.end());
4859 twiddled.push_back(-1);
4860 return twiddled;
4861 }
4862}
4863
4864void OSD::resume_creating_pg()
4865{
4866 bool do_sub_pg_creates = false;
b32b8144 4867 bool have_pending_creates = false;
3efd9988
FG
4868 {
4869 const auto max_pgs_per_osd =
11fdf7f2
TL
4870 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4871 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4872 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
4873 // this could happen if admin decreases this setting before a PG is removed
4874 return;
4875 }
11fdf7f2
TL
4876 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4877 std::lock_guard l(pending_creates_lock);
3efd9988 4878 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
4879 dout(20) << __func__ << " pending_creates_from_mon "
4880 << pending_creates_from_mon << dendl;
3efd9988
FG
4881 do_sub_pg_creates = true;
4882 if (pending_creates_from_mon >= spare_pgs) {
4883 spare_pgs = pending_creates_from_mon = 0;
4884 } else {
4885 spare_pgs -= pending_creates_from_mon;
4886 pending_creates_from_mon = 0;
4887 }
4888 }
4889 auto pg = pending_creates_from_osd.cbegin();
4890 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 4891 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 4892 vector<int> acting;
9f95a23c
TL
4893 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
4894 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
3efd9988 4895 pg = pending_creates_from_osd.erase(pg);
94b18763 4896 do_sub_pg_creates = true;
3efd9988
FG
4897 spare_pgs--;
4898 }
b32b8144
FG
4899 have_pending_creates = (pending_creates_from_mon > 0 ||
4900 !pending_creates_from_osd.empty());
3efd9988 4901 }
b32b8144
FG
4902
4903 bool do_renew_subs = false;
3efd9988
FG
4904 if (do_sub_pg_creates) {
4905 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4906 dout(4) << __func__ << ": resolicit pg creates from mon since "
4907 << last_pg_create_epoch << dendl;
b32b8144 4908 do_renew_subs = true;
3efd9988
FG
4909 }
4910 }
9f95a23c 4911 version_t start = get_osdmap_epoch() + 1;
b32b8144
FG
4912 if (have_pending_creates) {
4913 // don't miss any new osdmap deleting PGs
4914 if (monc->sub_want("osdmap", start, 0)) {
4915 dout(4) << __func__ << ": resolicit osdmap from mon since "
4916 << start << dendl;
4917 do_renew_subs = true;
4918 }
94b18763 4919 } else if (do_sub_pg_creates) {
b32b8144
FG
4920 // no need to subscribe the osdmap continuously anymore
4921 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4922 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 4923 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
4924 << start << dendl;
4925 do_renew_subs = true;
4926 }
4927 }
4928
4929 if (do_renew_subs) {
4930 monc->renew_subs();
4931 }
4932
94b18763 4933 service.send_pg_temp();
3efd9988 4934}
7c673cae
FG
4935
4936void OSD::build_initial_pg_history(
4937 spg_t pgid,
4938 epoch_t created,
4939 utime_t created_stamp,
4940 pg_history_t *h,
4941 PastIntervals *pi)
4942{
4943 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
9f95a23c 4944 *h = pg_history_t(created, created_stamp);
7c673cae
FG
4945
4946 OSDMapRef lastmap = service.get_map(created);
4947 int up_primary, acting_primary;
4948 vector<int> up, acting;
4949 lastmap->pg_to_up_acting_osds(
4950 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4951
4952 ostringstream debug;
9f95a23c 4953 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
7c673cae
FG
4954 OSDMapRef osdmap = service.get_map(e);
4955 int new_up_primary, new_acting_primary;
4956 vector<int> new_up, new_acting;
4957 osdmap->pg_to_up_acting_osds(
4958 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4959
4960 // this is a bit imprecise, but sufficient?
4961 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4962 const pg_pool_t *pi;
4963 bool operator()(const set<pg_shard_t> &have) const {
4964 return have.size() >= pi->min_size;
4965 }
11fdf7f2 4966 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
7c673cae
FG
4967 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4968
4969 bool new_interval = PastIntervals::check_new_interval(
4970 acting_primary,
4971 new_acting_primary,
4972 acting, new_acting,
4973 up_primary,
4974 new_up_primary,
4975 up, new_up,
4976 h->same_interval_since,
4977 h->last_epoch_clean,
9f95a23c
TL
4978 osdmap.get(),
4979 lastmap.get(),
7c673cae 4980 pgid.pgid,
9f95a23c 4981 min_size_predicate,
7c673cae
FG
4982 pi,
4983 &debug);
4984 if (new_interval) {
4985 h->same_interval_since = e;
181888fb
FG
4986 if (up != new_up) {
4987 h->same_up_since = e;
4988 }
4989 if (acting_primary != new_acting_primary) {
4990 h->same_primary_since = e;
4991 }
4992 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4993 osdmap->get_pg_num(pgid.pgid.pool()),
4994 nullptr)) {
4995 h->last_epoch_split = e;
4996 }
4997 up = new_up;
4998 acting = new_acting;
4999 up_primary = new_up_primary;
5000 acting_primary = new_acting_primary;
c07f9fc5 5001 }
7c673cae
FG
5002 lastmap = osdmap;
5003 }
5004 dout(20) << __func__ << " " << debug.str() << dendl;
5005 dout(10) << __func__ << " " << *h << " " << *pi
5006 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5007 pi->get_bounds()) << ")"
5008 << dendl;
5009}
5010
7c673cae
FG
5011void OSD::_add_heartbeat_peer(int p)
5012{
5013 if (p == whoami)
5014 return;
5015 HeartbeatInfo *hi;
5016
5017 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5018 if (i == heartbeat_peers.end()) {
9f95a23c 5019 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
7c673cae
FG
5020 if (!cons.first)
5021 return;
9f95a23c
TL
5022 assert(cons.second);
5023
7c673cae
FG
5024 hi = &heartbeat_peers[p];
5025 hi->peer = p;
9f95a23c
TL
5026
5027 auto stamps = service.get_hb_stamps(p);
5028
5029 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5030 sb->peer = p;
5031 sb->stamps = stamps;
eafe8130 5032 hi->hb_interval_start = ceph_clock_now();
7c673cae 5033 hi->con_back = cons.first.get();
9f95a23c
TL
5034 hi->con_back->set_priv(sb);
5035
5036 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5037 sf->peer = p;
5038 sf->stamps = stamps;
5039 hi->con_front = cons.second.get();
5040 hi->con_front->set_priv(sf);
5041
5042 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5043 << " " << hi->con_back->get_peer_addr()
5044 << " " << hi->con_front->get_peer_addr()
5045 << dendl;
7c673cae
FG
5046 } else {
5047 hi = &i->second;
5048 }
9f95a23c 5049 hi->epoch = get_osdmap_epoch();
7c673cae
FG
5050}
5051
5052void OSD::_remove_heartbeat_peer(int n)
5053{
5054 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 5055 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
5056 dout(20) << " removing heartbeat peer osd." << n
5057 << " " << q->second.con_back->get_peer_addr()
5058 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5059 << dendl;
9f95a23c 5060 q->second.clear_mark_down();
7c673cae
FG
5061 heartbeat_peers.erase(q);
5062}
5063
5064void OSD::need_heartbeat_peer_update()
5065{
5066 if (is_stopping())
5067 return;
5068 dout(20) << "need_heartbeat_peer_update" << dendl;
5069 heartbeat_set_peers_need_update();
5070}
5071
5072void OSD::maybe_update_heartbeat_peers()
5073{
9f95a23c 5074 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5075
11fdf7f2 5076 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
5077 utime_t now = ceph_clock_now();
5078 if (last_heartbeat_resample == utime_t()) {
5079 last_heartbeat_resample = now;
5080 heartbeat_set_peers_need_update();
5081 } else if (!heartbeat_peers_need_update()) {
5082 utime_t dur = now - last_heartbeat_resample;
5083 if (dur > cct->_conf->osd_heartbeat_grace) {
5084 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5085 heartbeat_set_peers_need_update();
5086 last_heartbeat_resample = now;
494da23a
TL
5087 // automatically clean up any stale heartbeat peers
5088 // if we are unhealthy, then clean all
5089 reset_heartbeat_peers(is_waiting_for_healthy());
7c673cae
FG
5090 }
5091 }
5092 }
5093
5094 if (!heartbeat_peers_need_update())
5095 return;
5096 heartbeat_clear_peers_need_update();
5097
11fdf7f2 5098 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5099
5100 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5101
5102
5103 // build heartbeat from set
5104 if (is_active()) {
11fdf7f2
TL
5105 vector<PGRef> pgs;
5106 _get_pgs(&pgs);
5107 for (auto& pg : pgs) {
5108 pg->with_heartbeat_peers([&](int peer) {
9f95a23c 5109 if (get_osdmap()->is_up(peer)) {
11fdf7f2
TL
5110 _add_heartbeat_peer(peer);
5111 }
5112 });
7c673cae
FG
5113 }
5114 }
5115
5116 // include next and previous up osds to ensure we have a fully-connected set
5117 set<int> want, extras;
9f95a23c 5118 const int next = get_osdmap()->get_next_up_osd_after(whoami);
7c673cae
FG
5119 if (next >= 0)
5120 want.insert(next);
9f95a23c 5121 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
7c673cae
FG
5122 if (prev >= 0 && prev != next)
5123 want.insert(prev);
5124
11fdf7f2
TL
5125 // make sure we have at least **min_down** osds coming from different
5126 // subtree level (e.g., hosts) for fast failure detection.
5127 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5128 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
9f95a23c
TL
5129 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5130 get_osdmap()->get_random_up_osds_by_subtree(
5131 whoami, subtree, limit, want, &want);
11fdf7f2 5132
7c673cae
FG
5133 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5134 dout(10) << " adding neighbor peer osd." << *p << dendl;
5135 extras.insert(*p);
5136 _add_heartbeat_peer(*p);
5137 }
5138
5139 // remove down peers; enumerate extras
5140 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5141 while (p != heartbeat_peers.end()) {
9f95a23c 5142 if (!get_osdmap()->is_up(p->first)) {
7c673cae
FG
5143 int o = p->first;
5144 ++p;
5145 _remove_heartbeat_peer(o);
5146 continue;
5147 }
9f95a23c 5148 if (p->second.epoch < get_osdmap_epoch()) {
7c673cae
FG
5149 extras.insert(p->first);
5150 }
5151 ++p;
5152 }
5153
5154 // too few?
11fdf7f2 5155 for (int n = next; n >= 0; ) {
7c673cae
FG
5156 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5157 break;
5158 if (!extras.count(n) && !want.count(n) && n != whoami) {
5159 dout(10) << " adding random peer osd." << n << dendl;
5160 extras.insert(n);
5161 _add_heartbeat_peer(n);
5162 }
9f95a23c 5163 n = get_osdmap()->get_next_up_osd_after(n);
11fdf7f2 5164 if (n == next)
7c673cae
FG
5165 break; // came full circle; stop
5166 }
5167
5168 // too many?
5169 for (set<int>::iterator p = extras.begin();
5170 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5171 ++p) {
5172 if (want.count(*p))
5173 continue;
5174 _remove_heartbeat_peer(*p);
5175 }
5176
5177 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
9f95a23c
TL
5178
5179 // clean up stale failure pending
5180 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5181 if (heartbeat_peers.count(it->first) == 0) {
5182 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5183 failure_pending.erase(it++);
5184 } else {
5185 it++;
5186 }
5187 }
7c673cae
FG
5188}
5189
494da23a 5190void OSD::reset_heartbeat_peers(bool all)
7c673cae 5191{
9f95a23c 5192 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5193 dout(10) << "reset_heartbeat_peers" << dendl;
494da23a
TL
5194 utime_t stale = ceph_clock_now();
5195 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
11fdf7f2 5196 std::lock_guard l(heartbeat_lock);
494da23a
TL
5197 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5198 HeartbeatInfo& hi = it->second;
5199 if (all || hi.is_stale(stale)) {
9f95a23c 5200 hi.clear_mark_down();
494da23a
TL
5201 // stop sending failure_report to mon too
5202 failure_queue.erase(it->first);
5203 heartbeat_peers.erase(it++);
5204 } else {
5205 it++;
7c673cae 5206 }
7c673cae 5207 }
7c673cae
FG
5208}
5209
5210void OSD::handle_osd_ping(MOSDPing *m)
5211{
5212 if (superblock.cluster_fsid != m->fsid) {
5213 dout(20) << "handle_osd_ping from " << m->get_source_inst()
9f95a23c
TL
5214 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5215 << dendl;
7c673cae
FG
5216 m->put();
5217 return;
5218 }
5219
5220 int from = m->get_source().num();
5221
9f95a23c 5222 heartbeat_lock.lock();
7c673cae 5223 if (is_stopping()) {
9f95a23c 5224 heartbeat_lock.unlock();
7c673cae
FG
5225 m->put();
5226 return;
5227 }
5228
9f95a23c
TL
5229 utime_t now = ceph_clock_now();
5230 auto mnow = service.get_mnow();
5231 ConnectionRef con(m->get_connection());
7c673cae 5232 OSDMapRef curmap = service.get_osdmap();
c07f9fc5 5233 if (!curmap) {
9f95a23c 5234 heartbeat_lock.unlock();
c07f9fc5
FG
5235 m->put();
5236 return;
5237 }
7c673cae 5238
9f95a23c
TL
5239 auto sref = con->get_priv();
5240 Session *s = static_cast<Session*>(sref.get());
5241 if (!s) {
5242 heartbeat_lock.unlock();
5243 m->put();
5244 return;
5245 }
5246 if (!s->stamps) {
5247 s->peer = from;
5248 s->stamps = service.get_hb_stamps(from);
5249 }
5250
7c673cae
FG
5251 switch (m->op) {
5252
5253 case MOSDPing::PING:
5254 {
5255 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5256 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5257 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5258 if (heartbeat_drop->second == 0) {
5259 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5260 } else {
5261 --heartbeat_drop->second;
5262 dout(5) << "Dropping heartbeat from " << from
5263 << ", " << heartbeat_drop->second
5264 << " remaining to drop" << dendl;
5265 break;
5266 }
5267 } else if (cct->_conf->osd_debug_drop_ping_probability >
5268 ((((double)(rand()%100))/100.0))) {
5269 heartbeat_drop =
5270 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5271 cct->_conf->osd_debug_drop_ping_duration)).first;
5272 dout(5) << "Dropping heartbeat from " << from
5273 << ", " << heartbeat_drop->second
5274 << " remaining to drop" << dendl;
5275 break;
5276 }
5277 }
5278
9f95a23c
TL
5279 ceph::signedspan sender_delta_ub{};
5280 s->stamps->got_ping(
5281 m->up_from,
5282 mnow,
5283 m->mono_send_stamp,
5284 m->delta_ub,
5285 &sender_delta_ub);
5286 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5287
7c673cae 5288 if (!cct->get_heartbeat_map()->is_healthy()) {
9f95a23c
TL
5289 dout(10) << "internal heartbeat not healthy, dropping ping request"
5290 << dendl;
7c673cae
FG
5291 break;
5292 }
5293
5294 Message *r = new MOSDPing(monc->get_fsid(),
5295 curmap->get_epoch(),
9f95a23c
TL
5296 MOSDPing::PING_REPLY,
5297 m->ping_stamp,
5298 m->mono_ping_stamp,
5299 mnow,
5300 service.get_up_epoch(),
5301 cct->_conf->osd_heartbeat_min_size,
5302 sender_delta_ub);
5303 con->send_message(r);
7c673cae
FG
5304
5305 if (curmap->is_up(from)) {
7c673cae 5306 if (is_active()) {
9f95a23c
TL
5307 ConnectionRef cluster_con = service.get_con_osd_cluster(
5308 from, curmap->get_epoch());
5309 if (cluster_con) {
5310 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5311 }
5312 }
5313 } else if (!curmap->exists(from) ||
5314 curmap->get_down_at(from) > m->map_epoch) {
5315 // tell them they have died
5316 Message *r = new MOSDPing(monc->get_fsid(),
5317 curmap->get_epoch(),
5318 MOSDPing::YOU_DIED,
9f95a23c
TL
5319 m->ping_stamp,
5320 m->mono_ping_stamp,
5321 mnow,
5322 service.get_up_epoch(),
31f18b77 5323 cct->_conf->osd_heartbeat_min_size);
9f95a23c 5324 con->send_message(r);
7c673cae
FG
5325 }
5326 }
5327 break;
5328
5329 case MOSDPing::PING_REPLY:
5330 {
5331 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5332 if (i != heartbeat_peers.end()) {
9f95a23c 5333 auto acked = i->second.ping_history.find(m->ping_stamp);
11fdf7f2 5334 if (acked != i->second.ping_history.end()) {
11fdf7f2 5335 int &unacknowledged = acked->second.second;
9f95a23c 5336 if (con == i->second.con_back) {
11fdf7f2
TL
5337 dout(25) << "handle_osd_ping got reply from osd." << from
5338 << " first_tx " << i->second.first_tx
5339 << " last_tx " << i->second.last_tx
9f95a23c
TL
5340 << " last_rx_back " << i->second.last_rx_back
5341 << " -> " << now
11fdf7f2
TL
5342 << " last_rx_front " << i->second.last_rx_front
5343 << dendl;
5344 i->second.last_rx_back = now;
5345 ceph_assert(unacknowledged > 0);
5346 --unacknowledged;
5347 // if there is no front con, set both stamps.
5348 if (i->second.con_front == NULL) {
5349 i->second.last_rx_front = now;
5350 ceph_assert(unacknowledged > 0);
5351 --unacknowledged;
5352 }
9f95a23c 5353 } else if (con == i->second.con_front) {
11fdf7f2
TL
5354 dout(25) << "handle_osd_ping got reply from osd." << from
5355 << " first_tx " << i->second.first_tx
5356 << " last_tx " << i->second.last_tx
5357 << " last_rx_back " << i->second.last_rx_back
9f95a23c
TL
5358 << " last_rx_front " << i->second.last_rx_front
5359 << " -> " << now
11fdf7f2
TL
5360 << dendl;
5361 i->second.last_rx_front = now;
5362 ceph_assert(unacknowledged > 0);
5363 --unacknowledged;
5364 }
7c673cae 5365
11fdf7f2
TL
5366 if (unacknowledged == 0) {
5367 // succeeded in getting all replies
5368 dout(25) << "handle_osd_ping got all replies from osd." << from
9f95a23c 5369 << " , erase pending ping(sent at " << m->ping_stamp << ")"
11fdf7f2
TL
5370 << " and older pending ping(s)"
5371 << dendl;
eafe8130
TL
5372
5373#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5374 ++i->second.hb_average_count;
9f95a23c 5375 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
eafe8130
TL
5376 i->second.hb_total_back += back_pingtime;
5377 if (back_pingtime < i->second.hb_min_back)
5378 i->second.hb_min_back = back_pingtime;
5379 if (back_pingtime > i->second.hb_max_back)
5380 i->second.hb_max_back = back_pingtime;
9f95a23c 5381 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
eafe8130
TL
5382 i->second.hb_total_front += front_pingtime;
5383 if (front_pingtime < i->second.hb_min_front)
5384 i->second.hb_min_front = front_pingtime;
5385 if (front_pingtime > i->second.hb_max_front)
5386 i->second.hb_max_front = front_pingtime;
5387
5388 ceph_assert(i->second.hb_interval_start != utime_t());
5389 if (i->second.hb_interval_start == utime_t())
5390 i->second.hb_interval_start = now;
5391 int64_t hb_avg_time_period = 60;
5392 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5393 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5394 }
5395 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5396 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5397 uint32_t back_min = i->second.hb_min_back;
5398 uint32_t back_max = i->second.hb_max_back;
5399 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5400 uint32_t front_min = i->second.hb_min_front;
5401 uint32_t front_max = i->second.hb_max_front;
5402
5403 // Reset for new interval
5404 i->second.hb_average_count = 0;
5405 i->second.hb_interval_start = now;
5406 i->second.hb_total_back = i->second.hb_max_back = 0;
5407 i->second.hb_min_back = UINT_MAX;
5408 i->second.hb_total_front = i->second.hb_max_front = 0;
5409 i->second.hb_min_front = UINT_MAX;
5410
5411 // Record per osd interace ping times
5412 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5413 if (i->second.hb_back_pingtime.size() == 0) {
5414 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5415 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5416 i->second.hb_back_pingtime.push_back(back_avg);
5417 i->second.hb_back_min.push_back(back_min);
5418 i->second.hb_back_max.push_back(back_max);
5419 i->second.hb_front_pingtime.push_back(front_avg);
5420 i->second.hb_front_min.push_back(front_min);
5421 i->second.hb_front_max.push_back(front_max);
5422 ++i->second.hb_index;
5423 }
5424 } else {
5425 int index = i->second.hb_index & (hb_vector_size - 1);
5426 i->second.hb_back_pingtime[index] = back_avg;
5427 i->second.hb_back_min[index] = back_min;
5428 i->second.hb_back_max[index] = back_max;
5429 i->second.hb_front_pingtime[index] = front_avg;
5430 i->second.hb_front_min[index] = front_min;
5431 i->second.hb_front_max[index] = front_max;
5432 ++i->second.hb_index;
5433 }
5434
5435 {
5436 std::lock_guard l(service.stat_lock);
5437 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5438 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5439
5440 uint32_t total = 0;
5441 uint32_t min = UINT_MAX;
5442 uint32_t max = 0;
5443 uint32_t count = 0;
5444 uint32_t which = 0;
5445 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5446 for (int32_t k = size - 1 ; k >= 0; --k) {
5447 ++count;
5448 int index = (i->second.hb_index + k) % size;
5449 total += i->second.hb_back_pingtime[index];
5450 if (i->second.hb_back_min[index] < min)
5451 min = i->second.hb_back_min[index];
5452 if (i->second.hb_back_max[index] > max)
5453 max = i->second.hb_back_max[index];
5454 if (count == 1 || count == 5 || count == 15) {
5455 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5456 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5457 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5458 which++;
5459 if (count == 15)
5460 break;
5461 }
5462 }
5463
5464 if (i->second.con_front != NULL) {
5465 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5466
5467 total = 0;
5468 min = UINT_MAX;
5469 max = 0;
5470 count = 0;
5471 which = 0;
5472 for (int32_t k = size - 1 ; k >= 0; --k) {
5473 ++count;
5474 int index = (i->second.hb_index + k) % size;
5475 total += i->second.hb_front_pingtime[index];
5476 if (i->second.hb_front_min[index] < min)
5477 min = i->second.hb_front_min[index];
5478 if (i->second.hb_front_max[index] > max)
5479 max = i->second.hb_front_max[index];
5480 if (count == 1 || count == 5 || count == 15) {
5481 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5482 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5483 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5484 which++;
5485 if (count == 15)
5486 break;
5487 }
5488 }
5489 }
5490 }
5491 } else {
5492 std::lock_guard l(service.stat_lock);
5493 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5494 if (i->second.con_front != NULL)
5495 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5496 }
11fdf7f2 5497 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5498 }
5499
11fdf7f2
TL
5500 if (i->second.is_healthy(now)) {
5501 // Cancel false reports
5502 auto failure_queue_entry = failure_queue.find(from);
5503 if (failure_queue_entry != failure_queue.end()) {
5504 dout(10) << "handle_osd_ping canceling queued "
5505 << "failure report for osd." << from << dendl;
5506 failure_queue.erase(failure_queue_entry);
5507 }
5508
5509 auto failure_pending_entry = failure_pending.find(from);
5510 if (failure_pending_entry != failure_pending.end()) {
5511 dout(10) << "handle_osd_ping canceling in-flight "
5512 << "failure report for osd." << from << dendl;
5513 send_still_alive(curmap->get_epoch(),
5514 from,
5515 failure_pending_entry->second.second);
5516 failure_pending.erase(failure_pending_entry);
5517 }
7c673cae 5518 }
11fdf7f2
TL
5519 } else {
5520 // old replies, deprecated by newly sent pings.
9f95a23c 5521 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
11fdf7f2
TL
5522 << ") is found, treat as covered by newly sent pings "
5523 << "and ignore"
5524 << dendl;
7c673cae
FG
5525 }
5526 }
5527
5528 if (m->map_epoch &&
5529 curmap->is_up(from)) {
7c673cae 5530 if (is_active()) {
9f95a23c
TL
5531 ConnectionRef cluster_con = service.get_con_osd_cluster(
5532 from, curmap->get_epoch());
5533 if (cluster_con) {
5534 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5535 }
5536 }
5537 }
9f95a23c
TL
5538
5539 s->stamps->got_ping_reply(
5540 mnow,
5541 m->mono_send_stamp,
5542 m->delta_ub);
5543 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
7c673cae
FG
5544 }
5545 break;
5546
5547 case MOSDPing::YOU_DIED:
5548 dout(10) << "handle_osd_ping " << m->get_source_inst()
5549 << " says i am down in " << m->map_epoch << dendl;
5550 osdmap_subscribe(curmap->get_epoch()+1, false);
5551 break;
5552 }
5553
9f95a23c 5554 heartbeat_lock.unlock();
7c673cae
FG
5555 m->put();
5556}
5557
5558void OSD::heartbeat_entry()
5559{
9f95a23c 5560 std::unique_lock l(heartbeat_lock);
7c673cae
FG
5561 if (is_stopping())
5562 return;
5563 while (!heartbeat_stop) {
5564 heartbeat();
5565
eafe8130
TL
5566 double wait;
5567 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5568 wait = (float)cct->_conf->osd_heartbeat_interval;
5569 } else {
5570 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5571 }
9f95a23c 5572 auto w = ceph::make_timespan(wait);
7c673cae 5573 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
9f95a23c 5574 heartbeat_cond.wait_for(l, w);
7c673cae
FG
5575 if (is_stopping())
5576 return;
5577 dout(30) << "heartbeat_entry woke up" << dendl;
5578 }
5579}
5580
5581void OSD::heartbeat_check()
5582{
9f95a23c 5583 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
7c673cae
FG
5584 utime_t now = ceph_clock_now();
5585
11fdf7f2 5586 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5587 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5588 p != heartbeat_peers.end();
5589 ++p) {
5590
5591 if (p->second.first_tx == utime_t()) {
5592 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5593 << " yet, skipping" << dendl;
7c673cae
FG
5594 continue;
5595 }
5596
5597 dout(25) << "heartbeat_check osd." << p->first
5598 << " first_tx " << p->second.first_tx
5599 << " last_tx " << p->second.last_tx
5600 << " last_rx_back " << p->second.last_rx_back
5601 << " last_rx_front " << p->second.last_rx_front
5602 << dendl;
11fdf7f2
TL
5603 if (p->second.is_unhealthy(now)) {
5604 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5605 if (p->second.last_rx_back == utime_t() ||
5606 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5607 derr << "heartbeat_check: no reply from "
5608 << p->second.con_front->get_peer_addr().get_sockaddr()
5609 << " osd." << p->first
5610 << " ever on either front or back, first ping sent "
5611 << p->second.first_tx
5612 << " (oldest deadline " << oldest_deadline << ")"
5613 << dendl;
7c673cae 5614 // fail
11fdf7f2 5615 failure_queue[p->first] = p->second.first_tx;
7c673cae 5616 } else {
11fdf7f2
TL
5617 derr << "heartbeat_check: no reply from "
5618 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5619 << " osd." << p->first << " since back " << p->second.last_rx_back
5620 << " front " << p->second.last_rx_front
11fdf7f2
TL
5621 << " (oldest deadline " << oldest_deadline << ")"
5622 << dendl;
7c673cae 5623 // fail
11fdf7f2 5624 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5625 }
5626 }
5627 }
5628}
5629
5630void OSD::heartbeat()
5631{
9f95a23c 5632 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
7c673cae
FG
5633 dout(30) << "heartbeat" << dendl;
5634
5635 // get CPU load avg
5636 double loadavgs[1];
11fdf7f2
TL
5637 int hb_interval = cct->_conf->osd_heartbeat_interval;
5638 int n_samples = 86400;
5639 if (hb_interval > 1) {
5640 n_samples /= hb_interval;
5641 if (n_samples < 1)
5642 n_samples = 1;
5643 }
5644
7c673cae
FG
5645 if (getloadavg(loadavgs, 1) == 1) {
5646 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5647 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5648 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5649 }
5650
5651 dout(30) << "heartbeat checking stats" << dendl;
5652
11fdf7f2 5653 // refresh peer list and osd stats
7c673cae
FG
5654 vector<int> hb_peers;
5655 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5656 p != heartbeat_peers.end();
5657 ++p)
5658 hb_peers.push_back(p->first);
7c673cae 5659
11fdf7f2
TL
5660 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5661 dout(5) << __func__ << " " << new_stat << dendl;
5662 ceph_assert(new_stat.statfs.total);
5663
5664 float pratio;
5665 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5666
5667 service.check_full_status(ratio, pratio);
7c673cae
FG
5668
5669 utime_t now = ceph_clock_now();
9f95a23c 5670 auto mnow = service.get_mnow();
11fdf7f2
TL
5671 utime_t deadline = now;
5672 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
5673
5674 // send heartbeats
5675 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5676 i != heartbeat_peers.end();
5677 ++i) {
5678 int peer = i->first;
9f95a23c
TL
5679 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5680
7c673cae
FG
5681 i->second.last_tx = now;
5682 if (i->second.first_tx == utime_t())
5683 i->second.first_tx = now;
11fdf7f2
TL
5684 i->second.ping_history[now] = make_pair(deadline,
5685 HeartbeatInfo::HEARTBEAT_MAX_CONN);
eafe8130
TL
5686 if (i->second.hb_interval_start == utime_t())
5687 i->second.hb_interval_start = now;
9f95a23c
TL
5688
5689 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5690 std::optional<ceph::signedspan> delta_ub;
5691 s->stamps->sent_ping(&delta_ub);
5692
5693 i->second.con_back->send_message(
5694 new MOSDPing(monc->get_fsid(),
5695 service.get_osdmap_epoch(),
5696 MOSDPing::PING,
5697 now,
5698 mnow,
5699 mnow,
5700 service.get_up_epoch(),
5701 cct->_conf->osd_heartbeat_min_size,
5702 delta_ub));
7c673cae
FG
5703
5704 if (i->second.con_front)
9f95a23c
TL
5705 i->second.con_front->send_message(
5706 new MOSDPing(monc->get_fsid(),
5707 service.get_osdmap_epoch(),
5708 MOSDPing::PING,
5709 now,
5710 mnow,
5711 mnow,
5712 service.get_up_epoch(),
5713 cct->_conf->osd_heartbeat_min_size,
5714 delta_ub));
7c673cae
FG
5715 }
5716
5717 logger->set(l_osd_hb_to, heartbeat_peers.size());
5718
5719 // hmm.. am i all alone?
5720 dout(30) << "heartbeat lonely?" << dendl;
5721 if (heartbeat_peers.empty()) {
5722 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5723 last_mon_heartbeat = now;
5724 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
9f95a23c 5725 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
5726 }
5727 }
5728
5729 dout(30) << "heartbeat done" << dendl;
5730}
5731
5732bool OSD::heartbeat_reset(Connection *con)
5733{
11fdf7f2
TL
5734 std::lock_guard l(heartbeat_lock);
5735 auto s = con->get_priv();
9f95a23c 5736 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
11fdf7f2 5737 con->set_priv(nullptr);
7c673cae 5738 if (s) {
7c673cae 5739 if (is_stopping()) {
7c673cae
FG
5740 return true;
5741 }
9f95a23c
TL
5742 auto session = static_cast<Session*>(s.get());
5743 auto p = heartbeat_peers.find(session->peer);
7c673cae
FG
5744 if (p != heartbeat_peers.end() &&
5745 (p->second.con_back == con ||
5746 p->second.con_front == con)) {
5747 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5748 << ", reopening" << dendl;
9f95a23c 5749 p->second.clear_mark_down(con);
7c673cae
FG
5750 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5751 if (newcon.first) {
5752 p->second.con_back = newcon.first.get();
11fdf7f2 5753 p->second.con_back->set_priv(s);
7c673cae
FG
5754 if (newcon.second) {
5755 p->second.con_front = newcon.second.get();
11fdf7f2 5756 p->second.con_front->set_priv(s);
7c673cae 5757 }
11fdf7f2 5758 p->second.ping_history.clear();
7c673cae
FG
5759 } else {
5760 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5761 << ", raced with osdmap update, closing out peer" << dendl;
5762 heartbeat_peers.erase(p);
5763 }
5764 } else {
5765 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5766 }
7c673cae
FG
5767 }
5768 return true;
5769}
5770
5771
5772
5773// =========================================
5774
5775void OSD::tick()
5776{
9f95a23c 5777 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
5778 dout(10) << "tick" << dendl;
5779
9f95a23c
TL
5780 utime_t now = ceph_clock_now();
5781 // throw out any obsolete markdown log
5782 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5783 while (!osd_markdown_log.empty() &&
5784 osd_markdown_log.front() + grace < now)
5785 osd_markdown_log.pop_front();
5786
7c673cae
FG
5787 if (is_active() || is_waiting_for_healthy()) {
5788 maybe_update_heartbeat_peers();
5789 }
5790
5791 if (is_waiting_for_healthy()) {
5792 start_boot();
494da23a
TL
5793 }
5794
5795 if (is_waiting_for_healthy() || is_booting()) {
5796 std::lock_guard l(heartbeat_lock);
494da23a
TL
5797 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5798 last_mon_heartbeat = now;
5799 dout(1) << __func__ << " checking mon for new map" << dendl;
9f95a23c 5800 osdmap_subscribe(get_osdmap_epoch() + 1, false);
11fdf7f2 5801 }
7c673cae
FG
5802 }
5803
5804 do_waiters();
5805
9f95a23c
TL
5806 // scrub purged_snaps every deep scrub interval
5807 {
5808 const utime_t last = superblock.last_purged_snaps_scrub;
5809 utime_t next = last;
5810 next += cct->_conf->osd_scrub_min_interval;
5811 std::mt19937 rng;
5812 // use a seed that is stable for each scrub interval, but varies
5813 // by OSD to avoid any herds.
5814 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5815 double r = (rng() % 1024) / 1024;
5816 next +=
5817 cct->_conf->osd_scrub_min_interval *
5818 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5819 if (next < ceph_clock_now()) {
5820 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5821 << " next " << next << " ... now" << dendl;
5822 scrub_purged_snaps();
5823 } else {
5824 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5825 << " next " << next << dendl;
5826 }
5827 }
5828
91327a77 5829 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
5830}
5831
5832void OSD::tick_without_osd_lock()
5833{
9f95a23c 5834 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
7c673cae
FG
5835 dout(10) << "tick_without_osd_lock" << dendl;
5836
7c673cae
FG
5837 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5838 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5839 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
11fdf7f2
TL
5840
5841 // refresh osd stats
5842 struct store_statfs_t stbuf;
5843 osd_alert_list_t alerts;
5844 int r = store->statfs(&stbuf, &alerts);
5845 ceph_assert(r == 0);
5846 service.set_statfs(stbuf, alerts);
7c673cae
FG
5847
5848 // osd_lock is not being held, which means the OSD state
5849 // might change when doing the monitor report
5850 if (is_active() || is_waiting_for_healthy()) {
9f95a23c
TL
5851 {
5852 std::lock_guard l{heartbeat_lock};
5853 heartbeat_check();
5854 }
5855 map_lock.lock_shared();
11fdf7f2 5856 std::lock_guard l(mon_report_lock);
7c673cae
FG
5857
5858 // mon report?
7c673cae 5859 utime_t now = ceph_clock_now();
11fdf7f2
TL
5860 if (service.need_fullness_update() ||
5861 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 5862 last_mon_report = now;
7c673cae
FG
5863 send_full_update();
5864 send_failures();
7c673cae 5865 }
9f95a23c 5866 map_lock.unlock_shared();
11fdf7f2
TL
5867
5868 epoch_t max_waiting_epoch = 0;
5869 for (auto s : shards) {
5870 max_waiting_epoch = std::max(max_waiting_epoch,
5871 s->get_max_waiting_epoch());
5872 }
5873 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5874 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5875 << ", requesting new map" << dendl;
5876 osdmap_subscribe(superblock.newest_map + 1, false);
5877 }
7c673cae
FG
5878 }
5879
5880 if (is_active()) {
5881 if (!scrub_random_backoff()) {
5882 sched_scrub();
5883 }
5884 service.promote_throttle_recalibrate();
3efd9988 5885 resume_creating_pg();
224ce89b
WB
5886 bool need_send_beacon = false;
5887 const auto now = ceph::coarse_mono_clock::now();
5888 {
5889 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 5890 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b
WB
5891 const auto elapsed = now - last_sent_beacon;
5892 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5893 cct->_conf->osd_beacon_report_interval) {
5894 need_send_beacon = true;
5895 }
5896 }
5897 if (need_send_beacon) {
5898 send_beacon(now);
5899 }
7c673cae
FG
5900 }
5901
11fdf7f2 5902 mgrc.update_daemon_health(get_health_metrics());
7c673cae 5903 service.kick_recovery_queue();
91327a77
AA
5904 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5905 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
5906}
5907
7c673cae
FG
5908// Usage:
5909// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5910// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5911// setomapheader <pool-id> [namespace/]<obj-name> <header>
5912// getomap <pool> [namespace/]<obj-name>
5913// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5914// injectmdataerr [namespace/]<obj-name> [shardid]
5915// injectdataerr [namespace/]<obj-name> [shardid]
5916//
5917// set_recovery_delay [utime]
5918void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
5919 std::string_view command,
5920 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
5921{
5922 //Test support
5923 //Support changing the omap on a single osd by using the Admin Socket to
5924 //directly request the osd make a change.
5925 if (command == "setomapval" || command == "rmomapkey" ||
5926 command == "setomapheader" || command == "getomap" ||
5927 command == "truncobj" || command == "injectmdataerr" ||
5928 command == "injectdataerr"
5929 ) {
5930 pg_t rawpg;
5931 int64_t pool;
5932 OSDMapRef curmap = service->get_osdmap();
5933 int r = -1;
5934
5935 string poolstr;
5936
9f95a23c 5937 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
5938 pool = curmap->lookup_pg_pool_name(poolstr);
5939 //If we can't find it by name then maybe id specified
5940 if (pool < 0 && isdigit(poolstr[0]))
5941 pool = atoll(poolstr.c_str());
5942 if (pool < 0) {
b5b8bbf5 5943 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
5944 return;
5945 }
5946
5947 string objname, nspace;
9f95a23c 5948 cmd_getval(cmdmap, "objname", objname);
7c673cae
FG
5949 std::size_t found = objname.find_first_of('/');
5950 if (found != string::npos) {
5951 nspace = objname.substr(0, found);
5952 objname = objname.substr(found+1);
5953 }
5954 object_locator_t oloc(pool, nspace);
5955 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5956
5957 if (r < 0) {
5958 ss << "Invalid namespace/objname";
5959 return;
5960 }
5961
5962 int64_t shardid;
9f95a23c 5963 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
7c673cae
FG
5964 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5965 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5966 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5967 if (curmap->pg_is_ec(rawpg)) {
5968 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5969 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5970 return;
5971 }
5972 }
5973
5974 ObjectStore::Transaction t;
5975
5976 if (command == "setomapval") {
5977 map<string, bufferlist> newattrs;
5978 bufferlist val;
5979 string key, valstr;
9f95a23c
TL
5980 cmd_getval(cmdmap, "key", key);
5981 cmd_getval(cmdmap, "val", valstr);
7c673cae
FG
5982
5983 val.append(valstr);
5984 newattrs[key] = val;
5985 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 5986 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5987 if (r < 0)
5988 ss << "error=" << r;
5989 else
5990 ss << "ok";
5991 } else if (command == "rmomapkey") {
5992 string key;
9f95a23c 5993 cmd_getval(cmdmap, "key", key);
7c673cae 5994
9f95a23c 5995 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
11fdf7f2 5996 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
5997 if (r < 0)
5998 ss << "error=" << r;
5999 else
6000 ss << "ok";
6001 } else if (command == "setomapheader") {
6002 bufferlist newheader;
6003 string headerstr;
6004
9f95a23c 6005 cmd_getval(cmdmap, "header", headerstr);
7c673cae
FG
6006 newheader.append(headerstr);
6007 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 6008 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6009 if (r < 0)
6010 ss << "error=" << r;
6011 else
6012 ss << "ok";
6013 } else if (command == "getomap") {
6014 //Debug: Output entire omap
6015 bufferlist hdrbl;
6016 map<string, bufferlist> keyvals;
11fdf7f2
TL
6017 auto ch = store->open_collection(coll_t(pgid));
6018 if (!ch) {
6019 ss << "unable to open collection for " << pgid;
6020 r = -ENOENT;
6021 } else {
6022 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6023 if (r >= 0) {
7c673cae
FG
6024 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6025 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 6026 it != keyvals.end(); ++it)
7c673cae
FG
6027 ss << " key=" << (*it).first << " val="
6028 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 6029 } else {
7c673cae 6030 ss << "error=" << r;
11fdf7f2 6031 }
7c673cae
FG
6032 }
6033 } else if (command == "truncobj") {
6034 int64_t trunclen;
9f95a23c 6035 cmd_getval(cmdmap, "len", trunclen);
7c673cae 6036 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 6037 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6038 if (r < 0)
6039 ss << "error=" << r;
6040 else
6041 ss << "ok";
6042 } else if (command == "injectdataerr") {
6043 store->inject_data_error(gobj);
6044 ss << "ok";
6045 } else if (command == "injectmdataerr") {
6046 store->inject_mdata_error(gobj);
6047 ss << "ok";
6048 }
6049 return;
6050 }
6051 if (command == "set_recovery_delay") {
6052 int64_t delay;
9f95a23c 6053 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
7c673cae
FG
6054 ostringstream oss;
6055 oss << delay;
11fdf7f2 6056 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
6057 oss.str().c_str());
6058 if (r != 0) {
6059 ss << "set_recovery_delay: error setting "
6060 << "osd_recovery_delay_start to '" << delay << "': error "
6061 << r;
6062 return;
6063 }
11fdf7f2 6064 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
6065 ss << "set_recovery_delay: set osd_recovery_delay_start "
6066 << "to " << service->cct->_conf->osd_recovery_delay_start;
6067 return;
6068 }
7c673cae
FG
6069 if (command == "injectfull") {
6070 int64_t count;
6071 string type;
6072 OSDService::s_names state;
9f95a23c
TL
6073 cmd_getval(cmdmap, "type", type, string("full"));
6074 cmd_getval(cmdmap, "count", count, (int64_t)-1);
7c673cae
FG
6075 if (type == "none" || count == 0) {
6076 type = "none";
6077 count = 0;
6078 }
6079 state = service->get_full_state(type);
6080 if (state == OSDService::s_names::INVALID) {
6081 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6082 return;
6083 }
6084 service->set_injectfull(state, count);
6085 return;
6086 }
6087 ss << "Internal error - command=" << command;
6088}
6089
7c673cae
FG
6090// =========================================
6091
6092void OSD::ms_handle_connect(Connection *con)
6093{
6094 dout(10) << __func__ << " con " << con << dendl;
6095 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 6096 std::lock_guard l(osd_lock);
7c673cae
FG
6097 if (is_stopping())
6098 return;
6099 dout(10) << __func__ << " on mon" << dendl;
6100
6101 if (is_preboot()) {
6102 start_boot();
6103 } else if (is_booting()) {
6104 _send_boot(); // resend boot message
6105 } else {
9f95a23c 6106 map_lock.lock_shared();
11fdf7f2 6107 std::lock_guard l2(mon_report_lock);
7c673cae
FG
6108
6109 utime_t now = ceph_clock_now();
6110 last_mon_report = now;
6111
6112 // resend everything, it's a new session
6113 send_full_update();
6114 send_alive();
6115 service.requeue_pg_temp();
11fdf7f2 6116 service.clear_sent_ready_to_merge();
7c673cae 6117 service.send_pg_temp();
11fdf7f2
TL
6118 service.send_ready_to_merge();
6119 service.send_pg_created();
7c673cae
FG
6120 requeue_failures();
6121 send_failures();
7c673cae 6122
9f95a23c 6123 map_lock.unlock_shared();
7c673cae
FG
6124 if (is_active()) {
6125 send_beacon(ceph::coarse_mono_clock::now());
6126 }
6127 }
6128
6129 // full map requests may happen while active or pre-boot
6130 if (requested_full_first) {
6131 rerequest_full_maps();
6132 }
6133 }
6134}
6135
6136void OSD::ms_handle_fast_connect(Connection *con)
6137{
6138 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6139 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6140 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6141 s = ceph::make_ref<Session>(cct, con);
6142 con->set_priv(s);
7c673cae
FG
6143 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6144 << " addr=" << s->con->get_peer_addr() << dendl;
6145 // we don't connect to clients
11fdf7f2 6146 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6147 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6148 }
7c673cae
FG
6149 }
6150}
6151
6152void OSD::ms_handle_fast_accept(Connection *con)
6153{
6154 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6155 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6156 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6157 s = ceph::make_ref<Session>(cct, con);
6158 con->set_priv(s);
7c673cae
FG
6159 dout(10) << "new session (incoming)" << s << " con=" << con
6160 << " addr=" << con->get_peer_addr()
6161 << " must have raced with connect" << dendl;
11fdf7f2 6162 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6163 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6164 }
7c673cae
FG
6165 }
6166}
6167
6168bool OSD::ms_handle_reset(Connection *con)
6169{
9f95a23c
TL
6170 auto session = ceph::ref_cast<Session>(con->get_priv());
6171 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
7c673cae
FG
6172 if (!session)
6173 return false;
6174 session->wstate.reset(con);
11fdf7f2
TL
6175 session->con->set_priv(nullptr);
6176 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
6177 // note that we break session->con *before* the session_handle_reset
6178 // cleanup below. this avoids a race between us and
6179 // PG::add_backoff, Session::check_backoff, etc.
9f95a23c 6180 session_handle_reset(session);
7c673cae
FG
6181 return true;
6182}
6183
6184bool OSD::ms_handle_refused(Connection *con)
6185{
6186 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6187 return false;
6188
9f95a23c
TL
6189 auto session = ceph::ref_cast<Session>(con->get_priv());
6190 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
7c673cae
FG
6191 if (!session)
6192 return false;
6193 int type = con->get_peer_type();
6194 // handle only OSD failures here
6195 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6196 OSDMapRef osdmap = get_osdmap();
6197 if (osdmap) {
6198 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6199 if (id >= 0 && osdmap->is_up(id)) {
6200 // I'm cheating mon heartbeat grace logic, because we know it's not going
6201 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
6202 monc->send_mon_message(
6203 new MOSDFailure(
6204 monc->get_fsid(),
6205 id,
6206 osdmap->get_addrs(id),
6207 cct->_conf->osd_heartbeat_grace + 1,
6208 osdmap->get_epoch(),
6209 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6210 ));
7c673cae
FG
6211 }
6212 }
6213 }
7c673cae
FG
6214 return true;
6215}
6216
6217struct C_OSD_GetVersion : public Context {
6218 OSD *osd;
6219 uint64_t oldest, newest;
6220 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6221 void finish(int r) override {
6222 if (r >= 0)
6223 osd->_got_mon_epochs(oldest, newest);
6224 }
6225};
6226
6227void OSD::start_boot()
6228{
6229 if (!_is_healthy()) {
6230 // if we are not healthy, do not mark ourselves up (yet)
6231 dout(1) << "not healthy; waiting to boot" << dendl;
6232 if (!is_waiting_for_healthy())
6233 start_waiting_for_healthy();
6234 // send pings sooner rather than later
6235 heartbeat_kick();
6236 return;
6237 }
6238 dout(1) << __func__ << dendl;
6239 set_state(STATE_PREBOOT);
6240 dout(10) << "start_boot - have maps " << superblock.oldest_map
6241 << ".." << superblock.newest_map << dendl;
6242 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6243 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6244}
6245
6246void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6247{
11fdf7f2 6248 std::lock_guard l(osd_lock);
7c673cae
FG
6249 if (is_preboot()) {
6250 _preboot(oldest, newest);
6251 }
6252}
6253
6254void OSD::_preboot(epoch_t oldest, epoch_t newest)
6255{
11fdf7f2 6256 ceph_assert(is_preboot());
7c673cae
FG
6257 dout(10) << __func__ << " _preboot mon has osdmaps "
6258 << oldest << ".." << newest << dendl;
6259
6260 // ensure our local fullness awareness is accurate
81eedcae
TL
6261 {
6262 std::lock_guard l(heartbeat_lock);
6263 heartbeat();
6264 }
7c673cae 6265
9f95a23c
TL
6266 const auto& monmap = monc->monmap;
6267 const auto osdmap = get_osdmap();
7c673cae 6268 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
6269 if (osdmap->get_epoch() == 0) {
6270 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 6271 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
6272 derr << "osdmap says I am destroyed" << dendl;
6273 // provide a small margin so we don't livelock seeing if we
6274 // un-destroyed ourselves.
6275 if (osdmap->get_epoch() > newest - 1) {
6276 exit(0);
6277 }
81eedcae 6278 } else if (osdmap->is_noup(whoami)) {
7c673cae
FG
6279 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6280 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6281 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6282 << dendl;
9f95a23c 6283 } else if (osdmap->require_osd_release < ceph_release_t::luminous) {
11fdf7f2 6284 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
7c673cae 6285 << dendl;
7c673cae
FG
6286 } else if (service.need_fullness_update()) {
6287 derr << "osdmap fullness state needs update" << dendl;
6288 send_full_update();
9f95a23c
TL
6289 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6290 superblock.purged_snaps_last < superblock.current_epoch) {
6291 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6292 << " < newest_map " << superblock.current_epoch << dendl;
6293 _get_purged_snaps();
7c673cae
FG
6294 } else if (osdmap->get_epoch() >= oldest - 1 &&
6295 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
6296
6297 // wait for pgs to fully catch up in a different thread, since
6298 // this thread might be required for splitting and merging PGs to
6299 // make progress.
6300 boot_finisher.queue(
9f95a23c 6301 new LambdaContext(
11fdf7f2 6302 [this](int r) {
9f95a23c 6303 std::unique_lock l(osd_lock);
11fdf7f2
TL
6304 if (is_preboot()) {
6305 dout(10) << __func__ << " waiting for peering work to drain"
6306 << dendl;
9f95a23c 6307 l.unlock();
11fdf7f2 6308 for (auto shard : shards) {
9f95a23c 6309 shard->wait_min_pg_epoch(get_osdmap_epoch());
11fdf7f2 6310 }
9f95a23c 6311 l.lock();
11fdf7f2
TL
6312 }
6313 if (is_preboot()) {
6314 _send_boot();
6315 }
6316 }));
6317 return;
7c673cae
FG
6318 }
6319
6320 // get all the latest maps
6321 if (osdmap->get_epoch() + 1 >= oldest)
6322 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6323 else
6324 osdmap_subscribe(oldest - 1, true);
6325}
6326
9f95a23c
TL
6327void OSD::_get_purged_snaps()
6328{
6329 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6330 // overlapping requests to the mon, which will be somewhat inefficient, but
6331 // it should be reliable.
6332 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6333 << ", newest_map " << superblock.current_epoch << dendl;
6334 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6335 superblock.purged_snaps_last + 1,
6336 superblock.current_epoch + 1);
6337 monc->send_mon_message(m);
6338}
6339
6340void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6341{
6342 dout(10) << __func__ << " " << *m << dendl;
6343 ObjectStore::Transaction t;
6344 if (!is_preboot() ||
6345 m->last < superblock.purged_snaps_last) {
6346 goto out;
6347 }
6348 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6349 make_purged_snaps_oid(), &t,
6350 m->purged_snaps);
6351 superblock.purged_snaps_last = m->last;
6352 write_superblock(t);
6353 store->queue_transaction(
6354 service.meta_ch,
6355 std::move(t));
6356 service.publish_superblock(superblock);
6357 if (m->last < superblock.current_epoch) {
6358 _get_purged_snaps();
6359 } else {
6360 start_boot();
6361 }
6362out:
6363 m->put();
6364}
6365
7c673cae
FG
6366void OSD::send_full_update()
6367{
6368 if (!service.need_fullness_update())
6369 return;
6370 unsigned state = 0;
6371 if (service.is_full()) {
6372 state = CEPH_OSD_FULL;
6373 } else if (service.is_backfillfull()) {
6374 state = CEPH_OSD_BACKFILLFULL;
6375 } else if (service.is_nearfull()) {
6376 state = CEPH_OSD_NEARFULL;
6377 }
6378 set<string> s;
6379 OSDMap::calc_state_set(state, s);
6380 dout(10) << __func__ << " want state " << s << dendl;
9f95a23c 6381 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
7c673cae
FG
6382}
6383
6384void OSD::start_waiting_for_healthy()
6385{
6386 dout(1) << "start_waiting_for_healthy" << dendl;
6387 set_state(STATE_WAITING_FOR_HEALTHY);
6388 last_heartbeat_resample = utime_t();
181888fb
FG
6389
6390 // subscribe to osdmap updates, in case our peers really are known to be dead
9f95a23c 6391 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
6392}
6393
6394bool OSD::_is_healthy()
6395{
6396 if (!cct->get_heartbeat_map()->is_healthy()) {
6397 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6398 return false;
6399 }
6400
6401 if (is_waiting_for_healthy()) {
11fdf7f2 6402 utime_t now = ceph_clock_now();
9f95a23c
TL
6403 if (osd_markdown_log.empty()) {
6404 dout(5) << __func__ << " force returning true since last markdown"
6405 << " was " << cct->_conf->osd_max_markdown_period
6406 << "s ago" << dendl;
11fdf7f2
TL
6407 return true;
6408 }
6409 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6410 int num = 0, up = 0;
6411 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6412 p != heartbeat_peers.end();
6413 ++p) {
11fdf7f2 6414 if (p->second.is_healthy(now))
7c673cae
FG
6415 ++up;
6416 ++num;
6417 }
6418 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6419 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6420 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6421 return false;
6422 }
6423 }
6424
6425 return true;
6426}
6427
6428void OSD::_send_boot()
6429{
6430 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
6431 Connection *local_connection =
6432 cluster_messenger->get_loopback_connection().get();
6433 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6434 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6435 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6436 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6437
6438 dout(20) << " initial client_addrs " << client_addrs
6439 << ", cluster_addrs " << cluster_addrs
6440 << ", hb_back_addrs " << hb_back_addrs
6441 << ", hb_front_addrs " << hb_front_addrs
6442 << dendl;
6443 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6444 dout(10) << " assuming cluster_addrs match client_addrs "
6445 << client_addrs << dendl;
6446 cluster_addrs = cluster_messenger->get_myaddrs();
6447 }
6448 if (auto session = local_connection->get_priv(); !session) {
6449 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6450 }
6451
7c673cae 6452 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6453 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6454 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6455 << cluster_addrs << dendl;
6456 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6457 }
11fdf7f2
TL
6458 if (auto session = local_connection->get_priv(); !session) {
6459 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6460 }
6461
11fdf7f2
TL
6462 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6463 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6464 dout(10) << " assuming hb_front_addrs match client_addrs "
6465 << client_addrs << dendl;
6466 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6467 }
6468 if (auto session = local_connection->get_priv(); !session) {
6469 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6470 }
6471
6472 // we now know what our front and back addrs will be, and we are
6473 // about to tell the mon what our metadata (including numa bindings)
6474 // are, so now is a good time!
6475 set_numa_affinity();
6476
6477 MOSDBoot *mboot = new MOSDBoot(
6478 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6479 hb_back_addrs, hb_front_addrs, cluster_addrs,
6480 CEPH_FEATURES_ALL);
6481 dout(10) << " final client_addrs " << client_addrs
6482 << ", cluster_addrs " << cluster_addrs
6483 << ", hb_back_addrs " << hb_back_addrs
6484 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6485 << dendl;
6486 _collect_metadata(&mboot->metadata);
6487 monc->send_mon_message(mboot);
6488 set_state(STATE_BOOTING);
6489}
6490
6491void OSD::_collect_metadata(map<string,string> *pm)
6492{
6493 // config info
6494 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6495 if (store->get_type() == "filestore") {
6496 // not applicable for bluestore
6497 (*pm)["osd_journal"] = journal_path;
6498 }
11fdf7f2
TL
6499 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6500 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6501 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6502 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6503
6504 // backend
6505 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6506 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6507 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6508 (*pm)["default_device_class"] = store->get_default_device_class();
7c673cae
FG
6509 store->collect_metadata(pm);
6510
6511 collect_sys_info(pm, cct);
6512
11fdf7f2
TL
6513 (*pm)["front_iface"] = pick_iface(
6514 cct,
6515 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6516 (*pm)["back_iface"] = pick_iface(
6517 cct,
6518 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6519
6520 // network numa
6521 {
6522 int node = -1;
6523 set<int> nodes;
6524 set<string> unknown;
6525 for (auto nm : { "front_iface", "back_iface" }) {
6526 if (!(*pm)[nm].size()) {
6527 unknown.insert(nm);
6528 continue;
6529 }
6530 int n = -1;
6531 int r = get_iface_numa_node((*pm)[nm], &n);
6532 if (r < 0) {
6533 unknown.insert((*pm)[nm]);
6534 continue;
6535 }
6536 nodes.insert(n);
6537 if (node < 0) {
6538 node = n;
6539 }
6540 }
6541 if (unknown.size()) {
6542 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6543 }
6544 if (!nodes.empty()) {
6545 (*pm)["network_numa_nodes"] = stringify(nodes);
6546 }
6547 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6548 (*pm)["network_numa_node"] = stringify(node);
6549 }
6550 }
6551
6552 if (numa_node >= 0) {
6553 (*pm)["numa_node"] = stringify(numa_node);
6554 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6555 &numa_cpu_set);
6556 }
6557
6558 set<string> devnames;
6559 store->get_devices(&devnames);
9f95a23c
TL
6560 map<string,string> errs;
6561 get_device_metadata(devnames, pm, &errs);
6562 for (auto& i : errs) {
6563 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
11fdf7f2 6564 }
7c673cae
FG
6565 dout(10) << __func__ << " " << *pm << dendl;
6566}
6567
6568void OSD::queue_want_up_thru(epoch_t want)
6569{
9f95a23c
TL
6570 std::shared_lock map_locker{map_lock};
6571 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6572 std::lock_guard report_locker(mon_report_lock);
7c673cae
FG
6573 if (want > up_thru_wanted) {
6574 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6575 << ", currently " << cur
6576 << dendl;
6577 up_thru_wanted = want;
6578 send_alive();
6579 } else {
6580 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6581 << ", currently " << cur
6582 << dendl;
6583 }
7c673cae
FG
6584}
6585
6586void OSD::send_alive()
6587{
9f95a23c
TL
6588 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6589 const auto osdmap = get_osdmap();
7c673cae
FG
6590 if (!osdmap->exists(whoami))
6591 return;
6592 epoch_t up_thru = osdmap->get_up_thru(whoami);
6593 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6594 if (up_thru_wanted > up_thru) {
6595 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6596 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6597 }
6598}
6599
6600void OSD::request_full_map(epoch_t first, epoch_t last)
6601{
6602 dout(10) << __func__ << " " << first << ".." << last
6603 << ", previously requested "
6604 << requested_full_first << ".." << requested_full_last << dendl;
9f95a23c 6605 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
6606 ceph_assert(first > 0 && last > 0);
6607 ceph_assert(first <= last);
6608 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6609 if (requested_full_first == 0) {
6610 // first request
6611 requested_full_first = first;
6612 requested_full_last = last;
6613 } else if (last <= requested_full_last) {
6614 // dup
6615 return;
6616 } else {
6617 // additional request
6618 first = requested_full_last + 1;
6619 requested_full_last = last;
6620 }
6621 MMonGetOSDMap *req = new MMonGetOSDMap;
6622 req->request_full(first, last);
6623 monc->send_mon_message(req);
6624}
6625
6626void OSD::got_full_map(epoch_t e)
6627{
11fdf7f2 6628 ceph_assert(requested_full_first <= requested_full_last);
9f95a23c 6629 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6630 if (requested_full_first == 0) {
6631 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6632 return;
6633 }
6634 if (e < requested_full_first) {
6635 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6636 << ".." << requested_full_last
6637 << ", ignoring" << dendl;
6638 return;
6639 }
6640 if (e >= requested_full_last) {
6641 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6642 << ".." << requested_full_last << ", resetting" << dendl;
6643 requested_full_first = requested_full_last = 0;
6644 return;
6645 }
6646
6647 requested_full_first = e + 1;
6648
6649 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6650 << ".." << requested_full_last
6651 << ", still need more" << dendl;
6652}
6653
6654void OSD::requeue_failures()
6655{
11fdf7f2 6656 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6657 unsigned old_queue = failure_queue.size();
6658 unsigned old_pending = failure_pending.size();
11fdf7f2 6659 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
6660 failure_queue[p->first] = p->second.first;
6661 failure_pending.erase(p++);
6662 }
6663 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6664 << failure_queue.size() << dendl;
6665}
6666
6667void OSD::send_failures()
6668{
9f95a23c
TL
6669 ceph_assert(ceph_mutex_is_locked(map_lock));
6670 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
11fdf7f2 6671 std::lock_guard l(heartbeat_lock);
7c673cae 6672 utime_t now = ceph_clock_now();
9f95a23c 6673 const auto osdmap = get_osdmap();
7c673cae
FG
6674 while (!failure_queue.empty()) {
6675 int osd = failure_queue.begin()->first;
7c673cae
FG
6676 if (!failure_pending.count(osd)) {
6677 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
6678 monc->send_mon_message(
6679 new MOSDFailure(
6680 monc->get_fsid(),
6681 osd,
6682 osdmap->get_addrs(osd),
6683 failed_for,
6684 osdmap->get_epoch()));
6685 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6686 osdmap->get_addrs(osd));
7c673cae
FG
6687 }
6688 failure_queue.erase(osd);
6689 }
6690}
6691
11fdf7f2 6692void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 6693{
11fdf7f2
TL
6694 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6695 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
6696 monc->send_mon_message(m);
6697}
6698
11fdf7f2 6699void OSD::cancel_pending_failures()
7c673cae 6700{
11fdf7f2
TL
6701 std::lock_guard l(heartbeat_lock);
6702 auto it = failure_pending.begin();
6703 while (it != failure_pending.end()) {
6704 dout(10) << __func__ << " canceling in-flight failure report for osd."
6705 << it->first << dendl;
9f95a23c 6706 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
11fdf7f2 6707 failure_pending.erase(it++);
7c673cae 6708 }
7c673cae
FG
6709}
6710
6711void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6712{
6713 const auto& monmap = monc->monmap;
6714 // send beacon to mon even if we are just connected, and the monmap is not
6715 // initialized yet by then.
6716 if (monmap.epoch > 0 &&
6717 monmap.get_required_features().contains_all(
6718 ceph::features::mon::FEATURE_LUMINOUS)) {
6719 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6720 MOSDBeacon* beacon = nullptr;
6721 {
11fdf7f2 6722 std::lock_guard l{min_last_epoch_clean_lock};
9f95a23c
TL
6723 beacon = new MOSDBeacon(get_osdmap_epoch(),
6724 min_last_epoch_clean,
6725 superblock.last_purged_snaps_scrub);
494da23a 6726 beacon->pgs = min_last_epoch_clean_pgs;
224ce89b 6727 last_sent_beacon = now;
7c673cae
FG
6728 }
6729 monc->send_mon_message(beacon);
6730 } else {
6731 dout(20) << __func__ << " not sending" << dendl;
6732 }
6733}
6734
7c673cae
FG
6735void OSD::handle_command(MCommand *m)
6736{
6737 ConnectionRef con = m->get_connection();
9f95a23c 6738 auto session = ceph::ref_cast<Session>(con->get_priv());
7c673cae 6739 if (!session) {
9f95a23c 6740 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
6741 m->put();
6742 return;
6743 }
9f95a23c
TL
6744 if (!session->caps.allow_all()) {
6745 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
6746 m->put();
6747 return;
6748 }
9f95a23c 6749 cct->get_admin_socket()->queue_tell_command(m);
7c673cae
FG
6750 m->put();
6751}
6752
f64942e4
AA
6753namespace {
6754 class unlock_guard {
9f95a23c 6755 ceph::mutex& m;
f64942e4 6756 public:
9f95a23c 6757 explicit unlock_guard(ceph::mutex& mutex)
f64942e4
AA
6758 : m(mutex)
6759 {
11fdf7f2 6760 m.unlock();
f64942e4
AA
6761 }
6762 unlock_guard(unlock_guard&) = delete;
6763 ~unlock_guard() {
11fdf7f2 6764 m.lock();
f64942e4
AA
6765 }
6766 };
6767}
6768
9f95a23c 6769void OSD::scrub_purged_snaps()
7c673cae 6770{
9f95a23c
TL
6771 dout(10) << __func__ << dendl;
6772 ceph_assert(ceph_mutex_is_locked(osd_lock));
6773 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6774 make_snapmapper_oid(),
6775 make_purged_snaps_oid());
6776 clog->debug() << "purged_snaps scrub starts";
6777 osd_lock.unlock();
6778 s.run();
6779 if (s.stray.size()) {
6780 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6781 } else {
6782 clog->debug() << "purged_snaps scrub ok";
224ce89b 6783 }
9f95a23c
TL
6784 set<pair<spg_t,snapid_t>> queued;
6785 for (auto& [pool, snap, hash, shard] : s.stray) {
6786 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6787 if (!pi) {
6788 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6789 continue;
11fdf7f2 6790 }
9f95a23c
TL
6791 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6792 spg_t spgid(pgid, shard);
6793 pair<spg_t,snapid_t> p(spgid, snap);
6794 if (queued.count(p)) {
6795 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6796 << " already queued" << dendl;
6797 continue;
11fdf7f2 6798 }
9f95a23c
TL
6799 PGRef pg = lookup_lock_pg(spgid);
6800 if (!pg) {
6801 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6802 continue;
11fdf7f2 6803 }
9f95a23c
TL
6804 queued.insert(p);
6805 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6806 << snap << dendl;
6807 pg->queue_snap_retrim(snap);
6808 pg->unlock();
7c673cae 6809 }
9f95a23c
TL
6810 osd_lock.lock();
6811 if (is_stopping()) {
6812 return;
6813 }
6814 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6815 ObjectStore::Transaction t;
6816 superblock.last_purged_snaps_scrub = ceph_clock_now();
6817 write_superblock(t);
6818 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6819 ceph_assert(tr == 0);
6820 if (is_active()) {
6821 send_beacon(ceph::coarse_mono_clock::now());
6822 }
6823 dout(10) << __func__ << " done" << dendl;
11fdf7f2
TL
6824}
6825
6826void OSD::probe_smart(const string& only_devid, ostream& ss)
6827{
6828 set<string> devnames;
6829 store->get_devices(&devnames);
6830 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6831 "osd_smart_report_timeout");
6832
6833 // == typedef std::map<std::string, mValue> mObject;
6834 json_spirit::mObject json_map;
6835
6836 for (auto dev : devnames) {
6837 // smartctl works only on physical devices; filter out any logical device
6838 if (dev.find("dm-") == 0) {
6839 continue;
6840 }
6841
6842 string err;
6843 string devid = get_device_id(dev, &err);
6844 if (devid.size() == 0) {
6845 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6846 << err << "), skipping" << dendl;
6847 continue;
6848 }
6849 if (only_devid.size() && devid != only_devid) {
6850 continue;
6851 }
6852
6853 json_spirit::mValue smart_json;
6854 if (block_device_get_metrics(dev, smart_timeout,
6855 &smart_json)) {
6856 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6857 continue;
6858 }
6859 json_map[devid] = smart_json;
7c673cae 6860 }
11fdf7f2 6861 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
6862}
6863
6864bool OSD::heartbeat_dispatch(Message *m)
6865{
6866 dout(30) << "heartbeat_dispatch " << m << dendl;
6867 switch (m->get_type()) {
6868
6869 case CEPH_MSG_PING:
6870 dout(10) << "ping from " << m->get_source_inst() << dendl;
6871 m->put();
6872 break;
6873
6874 case MSG_OSD_PING:
6875 handle_osd_ping(static_cast<MOSDPing*>(m));
6876 break;
6877
6878 default:
6879 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6880 m->put();
6881 }
6882
6883 return true;
6884}
6885
6886bool OSD::ms_dispatch(Message *m)
6887{
6888 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6889 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6890 service.got_stop_ack();
6891 m->put();
6892 return true;
6893 }
6894
6895 // lock!
6896
9f95a23c 6897 osd_lock.lock();
7c673cae 6898 if (is_stopping()) {
9f95a23c 6899 osd_lock.unlock();
7c673cae
FG
6900 m->put();
6901 return true;
6902 }
6903
6904 do_waiters();
6905 _dispatch(m);
6906
9f95a23c 6907 osd_lock.unlock();
7c673cae
FG
6908
6909 return true;
6910}
6911
9f95a23c
TL
6912void OSDService::maybe_share_map(
6913 Connection *con,
6914 const OSDMapRef& osdmap,
6915 epoch_t peer_epoch_lb)
7c673cae 6916{
9f95a23c
TL
6917 // NOTE: we assume caller hold something that keeps the Connection itself
6918 // pinned (e.g., an OpRequest's MessageRef).
6919 auto session = ceph::ref_cast<Session>(con->get_priv());
6920 if (!session) {
7c673cae
FG
6921 return;
6922 }
7c673cae 6923
9f95a23c
TL
6924 // assume the peer has the newer of the op's sent_epoch and what
6925 // we think we sent them.
7c673cae 6926 session->sent_epoch_lock.lock();
9f95a23c
TL
6927 if (peer_epoch_lb > session->last_sent_epoch) {
6928 dout(10) << __func__ << " con " << con
6929 << " " << con->get_peer_addr()
6930 << " map epoch " << session->last_sent_epoch
6931 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
6932 session->last_sent_epoch = peer_epoch_lb;
6933 }
6934 epoch_t last_sent_epoch = session->last_sent_epoch;
7c673cae
FG
6935 session->sent_epoch_lock.unlock();
6936
9f95a23c
TL
6937 if (osdmap->get_epoch() <= last_sent_epoch) {
6938 return;
6939 }
11fdf7f2 6940
9f95a23c
TL
6941 send_incremental_map(last_sent_epoch, con, osdmap);
6942 last_sent_epoch = osdmap->get_epoch();
7c673cae
FG
6943
6944 session->sent_epoch_lock.lock();
6945 if (session->last_sent_epoch < last_sent_epoch) {
9f95a23c
TL
6946 dout(10) << __func__ << " con " << con
6947 << " " << con->get_peer_addr()
6948 << " map epoch " << session->last_sent_epoch
6949 << " -> " << last_sent_epoch << " (shared)" << dendl;
7c673cae
FG
6950 session->last_sent_epoch = last_sent_epoch;
6951 }
6952 session->sent_epoch_lock.unlock();
7c673cae
FG
6953}
6954
9f95a23c 6955void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7c673cae 6956{
9f95a23c 6957 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7c673cae
FG
6958
6959 auto i = session->waiting_on_map.begin();
6960 while (i != session->waiting_on_map.end()) {
6961 OpRequestRef op = &(*i);
11fdf7f2 6962 ceph_assert(ms_can_fast_dispatch(op->get_req()));
9f95a23c 6963 auto m = op->get_req<MOSDFastDispatchOp>();
7c673cae
FG
6964 if (m->get_min_epoch() > osdmap->get_epoch()) {
6965 break;
6966 }
6967 session->waiting_on_map.erase(i++);
6968 op->put();
6969
6970 spg_t pgid;
6971 if (m->get_type() == CEPH_MSG_OSD_OP) {
6972 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6973 static_cast<const MOSDOp*>(m)->get_pg());
6974 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6975 continue;
6976 }
6977 } else {
6978 pgid = m->get_spg();
6979 }
11fdf7f2 6980 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
6981 }
6982
6983 if (session->waiting_on_map.empty()) {
6984 clear_session_waiting_on_map(session);
6985 } else {
6986 register_session_waiting_on_map(session);
6987 }
6988}
6989
6990void OSD::ms_fast_dispatch(Message *m)
6991{
11fdf7f2 6992 FUNCTRACE(cct);
7c673cae
FG
6993 if (service.is_stopping()) {
6994 m->put();
6995 return;
6996 }
11fdf7f2
TL
6997
6998 // peering event?
6999 switch (m->get_type()) {
7000 case CEPH_MSG_PING:
7001 dout(10) << "ping from " << m->get_source() << dendl;
7002 m->put();
7003 return;
11fdf7f2
TL
7004 case MSG_OSD_FORCE_RECOVERY:
7005 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7006 return;
7007 case MSG_OSD_SCRUB2:
7008 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7009 return;
7010
7011 case MSG_OSD_PG_CREATE2:
7012 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7013 case MSG_OSD_PG_QUERY:
7014 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7015 case MSG_OSD_PG_NOTIFY:
7016 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7017 case MSG_OSD_PG_INFO:
7018 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7019 case MSG_OSD_PG_REMOVE:
7020 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7021
7022 // these are single-pg messages that handle themselves
7023 case MSG_OSD_PG_LOG:
7024 case MSG_OSD_PG_TRIM:
9f95a23c
TL
7025 case MSG_OSD_PG_NOTIFY2:
7026 case MSG_OSD_PG_QUERY2:
7027 case MSG_OSD_PG_INFO2:
11fdf7f2
TL
7028 case MSG_OSD_BACKFILL_RESERVE:
7029 case MSG_OSD_RECOVERY_RESERVE:
9f95a23c
TL
7030 case MSG_OSD_PG_LEASE:
7031 case MSG_OSD_PG_LEASE_ACK:
11fdf7f2
TL
7032 {
7033 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7034 if (require_osd_peer(pm)) {
7035 enqueue_peering_evt(
7036 pm->get_spg(),
7037 PGPeeringEventRef(pm->get_event()));
7038 }
7039 pm->put();
7040 return;
7041 }
7042 }
7043
7c673cae
FG
7044 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7045 {
7046#ifdef WITH_LTTNG
7047 osd_reqid_t reqid = op->get_reqid();
7048#endif
7049 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7050 reqid.name._num, reqid.tid, reqid.inc);
7051 }
7052
7053 if (m->trace)
7054 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7055
11fdf7f2 7056 // note sender epoch, min req's epoch
7c673cae
FG
7057 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7058 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7059 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7060
7061 service.maybe_inject_dispatch_delay();
7062
7063 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7064 m->get_type() != CEPH_MSG_OSD_OP) {
7065 // queue it directly
7066 enqueue_op(
7067 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7068 std::move(op),
7c673cae
FG
7069 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7070 } else {
7071 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7072 // message that didn't have an explicit spg_t); we need to map
7073 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7074 auto priv = m->get_connection()->get_priv();
7075 if (auto session = static_cast<Session*>(priv.get()); session) {
7076 std::lock_guard l{session->session_dispatch_lock};
7077 op->get();
7078 session->waiting_on_map.push_back(*op);
7079 OSDMapRef nextmap = service.get_nextmap_reserved();
7080 dispatch_session_waiting(session, nextmap);
7081 service.release_map(nextmap);
7c673cae
FG
7082 }
7083 }
7084 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7085}
7086
11fdf7f2 7087int OSD::ms_handle_authentication(Connection *con)
7c673cae 7088{
11fdf7f2 7089 int ret = 0;
9f95a23c 7090 auto s = ceph::ref_cast<Session>(con->get_priv());
11fdf7f2 7091 if (!s) {
9f95a23c
TL
7092 s = ceph::make_ref<Session>(cct, con);
7093 con->set_priv(s);
11fdf7f2
TL
7094 s->entity_name = con->get_peer_entity_name();
7095 dout(10) << __func__ << " new session " << s << " con " << s->con
7096 << " entity " << s->entity_name
7097 << " addr " << con->get_peer_addrs() << dendl;
7098 } else {
7099 dout(10) << __func__ << " existing session " << s << " con " << s->con
7100 << " entity " << s->entity_name
7101 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7102 }
7103
11fdf7f2 7104 AuthCapsInfo &caps_info = con->get_peer_caps_info();
9f95a23c 7105 if (caps_info.allow_all) {
11fdf7f2 7106 s->caps.set_allow_all();
9f95a23c 7107 } else if (caps_info.caps.length() > 0) {
11fdf7f2
TL
7108 bufferlist::const_iterator p = caps_info.caps.cbegin();
7109 string str;
7110 try {
7111 decode(str, p);
7112 }
7113 catch (buffer::error& e) {
7114 dout(10) << __func__ << " session " << s << " " << s->entity_name
7115 << " failed to decode caps string" << dendl;
9f95a23c 7116 ret = -EACCES;
11fdf7f2
TL
7117 }
7118 if (!ret) {
7c673cae 7119 bool success = s->caps.parse(str);
11fdf7f2
TL
7120 if (success) {
7121 dout(10) << __func__ << " session " << s
7122 << " " << s->entity_name
7123 << " has caps " << s->caps << " '" << str << "'" << dendl;
7124 ret = 1;
7125 } else {
7126 dout(10) << __func__ << " session " << s << " " << s->entity_name
7127 << " failed to parse caps '" << str << "'" << dendl;
9f95a23c 7128 ret = -EACCES;
11fdf7f2 7129 }
7c673cae 7130 }
7c673cae 7131 }
11fdf7f2 7132 return ret;
7c673cae
FG
7133}
7134
7135void OSD::do_waiters()
7136{
9f95a23c 7137 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7138
7139 dout(10) << "do_waiters -- start" << dendl;
7140 while (!finished.empty()) {
7141 OpRequestRef next = finished.front();
7142 finished.pop_front();
7143 dispatch_op(next);
7144 }
7145 dout(10) << "do_waiters -- finish" << dendl;
7146}
7147
7148void OSD::dispatch_op(OpRequestRef op)
7149{
7150 switch (op->get_req()->get_type()) {
7151
7152 case MSG_OSD_PG_CREATE:
7153 handle_pg_create(op);
7154 break;
7c673cae
FG
7155 }
7156}
7157
7158void OSD::_dispatch(Message *m)
7159{
9f95a23c 7160 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7161 dout(20) << "_dispatch " << m << " " << *m << dendl;
7162
7163 switch (m->get_type()) {
7c673cae
FG
7164 // -- don't need OSDMap --
7165
7166 // map and replication
7167 case CEPH_MSG_OSD_MAP:
7168 handle_osd_map(static_cast<MOSDMap*>(m));
7169 break;
9f95a23c
TL
7170 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7171 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7172 break;
7c673cae
FG
7173
7174 // osd
7c673cae
FG
7175 case MSG_OSD_SCRUB:
7176 handle_scrub(static_cast<MOSDScrub*>(m));
7177 break;
7178
11fdf7f2
TL
7179 case MSG_COMMAND:
7180 handle_command(static_cast<MCommand*>(m));
7181 return;
c07f9fc5 7182
7c673cae
FG
7183 // -- need OSDMap --
7184
7185 case MSG_OSD_PG_CREATE:
7c673cae
FG
7186 {
7187 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7188 if (m->trace)
7189 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7190 // no map? starting up?
9f95a23c 7191 if (!get_osdmap()) {
7c673cae
FG
7192 dout(7) << "no OSDMap, not booted" << dendl;
7193 logger->inc(l_osd_waiting_for_map);
7194 waiting_for_osdmap.push_back(op);
7195 op->mark_delayed("no osdmap");
7196 break;
7197 }
7198
7199 // need OSDMap
7200 dispatch_op(op);
7201 }
7202 }
7203}
7204
11fdf7f2 7205// remove me post-nautilus
7c673cae
FG
7206void OSD::handle_scrub(MOSDScrub *m)
7207{
7208 dout(10) << "handle_scrub " << *m << dendl;
7209 if (!require_mon_or_mgr_peer(m)) {
7210 m->put();
7211 return;
7212 }
7213 if (m->fsid != monc->get_fsid()) {
11fdf7f2
TL
7214 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7215 << dendl;
7c673cae
FG
7216 m->put();
7217 return;
7218 }
7219
11fdf7f2
TL
7220 vector<spg_t> spgs;
7221 _get_pgids(&spgs);
7222
7223 if (!m->scrub_pgs.empty()) {
7224 vector<spg_t> v;
7225 for (auto pgid : m->scrub_pgs) {
7c673cae 7226 spg_t pcand;
9f95a23c 7227 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
11fdf7f2
TL
7228 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7229 v.push_back(pcand);
7c673cae
FG
7230 }
7231 }
11fdf7f2
TL
7232 spgs.swap(v);
7233 }
7234
7235 for (auto pgid : spgs) {
7236 enqueue_peering_evt(
7237 pgid,
7238 PGPeeringEventRef(
7239 std::make_shared<PGPeeringEvent>(
7240 get_osdmap_epoch(),
7241 get_osdmap_epoch(),
9f95a23c 7242 PeeringState::RequestScrub(m->deep, m->repair))));
7c673cae
FG
7243 }
7244
7245 m->put();
7246}
7247
11fdf7f2
TL
7248void OSD::handle_fast_scrub(MOSDScrub2 *m)
7249{
7250 dout(10) << __func__ << " " << *m << dendl;
7251 if (!require_mon_or_mgr_peer(m)) {
7252 m->put();
7253 return;
7254 }
7255 if (m->fsid != monc->get_fsid()) {
7256 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7257 << dendl;
7258 m->put();
7259 return;
7260 }
7261 for (auto pgid : m->scrub_pgs) {
7262 enqueue_peering_evt(
7263 pgid,
7264 PGPeeringEventRef(
7265 std::make_shared<PGPeeringEvent>(
7266 m->epoch,
7267 m->epoch,
9f95a23c 7268 PeeringState::RequestScrub(m->deep, m->repair))));
11fdf7f2
TL
7269 }
7270 m->put();
7271}
7272
7c673cae
FG
7273bool OSD::scrub_random_backoff()
7274{
7275 bool coin_flip = (rand() / (double)RAND_MAX >=
7276 cct->_conf->osd_scrub_backoff_ratio);
7277 if (!coin_flip) {
7278 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7279 return true;
7280 }
7281 return false;
7282}
7283
7284OSDService::ScrubJob::ScrubJob(CephContext* cct,
7285 const spg_t& pg, const utime_t& timestamp,
7286 double pool_scrub_min_interval,
7287 double pool_scrub_max_interval, bool must)
7288 : cct(cct),
7289 pgid(pg),
7290 sched_time(timestamp),
7291 deadline(timestamp)
7292{
7293 // if not explicitly requested, postpone the scrub with a random delay
7294 if (!must) {
7295 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7296 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7297 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7298 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7299
7300 sched_time += scrub_min_interval;
7301 double r = rand() / (double)RAND_MAX;
7302 sched_time +=
7303 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
11fdf7f2
TL
7304 if (scrub_max_interval == 0) {
7305 deadline = utime_t();
7306 } else {
7307 deadline += scrub_max_interval;
7308 }
7309
7c673cae
FG
7310 }
7311}
7312
7313bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7314 if (sched_time < rhs.sched_time)
7315 return true;
7316 if (sched_time > rhs.sched_time)
7317 return false;
7318 return pgid < rhs.pgid;
7319}
7320
9f95a23c
TL
7321double OSD::scrub_sleep_time(bool must_scrub)
7322{
7323 if (must_scrub) {
7324 return cct->_conf->osd_scrub_sleep;
7325 }
7326 utime_t now = ceph_clock_now();
7327 if (scrub_time_permit(now)) {
7328 return cct->_conf->osd_scrub_sleep;
7329 }
7330 double normal_sleep = cct->_conf->osd_scrub_sleep;
7331 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7332 return std::max(extended_sleep, normal_sleep);
7333}
7334
7c673cae
FG
7335bool OSD::scrub_time_permit(utime_t now)
7336{
7337 struct tm bdt;
7338 time_t tt = now.sec();
7339 localtime_r(&tt, &bdt);
28e407b8
AA
7340
7341 bool day_permit = false;
7342 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7343 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7344 day_permit = true;
7345 }
7346 } else {
7347 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7348 day_permit = true;
7349 }
7350 }
7351
7352 if (!day_permit) {
7353 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7354 << " - " << cct->_conf->osd_scrub_end_week_day
7355 << " now " << bdt.tm_wday << " = no" << dendl;
7356 return false;
7357 }
7358
7c673cae
FG
7359 bool time_permit = false;
7360 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7361 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7362 time_permit = true;
7363 }
7364 } else {
7365 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7366 time_permit = true;
7367 }
7368 }
7369 if (!time_permit) {
7370 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7371 << " - " << cct->_conf->osd_scrub_end_hour
7372 << " now " << bdt.tm_hour << " = no" << dendl;
7373 } else {
7374 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7375 << " - " << cct->_conf->osd_scrub_end_hour
7376 << " now " << bdt.tm_hour << " = yes" << dendl;
7377 }
7378 return time_permit;
7379}
7380
7381bool OSD::scrub_load_below_threshold()
7382{
7383 double loadavgs[3];
7384 if (getloadavg(loadavgs, 3) != 3) {
7385 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7386 return false;
7387 }
7388
7389 // allow scrub if below configured threshold
91327a77
AA
7390 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7391 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7392 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7393 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7c673cae
FG
7394 << " < max " << cct->_conf->osd_scrub_load_threshold
7395 << " = yes" << dendl;
7396 return true;
7397 }
7398
7399 // allow scrub if below daily avg and currently decreasing
7400 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7401 dout(20) << __func__ << " loadavg " << loadavgs[0]
7402 << " < daily_loadavg " << daily_loadavg
7403 << " and < 15m avg " << loadavgs[2]
7404 << " = yes" << dendl;
7405 return true;
7406 }
7407
7408 dout(20) << __func__ << " loadavg " << loadavgs[0]
7409 << " >= max " << cct->_conf->osd_scrub_load_threshold
7410 << " and ( >= daily_loadavg " << daily_loadavg
7411 << " or >= 15m avg " << loadavgs[2]
7412 << ") = no" << dendl;
7413 return false;
7414}
7415
7416void OSD::sched_scrub()
7417{
7418 // if not permitted, fail fast
eafe8130 7419 if (!service.can_inc_scrubs()) {
7c673cae
FG
7420 return;
7421 }
eafe8130
TL
7422 bool allow_requested_repair_only = false;
7423 if (service.is_recovery_active()) {
7424 if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
7425 dout(10) << __func__
7426 << " will only schedule explicitly requested repair due to active recovery"
7427 << dendl;
7428 allow_requested_repair_only = true;
7429 } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
7430 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7431 return;
7432 }
b5b8bbf5
FG
7433 }
7434
7c673cae
FG
7435 utime_t now = ceph_clock_now();
7436 bool time_permit = scrub_time_permit(now);
7437 bool load_is_low = scrub_load_below_threshold();
7438 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7439
7440 OSDService::ScrubJob scrub;
7441 if (service.first_scrub_stamp(&scrub)) {
7442 do {
7443 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7444
7445 if (scrub.sched_time > now) {
7446 // save ourselves some effort
7447 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7448 << " > " << now << dendl;
7449 break;
7450 }
7451
11fdf7f2 7452 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7c673cae
FG
7453 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7454 << (!time_permit ? "time not permit" : "high load") << dendl;
7455 continue;
7456 }
7457
11fdf7f2 7458 PGRef pg = _lookup_lock_pg(scrub.pgid);
7c673cae
FG
7459 if (!pg)
7460 continue;
494da23a
TL
7461 // This has already started, so go on to the next scrub job
7462 if (pg->scrubber.active) {
7463 pg->unlock();
7464 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7465 continue;
7466 }
eafe8130
TL
7467 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7468 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7469 pg->unlock();
7470 dout(10) << __func__ << " skip " << scrub.pgid
7471 << " because repairing is not explicitly requested on it"
7472 << dendl;
7473 continue;
7474 }
494da23a 7475 // If it is reserving, let it resolve before going to the next scrub job
eafe8130 7476 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
494da23a
TL
7477 pg->unlock();
7478 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7479 break;
7480 }
11fdf7f2
TL
7481 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7482 << (pg->get_must_scrub() ? ", explicitly requested" :
7483 (load_is_low ? ", load_is_low" : " deadline < now"))
7484 << dendl;
7485 if (pg->sched_scrub()) {
7486 pg->unlock();
7487 break;
7c673cae
FG
7488 }
7489 pg->unlock();
7490 } while (service.next_scrub_stamp(scrub, &scrub));
7491 }
7492 dout(20) << "sched_scrub done" << dendl;
7493}
7494
494da23a
TL
7495void OSD::resched_all_scrubs()
7496{
7497 dout(10) << __func__ << ": start" << dendl;
7498 OSDService::ScrubJob scrub;
7499 if (service.first_scrub_stamp(&scrub)) {
7500 do {
7501 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
7502
7503 PGRef pg = _lookup_lock_pg(scrub.pgid);
7504 if (!pg)
7505 continue;
7506 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
7507 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
7508 pg->on_info_history_change();
7509 }
7510 pg->unlock();
7511 } while (service.next_scrub_stamp(scrub, &scrub));
7512 }
7513 dout(10) << __func__ << ": done" << dendl;
7514}
7515
11fdf7f2
TL
7516MPGStats* OSD::collect_pg_stats()
7517{
7518 // This implementation unconditionally sends every is_primary PG's
7519 // stats every time we're called. This has equivalent cost to the
7520 // previous implementation's worst case where all PGs are busy and
7521 // their stats are always enqueued for sending.
9f95a23c 7522 std::shared_lock l{map_lock};
11fdf7f2 7523
11fdf7f2
TL
7524 osd_stat_t cur_stat = service.get_osd_stat();
7525 cur_stat.os_perf_stat = store->get_cur_stats();
7526
9f95a23c 7527 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
11fdf7f2
TL
7528 m->osd_stat = cur_stat;
7529
7530 std::lock_guard lec{min_last_epoch_clean_lock};
9f95a23c 7531 min_last_epoch_clean = get_osdmap_epoch();
11fdf7f2
TL
7532 min_last_epoch_clean_pgs.clear();
7533
7534 std::set<int64_t> pool_set;
7535 vector<PGRef> pgs;
7536 _get_pgs(&pgs);
7537 for (auto& pg : pgs) {
7538 auto pool = pg->pg_id.pgid.pool();
7539 pool_set.emplace((int64_t)pool);
7540 if (!pg->is_primary()) {
7541 continue;
7542 }
7543 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7544 m->pg_stat[pg->pg_id.pgid] = s;
7545 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7546 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7547 });
7548 }
7549 store_statfs_t st;
81eedcae 7550 bool per_pool_stats = false;
9f95a23c 7551 bool per_pool_omap_stats = false;
11fdf7f2 7552 for (auto p : pool_set) {
9f95a23c 7553 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
11fdf7f2
TL
7554 if (r == -ENOTSUP) {
7555 break;
7556 } else {
7557 assert(r >= 0);
7558 m->pool_stat[p] = st;
81eedcae 7559 per_pool_stats = true;
11fdf7f2
TL
7560 }
7561 }
7c673cae 7562
81eedcae
TL
7563 // indicate whether we are reporting per-pool stats
7564 m->osd_stat.num_osds = 1;
7565 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
9f95a23c 7566 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
81eedcae 7567
11fdf7f2
TL
7568 return m;
7569}
7c673cae 7570
11fdf7f2 7571vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 7572{
11fdf7f2
TL
7573 vector<DaemonHealthMetric> metrics;
7574 {
7575 utime_t oldest_secs;
7576 const utime_t now = ceph_clock_now();
7577 auto too_old = now;
7578 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7579 int slow = 0;
7580 TrackedOpRef oldest_op;
7581 auto count_slow_ops = [&](TrackedOp& op) {
7582 if (op.get_initiated() < too_old) {
9f95a23c
TL
7583 stringstream ss;
7584 ss << "slow request " << op.get_desc()
7585 << " initiated "
7586 << op.get_initiated()
7587 << " currently "
7588 << op.state_string();
7589 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7590 clog->warn() << ss.str();
11fdf7f2
TL
7591 slow++;
7592 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7593 oldest_op = &op;
7594 }
7595 return true;
7596 } else {
7597 return false;
7598 }
7599 };
7600 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7601 if (slow) {
7602 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7603 << oldest_op->get_desc() << dendl;
7604 }
7605 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7606 } else {
7607 // no news is not good news.
7608 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7609 }
7610 }
7611 {
7612 std::lock_guard l(pending_creates_lock);
7613 auto n_primaries = pending_creates_from_mon;
7614 for (const auto& create : pending_creates_from_osd) {
7615 if (create.second) {
7616 n_primaries++;
7617 }
b32b8144 7618 }
11fdf7f2 7619 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 7620 }
b32b8144
FG
7621 return metrics;
7622}
7623
7c673cae
FG
7624// =====================================================
7625// MAP
7626
7627void OSD::wait_for_new_map(OpRequestRef op)
7628{
7629 // ask?
7630 if (waiting_for_osdmap.empty()) {
9f95a23c 7631 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
7632 }
7633
7634 logger->inc(l_osd_waiting_for_map);
7635 waiting_for_osdmap.push_back(op);
7636 op->mark_delayed("wait for new map");
7637}
7638
7639
7640/** update_map
7641 * assimilate new OSDMap(s). scan pgs, etc.
7642 */
7643
7644void OSD::note_down_osd(int peer)
7645{
9f95a23c
TL
7646 ceph_assert(ceph_mutex_is_locked(osd_lock));
7647 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7c673cae 7648
9f95a23c 7649 std::lock_guard l{heartbeat_lock};
7c673cae
FG
7650 failure_queue.erase(peer);
7651 failure_pending.erase(peer);
7652 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7653 if (p != heartbeat_peers.end()) {
9f95a23c 7654 p->second.clear_mark_down();
7c673cae
FG
7655 heartbeat_peers.erase(p);
7656 }
7c673cae
FG
7657}
7658
7659void OSD::note_up_osd(int peer)
7660{
7c673cae
FG
7661 heartbeat_set_peers_need_update();
7662}
7663
7664struct C_OnMapCommit : public Context {
7665 OSD *osd;
7666 epoch_t first, last;
7667 MOSDMap *msg;
7668 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7669 : osd(o), first(f), last(l), msg(m) {}
7670 void finish(int r) override {
7671 osd->_committed_osd_maps(first, last, msg);
7672 msg->put();
7673 }
7674};
7675
7c673cae
FG
7676void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7677{
11fdf7f2 7678 std::lock_guard l(osdmap_subscribe_lock);
181888fb 7679 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7680 return;
7681
11fdf7f2 7682 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 7683
7c673cae
FG
7684 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7685 force_request) {
7686 monc->renew_subs();
7687 }
7688}
7689
7690void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7691{
7692 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7693 if (min <= superblock.oldest_map)
7694 return;
7695
7696 int num = 0;
7697 ObjectStore::Transaction t;
7698 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7699 dout(20) << " removing old osdmap epoch " << e << dendl;
7700 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7701 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7702 superblock.oldest_map = e + 1;
7703 num++;
7704 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7705 service.publish_superblock(superblock);
7706 write_superblock(t);
11fdf7f2
TL
7707 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7708 ceph_assert(tr == 0);
7c673cae
FG
7709 num = 0;
7710 if (!skip_maps) {
7711 // skip_maps leaves us with a range of old maps if we fail to remove all
7712 // of them before moving superblock.oldest_map forward to the first map
7713 // in the incoming MOSDMap msg. so we should continue removing them in
7714 // this case, even we could do huge series of delete transactions all at
7715 // once.
7716 break;
7717 }
7718 }
7719 }
7720 if (num > 0) {
7721 service.publish_superblock(superblock);
7722 write_superblock(t);
11fdf7f2
TL
7723 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7724 ceph_assert(tr == 0);
7c673cae
FG
7725 }
7726 // we should not remove the cached maps
11fdf7f2 7727 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
7728}
7729
7730void OSD::handle_osd_map(MOSDMap *m)
7731{
11fdf7f2
TL
7732 // wait for pgs to catch up
7733 {
7734 // we extend the map cache pins to accomodate pgs slow to consume maps
7735 // for some period, until we hit the max_lag_factor bound, at which point
7736 // we block here to stop injesting more maps than they are able to keep
7737 // up with.
7738 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7739 m_osd_pg_epoch_max_lag_factor;
7740 ceph_assert(max_lag > 0);
7741 epoch_t osd_min = 0;
7742 for (auto shard : shards) {
7743 epoch_t min = shard->get_min_pg_epoch();
7744 if (osd_min == 0 || min < osd_min) {
7745 osd_min = min;
7746 }
7747 }
9f95a23c 7748 epoch_t osdmap_epoch = get_osdmap_epoch();
11fdf7f2 7749 if (osd_min > 0 &&
9f95a23c
TL
7750 osdmap_epoch > max_lag &&
7751 osdmap_epoch - max_lag > osd_min) {
7752 epoch_t need = osdmap_epoch - max_lag;
11fdf7f2
TL
7753 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7754 << " max_lag " << max_lag << ")" << dendl;
7755 for (auto shard : shards) {
7756 epoch_t min = shard->get_min_pg_epoch();
7757 if (need > min) {
7758 dout(10) << __func__ << " waiting for pgs to consume " << need
7759 << " (shard " << shard->shard_id << " min " << min
7760 << ", map cache is " << cct->_conf->osd_map_cache_size
7761 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7762 << ")" << dendl;
7763 unlock_guard unlock{osd_lock};
7764 shard->wait_min_pg_epoch(need);
7765 }
7766 }
7767 }
7768 }
7769
9f95a23c 7770 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
7771 map<epoch_t,OSDMapRef> added_maps;
7772 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
7773 if (m->fsid != monc->get_fsid()) {
7774 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7775 << monc->get_fsid() << dendl;
7776 m->put();
7777 return;
7778 }
7779 if (is_initializing()) {
7780 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7781 m->put();
7782 return;
7783 }
7784
9f95a23c
TL
7785 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7786 if (session && !(session->entity_name.is_mon() ||
7c673cae
FG
7787 session->entity_name.is_osd())) {
7788 //not enough perms!
7789 dout(10) << "got osd map from Session " << session
7790 << " which we can't take maps from (not a mon or osd)" << dendl;
7791 m->put();
7c673cae
FG
7792 return;
7793 }
7c673cae
FG
7794
7795 // share with the objecter
7796 if (!is_preboot())
7797 service.objecter->handle_osd_map(m);
7798
7799 epoch_t first = m->get_first();
7800 epoch_t last = m->get_last();
7801 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7802 << superblock.newest_map
7803 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7804 << dendl;
7805
7806 logger->inc(l_osd_map);
7807 logger->inc(l_osd_mape, last - first + 1);
7808 if (first <= superblock.newest_map)
7809 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7810 if (service.max_oldest_map < m->oldest_map) {
7811 service.max_oldest_map = m->oldest_map;
11fdf7f2 7812 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7c673cae
FG
7813 }
7814
7815 // make sure there is something new, here, before we bother flushing
7816 // the queues and such
7817 if (last <= superblock.newest_map) {
7818 dout(10) << " no new maps here, dropping" << dendl;
7819 m->put();
7820 return;
7821 }
7822
7823 // missing some?
7824 bool skip_maps = false;
7825 if (first > superblock.newest_map + 1) {
7826 dout(10) << "handle_osd_map message skips epochs "
7827 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7828 if (m->oldest_map <= superblock.newest_map + 1) {
7829 osdmap_subscribe(superblock.newest_map + 1, false);
7830 m->put();
7831 return;
7832 }
7833 // always try to get the full range of maps--as many as we can. this
7834 // 1- is good to have
7835 // 2- is at present the only way to ensure that we get a *full* map as
7836 // the first map!
7837 if (m->oldest_map < first) {
7838 osdmap_subscribe(m->oldest_map - 1, true);
7839 m->put();
7840 return;
7841 }
7842 skip_maps = true;
7843 }
7844
7845 ObjectStore::Transaction t;
7846 uint64_t txn_size = 0;
7847
9f95a23c
TL
7848 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
7849
7c673cae 7850 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 7851 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
7852 for (epoch_t e = start; e <= last; e++) {
7853 if (txn_size >= t.get_num_bytes()) {
7854 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 7855 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
7856 }
7857 txn_size = t.get_num_bytes();
7858 map<epoch_t,bufferlist>::iterator p;
7859 p = m->maps.find(e);
7860 if (p != m->maps.end()) {
7861 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7862 OSDMap *o = new OSDMap;
7863 bufferlist& bl = p->second;
7864
7865 o->decode(bl);
7866
9f95a23c
TL
7867 purged_snaps[e] = o->get_new_purged_snaps();
7868
7c673cae
FG
7869 ghobject_t fulloid = get_osdmap_pobject_name(e);
7870 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
7871 added_maps[e] = add_map(o);
7872 added_maps_bl[e] = bl;
7c673cae
FG
7873 got_full_map(e);
7874 continue;
7875 }
7876
7877 p = m->incremental_maps.find(e);
7878 if (p != m->incremental_maps.end()) {
7879 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7880 bufferlist& bl = p->second;
7881 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7882 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
7883
7884 OSDMap *o = new OSDMap;
7885 if (e > 1) {
7886 bufferlist obl;
7887 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
7888 if (!got) {
7889 auto p = added_maps_bl.find(e - 1);
7890 ceph_assert(p != added_maps_bl.end());
7891 obl = p->second;
7892 }
7c673cae
FG
7893 o->decode(obl);
7894 }
7895
7896 OSDMap::Incremental inc;
11fdf7f2 7897 auto p = bl.cbegin();
7c673cae 7898 inc.decode(p);
494da23a 7899
7c673cae 7900 if (o->apply_incremental(inc) < 0) {
9f95a23c 7901 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 7902 ceph_abort_msg("bad fsid");
7c673cae
FG
7903 }
7904
7905 bufferlist fbl;
7906 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7907
7908 bool injected_failure = false;
7909 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7910 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7911 derr << __func__ << " injecting map crc failure" << dendl;
7912 injected_failure = true;
7913 }
7914
7915 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7916 dout(2) << "got incremental " << e
7917 << " but failed to encode full with correct crc; requesting"
7918 << dendl;
7919 clog->warn() << "failed to encode map e" << e << " with expected crc";
7920 dout(20) << "my encoded map was:\n";
7921 fbl.hexdump(*_dout);
7922 *_dout << dendl;
7923 delete o;
7924 request_full_map(e, last);
7925 last = e - 1;
7926 break;
7927 }
7928 got_full_map(e);
9f95a23c 7929 purged_snaps[e] = o->get_new_purged_snaps();
7c673cae
FG
7930
7931 ghobject_t fulloid = get_osdmap_pobject_name(e);
7932 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
7933 added_maps[e] = add_map(o);
7934 added_maps_bl[e] = fbl;
7c673cae
FG
7935 continue;
7936 }
7937
11fdf7f2 7938 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
7939 }
7940
7941 // even if this map isn't from a mon, we may have satisfied our subscription
7942 monc->sub_got("osdmap", last);
7943
7944 if (!m->maps.empty() && requested_full_first) {
7945 dout(10) << __func__ << " still missing full maps " << requested_full_first
7946 << ".." << requested_full_last << dendl;
7947 rerequest_full_maps();
7948 }
7949
7c673cae
FG
7950 if (superblock.oldest_map) {
7951 // make sure we at least keep pace with incoming maps
7952 trim_maps(m->oldest_map, last - first + 1, skip_maps);
11fdf7f2 7953 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
7954 }
7955
7956 if (!superblock.oldest_map || skip_maps)
7957 superblock.oldest_map = first;
7958 superblock.newest_map = last;
7959 superblock.current_epoch = last;
7960
7961 // note in the superblock that we were clean thru the prior epoch
7962 epoch_t boot_epoch = service.get_boot_epoch();
7963 if (boot_epoch && boot_epoch >= superblock.mounted) {
7964 superblock.mounted = boot_epoch;
7965 superblock.clean_thru = last;
7966 }
7967
11fdf7f2
TL
7968 // check for pg_num changes and deleted pools
7969 OSDMapRef lastmap;
7970 for (auto& i : added_maps) {
7971 if (!lastmap) {
7972 if (!(lastmap = service.try_get_map(i.first - 1))) {
7973 dout(10) << __func__ << " can't get previous map " << i.first - 1
7974 << " probably first start of this osd" << dendl;
7975 continue;
7976 }
7977 }
7978 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
7979 for (auto& j : lastmap->get_pools()) {
7980 if (!i.second->have_pg_pool(j.first)) {
7981 pg_num_history.log_pool_delete(i.first, j.first);
7982 dout(10) << __func__ << " recording final pg_pool_t for pool "
7983 << j.first << dendl;
7984 // this information is needed by _make_pg() if have to restart before
7985 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
7986 ghobject_t obj = make_final_pool_info_oid(j.first);
7987 bufferlist bl;
7988 encode(j.second, bl, CEPH_FEATURES_ALL);
7989 string name = lastmap->get_pool_name(j.first);
7990 encode(name, bl);
7991 map<string,string> profile;
7992 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
7993 profile = lastmap->get_erasure_code_profile(
7994 lastmap->get_pg_pool(j.first)->erasure_code_profile);
7995 }
7996 encode(profile, bl);
7997 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
11fdf7f2
TL
7998 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
7999 new_pg_num != j.second.get_pg_num()) {
8000 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8001 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8002 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8003 }
8004 }
8005 for (auto& j : i.second->get_pools()) {
8006 if (!lastmap->have_pg_pool(j.first)) {
8007 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8008 << j.second.get_pg_num() << dendl;
8009 pg_num_history.log_pg_num_change(i.first, j.first,
8010 j.second.get_pg_num());
8011 }
8012 }
8013 lastmap = i.second;
8014 }
8015 pg_num_history.epoch = last;
8016 {
8017 bufferlist bl;
8018 ::encode(pg_num_history, bl);
8019 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8020 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8021 }
8022
9f95a23c
TL
8023 // record new purged_snaps
8024 if (superblock.purged_snaps_last == start - 1) {
8025 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8026 make_purged_snaps_oid(), &t,
8027 purged_snaps);
8028 superblock.purged_snaps_last = last;
8029 } else {
8030 dout(10) << __func__ << " superblock purged_snaps_last is "
8031 << superblock.purged_snaps_last
8032 << ", not recording new purged_snaps" << dendl;
8033 }
8034
7c673cae
FG
8035 // superblock and commit
8036 write_superblock(t);
11fdf7f2 8037 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8038 store->queue_transaction(
11fdf7f2
TL
8039 service.meta_ch,
8040 std::move(t));
7c673cae
FG
8041 service.publish_superblock(superblock);
8042}
8043
8044void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8045{
8046 dout(10) << __func__ << " " << first << ".." << last << dendl;
8047 if (is_stopping()) {
8048 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8049 return;
8050 }
11fdf7f2 8051 std::lock_guard l(osd_lock);
31f18b77
FG
8052 if (is_stopping()) {
8053 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8054 return;
8055 }
9f95a23c 8056 map_lock.lock();
7c673cae
FG
8057
8058 bool do_shutdown = false;
8059 bool do_restart = false;
8060 bool network_error = false;
9f95a23c 8061 OSDMapRef osdmap;
7c673cae
FG
8062
8063 // advance through the new maps
8064 for (epoch_t cur = first; cur <= last; cur++) {
8065 dout(10) << " advance to epoch " << cur
8066 << " (<= last " << last
8067 << " <= newest_map " << superblock.newest_map
8068 << ")" << dendl;
8069
8070 OSDMapRef newmap = get_map(cur);
11fdf7f2 8071 ceph_assert(newmap); // we just cached it above!
7c673cae
FG
8072
8073 // start blacklisting messages sent to peers that go down.
8074 service.pre_publish_map(newmap);
8075
8076 // kill connections to newly down osds
8077 bool waited_for_reservations = false;
8078 set<int> old;
9f95a23c 8079 osdmap = get_osdmap();
7c673cae
FG
8080 osdmap->get_all_osds(old);
8081 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8082 if (*p != whoami &&
8083 osdmap->is_up(*p) && // in old map
8084 newmap->is_down(*p)) { // but not the new one
8085 if (!waited_for_reservations) {
8086 service.await_reserved_maps();
8087 waited_for_reservations = true;
8088 }
8089 note_down_osd(*p);
8090 } else if (*p != whoami &&
8091 osdmap->is_down(*p) &&
8092 newmap->is_up(*p)) {
8093 note_up_osd(*p);
8094 }
8095 }
8096
81eedcae 8097 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
7c673cae
FG
8098 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8099 << dendl;
8100 if (is_booting()) {
8101 // this captures the case where we sent the boot message while
8102 // NOUP was being set on the mon and our boot request was
8103 // dropped, and then later it is cleared. it imperfectly
8104 // handles the case where our original boot message was not
8105 // dropped and we restart even though we might have booted, but
8106 // that is harmless (boot will just take slightly longer).
8107 do_restart = true;
8108 }
8109 }
8110
9f95a23c
TL
8111 osdmap = std::move(newmap);
8112 set_osdmap(osdmap);
7c673cae
FG
8113 epoch_t up_epoch;
8114 epoch_t boot_epoch;
8115 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8116 if (!up_epoch &&
8117 osdmap->is_up(whoami) &&
11fdf7f2 8118 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8119 up_epoch = osdmap->get_epoch();
8120 dout(10) << "up_epoch is " << up_epoch << dendl;
8121 if (!boot_epoch) {
8122 boot_epoch = osdmap->get_epoch();
8123 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8124 }
8125 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8126 }
8127 }
8128
7c673cae
FG
8129 epoch_t _bind_epoch = service.get_bind_epoch();
8130 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8131 osdmap->get_addrs(whoami).legacy_equals(
8132 client_messenger->get_myaddrs()) &&
7c673cae
FG
8133 _bind_epoch < osdmap->get_up_from(whoami)) {
8134
8135 if (is_booting()) {
8136 dout(1) << "state: booting -> active" << dendl;
8137 set_state(STATE_ACTIVE);
11fdf7f2 8138 do_restart = false;
7c673cae
FG
8139
8140 // set incarnation so that osd_reqid_t's we generate for our
8141 // objecter requests are unique across restarts.
8142 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8143 cancel_pending_failures();
7c673cae
FG
8144 }
8145 }
8146
8147 if (osdmap->get_epoch() > 0 &&
8148 is_active()) {
8149 if (!osdmap->exists(whoami)) {
9f95a23c 8150 derr << "map says i do not exist. shutting down." << dendl;
7c673cae
FG
8151 do_shutdown = true; // don't call shutdown() while we have
8152 // everything paused
9f95a23c
TL
8153 } else if (osdmap->is_stop(whoami)) {
8154 derr << "map says i am stopped by admin. shutting down." << dendl;
8155 do_shutdown = true;
7c673cae 8156 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8157 !osdmap->get_addrs(whoami).legacy_equals(
8158 client_messenger->get_myaddrs()) ||
8159 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8160 cluster_messenger->get_myaddrs()) ||
8161 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8162 hb_back_server_messenger->get_myaddrs()) ||
8163 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8164 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8165 if (!osdmap->is_up(whoami)) {
8166 if (service.is_preparing_to_stop() || service.is_stopping()) {
8167 service.got_stop_ack();
8168 } else {
c07f9fc5
FG
8169 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8170 "but it is still running";
8171 clog->debug() << "map e" << osdmap->get_epoch()
8172 << " wrongly marked me down at e"
8173 << osdmap->get_down_at(whoami);
7c673cae 8174 }
9f95a23c
TL
8175 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8176 // note that this is best-effort...
8177 monc->send_mon_message(
8178 new MOSDMarkMeDead(
8179 monc->get_fsid(),
8180 whoami,
8181 osdmap->get_epoch()));
8182 }
11fdf7f2
TL
8183 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8184 client_messenger->get_myaddrs())) {
7c673cae 8185 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8186 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8187 << " != my " << client_messenger->get_myaddrs() << ")";
8188 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8189 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8190 clog->error() << "map e" << osdmap->get_epoch()
8191 << " had wrong cluster addr ("
11fdf7f2
TL
8192 << osdmap->get_cluster_addrs(whoami)
8193 << " != my " << cluster_messenger->get_myaddrs() << ")";
8194 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8195 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8196 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8197 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8198 << osdmap->get_hb_back_addrs(whoami)
8199 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8200 << ")";
11fdf7f2
TL
8201 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8202 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8203 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8204 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8205 << osdmap->get_hb_front_addrs(whoami)
8206 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8207 << ")";
8208 }
8209
8210 if (!service.is_stopping()) {
8211 epoch_t up_epoch = 0;
8212 epoch_t bind_epoch = osdmap->get_epoch();
8213 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8214 do_restart = true;
8215
8216 //add markdown log
8217 utime_t now = ceph_clock_now();
8218 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8219 osd_markdown_log.push_back(now);
7c673cae 8220 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
9f95a23c
TL
8221 derr << __func__ << " marked down "
8222 << osd_markdown_log.size()
8223 << " > osd_max_markdown_count "
8224 << cct->_conf->osd_max_markdown_count
8225 << " in last " << grace << " seconds, shutting down"
8226 << dendl;
7c673cae
FG
8227 do_restart = false;
8228 do_shutdown = true;
8229 }
8230
8231 start_waiting_for_healthy();
8232
8233 set<int> avoid_ports;
8234#if defined(__FreeBSD__)
8235 // prevent FreeBSD from grabbing the client_messenger port during
8236 // rebinding. In which case a cluster_meesneger will connect also
8237 // to the same port
11fdf7f2 8238 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8239#endif
11fdf7f2 8240 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8241
8242 int r = cluster_messenger->rebind(avoid_ports);
8243 if (r != 0) {
8244 do_shutdown = true; // FIXME: do_restart?
8245 network_error = true;
9f95a23c
TL
8246 derr << __func__ << " marked down:"
8247 << " rebind cluster_messenger failed" << dendl;
7c673cae
FG
8248 }
8249
9f95a23c
TL
8250 hb_back_server_messenger->mark_down_all();
8251 hb_front_server_messenger->mark_down_all();
7c673cae
FG
8252 hb_front_client_messenger->mark_down_all();
8253 hb_back_client_messenger->mark_down_all();
8254
494da23a 8255 reset_heartbeat_peers(true);
7c673cae
FG
8256 }
8257 }
8258 }
8259
9f95a23c 8260 map_lock.unlock();
7c673cae 8261
11fdf7f2 8262 check_osdmap_features();
7c673cae
FG
8263
8264 // yay!
8265 consume_map();
8266
8267 if (is_active() || is_waiting_for_healthy())
8268 maybe_update_heartbeat_peers();
8269
11fdf7f2 8270 if (is_active()) {
7c673cae
FG
8271 activate_map();
8272 }
8273
31f18b77 8274 if (do_shutdown) {
7c673cae 8275 if (network_error) {
11fdf7f2 8276 cancel_pending_failures();
7c673cae
FG
8277 }
8278 // trigger shutdown in a different thread
8279 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8280 queue_async_signal(SIGINT);
8281 }
31f18b77
FG
8282 else if (m->newest_map && m->newest_map > last) {
8283 dout(10) << " msg say newest map is " << m->newest_map
8284 << ", requesting more" << dendl;
8285 osdmap_subscribe(osdmap->get_epoch()+1, false);
8286 }
7c673cae
FG
8287 else if (is_preboot()) {
8288 if (m->get_source().is_mon())
8289 _preboot(m->oldest_map, m->newest_map);
8290 else
8291 start_boot();
8292 }
8293 else if (do_restart)
8294 start_boot();
8295
8296}
8297
11fdf7f2 8298void OSD::check_osdmap_features()
7c673cae
FG
8299{
8300 // adjust required feature bits?
8301
8302 // we have to be a bit careful here, because we are accessing the
8303 // Policy structures without taking any lock. in particular, only
8304 // modify integer values that can safely be read by a racing CPU.
8305 // since we are only accessing existing Policy structures a their
8306 // current memory location, and setting or clearing bits in integer
8307 // fields, and we are the only writer, this is not a problem.
8308
9f95a23c 8309 const auto osdmap = get_osdmap();
7c673cae
FG
8310 {
8311 Messenger::Policy p = client_messenger->get_default_policy();
8312 uint64_t mask;
8313 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8314 if ((p.features_required & mask) != features) {
8315 dout(0) << "crush map has features " << features
8316 << ", adjusting msgr requires for clients" << dendl;
8317 p.features_required = (p.features_required & ~mask) | features;
8318 client_messenger->set_default_policy(p);
8319 }
8320 }
8321 {
8322 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8323 uint64_t mask;
8324 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8325 if ((p.features_required & mask) != features) {
8326 dout(0) << "crush map has features " << features
8327 << " was " << p.features_required
8328 << ", adjusting msgr requires for mons" << dendl;
8329 p.features_required = (p.features_required & ~mask) | features;
8330 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8331 }
8332 }
8333 {
8334 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8335 uint64_t mask;
8336 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8337
8338 if ((p.features_required & mask) != features) {
8339 dout(0) << "crush map has features " << features
8340 << ", adjusting msgr requires for osds" << dendl;
8341 p.features_required = (p.features_required & ~mask) | features;
8342 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8343 }
8344
11fdf7f2 8345 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8346 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8347 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8348 ObjectStore::Transaction t;
8349 write_superblock(t);
11fdf7f2
TL
8350 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8351 ceph_assert(err == 0);
7c673cae
FG
8352 }
8353 }
11fdf7f2 8354
9f95a23c
TL
8355 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8356 hb_front_server_messenger->set_require_authorizer(false);
8357 hb_back_server_messenger->set_require_authorizer(false);
8358 } else {
8359 hb_front_server_messenger->set_require_authorizer(true);
8360 hb_back_server_messenger->set_require_authorizer(true);
11fdf7f2
TL
8361 }
8362
8363 if (osdmap->require_osd_release != last_require_osd_release) {
8364 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8365 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8366 store->write_meta("require_osd_release",
8367 stringify((int)osdmap->require_osd_release));
8368 last_require_osd_release = osdmap->require_osd_release;
8369 }
7c673cae
FG
8370}
8371
11fdf7f2
TL
8372struct C_FinishSplits : public Context {
8373 OSD *osd;
8374 set<PGRef> pgs;
8375 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8376 : osd(osd), pgs(in) {}
8377 void finish(int r) override {
8378 osd->_finish_splits(pgs);
8379 }
8380};
8381
8382void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8383{
11fdf7f2
TL
8384 dout(10) << __func__ << " " << pgs << dendl;
8385 if (is_stopping())
8386 return;
11fdf7f2
TL
8387 for (set<PGRef>::iterator i = pgs.begin();
8388 i != pgs.end();
8389 ++i) {
8390 PG *pg = i->get();
7c673cae 8391
9f95a23c 8392 PeeringCtx rctx = create_context();
11fdf7f2
TL
8393 pg->lock();
8394 dout(10) << __func__ << " " << *pg << dendl;
8395 epoch_t e = pg->get_osdmap_epoch();
9f95a23c 8396 pg->handle_initialize(rctx);
11fdf7f2 8397 pg->queue_null(e, e);
9f95a23c 8398 dispatch_context(rctx, pg, service.get_osdmap());
11fdf7f2 8399 pg->unlock();
7c673cae 8400
11fdf7f2
TL
8401 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8402 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae 8403 }
11fdf7f2
TL
8404};
8405
8406bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8407 unsigned need)
8408{
8409 std::lock_guard l(merge_lock);
8410 auto& p = merge_waiters[nextmap->get_epoch()][target];
8411 p[src->pg_id] = src;
8412 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8413 << " for " << target << ", have " << p.size() << "/" << need
8414 << dendl;
8415 return p.size() == need;
8416}
8417
8418bool OSD::advance_pg(
8419 epoch_t osd_epoch,
8420 PG *pg,
8421 ThreadPool::TPHandle &handle,
9f95a23c 8422 PeeringCtx &rctx)
11fdf7f2
TL
8423{
8424 if (osd_epoch <= pg->get_osdmap_epoch()) {
8425 return true;
8426 }
8427 ceph_assert(pg->is_locked());
8428 OSDMapRef lastmap = pg->get_osdmap();
8429 ceph_assert(lastmap->get_epoch() < osd_epoch);
8430 set<PGRef> new_pgs; // any split children
8431 bool ret = true;
8432
8433 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8434 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8435 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8436 next_epoch <= osd_epoch;
7c673cae
FG
8437 ++next_epoch) {
8438 OSDMapRef nextmap = service.try_get_map(next_epoch);
8439 if (!nextmap) {
8440 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8441 continue;
8442 }
8443
11fdf7f2
TL
8444 unsigned new_pg_num =
8445 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8446 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8447 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8448 // check for merge
8449 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8450 spg_t parent;
8451 if (pg->pg_id.is_merge_source(
8452 old_pg_num,
8453 new_pg_num,
8454 &parent)) {
8455 // we are merge source
8456 PGRef spg = pg; // carry a ref
8457 dout(1) << __func__ << " " << pg->pg_id
8458 << " is merge source, target is " << parent
8459 << dendl;
8460 pg->write_if_dirty(rctx);
9f95a23c
TL
8461 if (!new_pgs.empty()) {
8462 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8463 new_pgs));
8464 new_pgs.clear();
8465 }
8466 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2 8467 pg->ch->flush();
eafe8130
TL
8468 // release backoffs explicitly, since the on_shutdown path
8469 // aggressively tears down backoff state.
8470 if (pg->is_primary()) {
8471 pg->release_pg_backoffs();
8472 }
11fdf7f2
TL
8473 pg->on_shutdown();
8474 OSDShard *sdata = pg->osd_shard;
8475 {
8476 std::lock_guard l(sdata->shard_lock);
8477 if (pg->pg_slot) {
8478 sdata->_detach_pg(pg->pg_slot);
8479 // update pg count now since we might not get an osdmap
8480 // any time soon.
8481 if (pg->is_primary())
8482 logger->dec(l_osd_pg_primary);
9f95a23c
TL
8483 else if (pg->is_nonprimary())
8484 logger->dec(l_osd_pg_replica); // misnomer
11fdf7f2
TL
8485 else
8486 logger->dec(l_osd_pg_stray);
8487 }
8488 }
8489 pg->unlock();
8490
8491 set<spg_t> children;
8492 parent.is_split(new_pg_num, old_pg_num, &children);
8493 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8494 enqueue_peering_evt(
8495 parent,
8496 PGPeeringEventRef(
8497 std::make_shared<PGPeeringEvent>(
8498 nextmap->get_epoch(),
8499 nextmap->get_epoch(),
8500 NullEvt())));
8501 }
8502 ret = false;
8503 goto out;
8504 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8505 // we are merge target
8506 set<spg_t> children;
8507 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8508 dout(20) << __func__ << " " << pg->pg_id
8509 << " is merge target, sources are " << children
8510 << dendl;
8511 map<spg_t,PGRef> sources;
8512 {
8513 std::lock_guard l(merge_lock);
8514 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8515 unsigned need = children.size();
8516 dout(20) << __func__ << " have " << s.size() << "/"
8517 << need << dendl;
8518 if (s.size() == need) {
8519 sources.swap(s);
8520 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8521 if (merge_waiters[nextmap->get_epoch()].empty()) {
8522 merge_waiters.erase(nextmap->get_epoch());
8523 }
8524 }
8525 }
8526 if (!sources.empty()) {
8527 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8528 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8529 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8530 pg->merge_from(
8531 sources, rctx, split_bits,
8532 nextmap->get_pg_pool(
8533 pg->pg_id.pool())->last_pg_merge_meta);
8534 pg->pg_slot->waiting_for_merge_epoch = 0;
8535 } else {
8536 dout(20) << __func__ << " not ready to merge yet" << dendl;
8537 pg->write_if_dirty(rctx);
9f95a23c
TL
8538 if (!new_pgs.empty()) {
8539 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8540 new_pgs));
8541 new_pgs.clear();
8542 }
8543 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2
TL
8544 pg->unlock();
8545 // kick source(s) to get them ready
8546 for (auto& i : children) {
8547 dout(20) << __func__ << " kicking source " << i << dendl;
8548 enqueue_peering_evt(
8549 i,
8550 PGPeeringEventRef(
8551 std::make_shared<PGPeeringEvent>(
8552 nextmap->get_epoch(),
8553 nextmap->get_epoch(),
8554 NullEvt())));
8555 }
8556 ret = false;
8557 goto out;
8558 }
8559 }
8560 }
8561 }
8562
7c673cae
FG
8563 vector<int> newup, newacting;
8564 int up_primary, acting_primary;
8565 nextmap->pg_to_up_acting_osds(
11fdf7f2 8566 pg->pg_id.pgid,
7c673cae
FG
8567 &newup, &up_primary,
8568 &newacting, &acting_primary);
8569 pg->handle_advance_map(
8570 nextmap, lastmap, newup, up_primary,
8571 newacting, acting_primary, rctx);
8572
494da23a
TL
8573 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8574 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8575 if (oldpool != lastmap->get_pools().end()
8576 && newpool != nextmap->get_pools().end()) {
8577 dout(20) << __func__
8578 << " new pool opts " << newpool->second.opts
8579 << " old pool opts " << oldpool->second.opts
8580 << dendl;
8581
8582 double old_min_interval = 0, new_min_interval = 0;
8583 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8584 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8585
8586 double old_max_interval = 0, new_max_interval = 0;
8587 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8588 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8589
8590 // Assume if an interval is change from set to unset or vice versa the actual config
8591 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8592 // unnecessarily.
8593 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8594 pg->on_info_history_change();
8595 }
8596 }
8597
11fdf7f2
TL
8598 if (new_pg_num && old_pg_num != new_pg_num) {
8599 // check for split
8600 set<spg_t> children;
8601 if (pg->pg_id.is_split(
8602 old_pg_num,
8603 new_pg_num,
8604 &children)) {
8605 split_pgs(
8606 pg, children, &new_pgs, lastmap, nextmap,
8607 rctx);
8608 }
7c673cae
FG
8609 }
8610
8611 lastmap = nextmap;
11fdf7f2 8612 old_pg_num = new_pg_num;
7c673cae
FG
8613 handle.reset_tp_timeout();
8614 }
7c673cae 8615 pg->handle_activate_map(rctx);
11fdf7f2
TL
8616
8617 ret = true;
8618 out:
8619 if (!new_pgs.empty()) {
9f95a23c 8620 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 8621 }
11fdf7f2 8622 return ret;
7c673cae
FG
8623}
8624
8625void OSD::consume_map()
8626{
9f95a23c
TL
8627 ceph_assert(ceph_mutex_is_locked(osd_lock));
8628 auto osdmap = get_osdmap();
7c673cae
FG
8629 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8630
3efd9988
FG
8631 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8632 * speak the older sorting version any more. Be careful not to force
8633 * a shutdown if we are merely processing old maps, though.
8634 */
8635 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8636 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8637 ceph_abort();
8638 }
8639
11fdf7f2
TL
8640 service.pre_publish_map(osdmap);
8641 service.await_reserved_maps();
8642 service.publish_map(osdmap);
7c673cae 8643
11fdf7f2
TL
8644 // prime splits and merges
8645 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8646 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8647 for (auto& shard : shards) {
8648 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8649 }
8650 if (!newly_split.empty()) {
8651 for (auto& shard : shards) {
8652 shard->prime_splits(osdmap, &newly_split);
8653 }
8654 ceph_assert(newly_split.empty());
8655 }
7c673cae 8656
11fdf7f2
TL
8657 // prune sent_ready_to_merge
8658 service.prune_sent_ready_to_merge(osdmap);
7c673cae 8659
11fdf7f2
TL
8660 // FIXME, maybe: We could race against an incoming peering message
8661 // that instantiates a merge PG after identify_merges() below and
8662 // never set up its peer to complete the merge. An OSD restart
8663 // would clear it up. This is a hard race to resolve,
8664 // extraordinarily rare (we only merge PGs that are stable and
8665 // clean, so it'd have to be an imported PG to an OSD with a
8666 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8667 // replace all of this with a seastar-based code soon anyway.
8668 if (!merge_pgs.empty()) {
8669 // mark the pgs we already have, or create new and empty merge
8670 // participants for those we are missing. do this all under the
8671 // shard lock so we don't have to worry about racing pg creates
8672 // via _process.
8673 for (auto& shard : shards) {
8674 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 8675 }
11fdf7f2
TL
8676 ceph_assert(merge_pgs.empty());
8677 }
8678
8679 service.prune_pg_created();
8680
8681 unsigned pushes_to_free = 0;
8682 for (auto& shard : shards) {
8683 shard->consume_map(osdmap, &pushes_to_free);
8684 }
8685
8686 vector<spg_t> pgids;
8687 _get_pgids(&pgids);
8688
8689 // count (FIXME, probably during seastar rewrite)
8690 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8691 vector<PGRef> pgs;
8692 _get_pgs(&pgs);
8693 for (auto& pg : pgs) {
8694 // FIXME (probably during seastar rewrite): this is lockless and
8695 // racy, but we don't want to take pg lock here.
8696 if (pg->is_primary())
8697 num_pg_primary++;
9f95a23c
TL
8698 else if (pg->is_nonprimary())
8699 num_pg_replica++; // misnomer
11fdf7f2
TL
8700 else
8701 num_pg_stray++;
8702 }
3efd9988 8703
11fdf7f2
TL
8704 {
8705 // FIXME (as part of seastar rewrite): move to OSDShard
8706 std::lock_guard l(pending_creates_lock);
8707 for (auto pg = pending_creates_from_osd.begin();
8708 pg != pending_creates_from_osd.end();) {
9f95a23c 8709 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
11fdf7f2
TL
8710 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8711 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
8712 pg = pending_creates_from_osd.erase(pg);
8713 } else {
8714 ++pg;
8715 }
8716 }
7c673cae
FG
8717 }
8718
7c673cae
FG
8719 service.maybe_inject_dispatch_delay();
8720
8721 dispatch_sessions_waiting_on_map();
8722
8723 service.maybe_inject_dispatch_delay();
8724
11fdf7f2 8725 service.release_reserved_pushes(pushes_to_free);
7c673cae 8726
11fdf7f2
TL
8727 // queue null events to push maps down to individual PGs
8728 for (auto pgid : pgids) {
8729 enqueue_peering_evt(
8730 pgid,
8731 PGPeeringEventRef(
8732 std::make_shared<PGPeeringEvent>(
8733 osdmap->get_epoch(),
8734 osdmap->get_epoch(),
8735 NullEvt())));
7c673cae 8736 }
11fdf7f2 8737 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
8738 logger->set(l_osd_pg_primary, num_pg_primary);
8739 logger->set(l_osd_pg_replica, num_pg_replica);
8740 logger->set(l_osd_pg_stray, num_pg_stray);
8741}
8742
8743void OSD::activate_map()
8744{
9f95a23c
TL
8745 ceph_assert(ceph_mutex_is_locked(osd_lock));
8746 auto osdmap = get_osdmap();
7c673cae
FG
8747
8748 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8749
7c673cae
FG
8750 // norecover?
8751 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8752 if (!service.recovery_is_paused()) {
8753 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8754 service.pause_recovery();
8755 }
8756 } else {
8757 if (service.recovery_is_paused()) {
8758 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8759 service.unpause_recovery();
8760 }
8761 }
8762
8763 service.activate_map();
8764
8765 // process waiters
8766 take_waiters(waiting_for_osdmap);
8767}
8768
8769bool OSD::require_mon_peer(const Message *m)
8770{
8771 if (!m->get_connection()->peer_is_mon()) {
8772 dout(0) << "require_mon_peer received from non-mon "
8773 << m->get_connection()->get_peer_addr()
8774 << " " << *m << dendl;
8775 return false;
8776 }
8777 return true;
8778}
8779
8780bool OSD::require_mon_or_mgr_peer(const Message *m)
8781{
8782 if (!m->get_connection()->peer_is_mon() &&
8783 !m->get_connection()->peer_is_mgr()) {
8784 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8785 << m->get_connection()->get_peer_addr()
8786 << " " << *m << dendl;
8787 return false;
8788 }
8789 return true;
8790}
8791
8792bool OSD::require_osd_peer(const Message *m)
8793{
8794 if (!m->get_connection()->peer_is_osd()) {
8795 dout(0) << "require_osd_peer received from non-osd "
8796 << m->get_connection()->get_peer_addr()
8797 << " " << *m << dendl;
8798 return false;
8799 }
8800 return true;
8801}
8802
8803bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8804{
8805 epoch_t up_epoch = service.get_up_epoch();
8806 if (epoch < up_epoch) {
8807 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8808 return false;
8809 }
8810
8811 if (!is_active()) {
8812 dout(7) << "still in boot state, dropping message " << *m << dendl;
8813 return false;
8814 }
8815
8816 return true;
8817}
8818
9f95a23c 8819bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
7c673cae
FG
8820 bool is_fast_dispatch)
8821{
8822 int from = m->get_source().num();
8823
8824 if (map->is_down(from) ||
11fdf7f2 8825 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
7c673cae
FG
8826 dout(5) << "from dead osd." << from << ", marking down, "
8827 << " msg was " << m->get_source_inst().addr
11fdf7f2
TL
8828 << " expected "
8829 << (map->is_up(from) ?
8830 map->get_cluster_addrs(from) : entity_addrvec_t())
7c673cae
FG
8831 << dendl;
8832 ConnectionRef con = m->get_connection();
8833 con->mark_down();
9f95a23c 8834 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
7c673cae 8835 if (!is_fast_dispatch)
9f95a23c 8836 s->session_dispatch_lock.lock();
7c673cae 8837 clear_session_waiting_on_map(s);
11fdf7f2
TL
8838 con->set_priv(nullptr); // break ref <-> session cycle, if any
8839 s->con.reset();
7c673cae 8840 if (!is_fast_dispatch)
9f95a23c 8841 s->session_dispatch_lock.unlock();
7c673cae
FG
8842 }
8843 return false;
8844 }
8845 return true;
8846}
8847
8848
8849/*
8850 * require that we have same (or newer) map, and that
8851 * the source is the pg primary.
8852 */
8853bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8854 bool is_fast_dispatch)
8855{
8856 const Message *m = op->get_req();
9f95a23c 8857 const auto osdmap = get_osdmap();
7c673cae
FG
8858 dout(15) << "require_same_or_newer_map " << epoch
8859 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8860
9f95a23c 8861 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
8862
8863 // do they have a newer map?
8864 if (epoch > osdmap->get_epoch()) {
8865 dout(7) << "waiting for newer map epoch " << epoch
8866 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8867 wait_for_new_map(op);
8868 return false;
8869 }
8870
8871 if (!require_self_aliveness(op->get_req(), epoch)) {
8872 return false;
8873 }
8874
8875 // ok, our map is same or newer.. do they still exist?
8876 if (m->get_connection()->get_messenger() == cluster_messenger &&
8877 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8878 return false;
8879 }
8880
8881 return true;
8882}
8883
8884
8885
8886
8887
8888// ----------------------------------------
8889// pg creation
8890
8891void OSD::split_pgs(
8892 PG *parent,
31f18b77 8893 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
8894 OSDMapRef curmap,
8895 OSDMapRef nextmap,
9f95a23c 8896 PeeringCtx &rctx)
7c673cae 8897{
11fdf7f2
TL
8898 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8899 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 8900
11fdf7f2
TL
8901 vector<object_stat_sum_t> updated_stats;
8902 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
8903
8904 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8905 for (set<spg_t>::const_iterator i = childpgids.begin();
8906 i != childpgids.end();
8907 ++i, ++stat_iter) {
11fdf7f2
TL
8908 ceph_assert(stat_iter != updated_stats.end());
8909 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
8910 PG* child = _make_pg(nextmap, *i);
8911 child->lock(true);
8912 out_pgs->insert(child);
11fdf7f2 8913 child->ch = store->create_new_collection(child->coll);
7c673cae 8914
11fdf7f2
TL
8915 {
8916 uint32_t shard_index = i->hash_to_shard(shards.size());
8917 assert(NULL != shards[shard_index]);
8918 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8919 }
7c673cae 8920
11fdf7f2
TL
8921 unsigned split_bits = i->get_split_bits(pg_num);
8922 dout(10) << " pg_num is " << pg_num
8923 << ", m_seed " << i->ps()
8924 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
8925 parent->split_colls(
8926 *i,
8927 split_bits,
8928 i->ps(),
11fdf7f2 8929 &child->get_pool().info,
9f95a23c 8930 rctx.transaction);
7c673cae
FG
8931 parent->split_into(
8932 i->pgid,
8933 child,
8934 split_bits);
7c673cae 8935
92f5a8d4
TL
8936 child->init_collection_pool_opts();
8937
9f95a23c 8938 child->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
8939 child->unlock();
8940 }
11fdf7f2 8941 ceph_assert(stat_iter != updated_stats.end());
9f95a23c 8942 parent->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
8943}
8944
8945/*
8946 * holding osd_lock
8947 */
8948void OSD::handle_pg_create(OpRequestRef op)
8949{
9f95a23c
TL
8950 // NOTE: this can be removed in P release (mimic is the last version to
8951 // send MOSDPGCreate messages).
8952
8953 auto m = op->get_req<MOSDPGCreate>();
11fdf7f2 8954 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
7c673cae
FG
8955
8956 dout(10) << "handle_pg_create " << *m << dendl;
8957
8958 if (!require_mon_peer(op->get_req())) {
8959 return;
8960 }
8961
8962 if (!require_same_or_newer_map(op, m->epoch, false))
8963 return;
8964
8965 op->mark_started();
8966
9f95a23c 8967 const auto osdmap = get_osdmap();
7c673cae
FG
8968 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8969 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8970 p != m->mkpg.end();
8971 ++p, ++ci) {
11fdf7f2 8972 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
7c673cae
FG
8973 epoch_t created = p->second.created;
8974 if (p->second.split_bits) // Skip split pgs
8975 continue;
8976 pg_t on = p->first;
8977
7c673cae
FG
8978 if (!osdmap->have_pg_pool(on.pool())) {
8979 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8980 continue;
8981 }
8982
8983 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8984
9f95a23c
TL
8985 spg_t pgid;
8986 bool mapped = osdmap->get_primary_shard(on, &pgid);
8987 ceph_assert(mapped);
8988
7c673cae
FG
8989 // is it still ours?
8990 vector<int> up, acting;
8991 int up_primary = -1;
8992 int acting_primary = -1;
8993 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9f95a23c 8994 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae
FG
8995
8996 if (acting_primary != whoami) {
8997 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8998 << "), my role=" << role << ", skipping" << dendl;
8999 continue;
9000 }
9001
7c673cae 9002
11fdf7f2 9003 PastIntervals pi;
7c673cae
FG
9004 pg_history_t history;
9005 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9006
11fdf7f2
TL
9007 // The mon won't resend unless the primary changed, so we ignore
9008 // same_interval_since. We'll pass this history with the current
9009 // epoch as the event.
7c673cae
FG
9010 if (history.same_primary_since > m->epoch) {
9011 dout(10) << __func__ << ": got obsolete pg create on pgid "
9012 << pgid << " from epoch " << m->epoch
9013 << ", primary changed in " << history.same_primary_since
9014 << dendl;
9015 continue;
9016 }
11fdf7f2
TL
9017 enqueue_peering_evt(
9018 pgid,
9019 PGPeeringEventRef(
9020 std::make_shared<PGPeeringEvent>(
9021 osdmap->get_epoch(),
9022 osdmap->get_epoch(),
9023 NullEvt(),
9024 true,
9025 new PGCreateInfo(
9026 pgid,
9027 osdmap->get_epoch(),
9028 history,
9029 pi,
9030 true)
9031 )));
7c673cae 9032 }
7c673cae 9033
3efd9988 9034 {
11fdf7f2 9035 std::lock_guard l(pending_creates_lock);
3efd9988
FG
9036 if (pending_creates_from_mon == 0) {
9037 last_pg_create_epoch = m->epoch;
9038 }
9039 }
11fdf7f2 9040
7c673cae
FG
9041 maybe_update_heartbeat_peers();
9042}
9043
9044
9045// ----------------------------------------
9046// peering and recovery
9047
9f95a23c 9048PeeringCtx OSD::create_context()
7c673cae 9049{
9f95a23c 9050 return PeeringCtx(get_osdmap()->require_osd_release);
7c673cae
FG
9051}
9052
9f95a23c 9053void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
7c673cae
FG
9054 ThreadPool::TPHandle *handle)
9055{
11fdf7f2
TL
9056 if (!service.get_osdmap()->is_up(whoami)) {
9057 dout(20) << __func__ << " not up in osdmap" << dendl;
9058 } else if (!is_active()) {
9059 dout(20) << __func__ << " not active" << dendl;
9060 } else {
9f95a23c
TL
9061 for (auto& [osd, ls] : ctx.message_map) {
9062 if (!curmap->is_up(osd)) {
9063 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9064 continue;
9065 }
9066 ConnectionRef con = service.get_con_osd_cluster(
9067 osd, curmap->get_epoch());
9068 if (!con) {
9069 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9070 << dendl;
9071 continue;
9072 }
9073 service.maybe_share_map(con.get(), curmap);
9074 for (auto m : ls) {
9075 con->send_message2(m);
9076 }
9077 ls.clear();
9078 }
7c673cae 9079 }
9f95a23c 9080 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
7c673cae 9081 int tr = store->queue_transaction(
11fdf7f2 9082 pg->ch,
9f95a23c 9083 std::move(ctx.transaction), TrackedOpRef(),
7c673cae 9084 handle);
11fdf7f2 9085 ceph_assert(tr == 0);
7c673cae 9086 }
7c673cae
FG
9087}
9088
11fdf7f2 9089void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9090{
11fdf7f2
TL
9091 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9092 if (!require_mon_peer(m)) {
9093 m->put();
7c673cae 9094 return;
7c673cae 9095 }
11fdf7f2
TL
9096 for (auto& p : m->pgs) {
9097 spg_t pgid = p.first;
9098 epoch_t created = p.second.first;
9099 utime_t created_stamp = p.second.second;
9f95a23c
TL
9100 auto q = m->pg_extra.find(pgid);
9101 if (q == m->pg_extra.end()) {
9102 dout(20) << __func__ << " " << pgid << " e" << created
9103 << "@" << created_stamp
9104 << " (no history or past_intervals)" << dendl;
9105 // pre-octopus ... no pg history. this can be removed in Q release.
9106 enqueue_peering_evt(
9107 pgid,
9108 PGPeeringEventRef(
9109 std::make_shared<PGPeeringEvent>(
9110 m->epoch,
9111 m->epoch,
9112 NullEvt(),
9113 true,
9114 new PGCreateInfo(
9115 pgid,
9116 created,
9117 pg_history_t(created, created_stamp),
9118 PastIntervals(),
9119 true)
9120 )));
9121 } else {
9122 dout(20) << __func__ << " " << pgid << " e" << created
9123 << "@" << created_stamp
9124 << " history " << q->second.first
9125 << " pi " << q->second.second << dendl;
9126 if (!q->second.second.empty() &&
9127 m->epoch < q->second.second.get_bounds().second) {
9128 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9129 << " and unmatched past_intervals " << q->second.second
9130 << " (history " << q->second.first << ")";
9131 } else {
9132 enqueue_peering_evt(
9133 pgid,
9134 PGPeeringEventRef(
9135 std::make_shared<PGPeeringEvent>(
9136 m->epoch,
9137 m->epoch,
9138 NullEvt(),
9139 true,
9140 new PGCreateInfo(
9141 pgid,
9142 m->epoch,
9143 q->second.first,
9144 q->second.second,
9145 true)
9146 )));
9147 }
9148 }
11fdf7f2 9149 }
7c673cae 9150
11fdf7f2
TL
9151 {
9152 std::lock_guard l(pending_creates_lock);
9153 if (pending_creates_from_mon == 0) {
9154 last_pg_create_epoch = m->epoch;
9155 }
7c673cae
FG
9156 }
9157
11fdf7f2 9158 m->put();
7c673cae
FG
9159}
9160
11fdf7f2 9161void OSD::handle_fast_pg_query(MOSDPGQuery *m)
7c673cae 9162{
11fdf7f2
TL
9163 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9164 if (!require_osd_peer(m)) {
9165 m->put();
7c673cae 9166 return;
11fdf7f2 9167 }
7c673cae 9168 int from = m->get_source().num();
11fdf7f2
TL
9169 for (auto& p : m->pg_list) {
9170 enqueue_peering_evt(
9171 p.first,
9172 PGPeeringEventRef(
9173 std::make_shared<PGPeeringEvent>(
9174 p.second.epoch_sent, p.second.epoch_sent,
9175 MQuery(
9176 p.first,
9177 pg_shard_t(from, p.second.from),
9178 p.second,
9179 p.second.epoch_sent),
9180 false))
7c673cae
FG
9181 );
9182 }
11fdf7f2 9183 m->put();
7c673cae
FG
9184}
9185
11fdf7f2 9186void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9187{
11fdf7f2
TL
9188 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9189 if (!require_osd_peer(m)) {
9190 m->put();
7c673cae
FG
9191 return;
9192 }
11fdf7f2
TL
9193 int from = m->get_source().num();
9194 for (auto& p : m->get_pg_list()) {
9f95a23c 9195 spg_t pgid(p.info.pgid.pgid, p.to);
11fdf7f2
TL
9196 enqueue_peering_evt(
9197 pgid,
9198 PGPeeringEventRef(
9199 std::make_shared<PGPeeringEvent>(
9f95a23c
TL
9200 p.epoch_sent,
9201 p.query_epoch,
11fdf7f2 9202 MNotifyRec(
9f95a23c
TL
9203 pgid, pg_shard_t(from, p.from),
9204 p,
9205 m->get_connection()->get_features()),
11fdf7f2
TL
9206 true,
9207 new PGCreateInfo(
9208 pgid,
9f95a23c
TL
9209 p.query_epoch,
9210 p.info.history,
9211 p.past_intervals,
11fdf7f2
TL
9212 false)
9213 )));
7c673cae 9214 }
11fdf7f2 9215 m->put();
7c673cae
FG
9216}
9217
11fdf7f2 9218void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9219{
11fdf7f2
TL
9220 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9221 if (!require_osd_peer(m)) {
9222 m->put();
7c673cae
FG
9223 return;
9224 }
11fdf7f2
TL
9225 int from = m->get_source().num();
9226 for (auto& p : m->pg_list) {
9227 enqueue_peering_evt(
9f95a23c 9228 spg_t(p.info.pgid.pgid, p.to),
11fdf7f2
TL
9229 PGPeeringEventRef(
9230 std::make_shared<PGPeeringEvent>(
9f95a23c 9231 p.epoch_sent, p.query_epoch,
11fdf7f2 9232 MInfoRec(
9f95a23c
TL
9233 pg_shard_t(from, p.from),
9234 p.info,
9235 p.epoch_sent)))
11fdf7f2 9236 );
7c673cae 9237 }
11fdf7f2 9238 m->put();
7c673cae
FG
9239}
9240
11fdf7f2 9241void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9242{
11fdf7f2
TL
9243 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9244 if (!require_osd_peer(m)) {
9245 m->put();
7c673cae
FG
9246 return;
9247 }
11fdf7f2
TL
9248 for (auto& pgid : m->pg_list) {
9249 enqueue_peering_evt(
9250 pgid,
9251 PGPeeringEventRef(
9252 std::make_shared<PGPeeringEvent>(
9253 m->get_epoch(), m->get_epoch(),
9f95a23c 9254 PeeringState::DeleteStart())));
7c673cae 9255 }
11fdf7f2 9256 m->put();
7c673cae
FG
9257}
9258
11fdf7f2 9259void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9260{
11fdf7f2
TL
9261 dout(10) << __func__ << " " << *m << dendl;
9262 if (!require_mon_or_mgr_peer(m)) {
9263 m->put();
9264 return;
9265 }
9266 epoch_t epoch = get_osdmap_epoch();
9267 for (auto pgid : m->forced_pgs) {
9268 if (m->options & OFR_BACKFILL) {
9269 if (m->options & OFR_CANCEL) {
9270 enqueue_peering_evt(
9271 pgid,
9272 PGPeeringEventRef(
9273 std::make_shared<PGPeeringEvent>(
9274 epoch, epoch,
9f95a23c 9275 PeeringState::UnsetForceBackfill())));
11fdf7f2
TL
9276 } else {
9277 enqueue_peering_evt(
9278 pgid,
9279 PGPeeringEventRef(
9280 std::make_shared<PGPeeringEvent>(
9281 epoch, epoch,
9f95a23c 9282 PeeringState::SetForceBackfill())));
11fdf7f2
TL
9283 }
9284 } else if (m->options & OFR_RECOVERY) {
9285 if (m->options & OFR_CANCEL) {
9286 enqueue_peering_evt(
9287 pgid,
9288 PGPeeringEventRef(
9289 std::make_shared<PGPeeringEvent>(
9290 epoch, epoch,
9f95a23c 9291 PeeringState::UnsetForceRecovery())));
11fdf7f2
TL
9292 } else {
9293 enqueue_peering_evt(
9294 pgid,
9295 PGPeeringEventRef(
9296 std::make_shared<PGPeeringEvent>(
9297 epoch, epoch,
9f95a23c 9298 PeeringState::SetForceRecovery())));
c07f9fc5
FG
9299 }
9300 }
9301 }
11fdf7f2 9302 m->put();
c07f9fc5 9303}
7c673cae 9304
11fdf7f2 9305void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9306{
11fdf7f2
TL
9307 spg_t pgid = q.pgid;
9308 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9309
11fdf7f2
TL
9310 OSDMapRef osdmap = get_osdmap();
9311 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9312 return;
9313
11fdf7f2
TL
9314 dout(10) << " pg " << pgid << " dne" << dendl;
9315 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9316 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9317 if (con) {
9318 Message *m;
9319 if (q.query.type == pg_query_t::LOG ||
9320 q.query.type == pg_query_t::FULLLOG) {
9321 m = new MOSDPGLog(
9322 q.query.from, q.query.to,
9323 osdmap->get_epoch(), empty,
9324 q.query.epoch_sent);
7c673cae 9325 } else {
9f95a23c 9326 vector<pg_notify_t> ls;
11fdf7f2 9327 ls.push_back(
9f95a23c
TL
9328 pg_notify_t(
9329 q.query.from, q.query.to,
9330 q.query.epoch_sent,
9331 osdmap->get_epoch(),
9332 empty,
11fdf7f2 9333 PastIntervals()));
9f95a23c 9334 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
7c673cae 9335 }
9f95a23c 9336 service.maybe_share_map(con.get(), osdmap);
11fdf7f2 9337 con->send_message(m);
7c673cae
FG
9338 }
9339}
9340
9f95a23c
TL
9341void OSDService::queue_check_readable(spg_t spgid,
9342 epoch_t lpr,
9343 ceph::signedspan delay)
9344{
9345 if (delay == ceph::signedspan::zero()) {
9346 osd->enqueue_peering_evt(
9347 spgid,
9348 PGPeeringEventRef(
9349 std::make_shared<PGPeeringEvent>(
9350 lpr, lpr,
9351 PeeringState::CheckReadable())));
9352 } else {
9353 mono_timer.add_event(
9354 delay,
9355 [this, spgid, lpr]() {
9356 queue_check_readable(spgid, lpr);
9357 });
9358 }
9359}
9360
7c673cae 9361
7c673cae
FG
9362// =========================================================
9363// RECOVERY
9364
9365void OSDService::_maybe_queue_recovery() {
9f95a23c 9366 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
7c673cae
FG
9367 uint64_t available_pushes;
9368 while (!awaiting_throttle.empty() &&
9369 _recover_now(&available_pushes)) {
11fdf7f2 9370 uint64_t to_start = std::min(
7c673cae
FG
9371 available_pushes,
9372 cct->_conf->osd_recovery_max_single_start);
9373 _queue_for_recovery(awaiting_throttle.front(), to_start);
9374 awaiting_throttle.pop_front();
11fdf7f2
TL
9375 dout(10) << __func__ << " starting " << to_start
9376 << ", recovery_ops_reserved " << recovery_ops_reserved
9377 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9378 recovery_ops_reserved += to_start;
9379 }
9380}
9381
9382bool OSDService::_recover_now(uint64_t *available_pushes)
9383{
9384 if (available_pushes)
9385 *available_pushes = 0;
9386
9387 if (ceph_clock_now() < defer_recovery_until) {
9388 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9389 return false;
9390 }
9391
9392 if (recovery_paused) {
9393 dout(15) << __func__ << " paused" << dendl;
9394 return false;
9395 }
9396
9f95a23c 9397 uint64_t max = osd->get_recovery_max_active();
7c673cae
FG
9398 if (max <= recovery_ops_active + recovery_ops_reserved) {
9399 dout(15) << __func__ << " active " << recovery_ops_active
9400 << " + reserved " << recovery_ops_reserved
9401 << " >= max " << max << dendl;
9402 return false;
9403 }
9404
9405 if (available_pushes)
9406 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9407
9408 return true;
9409}
9410
9f95a23c
TL
9411unsigned OSDService::get_target_pg_log_entries() const
9412{
9413 auto num_pgs = osd->get_num_pgs();
9414 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9415 if (num_pgs > 0 && target > 0) {
9416 // target an even spread of our budgeted log entries across all
9417 // PGs. note that while we only get to control the entry count
9418 // for primary PGs, we'll normally be responsible for a mix of
9419 // primary and replica PGs (for the same pool(s) even), so this
9420 // will work out.
9421 return std::max<unsigned>(
9422 std::min<unsigned>(target / num_pgs,
9423 cct->_conf->osd_max_pg_log_entries),
9424 cct->_conf->osd_min_pg_log_entries);
9425 } else {
9426 // fall back to a per-pg value.
9427 return cct->_conf->osd_min_pg_log_entries;
9428 }
9429}
9430
7c673cae
FG
9431void OSD::do_recovery(
9432 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9433 ThreadPool::TPHandle &handle)
9434{
9435 uint64_t started = 0;
31f18b77
FG
9436
9437 /*
9438 * When the value of osd_recovery_sleep is set greater than zero, recovery
9439 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9440 * recovery event's schedule time. This is done by adding a
9441 * recovery_requeue_callback event, which re-queues the recovery op using
9442 * queue_recovery_after_sleep.
9443 */
c07f9fc5 9444 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9445 {
11fdf7f2 9446 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9447 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9448 PGRef pgref(pg);
9f95a23c 9449 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
b32b8144
FG
9450 dout(20) << "do_recovery wake up at "
9451 << ceph_clock_now()
9452 << ", re-queuing recovery" << dendl;
11fdf7f2 9453 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9454 service.recovery_needs_sleep = false;
9455 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9456 });
9457
9458 // This is true for the first recovery op and when the previous recovery op
9459 // has been scheduled in the past. The next recovery op is scheduled after
9460 // completing the sleep from now.
9f95a23c
TL
9461
9462 if (auto now = ceph::real_clock::now();
9463 service.recovery_schedule_time < now) {
9464 service.recovery_schedule_time = now;
b32b8144 9465 }
9f95a23c 9466 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
11fdf7f2 9467 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9f95a23c 9468 recovery_requeue_callback);
b32b8144
FG
9469 dout(20) << "Recovery event scheduled at "
9470 << service.recovery_schedule_time << dendl;
9471 return;
9472 }
7c673cae
FG
9473 }
9474
9475 {
b32b8144 9476 {
11fdf7f2 9477 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9478 service.recovery_needs_sleep = true;
9479 }
9480
7c673cae
FG
9481 if (pg->pg_has_reset_since(queued)) {
9482 goto out;
9483 }
9484
7c673cae
FG
9485 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9486#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 9487 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
9488#endif
9489
11fdf7f2 9490 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
7c673cae
FG
9491 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9492 << " on " << *pg << dendl;
9493
11fdf7f2 9494 if (do_unfound) {
9f95a23c 9495 PeeringCtx rctx = create_context();
11fdf7f2 9496 rctx.handle = &handle;
9f95a23c 9497 pg->find_unfound(queued, rctx);
11fdf7f2 9498 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 9499 }
7c673cae
FG
9500 }
9501
9502 out:
11fdf7f2 9503 ceph_assert(started <= reserved_pushes);
7c673cae
FG
9504 service.release_reserved_pushes(reserved_pushes);
9505}
9506
9507void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9508{
11fdf7f2 9509 std::lock_guard l(recovery_lock);
7c673cae
FG
9510 dout(10) << "start_recovery_op " << *pg << " " << soid
9511 << " (" << recovery_ops_active << "/"
9f95a23c 9512 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9513 << dendl;
9514 recovery_ops_active++;
9515
9516#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9517 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9518 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9519 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
9520#endif
9521}
9522
9523void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9524{
11fdf7f2 9525 std::lock_guard l(recovery_lock);
7c673cae
FG
9526 dout(10) << "finish_recovery_op " << *pg << " " << soid
9527 << " dequeue=" << dequeue
9f95a23c
TL
9528 << " (" << recovery_ops_active << "/"
9529 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9530 << dendl;
9531
9532 // adjust count
11fdf7f2 9533 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
9534 recovery_ops_active--;
9535
9536#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9537 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9538 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9539 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
9540#endif
9541
9542 _maybe_queue_recovery();
9543}
9544
9545bool OSDService::is_recovery_active()
9546{
eafe8130
TL
9547 if (cct->_conf->osd_debug_pretend_recovery_active) {
9548 return true;
9549 }
b5b8bbf5 9550 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9551}
9552
11fdf7f2
TL
9553void OSDService::release_reserved_pushes(uint64_t pushes)
9554{
9555 std::lock_guard l(recovery_lock);
9556 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9557 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9558 << dendl;
9559 ceph_assert(recovery_ops_reserved >= pushes);
9560 recovery_ops_reserved -= pushes;
9561 _maybe_queue_recovery();
9562}
9563
7c673cae
FG
9564// =========================================================
9565// OPS
9566
9567bool OSD::op_is_discardable(const MOSDOp *op)
9568{
9569 // drop client request if they are not connected and can't get the
9570 // reply anyway.
9571 if (!op->get_connection()->is_connected()) {
9572 return true;
9573 }
9574 return false;
9575}
9576
11fdf7f2 9577void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 9578{
11fdf7f2
TL
9579 const utime_t stamp = op->get_req()->get_recv_stamp();
9580 const utime_t latency = ceph_clock_now() - stamp;
9581 const unsigned priority = op->get_req()->get_priority();
9582 const int cost = op->get_req()->get_cost();
9583 const uint64_t owner = op->get_req()->get_source().num();
9584
9585 dout(15) << "enqueue_op " << op << " prio " << priority
9586 << " cost " << cost
7c673cae
FG
9587 << " latency " << latency
9588 << " epoch " << epoch
9589 << " " << *(op->get_req()) << dendl;
9590 op->osd_trace.event("enqueue op");
11fdf7f2
TL
9591 op->osd_trace.keyval("priority", priority);
9592 op->osd_trace.keyval("cost", cost);
7c673cae 9593 op->mark_queued_for_pg();
224ce89b 9594 logger->tinc(l_osd_op_before_queue_op_lat, latency);
11fdf7f2 9595 op_shardedwq.queue(
9f95a23c
TL
9596 OpSchedulerItem(
9597 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
11fdf7f2 9598 cost, priority, stamp, owner, epoch));
7c673cae
FG
9599}
9600
11fdf7f2
TL
9601void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9602{
9603 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9604 op_shardedwq.queue(
9f95a23c
TL
9605 OpSchedulerItem(
9606 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
11fdf7f2
TL
9607 10,
9608 cct->_conf->osd_peering_op_priority,
9609 utime_t(),
9610 0,
9611 evt->get_epoch_sent()));
9612}
7c673cae
FG
9613
9614/*
9615 * NOTE: dequeue called in worker thread, with pg lock
9616 */
9617void OSD::dequeue_op(
9618 PGRef pg, OpRequestRef op,
9619 ThreadPool::TPHandle &handle)
9620{
9f95a23c
TL
9621 const Message *m = op->get_req();
9622
11fdf7f2 9623 FUNCTRACE(cct);
9f95a23c 9624 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
7c673cae
FG
9625
9626 utime_t now = ceph_clock_now();
9627 op->set_dequeued_time(now);
9f95a23c
TL
9628
9629 utime_t latency = now - m->get_recv_stamp();
9630 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9631 << " cost " << m->get_cost()
7c673cae 9632 << " latency " << latency
9f95a23c 9633 << " " << *m
7c673cae
FG
9634 << " pg " << *pg << dendl;
9635
224ce89b
WB
9636 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9637
9f95a23c
TL
9638 service.maybe_share_map(m->get_connection().get(),
9639 pg->get_osdmap(),
9640 op->sent_epoch);
7c673cae 9641
11fdf7f2 9642 if (pg->is_deleting())
7c673cae
FG
9643 return;
9644
9645 op->mark_reached_pg();
9646 op->osd_trace.event("dequeue_op");
9647
9648 pg->do_request(op, handle);
9649
9650 // finish
9651 dout(10) << "dequeue_op " << op << " finish" << dendl;
9f95a23c 9652 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
7c673cae
FG
9653}
9654
9655
11fdf7f2
TL
9656void OSD::dequeue_peering_evt(
9657 OSDShard *sdata,
9658 PG *pg,
9659 PGPeeringEventRef evt,
9660 ThreadPool::TPHandle& handle)
7c673cae 9661{
9f95a23c 9662 PeeringCtx rctx = create_context();
11fdf7f2 9663 auto curmap = sdata->get_osdmap();
9f95a23c
TL
9664 bool need_up_thru = false;
9665 epoch_t same_interval_since = 0;
11fdf7f2
TL
9666 if (!pg) {
9667 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9668 handle_pg_query_nopg(*q);
7c673cae 9669 } else {
11fdf7f2
TL
9670 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9671 ceph_abort();
9672 }
9f95a23c
TL
9673 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9674 pg->do_peering_event(evt, rctx);
11fdf7f2 9675 if (pg->is_deleted()) {
11fdf7f2
TL
9676 pg->unlock();
9677 return;
7c673cae 9678 }
9f95a23c 9679 dispatch_context(rctx, pg, curmap, &handle);
11fdf7f2
TL
9680 need_up_thru = pg->get_need_up_thru();
9681 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
9682 pg->unlock();
9683 }
11fdf7f2
TL
9684
9685 if (need_up_thru) {
7c673cae 9686 queue_want_up_thru(same_interval_since);
11fdf7f2 9687 }
7c673cae
FG
9688
9689 service.send_pg_temp();
9690}
9691
11fdf7f2
TL
9692void OSD::dequeue_delete(
9693 OSDShard *sdata,
9694 PG *pg,
9695 epoch_t e,
9696 ThreadPool::TPHandle& handle)
9697{
9698 dequeue_peering_evt(
9699 sdata,
9700 pg,
9701 PGPeeringEventRef(
9702 std::make_shared<PGPeeringEvent>(
9703 e, e,
9f95a23c 9704 PeeringState::DeleteSome())),
11fdf7f2
TL
9705 handle);
9706}
9707
9708
9709
7c673cae
FG
9710// --------------------------------
9711
9712const char** OSD::get_tracked_conf_keys() const
9713{
9714 static const char* KEYS[] = {
9715 "osd_max_backfills",
9716 "osd_min_recovery_priority",
224ce89b
WB
9717 "osd_max_trimming_pgs",
9718 "osd_op_complaint_time",
9719 "osd_op_log_threshold",
9720 "osd_op_history_size",
9721 "osd_op_history_duration",
9722 "osd_op_history_slow_op_size",
9723 "osd_op_history_slow_op_threshold",
7c673cae
FG
9724 "osd_enable_op_tracker",
9725 "osd_map_cache_size",
11fdf7f2 9726 "osd_pg_epoch_max_lag_factor",
7c673cae 9727 "osd_pg_epoch_persisted_max_stale",
7c673cae
FG
9728 // clog & admin clog
9729 "clog_to_monitors",
9730 "clog_to_syslog",
9731 "clog_to_syslog_facility",
9732 "clog_to_syslog_level",
9733 "osd_objectstore_fuse",
9734 "clog_to_graylog",
9735 "clog_to_graylog_host",
9736 "clog_to_graylog_port",
9737 "host",
9738 "fsid",
9739 "osd_recovery_delay_start",
9740 "osd_client_message_size_cap",
9741 "osd_client_message_cap",
31f18b77
FG
9742 "osd_heartbeat_min_size",
9743 "osd_heartbeat_interval",
9f95a23c 9744 "osd_object_clean_region_max_num_intervals",
494da23a
TL
9745 "osd_scrub_min_interval",
9746 "osd_scrub_max_interval",
7c673cae
FG
9747 NULL
9748 };
9749 return KEYS;
9750}
9751
11fdf7f2 9752void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
9753 const std::set <std::string> &changed)
9754{
9f95a23c 9755 std::lock_guard l{osd_lock};
7c673cae
FG
9756 if (changed.count("osd_max_backfills")) {
9757 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9758 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9759 }
9760 if (changed.count("osd_min_recovery_priority")) {
9761 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9762 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9763 }
9764 if (changed.count("osd_max_trimming_pgs")) {
9765 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9766 }
9767 if (changed.count("osd_op_complaint_time") ||
9768 changed.count("osd_op_log_threshold")) {
9769 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9770 cct->_conf->osd_op_log_threshold);
9771 }
9772 if (changed.count("osd_op_history_size") ||
9773 changed.count("osd_op_history_duration")) {
9774 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9775 cct->_conf->osd_op_history_duration);
9776 }
9777 if (changed.count("osd_op_history_slow_op_size") ||
9778 changed.count("osd_op_history_slow_op_threshold")) {
9779 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9780 cct->_conf->osd_op_history_slow_op_threshold);
9781 }
9782 if (changed.count("osd_enable_op_tracker")) {
9783 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9784 }
7c673cae
FG
9785 if (changed.count("osd_map_cache_size")) {
9786 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9787 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9788 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9789 }
9790 if (changed.count("clog_to_monitors") ||
9791 changed.count("clog_to_syslog") ||
9792 changed.count("clog_to_syslog_level") ||
9793 changed.count("clog_to_syslog_facility") ||
9794 changed.count("clog_to_graylog") ||
9795 changed.count("clog_to_graylog_host") ||
9796 changed.count("clog_to_graylog_port") ||
9797 changed.count("host") ||
9798 changed.count("fsid")) {
9799 update_log_config();
9800 }
11fdf7f2
TL
9801 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9802 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9803 "osd_pg_epoch_max_lag_factor");
9804 }
7c673cae
FG
9805
9806#ifdef HAVE_LIBFUSE
9807 if (changed.count("osd_objectstore_fuse")) {
9808 if (store) {
9809 enable_disable_fuse(false);
9810 }
9811 }
9812#endif
9813
9814 if (changed.count("osd_recovery_delay_start")) {
9815 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9816 service.kick_recovery_queue();
9817 }
9818
9819 if (changed.count("osd_client_message_cap")) {
9820 uint64_t newval = cct->_conf->osd_client_message_cap;
9821 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9822 if (pol.throttler_messages && newval > 0) {
9823 pol.throttler_messages->reset_max(newval);
9824 }
9825 }
9826 if (changed.count("osd_client_message_size_cap")) {
9827 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9828 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9829 if (pol.throttler_bytes && newval > 0) {
9830 pol.throttler_bytes->reset_max(newval);
9831 }
9832 }
9f95a23c
TL
9833 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9834 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9835 }
7c673cae 9836
494da23a
TL
9837 if (changed.count("osd_scrub_min_interval") ||
9838 changed.count("osd_scrub_max_interval")) {
9839 resched_all_scrubs();
9840 dout(0) << __func__ << ": scrub interval change" << dendl;
9841 }
7c673cae
FG
9842 check_config();
9843}
9844
9845void OSD::update_log_config()
9846{
9847 map<string,string> log_to_monitors;
9848 map<string,string> log_to_syslog;
9849 map<string,string> log_channel;
9850 map<string,string> log_prio;
9851 map<string,string> log_to_graylog;
9852 map<string,string> log_to_graylog_host;
9853 map<string,string> log_to_graylog_port;
9854 uuid_d fsid;
9855 string host;
9856
9857 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9858 log_channel, log_prio, log_to_graylog,
9859 log_to_graylog_host, log_to_graylog_port,
9860 fsid, host) == 0)
9861 clog->update_config(log_to_monitors, log_to_syslog,
9862 log_channel, log_prio, log_to_graylog,
9863 log_to_graylog_host, log_to_graylog_port,
9864 fsid, host);
9865 derr << "log_to_monitors " << log_to_monitors << dendl;
9866}
9867
9868void OSD::check_config()
9869{
9870 // some sanity checks
7c673cae
FG
9871 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9872 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9873 << " is not > osd_pg_epoch_persisted_max_stale ("
9874 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9875 }
9f95a23c
TL
9876 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
9877 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9878 << cct->_conf->osd_object_clean_region_max_num_intervals
9879 << ") is < 0";
9880 }
7c673cae
FG
9881}
9882
7c673cae
FG
9883// --------------------------------
9884
9885void OSD::get_latest_osdmap()
9886{
9887 dout(10) << __func__ << " -- start" << dendl;
9888
9889 C_SaferCond cond;
9890 service.objecter->wait_for_latest_osdmap(&cond);
9891 cond.wait();
9892
9893 dout(10) << __func__ << " -- finish" << dendl;
9894}
9895
9896// --------------------------------
9897
9f95a23c
TL
9898void OSD::set_perf_queries(const ConfigPayload &config_payload) {
9899 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
9900 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
11fdf7f2
TL
9901 dout(10) << "setting " << queries.size() << " queries" << dendl;
9902
9903 std::list<OSDPerfMetricQuery> supported_queries;
9904 for (auto &it : queries) {
9905 auto &query = it.first;
9906 if (!query.key_descriptor.empty()) {
9907 supported_queries.push_back(query);
9908 }
9909 }
9910 if (supported_queries.size() < queries.size()) {
9911 dout(1) << queries.size() - supported_queries.size()
9912 << " unsupported queries" << dendl;
9913 }
11fdf7f2 9914 {
9f95a23c 9915 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
9916 m_perf_queries = supported_queries;
9917 m_perf_limits = queries;
9918 }
11fdf7f2
TL
9919 std::vector<PGRef> pgs;
9920 _get_pgs(&pgs);
9921 for (auto& pg : pgs) {
9f95a23c 9922 std::scoped_lock l{*pg};
eafe8130 9923 pg->set_dynamic_perf_stats_queries(supported_queries);
7c673cae 9924 }
7c673cae
FG
9925}
9926
9f95a23c
TL
9927MetricPayload OSD::get_perf_reports() {
9928 OSDMetricPayload payload;
9929 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
9930
11fdf7f2
TL
9931 std::vector<PGRef> pgs;
9932 _get_pgs(&pgs);
9933 DynamicPerfStats dps;
9934 for (auto& pg : pgs) {
eafe8130
TL
9935 // m_perf_queries can be modified only in set_perf_queries by mgr client
9936 // request, and it is protected by by mgr client's lock, which is held
9937 // when set_perf_queries/get_perf_reports are called, so we may not hold
9938 // m_perf_queries_lock here.
9939 DynamicPerfStats pg_dps(m_perf_queries);
9940 pg->lock();
9941 pg->get_dynamic_perf_stats(&pg_dps);
9942 pg->unlock();
9943 dps.merge(pg_dps);
11fdf7f2 9944 }
9f95a23c
TL
9945 dps.add_to_reports(m_perf_limits, &reports);
9946 dout(20) << "reports for " << reports.size() << " queries" << dendl;
9947
9948 return payload;
11fdf7f2 9949}
224ce89b 9950
7c673cae
FG
9951// =============================================================
9952
9953#undef dout_context
11fdf7f2 9954#define dout_context cct
7c673cae 9955#undef dout_prefix
11fdf7f2 9956#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 9957
11fdf7f2 9958void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 9959{
11fdf7f2
TL
9960 dout(10) << pg->pg_id << " " << pg << dendl;
9961 slot->pg = pg;
9962 pg->osd_shard = this;
9963 pg->pg_slot = slot;
9964 osd->inc_num_pgs();
9965
9966 slot->epoch = pg->get_osdmap_epoch();
9967 pg_slots_by_epoch.insert(*slot);
9968}
9969
9970void OSDShard::_detach_pg(OSDShardPGSlot *slot)
9971{
9972 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
9973 slot->pg->osd_shard = nullptr;
9974 slot->pg->pg_slot = nullptr;
9975 slot->pg = nullptr;
9976 osd->dec_num_pgs();
9977
9978 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
9979 slot->epoch = 0;
9980 if (waiting_for_min_pg_epoch) {
9981 min_pg_epoch_cond.notify_all();
9982 }
9983}
9984
9985void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
9986{
9987 std::lock_guard l(shard_lock);
9988 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
9989 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
9990 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
9991 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
9992 slot->epoch = e;
9993 pg_slots_by_epoch.insert(*slot);
9994 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
9995 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
9996 if (waiting_for_min_pg_epoch) {
9997 min_pg_epoch_cond.notify_all();
9998 }
9999}
10000
10001epoch_t OSDShard::get_min_pg_epoch()
10002{
10003 std::lock_guard l(shard_lock);
10004 auto p = pg_slots_by_epoch.begin();
10005 if (p == pg_slots_by_epoch.end()) {
10006 return 0;
10007 }
10008 return p->epoch;
10009}
10010
10011void OSDShard::wait_min_pg_epoch(epoch_t need)
10012{
10013 std::unique_lock l{shard_lock};
10014 ++waiting_for_min_pg_epoch;
10015 min_pg_epoch_cond.wait(l, [need, this] {
10016 if (pg_slots_by_epoch.empty()) {
10017 return true;
10018 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10019 return true;
10020 } else {
10021 dout(10) << need << " waiting on "
10022 << pg_slots_by_epoch.begin()->epoch << dendl;
10023 return false;
10024 }
10025 });
10026 --waiting_for_min_pg_epoch;
10027}
10028
10029epoch_t OSDShard::get_max_waiting_epoch()
10030{
10031 std::lock_guard l(shard_lock);
10032 epoch_t r = 0;
10033 for (auto& i : pg_slots) {
10034 if (!i.second->waiting_peering.empty()) {
10035 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10036 }
10037 }
10038 return r;
10039}
10040
10041void OSDShard::consume_map(
9f95a23c 10042 const OSDMapRef& new_osdmap,
11fdf7f2
TL
10043 unsigned *pushes_to_free)
10044{
10045 std::lock_guard l(shard_lock);
10046 OSDMapRef old_osdmap;
7c673cae 10047 {
11fdf7f2
TL
10048 std::lock_guard l(osdmap_lock);
10049 old_osdmap = std::move(shard_osdmap);
10050 shard_osdmap = new_osdmap;
10051 }
10052 dout(10) << new_osdmap->get_epoch()
10053 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10054 << dendl;
10055 bool queued = false;
10056
10057 // check slots
10058 auto p = pg_slots.begin();
10059 while (p != pg_slots.end()) {
10060 OSDShardPGSlot *slot = p->second.get();
10061 const spg_t& pgid = p->first;
10062 dout(20) << __func__ << " " << pgid << dendl;
10063 if (!slot->waiting_for_split.empty()) {
10064 dout(20) << __func__ << " " << pgid
10065 << " waiting for split " << slot->waiting_for_split << dendl;
10066 ++p;
10067 continue;
10068 }
10069 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10070 dout(20) << __func__ << " " << pgid
10071 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10072 << dendl;
10073 ++p;
10074 continue;
10075 }
10076 if (!slot->waiting_peering.empty()) {
10077 epoch_t first = slot->waiting_peering.begin()->first;
10078 if (first <= new_osdmap->get_epoch()) {
10079 dout(20) << __func__ << " " << pgid
10080 << " pending_peering first epoch " << first
10081 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10082 _wake_pg_slot(pgid, slot);
10083 queued = true;
10084 }
10085 ++p;
10086 continue;
10087 }
10088 if (!slot->waiting.empty()) {
10089 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10090 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10091 << dendl;
10092 ++p;
10093 continue;
7c673cae 10094 }
11fdf7f2
TL
10095 while (!slot->waiting.empty() &&
10096 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10097 auto& qi = slot->waiting.front();
10098 dout(20) << __func__ << " " << pgid
10099 << " waiting item " << qi
10100 << " epoch " << qi.get_map_epoch()
10101 << " <= " << new_osdmap->get_epoch()
10102 << ", "
10103 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10104 "misdirected")
10105 << ", dropping" << dendl;
10106 *pushes_to_free += qi.get_reserved_pushes();
10107 slot->waiting.pop_front();
10108 }
10109 }
10110 if (slot->waiting.empty() &&
10111 slot->num_running == 0 &&
10112 slot->waiting_for_split.empty() &&
10113 !slot->pg) {
10114 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10115 p = pg_slots.erase(p);
10116 continue;
7c673cae 10117 }
11fdf7f2
TL
10118
10119 ++p;
7c673cae 10120 }
7c673cae 10121 if (queued) {
11fdf7f2
TL
10122 std::lock_guard l{sdata_wait_lock};
10123 sdata_cond.notify_one();
7c673cae
FG
10124 }
10125}
10126
11fdf7f2
TL
10127void OSDShard::_wake_pg_slot(
10128 spg_t pgid,
10129 OSDShardPGSlot *slot)
10130{
10131 dout(20) << __func__ << " " << pgid
10132 << " to_process " << slot->to_process
10133 << " waiting " << slot->waiting
10134 << " waiting_peering " << slot->waiting_peering << dendl;
10135 for (auto i = slot->to_process.rbegin();
10136 i != slot->to_process.rend();
10137 ++i) {
9f95a23c 10138 scheduler->enqueue_front(std::move(*i));
11fdf7f2
TL
10139 }
10140 slot->to_process.clear();
10141 for (auto i = slot->waiting.rbegin();
10142 i != slot->waiting.rend();
10143 ++i) {
9f95a23c 10144 scheduler->enqueue_front(std::move(*i));
11fdf7f2
TL
10145 }
10146 slot->waiting.clear();
10147 for (auto i = slot->waiting_peering.rbegin();
10148 i != slot->waiting_peering.rend();
10149 ++i) {
10150 // this is overkill; we requeue everything, even if some of these
10151 // items are waiting for maps we don't have yet. FIXME, maybe,
10152 // someday, if we decide this inefficiency matters
10153 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
9f95a23c 10154 scheduler->enqueue_front(std::move(*j));
11fdf7f2
TL
10155 }
10156 }
10157 slot->waiting_peering.clear();
10158 ++slot->requeue_seq;
10159}
10160
10161void OSDShard::identify_splits_and_merges(
10162 const OSDMapRef& as_of_osdmap,
10163 set<pair<spg_t,epoch_t>> *split_pgs,
10164 set<pair<spg_t,epoch_t>> *merge_pgs)
10165{
10166 std::lock_guard l(shard_lock);
10167 if (shard_osdmap) {
10168 for (auto& i : pg_slots) {
10169 const spg_t& pgid = i.first;
10170 auto *slot = i.second.get();
10171 if (slot->pg) {
10172 osd->service.identify_splits_and_merges(
10173 shard_osdmap, as_of_osdmap, pgid,
10174 split_pgs, merge_pgs);
10175 } else if (!slot->waiting_for_split.empty()) {
10176 osd->service.identify_splits_and_merges(
10177 shard_osdmap, as_of_osdmap, pgid,
10178 split_pgs, nullptr);
10179 } else {
10180 dout(20) << __func__ << " slot " << pgid
9f95a23c 10181 << " has no pg and waiting_for_split " << dendl;
7c673cae 10182 }
11fdf7f2
TL
10183 }
10184 }
10185}
10186
10187void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10188 set<pair<spg_t,epoch_t>> *pgids)
10189{
10190 std::lock_guard l(shard_lock);
10191 _prime_splits(pgids);
10192 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10193 set<pair<spg_t,epoch_t>> newer_children;
10194 for (auto i : *pgids) {
10195 osd->service.identify_splits_and_merges(
10196 as_of_osdmap, shard_osdmap, i.first,
10197 &newer_children, nullptr);
10198 }
10199 newer_children.insert(pgids->begin(), pgids->end());
10200 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10201 << shard_osdmap->get_epoch() << ", new children " << newer_children
10202 << dendl;
10203 _prime_splits(&newer_children);
10204 // note: we don't care what is left over here for other shards.
10205 // if this shard is ahead of us and one isn't, e.g., one thread is
10206 // calling into prime_splits via _process (due to a newly created
10207 // pg) and this shard has a newer map due to a racing consume_map,
10208 // then any grandchildren left here will be identified (or were
10209 // identified) when the slower shard's osdmap is advanced.
10210 // _prime_splits() will tolerate the case where the pgid is
10211 // already primed.
10212 }
10213}
10214
10215void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10216{
10217 dout(10) << *pgids << dendl;
10218 auto p = pgids->begin();
10219 while (p != pgids->end()) {
10220 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10221 if (shard_index == shard_id) {
10222 auto r = pg_slots.emplace(p->first, nullptr);
10223 if (r.second) {
10224 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10225 r.first->second = make_unique<OSDShardPGSlot>();
10226 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10227 } else {
11fdf7f2
TL
10228 auto q = r.first;
10229 ceph_assert(q != pg_slots.end());
10230 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10231 << dendl;
10232 q->second->waiting_for_split.insert(p->second);
7c673cae 10233 }
11fdf7f2
TL
10234 p = pgids->erase(p);
10235 } else {
10236 ++p;
7c673cae
FG
10237 }
10238 }
11fdf7f2
TL
10239}
10240
10241void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10242 set<pair<spg_t,epoch_t>> *merge_pgs)
10243{
10244 std::lock_guard l(shard_lock);
10245 dout(20) << __func__ << " checking shard " << shard_id
10246 << " for remaining merge pgs " << merge_pgs << dendl;
10247 auto p = merge_pgs->begin();
10248 while (p != merge_pgs->end()) {
10249 spg_t pgid = p->first;
10250 epoch_t epoch = p->second;
10251 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10252 if (shard_index != shard_id) {
10253 ++p;
10254 continue;
10255 }
10256 OSDShardPGSlot *slot;
10257 auto r = pg_slots.emplace(pgid, nullptr);
10258 if (r.second) {
10259 r.first->second = make_unique<OSDShardPGSlot>();
10260 }
10261 slot = r.first->second.get();
10262 if (slot->pg) {
10263 // already have pg
10264 dout(20) << __func__ << " have merge participant pg " << pgid
10265 << " " << slot->pg << dendl;
10266 } else if (!slot->waiting_for_split.empty() &&
10267 *slot->waiting_for_split.begin() < epoch) {
10268 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10269 << " " << slot->waiting_for_split << dendl;
10270 } else {
10271 dout(20) << __func__ << " creating empty merge participant " << pgid
10272 << " for merge in " << epoch << dendl;
10273 // leave history zeroed; PG::merge_from() will fill it in.
10274 pg_history_t history;
10275 PGCreateInfo cinfo(pgid, epoch - 1,
10276 history, PastIntervals(), false);
10277 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10278 _attach_pg(r.first->second.get(), pg.get());
10279 _wake_pg_slot(pgid, slot);
10280 pg->unlock();
10281 }
10282 // mark slot for merge
10283 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10284 slot->waiting_for_merge_epoch = epoch;
10285 p = merge_pgs->erase(p);
7c673cae
FG
10286 }
10287}
10288
11fdf7f2 10289void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 10290{
11fdf7f2
TL
10291 epoch_t epoch;
10292 {
10293 std::lock_guard l(shard_lock);
10294 dout(10) << pg->pg_id << " " << pg << dendl;
10295 auto p = pg_slots.find(pg->pg_id);
10296 ceph_assert(p != pg_slots.end());
10297 auto *slot = p->second.get();
10298 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10299 << dendl;
10300 ceph_assert(!slot->pg);
10301 ceph_assert(!slot->waiting_for_split.empty());
10302 _attach_pg(slot, pg);
10303
10304 epoch = pg->get_osdmap_epoch();
10305 ceph_assert(slot->waiting_for_split.count(epoch));
10306 slot->waiting_for_split.erase(epoch);
10307 if (slot->waiting_for_split.empty()) {
10308 _wake_pg_slot(pg->pg_id, slot);
10309 } else {
10310 dout(10) << __func__ << " still waiting for split on "
10311 << slot->waiting_for_split << dendl;
10312 }
7c673cae 10313 }
11fdf7f2
TL
10314
10315 // kick child to ensure it pulls up to the latest osdmap
10316 osd->enqueue_peering_evt(
10317 pg->pg_id,
10318 PGPeeringEventRef(
10319 std::make_shared<PGPeeringEvent>(
10320 epoch,
10321 epoch,
10322 NullEvt())));
10323
10324 std::lock_guard l{sdata_wait_lock};
10325 sdata_cond.notify_one();
7c673cae
FG
10326}
10327
11fdf7f2 10328void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 10329{
11fdf7f2
TL
10330 std::lock_guard l(shard_lock);
10331 vector<spg_t> to_delete;
10332 for (auto& i : pg_slots) {
10333 if (i.first != parent &&
10334 i.first.get_ancestor(old_pg_num) == parent) {
10335 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10336 << dendl;
10337 _wake_pg_slot(i.first, i.second.get());
10338 to_delete.push_back(i.first);
10339 }
10340 }
10341 for (auto pgid : to_delete) {
10342 pg_slots.erase(pgid);
10343 }
10344}
10345
9f95a23c
TL
10346OSDShard::OSDShard(
10347 int id,
10348 CephContext *cct,
10349 OSD *osd)
10350 : shard_id(id),
10351 cct(cct),
10352 osd(osd),
10353 shard_name(string("OSDShard.") + stringify(id)),
10354 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10355 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10356 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10357 shard_lock_name(shard_name + "::shard_lock"),
10358 shard_lock{make_mutex(shard_lock_name)},
10359 scheduler(ceph::osd::scheduler::make_scheduler(cct)),
10360 context_queue(sdata_wait_lock, sdata_cond)
10361{
10362 dout(0) << "using op scheduler " << *scheduler << dendl;
10363}
10364
11fdf7f2
TL
10365
10366// =============================================================
10367
10368#undef dout_context
10369#define dout_context osd->cct
10370#undef dout_prefix
10371#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10372
10373void OSD::ShardedOpWQ::_add_slot_waiter(
10374 spg_t pgid,
10375 OSDShardPGSlot *slot,
9f95a23c 10376 OpSchedulerItem&& qi)
11fdf7f2
TL
10377{
10378 if (qi.is_peering()) {
10379 dout(20) << __func__ << " " << pgid
10380 << " peering, item epoch is "
10381 << qi.get_map_epoch()
10382 << ", will wait on " << qi << dendl;
10383 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10384 } else {
10385 dout(20) << __func__ << " " << pgid
10386 << " item epoch is "
10387 << qi.get_map_epoch()
10388 << ", will wait on " << qi << dendl;
10389 slot->waiting.push_back(std::move(qi));
7c673cae
FG
10390 }
10391}
10392
10393#undef dout_prefix
10394#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10395
10396void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10397{
11fdf7f2
TL
10398 uint32_t shard_index = thread_index % osd->num_shards;
10399 auto& sdata = osd->shards[shard_index];
10400 ceph_assert(sdata);
10401
10402 // If all threads of shards do oncommits, there is a out-of-order
10403 // problem. So we choose the thread which has the smallest
10404 // thread_index(thread_index < num_shards) of shard to do oncommit
10405 // callback.
10406 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
10407
10408 // peek at spg_t
11fdf7f2 10409 sdata->shard_lock.lock();
9f95a23c 10410 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10411 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10412 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10413 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10414 // we raced with a context_queue addition, don't wait
10415 wait_lock.unlock();
10416 } else if (!sdata->stop_waiting) {
10417 dout(20) << __func__ << " empty q, waiting" << dendl;
10418 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10419 sdata->shard_lock.unlock();
10420 sdata->sdata_cond.wait(wait_lock);
10421 wait_lock.unlock();
10422 sdata->shard_lock.lock();
9f95a23c 10423 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10424 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10425 sdata->shard_lock.unlock();
10426 return;
10427 }
10428 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10429 osd->cct->_conf->threadpool_default_timeout, 0);
10430 } else {
10431 dout(20) << __func__ << " need return immediately" << dendl;
10432 wait_lock.unlock();
10433 sdata->shard_lock.unlock();
7c673cae
FG
10434 return;
10435 }
10436 }
11fdf7f2
TL
10437
10438 list<Context *> oncommits;
9f95a23c
TL
10439 if (is_smallest_thread_index) {
10440 sdata->context_queue.move_to(oncommits);
7c673cae 10441 }
11fdf7f2 10442
9f95a23c 10443 if (sdata->scheduler->empty()) {
11fdf7f2
TL
10444 if (osd->is_stopping()) {
10445 sdata->shard_lock.unlock();
10446 for (auto c : oncommits) {
10447 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10448 delete c;
10449 }
10450 return; // OSD shutdown, discard.
7c673cae 10451 }
11fdf7f2
TL
10452 sdata->shard_lock.unlock();
10453 handle_oncommits(oncommits);
10454 return;
7c673cae 10455 }
7c673cae 10456
9f95a23c 10457 OpSchedulerItem item = sdata->scheduler->dequeue();
11fdf7f2
TL
10458 if (osd->is_stopping()) {
10459 sdata->shard_lock.unlock();
10460 for (auto c : oncommits) {
10461 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10462 delete c;
10463 }
10464 return; // OSD shutdown, discard.
10465 }
7c673cae 10466
11fdf7f2
TL
10467 const auto token = item.get_ordering_token();
10468 auto r = sdata->pg_slots.emplace(token, nullptr);
10469 if (r.second) {
10470 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 10471 }
11fdf7f2
TL
10472 OSDShardPGSlot *slot = r.first->second.get();
10473 dout(20) << __func__ << " " << token
10474 << (r.second ? " (new)" : "")
10475 << " to_process " << slot->to_process
10476 << " waiting " << slot->waiting
10477 << " waiting_peering " << slot->waiting_peering
10478 << dendl;
10479 slot->to_process.push_back(std::move(item));
10480 dout(20) << __func__ << " " << slot->to_process.back()
10481 << " queued" << dendl;
7c673cae 10482
11fdf7f2
TL
10483 retry_pg:
10484 PGRef pg = slot->pg;
7c673cae 10485
11fdf7f2
TL
10486 // lock pg (if we have it)
10487 if (pg) {
10488 // note the requeue seq now...
10489 uint64_t requeue_seq = slot->requeue_seq;
10490 ++slot->num_running;
7c673cae 10491
11fdf7f2
TL
10492 sdata->shard_lock.unlock();
10493 osd->service.maybe_inject_dispatch_delay();
10494 pg->lock();
10495 osd->service.maybe_inject_dispatch_delay();
10496 sdata->shard_lock.lock();
7c673cae 10497
11fdf7f2
TL
10498 auto q = sdata->pg_slots.find(token);
10499 if (q == sdata->pg_slots.end()) {
10500 // this can happen if we race with pg removal.
10501 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10502 pg->unlock();
10503 sdata->shard_lock.unlock();
10504 handle_oncommits(oncommits);
10505 return;
10506 }
10507 slot = q->second.get();
10508 --slot->num_running;
7c673cae 10509
11fdf7f2
TL
10510 if (slot->to_process.empty()) {
10511 // raced with _wake_pg_slot or consume_map
10512 dout(20) << __func__ << " " << token
10513 << " nothing queued" << dendl;
7c673cae 10514 pg->unlock();
11fdf7f2
TL
10515 sdata->shard_lock.unlock();
10516 handle_oncommits(oncommits);
10517 return;
7c673cae 10518 }
11fdf7f2
TL
10519 if (requeue_seq != slot->requeue_seq) {
10520 dout(20) << __func__ << " " << token
10521 << " requeue_seq " << slot->requeue_seq << " > our "
10522 << requeue_seq << ", we raced with _wake_pg_slot"
10523 << dendl;
7c673cae 10524 pg->unlock();
11fdf7f2
TL
10525 sdata->shard_lock.unlock();
10526 handle_oncommits(oncommits);
10527 return;
7c673cae 10528 }
11fdf7f2
TL
10529 if (slot->pg != pg) {
10530 // this can happen if we race with pg removal.
10531 dout(20) << __func__ << " slot " << token << " no longer attached to "
10532 << pg << dendl;
7c673cae 10533 pg->unlock();
11fdf7f2 10534 goto retry_pg;
7c673cae 10535 }
7c673cae
FG
10536 }
10537
11fdf7f2
TL
10538 dout(20) << __func__ << " " << token
10539 << " to_process " << slot->to_process
10540 << " waiting " << slot->waiting
10541 << " waiting_peering " << slot->waiting_peering << dendl;
10542
10543 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10544 suicide_interval);
10545
7c673cae 10546 // take next item
11fdf7f2
TL
10547 auto qi = std::move(slot->to_process.front());
10548 slot->to_process.pop_front();
10549 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10550 set<pair<spg_t,epoch_t>> new_children;
10551 OSDMapRef osdmap;
7c673cae 10552
11fdf7f2 10553 while (!pg) {
7c673cae 10554 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
10555 osdmap = sdata->shard_osdmap;
10556 const PGCreateInfo *create_info = qi.creates_pg();
10557 if (!slot->waiting_for_split.empty()) {
10558 dout(20) << __func__ << " " << token
10559 << " splitting " << slot->waiting_for_split << dendl;
10560 _add_slot_waiter(token, slot, std::move(qi));
10561 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10562 dout(20) << __func__ << " " << token
10563 << " map " << qi.get_map_epoch() << " > "
10564 << osdmap->get_epoch() << dendl;
10565 _add_slot_waiter(token, slot, std::move(qi));
10566 } else if (qi.is_peering()) {
10567 if (!qi.peering_requires_pg()) {
10568 // for pg-less events, we run them under the ordering lock, since
10569 // we don't have the pg lock to keep them ordered.
10570 qi.run(osd, sdata, pg, tp_handle);
10571 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10572 if (create_info) {
10573 if (create_info->by_mon &&
10574 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10575 dout(20) << __func__ << " " << token
10576 << " no pg, no longer primary, ignoring mon create on "
10577 << qi << dendl;
10578 } else {
10579 dout(20) << __func__ << " " << token
10580 << " no pg, should create on " << qi << dendl;
10581 pg = osd->handle_pg_create_info(osdmap, create_info);
10582 if (pg) {
10583 // we created the pg! drop out and continue "normally"!
10584 sdata->_attach_pg(slot, pg.get());
10585 sdata->_wake_pg_slot(token, slot);
10586
10587 // identify split children between create epoch and shard epoch.
10588 osd->service.identify_splits_and_merges(
10589 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10590 sdata->_prime_splits(&new_children);
10591 // distribute remaining split children to other shards below!
10592 break;
10593 }
10594 dout(20) << __func__ << " ignored create on " << qi << dendl;
10595 }
10596 } else {
10597 dout(20) << __func__ << " " << token
10598 << " no pg, peering, !create, discarding " << qi << dendl;
10599 }
10600 } else {
10601 dout(20) << __func__ << " " << token
10602 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10603 << ", discarding " << qi
10604 << dendl;
10605 }
10606 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10607 dout(20) << __func__ << " " << token
10608 << " no pg, should exist e" << osdmap->get_epoch()
10609 << ", will wait on " << qi << dendl;
10610 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 10611 } else {
11fdf7f2
TL
10612 dout(20) << __func__ << " " << token
10613 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10614 << ", dropping " << qi << dendl;
7c673cae 10615 // share map with client?
9f95a23c
TL
10616 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10617 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10618 sdata->shard_osdmap,
10619 (*_op)->sent_epoch);
7c673cae 10620 }
11fdf7f2 10621 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 10622 if (pushes_to_free > 0) {
11fdf7f2 10623 sdata->shard_lock.unlock();
7c673cae 10624 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 10625 handle_oncommits(oncommits);
7c673cae
FG
10626 return;
10627 }
10628 }
11fdf7f2
TL
10629 sdata->shard_lock.unlock();
10630 handle_oncommits(oncommits);
7c673cae
FG
10631 return;
10632 }
11fdf7f2
TL
10633 if (qi.is_peering()) {
10634 OSDMapRef osdmap = sdata->shard_osdmap;
10635 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10636 _add_slot_waiter(token, slot, std::move(qi));
10637 sdata->shard_lock.unlock();
10638 pg->unlock();
10639 handle_oncommits(oncommits);
10640 return;
10641 }
10642 }
10643 sdata->shard_lock.unlock();
7c673cae 10644
11fdf7f2
TL
10645 if (!new_children.empty()) {
10646 for (auto shard : osd->shards) {
10647 shard->prime_splits(osdmap, &new_children);
10648 }
10649 ceph_assert(new_children.empty());
10650 }
7c673cae
FG
10651
10652 // osd_opwq_process marks the point at which an operation has been dequeued
10653 // and will begin to be handled by a worker thread.
10654 {
10655#ifdef WITH_LTTNG
10656 osd_reqid_t reqid;
9f95a23c 10657 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
10658 reqid = (*_op)->get_reqid();
10659 }
10660#endif
10661 tracepoint(osd, opwq_process_start, reqid.name._type,
10662 reqid.name._num, reqid.tid, reqid.inc);
10663 }
10664
10665 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10666 Formatter *f = Formatter::create("json");
10667 f->open_object_section("q");
10668 dump(f);
10669 f->close_section();
10670 f->flush(*_dout);
10671 delete f;
10672 *_dout << dendl;
10673
11fdf7f2 10674 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
10675
10676 {
10677#ifdef WITH_LTTNG
10678 osd_reqid_t reqid;
9f95a23c 10679 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
10680 reqid = (*_op)->get_reqid();
10681 }
10682#endif
10683 tracepoint(osd, opwq_process_finish, reqid.name._type,
10684 reqid.name._num, reqid.tid, reqid.inc);
10685 }
10686
11fdf7f2 10687 handle_oncommits(oncommits);
7c673cae
FG
10688}
10689
9f95a23c 10690void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
7c673cae 10691 uint32_t shard_index =
11fdf7f2 10692 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 10693
9f95a23c
TL
10694 dout(20) << __func__ << " " << item << dendl;
10695
11fdf7f2 10696 OSDShard* sdata = osd->shards[shard_index];
7c673cae 10697 assert (NULL != sdata);
7c673cae 10698
9f95a23c
TL
10699 bool empty = true;
10700 {
10701 std::lock_guard l{sdata->shard_lock};
10702 empty = sdata->scheduler->empty();
10703 sdata->scheduler->enqueue(std::move(item));
10704 }
7c673cae 10705
9f95a23c
TL
10706 if (empty) {
10707 std::lock_guard l{sdata->sdata_wait_lock};
10708 sdata->sdata_cond.notify_one();
10709 }
7c673cae
FG
10710}
10711
9f95a23c 10712void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
7c673cae 10713{
11fdf7f2
TL
10714 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10715 auto& sdata = osd->shards[shard_index];
10716 ceph_assert(sdata);
10717 sdata->shard_lock.lock();
10718 auto p = sdata->pg_slots.find(item.get_ordering_token());
10719 if (p != sdata->pg_slots.end() &&
10720 !p->second->to_process.empty()) {
7c673cae 10721 // we may be racing with _process, which has dequeued a new item
9f95a23c 10722 // from scheduler, put it on to_process, and is now busy taking the
7c673cae
FG
10723 // pg lock. ensure this old requeued item is ordered before any
10724 // such newer item in to_process.
11fdf7f2
TL
10725 p->second->to_process.push_front(std::move(item));
10726 item = std::move(p->second->to_process.back());
10727 p->second->to_process.pop_back();
10728 dout(20) << __func__
10729 << " " << p->second->to_process.front()
10730 << " shuffled w/ " << item << dendl;
7c673cae 10731 } else {
11fdf7f2 10732 dout(20) << __func__ << " " << item << dendl;
7c673cae 10733 }
9f95a23c 10734 sdata->scheduler->enqueue_front(std::move(item));
11fdf7f2
TL
10735 sdata->shard_lock.unlock();
10736 std::lock_guard l{sdata->sdata_wait_lock};
10737 sdata->sdata_cond.notify_one();
7c673cae
FG
10738}
10739
10740namespace ceph {
10741namespace osd_cmds {
10742
11fdf7f2
TL
10743int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
10744 std::ostream& os)
7c673cae
FG
10745{
10746 if (!ceph_using_tcmalloc()) {
10747 os << "could not issue heap profiler command -- not using tcmalloc!";
10748 return -EOPNOTSUPP;
10749 }
10750
10751 string cmd;
9f95a23c 10752 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
7c673cae
FG
10753 os << "unable to get value for command \"" << cmd << "\"";
10754 return -EINVAL;
11fdf7f2 10755 }
7c673cae
FG
10756
10757 std::vector<std::string> cmd_vec;
10758 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
10759
10760 string val;
9f95a23c 10761 if (cmd_getval(cmdmap, "value", val)) {
11fdf7f2
TL
10762 cmd_vec.push_back(val);
10763 }
7c673cae
FG
10764
10765 ceph_heap_profiler_handle_command(cmd_vec, os);
10766
10767 return 0;
10768}
10769
10770}} // namespace ceph::osd_cmds