]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
import quincy 17.2.0
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
eafe8130 26#include <time.h>
eafe8130 27#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
28
29#ifdef HAVE_SYS_PARAM_H
30#include <sys/param.h>
31#endif
32
33#ifdef HAVE_SYS_MOUNT_H
34#include <sys/mount.h>
35#endif
36
37#include "osd/PG.h"
20effc67
TL
38#include "osd/scrubber/scrub_machine.h"
39#include "osd/scrubber/pg_scrubber.h"
7c673cae
FG
40
41#include "include/types.h"
42#include "include/compat.h"
11fdf7f2 43#include "include/random.h"
20effc67 44#include "include/scope_guard.h"
7c673cae
FG
45
46#include "OSD.h"
47#include "OSDMap.h"
48#include "Watch.h"
49#include "osdc/Objecter.h"
50
51#include "common/errno.h"
52#include "common/ceph_argparse.h"
9f95a23c 53#include "common/ceph_releases.h"
224ce89b 54#include "common/ceph_time.h"
7c673cae 55#include "common/version.h"
f67539c2 56#include "common/async/blocked_completion.h"
b5b8bbf5 57#include "common/pick_address.h"
11fdf7f2
TL
58#include "common/blkdev.h"
59#include "common/numa.h"
7c673cae
FG
60
61#include "os/ObjectStore.h"
62#ifdef HAVE_LIBFUSE
63#include "os/FuseStore.h"
64#endif
65
66#include "PrimaryLogPG.h"
67
7c673cae
FG
68#include "msg/Messenger.h"
69#include "msg/Message.h"
70
71#include "mon/MonClient.h"
72
73#include "messages/MLog.h"
74
75#include "messages/MGenericMessage.h"
7c673cae
FG
76#include "messages/MOSDPing.h"
77#include "messages/MOSDFailure.h"
78#include "messages/MOSDMarkMeDown.h"
9f95a23c 79#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
80#include "messages/MOSDFull.h"
81#include "messages/MOSDOp.h"
82#include "messages/MOSDOpReply.h"
83#include "messages/MOSDBackoff.h"
84#include "messages/MOSDBeacon.h"
85#include "messages/MOSDRepOp.h"
86#include "messages/MOSDRepOpReply.h"
87#include "messages/MOSDBoot.h"
88#include "messages/MOSDPGTemp.h"
11fdf7f2 89#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
90
91#include "messages/MOSDMap.h"
92#include "messages/MMonGetOSDMap.h"
93#include "messages/MOSDPGNotify.h"
9f95a23c 94#include "messages/MOSDPGNotify2.h"
9f95a23c 95#include "messages/MOSDPGQuery2.h"
7c673cae
FG
96#include "messages/MOSDPGLog.h"
97#include "messages/MOSDPGRemove.h"
98#include "messages/MOSDPGInfo.h"
9f95a23c 99#include "messages/MOSDPGInfo2.h"
7c673cae 100#include "messages/MOSDPGCreate.h"
11fdf7f2 101#include "messages/MOSDPGCreate2.h"
7c673cae
FG
102#include "messages/MBackfillReserve.h"
103#include "messages/MRecoveryReserve.h"
c07f9fc5 104#include "messages/MOSDForceRecovery.h"
7c673cae
FG
105#include "messages/MOSDECSubOpWrite.h"
106#include "messages/MOSDECSubOpWriteReply.h"
107#include "messages/MOSDECSubOpRead.h"
108#include "messages/MOSDECSubOpReadReply.h"
109#include "messages/MOSDPGCreated.h"
110#include "messages/MOSDPGUpdateLogMissing.h"
111#include "messages/MOSDPGUpdateLogMissingReply.h"
112
11fdf7f2
TL
113#include "messages/MOSDPeeringOp.h"
114
7c673cae
FG
115#include "messages/MOSDAlive.h"
116
117#include "messages/MOSDScrub.h"
11fdf7f2 118#include "messages/MOSDScrub2.h"
7c673cae 119
7c673cae
FG
120#include "messages/MCommand.h"
121#include "messages/MCommandReply.h"
122
123#include "messages/MPGStats.h"
7c673cae 124
9f95a23c
TL
125#include "messages/MMonGetPurgedSnaps.h"
126#include "messages/MMonGetPurgedSnapsReply.h"
127
7c673cae
FG
128#include "common/perf_counters.h"
129#include "common/Timer.h"
130#include "common/LogClient.h"
131#include "common/AsyncReserver.h"
132#include "common/HeartbeatMap.h"
133#include "common/admin_socket.h"
134#include "common/ceph_context.h"
135
136#include "global/signal_handler.h"
137#include "global/pidfile.h"
138
139#include "include/color.h"
140#include "perfglue/cpu_profiler.h"
141#include "perfglue/heap_profiler.h"
142
f67539c2 143#include "osd/ClassHandler.h"
7c673cae
FG
144#include "osd/OpRequest.h"
145
146#include "auth/AuthAuthorizeHandler.h"
147#include "auth/RotatingKeyRing.h"
7c673cae
FG
148
149#include "objclass/objclass.h"
150
151#include "common/cmdparse.h"
152#include "include/str_list.h"
153#include "include/util.h"
154
11fdf7f2 155#include "include/ceph_assert.h"
7c673cae
FG
156#include "common/config.h"
157#include "common/EventTrace.h"
158
11fdf7f2
TL
159#include "json_spirit/json_spirit_reader.h"
160#include "json_spirit/json_spirit_writer.h"
161
7c673cae
FG
162#ifdef WITH_LTTNG
163#define TRACEPOINT_DEFINE
164#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
165#include "tracing/osd.h"
166#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
167#undef TRACEPOINT_DEFINE
168#else
169#define tracepoint(...)
170#endif
20effc67
TL
171
172#include "osd_tracer.h"
173
7c673cae
FG
174
175#define dout_context cct
176#define dout_subsys ceph_subsys_osd
177#undef dout_prefix
178#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
179
f67539c2
TL
180using std::deque;
181using std::list;
182using std::lock_guard;
183using std::make_pair;
184using std::make_tuple;
185using std::make_unique;
186using std::map;
187using std::ostream;
188using std::ostringstream;
189using std::pair;
190using std::set;
191using std::string;
192using std::stringstream;
193using std::to_string;
194using std::unique_ptr;
195using std::vector;
196
197using ceph::bufferlist;
198using ceph::bufferptr;
199using ceph::decode;
200using ceph::encode;
201using ceph::fixed_u_to_string;
202using ceph::Formatter;
203using ceph::heartbeat_handle_d;
204using ceph::make_mutex;
205
9f95a23c
TL
206using namespace ceph::osd::scheduler;
207using TOPNSPC::common::cmd_getval;
20effc67 208using TOPNSPC::common::cmd_getval_or;
224ce89b 209
7c673cae
FG
210static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
211 return *_dout << "osd." << whoami << " " << epoch << " ";
212}
213
20effc67 214
7c673cae
FG
215//Initial features in new superblock.
216//Features here are also automatically upgraded
217CompatSet OSD::get_osd_initial_compat_set() {
218 CompatSet::FeatureSet ceph_osd_feature_compat;
219 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
220 CompatSet::FeatureSet ceph_osd_feature_incompat;
221 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
222 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
223 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
224 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
225 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
226 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
227 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
228 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
229 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
230 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
231 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
232 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
233 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
234 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 235 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
9f95a23c 236 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
7c673cae
FG
237 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
238 ceph_osd_feature_incompat);
239}
240
241//Features are added here that this OSD supports.
242CompatSet OSD::get_osd_compat_set() {
243 CompatSet compat = get_osd_initial_compat_set();
244 //Any features here can be set in code, but not in initial superblock
245 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
246 return compat;
247}
248
f67539c2 249OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
7c673cae
FG
250 osd(osd),
251 cct(osd->cct),
20effc67 252 whoami(osd->whoami), store(osd->store.get()),
7c673cae
FG
253 log_client(osd->log_client), clog(osd->clog),
254 pg_recovery_stats(osd->pg_recovery_stats),
255 cluster_messenger(osd->cluster_messenger),
256 client_messenger(osd->client_messenger),
257 logger(osd->logger),
258 recoverystate_perf(osd->recoverystate_perf),
259 monc(osd->monc),
11fdf7f2
TL
260 osd_max_object_size(cct->_conf, "osd_max_object_size"),
261 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
262 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
263 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
7c673cae 264 max_oldest_map(0),
20effc67 265 m_scrub_queue{cct, *this},
7c673cae
FG
266 agent_valid_iterator(false),
267 agent_ops(0),
268 flush_mode_high_count(0),
269 agent_active(true),
270 agent_thread(this),
271 agent_stop_flag(false),
7c673cae
FG
272 agent_timer(osd->client_messenger->cct, agent_timer_lock),
273 last_recalibrate(ceph_clock_now()),
274 promote_max_objects(0),
275 promote_max_bytes(0),
f67539c2 276 poolctx(poolctx),
9f95a23c
TL
277 objecter(make_unique<Objecter>(osd->client_messenger->cct,
278 osd->objecter_messenger,
f67539c2 279 osd->monc, poolctx)),
11fdf7f2 280 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
281 watch_timer(osd->client_messenger->cct, watch_lock),
282 next_notif_id(0),
7c673cae 283 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2 284 sleep_timer(cct, sleep_lock, false),
7c673cae 285 reserver_finisher(cct),
3efd9988 286 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 287 cct->_conf->osd_min_recovery_priority),
3efd9988 288 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 289 cct->_conf->osd_min_recovery_priority),
3efd9988 290 snap_reserver(cct, &reserver_finisher,
7c673cae 291 cct->_conf->osd_max_trimming_pgs),
7c673cae
FG
292 recovery_ops_active(0),
293 recovery_ops_reserved(0),
294 recovery_paused(false),
7c673cae
FG
295 map_cache(cct, cct->_conf->osd_map_cache_size),
296 map_bl_cache(cct->_conf->osd_map_cache_size),
297 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae 298 cur_state(NONE),
11fdf7f2 299 cur_ratio(0), physical_ratio(0),
9f95a23c 300 boot_epoch(0), up_epoch(0), bind_epoch(0)
7c673cae
FG
301{
302 objecter->init();
11fdf7f2
TL
303
304 for (int i = 0; i < m_objecter_finishers; i++) {
305 ostringstream str;
306 str << "objecter-finisher-" << i;
9f95a23c
TL
307 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
308 objecter_finishers.push_back(std::move(fin));
11fdf7f2 309 }
7c673cae
FG
310}
311
31f18b77 312#ifdef PG_DEBUG_REFS
f67539c2 313void OSDService::add_pgid(spg_t pgid, PG *pg) {
11fdf7f2 314 std::lock_guard l(pgid_lock);
31f18b77
FG
315 if (!pgid_tracker.count(pgid)) {
316 live_pgs[pgid] = pg;
317 }
318 pgid_tracker[pgid]++;
319}
320void OSDService::remove_pgid(spg_t pgid, PG *pg)
321{
11fdf7f2
TL
322 std::lock_guard l(pgid_lock);
323 ceph_assert(pgid_tracker.count(pgid));
324 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
325 pgid_tracker[pgid]--;
326 if (pgid_tracker[pgid] == 0) {
327 pgid_tracker.erase(pgid);
328 live_pgs.erase(pgid);
329 }
330}
331void OSDService::dump_live_pgids()
332{
11fdf7f2 333 std::lock_guard l(pgid_lock);
31f18b77
FG
334 derr << "live pgids:" << dendl;
335 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
336 i != pgid_tracker.cend();
337 ++i) {
338 derr << "\t" << *i << dendl;
339 live_pgs[i->first]->dump_live_ids();
340 }
341}
342#endif
343
344
9f95a23c
TL
345ceph::signedspan OSDService::get_mnow()
346{
347 return ceph::mono_clock::now() - osd->startup_time;
348}
7c673cae 349
11fdf7f2
TL
350void OSDService::identify_splits_and_merges(
351 OSDMapRef old_map,
352 OSDMapRef new_map,
353 spg_t pgid,
354 set<pair<spg_t,epoch_t>> *split_children,
355 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 356{
11fdf7f2 357 if (!old_map->have_pg_pool(pgid.pool())) {
7c673cae 358 return;
7c673cae 359 }
7c673cae 360 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
361 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
362 if (p == osd->pg_num_history.pg_nums.end()) {
363 return;
364 }
365 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
366 << " to e" << new_map->get_epoch()
367 << " pg_nums " << p->second << dendl;
368 deque<spg_t> queue;
369 queue.push_back(pgid);
eafe8130 370 set<spg_t> did;
11fdf7f2
TL
371 while (!queue.empty()) {
372 auto cur = queue.front();
373 queue.pop_front();
eafe8130 374 did.insert(cur);
11fdf7f2
TL
375 unsigned pgnum = old_pgnum;
376 for (auto q = p->second.lower_bound(old_map->get_epoch());
377 q != p->second.end() &&
378 q->first <= new_map->get_epoch();
379 ++q) {
380 if (pgnum < q->second) {
381 // split?
382 if (cur.ps() < pgnum) {
383 set<spg_t> children;
384 if (cur.is_split(pgnum, q->second, &children)) {
385 dout(20) << __func__ << " " << cur << " e" << q->first
386 << " pg_num " << pgnum << " -> " << q->second
387 << " children " << children << dendl;
388 for (auto i : children) {
389 split_children->insert(make_pair(i, q->first));
eafe8130
TL
390 if (!did.count(i))
391 queue.push_back(i);
11fdf7f2
TL
392 }
393 }
394 } else if (cur.ps() < q->second) {
395 dout(20) << __func__ << " " << cur << " e" << q->first
396 << " pg_num " << pgnum << " -> " << q->second
397 << " is a child" << dendl;
398 // normally we'd capture this from the parent, but it's
399 // possible the parent doesn't exist yet (it will be
400 // fabricated to allow an intervening merge). note this PG
401 // as a split child here to be sure we catch it.
402 split_children->insert(make_pair(cur, q->first));
403 } else {
404 dout(20) << __func__ << " " << cur << " e" << q->first
405 << " pg_num " << pgnum << " -> " << q->second
406 << " is post-split, skipping" << dendl;
407 }
408 } else if (merge_pgs) {
409 // merge?
410 if (cur.ps() >= q->second) {
411 if (cur.ps() < pgnum) {
412 spg_t parent;
413 if (cur.is_merge_source(pgnum, q->second, &parent)) {
414 set<spg_t> children;
415 parent.is_split(q->second, pgnum, &children);
416 dout(20) << __func__ << " " << cur << " e" << q->first
417 << " pg_num " << pgnum << " -> " << q->second
418 << " is merge source, target " << parent
419 << ", source(s) " << children << dendl;
420 merge_pgs->insert(make_pair(parent, q->first));
eafe8130
TL
421 if (!did.count(parent)) {
422 // queue (and re-scan) parent in case it might not exist yet
423 // and there are some future splits pending on it
424 queue.push_back(parent);
425 }
11fdf7f2
TL
426 for (auto c : children) {
427 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
428 if (!did.count(c))
429 queue.push_back(c);
11fdf7f2
TL
430 }
431 }
432 } else {
433 dout(20) << __func__ << " " << cur << " e" << q->first
434 << " pg_num " << pgnum << " -> " << q->second
435 << " is beyond old pgnum, skipping" << dendl;
436 }
437 } else {
438 set<spg_t> children;
439 if (cur.is_split(q->second, pgnum, &children)) {
440 dout(20) << __func__ << " " << cur << " e" << q->first
441 << " pg_num " << pgnum << " -> " << q->second
442 << " is merge target, source " << children << dendl;
443 for (auto c : children) {
444 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
445 if (!did.count(c))
446 queue.push_back(c);
11fdf7f2
TL
447 }
448 merge_pgs->insert(make_pair(cur, q->first));
449 }
7c673cae
FG
450 }
451 }
11fdf7f2 452 pgnum = q->second;
7c673cae
FG
453 }
454 }
455}
456
7c673cae
FG
457void OSDService::need_heartbeat_peer_update()
458{
459 osd->need_heartbeat_peer_update();
460}
461
9f95a23c
TL
462HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
463{
464 std::lock_guard l(hb_stamp_lock);
465 if (peer >= hb_stamps.size()) {
466 hb_stamps.resize(peer + 1);
467 }
468 if (!hb_stamps[peer]) {
469 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
470 }
471 return hb_stamps[peer];
472}
473
474void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
475{
476 osd->enqueue_peering_evt(
477 spgid,
478 PGPeeringEventRef(
479 std::make_shared<PGPeeringEvent>(
480 epoch, epoch,
481 RenewLease())));
482}
483
7c673cae
FG
484void OSDService::start_shutdown()
485{
486 {
11fdf7f2 487 std::lock_guard l(agent_timer_lock);
7c673cae
FG
488 agent_timer.shutdown();
489 }
31f18b77
FG
490
491 {
11fdf7f2
TL
492 std::lock_guard l(sleep_lock);
493 sleep_timer.shutdown();
31f18b77 494 }
81eedcae
TL
495
496 {
497 std::lock_guard l(recovery_request_lock);
498 recovery_request_timer.shutdown();
499 }
7c673cae
FG
500}
501
31f18b77 502void OSDService::shutdown_reserver()
7c673cae
FG
503{
504 reserver_finisher.wait_for_empty();
505 reserver_finisher.stop();
31f18b77
FG
506}
507
508void OSDService::shutdown()
509{
9f95a23c
TL
510 mono_timer.suspend();
511
7c673cae 512 {
11fdf7f2 513 std::lock_guard l(watch_lock);
7c673cae
FG
514 watch_timer.shutdown();
515 }
516
517 objecter->shutdown();
9f95a23c 518 for (auto& f : objecter_finishers) {
11fdf7f2
TL
519 f->wait_for_empty();
520 f->stop();
7c673cae
FG
521 }
522
11fdf7f2 523 publish_map(OSDMapRef());
7c673cae
FG
524 next_osdmap = OSDMapRef();
525}
526
527void OSDService::init()
528{
529 reserver_finisher.start();
9f95a23c 530 for (auto& f : objecter_finishers) {
11fdf7f2
TL
531 f->start();
532 }
7c673cae
FG
533 objecter->set_client_incarnation(0);
534
535 // deprioritize objecter in daemonperf output
536 objecter->get_logger()->set_prio_adjust(-3);
537
538 watch_timer.init();
539 agent_timer.init();
9f95a23c 540 mono_timer.resume();
7c673cae
FG
541
542 agent_thread.create("osd_srv_agent");
543
544 if (cct->_conf->osd_recovery_delay_start)
545 defer_recovery(cct->_conf->osd_recovery_delay_start);
546}
547
548void OSDService::final_init()
549{
550 objecter->start(osdmap.get());
551}
552
553void OSDService::activate_map()
554{
555 // wake/unwake the tiering agent
9f95a23c 556 std::lock_guard l{agent_lock};
7c673cae
FG
557 agent_active =
558 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
559 osd->is_active();
9f95a23c 560 agent_cond.notify_all();
7c673cae
FG
561}
562
181888fb
FG
563void OSDService::request_osdmap_update(epoch_t e)
564{
565 osd->osdmap_subscribe(e, false);
566}
567
9f95a23c 568
7c673cae
FG
569class AgentTimeoutCB : public Context {
570 PGRef pg;
571public:
572 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
573 void finish(int) override {
574 pg->agent_choose_mode_restart();
575 }
576};
577
578void OSDService::agent_entry()
579{
580 dout(10) << __func__ << " start" << dendl;
9f95a23c 581 std::unique_lock agent_locker{agent_lock};
7c673cae
FG
582
583 while (!agent_stop_flag) {
584 if (agent_queue.empty()) {
585 dout(20) << __func__ << " empty queue" << dendl;
9f95a23c 586 agent_cond.wait(agent_locker);
7c673cae
FG
587 continue;
588 }
589 uint64_t level = agent_queue.rbegin()->first;
590 set<PGRef>& top = agent_queue.rbegin()->second;
591 dout(10) << __func__
592 << " tiers " << agent_queue.size()
593 << ", top is " << level
594 << " with pgs " << top.size()
595 << ", ops " << agent_ops << "/"
596 << cct->_conf->osd_agent_max_ops
597 << (agent_active ? " active" : " NOT ACTIVE")
598 << dendl;
599 dout(20) << __func__ << " oids " << agent_oids << dendl;
600 int max = cct->_conf->osd_agent_max_ops - agent_ops;
601 int agent_flush_quota = max;
602 if (!flush_mode_high_count)
603 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
604 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
9f95a23c 605 agent_cond.wait(agent_locker);
7c673cae
FG
606 continue;
607 }
608
609 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
610 agent_queue_pos = top.begin();
611 agent_valid_iterator = true;
612 }
613 PGRef pg = *agent_queue_pos;
614 dout(10) << "high_count " << flush_mode_high_count
615 << " agent_ops " << agent_ops
616 << " flush_quota " << agent_flush_quota << dendl;
9f95a23c 617 agent_locker.unlock();
7c673cae 618 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 619 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
620 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
621 << " seconds" << dendl;
622
f67539c2 623 logger->inc(l_osd_tier_delay);
7c673cae 624 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
9f95a23c 625 std::lock_guard timer_locker{agent_timer_lock};
7c673cae
FG
626 Context *cb = new AgentTimeoutCB(pg);
627 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
7c673cae 628 }
9f95a23c 629 agent_locker.lock();
7c673cae 630 }
7c673cae
FG
631 dout(10) << __func__ << " finish" << dendl;
632}
633
634void OSDService::agent_stop()
635{
636 {
11fdf7f2 637 std::lock_guard l(agent_lock);
7c673cae
FG
638
639 // By this time all ops should be cancelled
11fdf7f2 640 ceph_assert(agent_ops == 0);
7c673cae
FG
641 // By this time all PGs are shutdown and dequeued
642 if (!agent_queue.empty()) {
643 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
644 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
645 ceph_abort_msg("agent queue not empty");
7c673cae
FG
646 }
647
648 agent_stop_flag = true;
9f95a23c 649 agent_cond.notify_all();
7c673cae
FG
650 }
651 agent_thread.join();
652}
653
654// -------------------------------------
655
656void OSDService::promote_throttle_recalibrate()
657{
658 utime_t now = ceph_clock_now();
659 double dur = now - last_recalibrate;
660 last_recalibrate = now;
661 unsigned prob = promote_probability_millis;
662
663 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
664 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
665
666 unsigned min_prob = 1;
667
668 uint64_t attempts, obj, bytes;
669 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
670 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 671 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 672 << target_obj_sec << " obj/sec or "
1adf2230 673 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
674 << dendl;
675
676 // calculate what the probability *should* be, given the targets
677 unsigned new_prob;
678 if (attempts && dur > 0) {
679 uint64_t avg_size = 1;
680 if (obj)
11fdf7f2 681 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
682 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
683 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
684 / (double)attempts;
685 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
686 << avg_size << dendl;
687 if (target_obj_sec && target_bytes_sec)
11fdf7f2 688 new_prob = std::min(po, pb);
7c673cae
FG
689 else if (target_obj_sec)
690 new_prob = po;
691 else if (target_bytes_sec)
692 new_prob = pb;
693 else
694 new_prob = 1000;
695 } else {
696 new_prob = 1000;
697 }
698 dout(20) << __func__ << " new_prob " << new_prob << dendl;
699
700 // correct for persistent skew between target rate and actual rate, adjust
701 double ratio = 1.0;
702 unsigned actual = 0;
703 if (attempts && obj) {
704 actual = obj * 1000 / attempts;
705 ratio = (double)actual / (double)prob;
706 new_prob = (double)new_prob / ratio;
707 }
11fdf7f2
TL
708 new_prob = std::max(new_prob, min_prob);
709 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
710
711 // adjust
712 prob = (prob + new_prob) / 2;
11fdf7f2
TL
713 prob = std::max(prob, min_prob);
714 prob = std::min(prob, 1000u);
7c673cae
FG
715 dout(10) << __func__ << " actual " << actual
716 << ", actual/prob ratio " << ratio
717 << ", adjusted new_prob " << new_prob
718 << ", prob " << promote_probability_millis << " -> " << prob
719 << dendl;
720 promote_probability_millis = prob;
721
722 // set hard limits for this interval to mitigate stampedes
91327a77
AA
723 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
724 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
725}
726
727// -------------------------------------
728
729float OSDService::get_failsafe_full_ratio()
730{
731 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
732 if (full_ratio > 1.0) full_ratio /= 100.0;
733 return full_ratio;
734}
735
11fdf7f2 736OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 737{
7c673cae
FG
738 // The OSDMap ratios take precendence. So if the failsafe is .95 and
739 // the admin sets the cluster full to .96, the failsafe moves up to .96
740 // too. (Not that having failsafe == full is ideal, but it's better than
741 // dropping writes before the clusters appears full.)
742 OSDMapRef osdmap = get_osdmap();
743 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 744 return NONE;
7c673cae
FG
745 }
746 float nearfull_ratio = osdmap->get_nearfull_ratio();
747 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
748 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
749 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
750
9f95a23c 751 if (osdmap->require_osd_release < ceph_release_t::luminous) {
7c673cae
FG
752 // use the failsafe for nearfull and full; the mon isn't using the
753 // flags anyway because we're mid-upgrade.
754 full_ratio = failsafe_ratio;
755 backfillfull_ratio = failsafe_ratio;
756 nearfull_ratio = failsafe_ratio;
757 } else if (full_ratio <= 0 ||
758 backfillfull_ratio <= 0 ||
759 nearfull_ratio <= 0) {
760 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
761 // use failsafe flag. ick. the monitor did something wrong or the user
762 // did something stupid.
763 full_ratio = failsafe_ratio;
764 backfillfull_ratio = failsafe_ratio;
765 nearfull_ratio = failsafe_ratio;
766 }
767
7c673cae 768 if (injectfull_state > NONE && injectfull) {
7c673cae 769 inject = "(Injected)";
11fdf7f2
TL
770 return injectfull_state;
771 } else if (pratio > failsafe_ratio) {
772 return FAILSAFE;
7c673cae 773 } else if (ratio > full_ratio) {
11fdf7f2 774 return FULL;
7c673cae 775 } else if (ratio > backfillfull_ratio) {
11fdf7f2 776 return BACKFILLFULL;
92f5a8d4 777 } else if (pratio > nearfull_ratio) {
11fdf7f2 778 return NEARFULL;
7c673cae 779 }
11fdf7f2
TL
780 return NONE;
781}
782
783void OSDService::check_full_status(float ratio, float pratio)
784{
785 std::lock_guard l(full_status_lock);
786
787 cur_ratio = ratio;
788 physical_ratio = pratio;
789
790 string inject;
791 s_names new_state;
792 new_state = recalc_full_state(ratio, pratio, inject);
793
7c673cae 794 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 795 << ", physical ratio " << pratio
7c673cae
FG
796 << ", new state " << get_full_state_name(new_state)
797 << " " << inject
798 << dendl;
799
800 // warn
801 if (cur_state != new_state) {
802 dout(10) << __func__ << " " << get_full_state_name(cur_state)
803 << " -> " << get_full_state_name(new_state) << dendl;
804 if (new_state == FAILSAFE) {
c07f9fc5 805 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
806 << (int)roundf(ratio * 100) << "% full";
807 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
808 clog->error() << "full status failsafe disengaged, no longer dropping "
809 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
810 }
811 cur_state = new_state;
812 }
813}
814
815bool OSDService::need_fullness_update()
816{
817 OSDMapRef osdmap = get_osdmap();
818 s_names cur = NONE;
819 if (osdmap->exists(whoami)) {
820 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
821 cur = FULL;
822 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
823 cur = BACKFILLFULL;
824 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
825 cur = NEARFULL;
826 }
827 }
828 s_names want = NONE;
829 if (is_full())
830 want = FULL;
831 else if (is_backfillfull())
832 want = BACKFILLFULL;
833 else if (is_nearfull())
834 want = NEARFULL;
835 return want != cur;
836}
837
11fdf7f2 838bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 839{
7c673cae
FG
840 if (injectfull && injectfull_state >= type) {
841 // injectfull is either a count of the number of times to return failsafe full
842 // or if -1 then always return full
843 if (injectfull > 0)
844 --injectfull;
11fdf7f2
TL
845 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
846 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
847 << dendl;
7c673cae
FG
848 return true;
849 }
11fdf7f2
TL
850 return false;
851}
852
853bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
854{
855 std::lock_guard l(full_status_lock);
856
857 if (_check_inject_full(dpp, type))
858 return true;
859
860 if (cur_state >= type)
861 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
862 << " physical " << physical_ratio << dendl;
7c673cae 863
7c673cae
FG
864 return cur_state >= type;
865}
866
11fdf7f2
TL
867bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
868{
869 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
870 {
871 std::lock_guard l(full_status_lock);
872 if (_check_inject_full(dpp, type)) {
873 return true;
874 }
875 }
876
877 float pratio;
878 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
879
880 string notused;
881 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
882
883 if (tentative_state >= type)
884 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
885
886 return tentative_state >= type;
887}
888
889bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
890{
891 return _check_full(dpp, FAILSAFE);
892}
893
894bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 895{
11fdf7f2 896 return _check_full(dpp, FULL);
7c673cae
FG
897}
898
11fdf7f2 899bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 900{
11fdf7f2 901 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
902}
903
11fdf7f2 904bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 905{
11fdf7f2 906 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
907}
908
11fdf7f2 909bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 910{
11fdf7f2 911 return _check_full(dpp, NEARFULL);
7c673cae
FG
912}
913
914bool OSDService::is_failsafe_full() const
915{
11fdf7f2 916 std::lock_guard l(full_status_lock);
7c673cae
FG
917 return cur_state == FAILSAFE;
918}
919
920bool OSDService::is_full() const
921{
11fdf7f2 922 std::lock_guard l(full_status_lock);
7c673cae
FG
923 return cur_state >= FULL;
924}
925
926bool OSDService::is_backfillfull() const
927{
11fdf7f2 928 std::lock_guard l(full_status_lock);
7c673cae
FG
929 return cur_state >= BACKFILLFULL;
930}
931
932bool OSDService::is_nearfull() const
933{
11fdf7f2 934 std::lock_guard l(full_status_lock);
7c673cae
FG
935 return cur_state >= NEARFULL;
936}
937
938void OSDService::set_injectfull(s_names type, int64_t count)
939{
11fdf7f2 940 std::lock_guard l(full_status_lock);
7c673cae
FG
941 injectfull_state = type;
942 injectfull = count;
943}
944
11fdf7f2
TL
945void OSDService::set_statfs(const struct store_statfs_t &stbuf,
946 osd_alert_list_t& alerts)
7c673cae 947{
224ce89b 948 uint64_t bytes = stbuf.total;
224ce89b 949 uint64_t avail = stbuf.available;
11fdf7f2
TL
950 uint64_t used = stbuf.get_used_raw();
951
952 // For testing fake statfs values so it doesn't matter if all
953 // OSDs are using the same partition.
954 if (cct->_conf->fake_statfs_for_testing) {
955 uint64_t total_num_bytes = 0;
956 vector<PGRef> pgs;
957 osd->_get_pgs(&pgs);
958 for (auto p : pgs) {
959 total_num_bytes += p->get_stats_num_bytes();
960 }
961 bytes = cct->_conf->fake_statfs_for_testing;
962 if (total_num_bytes < bytes)
963 avail = bytes - total_num_bytes;
964 else
965 avail = 0;
966 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
967 << " adjust available " << avail
968 << dendl;
969 used = bytes - avail;
970 }
7c673cae 971
f67539c2
TL
972 logger->set(l_osd_stat_bytes, bytes);
973 logger->set(l_osd_stat_bytes_used, used);
974 logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 975
11fdf7f2
TL
976 std::lock_guard l(stat_lock);
977 osd_stat.statfs = stbuf;
978 osd_stat.os_alerts.clear();
979 osd_stat.os_alerts[whoami].swap(alerts);
980 if (cct->_conf->fake_statfs_for_testing) {
981 osd_stat.statfs.total = bytes;
982 osd_stat.statfs.available = avail;
983 // For testing don't want used to go negative, so clear reserved
984 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
985 }
986}
7c673cae 987
11fdf7f2
TL
988osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
989 int num_pgs)
224ce89b 990{
eafe8130
TL
991 utime_t now = ceph_clock_now();
992 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
11fdf7f2
TL
993 std::lock_guard l(stat_lock);
994 osd_stat.hb_peers.swap(hb_peers);
995 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
996 osd_stat.num_pgs = num_pgs;
eafe8130
TL
997 // Clean entries that aren't updated
998 // This is called often enough that we can just remove 1 at a time
999 for (auto i: osd_stat.hb_pingtime) {
1000 if (i.second.last_update == 0)
1001 continue;
1002 if (stale_time && now.sec() - i.second.last_update > stale_time) {
1003 dout(20) << __func__ << " time out heartbeat for osd " << i.first
1004 << " last_update " << i.second.last_update << dendl;
1005 osd_stat.hb_pingtime.erase(i.first);
1006 break;
1007 }
1008 }
11fdf7f2
TL
1009 return osd_stat;
1010}
1011
1012void OSDService::inc_osd_stat_repaired()
1013{
1014 std::lock_guard l(stat_lock);
1015 osd_stat.num_shards_repaired++;
1016 return;
1017}
1018
1019float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1020 uint64_t adjust_used)
1021{
1022 *pratio =
b3b6e05e 1023 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
11fdf7f2
TL
1024
1025 if (adjust_used) {
1026 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1027 if (new_stat.statfs.available > adjust_used)
1028 new_stat.statfs.available -= adjust_used;
1029 else
1030 new_stat.statfs.available = 0;
1031 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
1032 }
1033
11fdf7f2
TL
1034 // Check all pgs and adjust kb_used to include all pending backfill data
1035 int backfill_adjusted = 0;
1036 vector<PGRef> pgs;
1037 osd->_get_pgs(&pgs);
1038 for (auto p : pgs) {
1039 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1040 }
1041 if (backfill_adjusted) {
1042 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1043 }
b3b6e05e 1044 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
7c673cae
FG
1045}
1046
7c673cae
FG
1047void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1048{
1049 OSDMapRef next_map = get_nextmap_reserved();
1050 // service map is always newer/newest
11fdf7f2 1051 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1052
1053 if (next_map->is_down(peer) ||
1054 next_map->get_info(peer).up_from > from_epoch) {
1055 m->put();
1056 release_map(next_map);
1057 return;
1058 }
9f95a23c
TL
1059 ConnectionRef peer_con;
1060 if (peer == whoami) {
1061 peer_con = osd->cluster_messenger->get_loopback_connection();
1062 } else {
1063 peer_con = osd->cluster_messenger->connect_to_osd(
1064 next_map->get_cluster_addrs(peer), false, true);
1065 }
1066 maybe_share_map(peer_con.get(), next_map);
7c673cae
FG
1067 peer_con->send_message(m);
1068 release_map(next_map);
1069}
1070
9f95a23c
TL
1071void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1072{
1073 OSDMapRef next_map = get_nextmap_reserved();
1074 // service map is always newer/newest
1075 ceph_assert(from_epoch <= next_map->get_epoch());
1076
1077 for (auto& iter : messages) {
1078 if (next_map->is_down(iter.first) ||
1079 next_map->get_info(iter.first).up_from > from_epoch) {
1080 iter.second->put();
1081 continue;
1082 }
1083 ConnectionRef peer_con;
1084 if (iter.first == whoami) {
1085 peer_con = osd->cluster_messenger->get_loopback_connection();
1086 } else {
1087 peer_con = osd->cluster_messenger->connect_to_osd(
1088 next_map->get_cluster_addrs(iter.first), false, true);
1089 }
1090 maybe_share_map(peer_con.get(), next_map);
1091 peer_con->send_message(iter.second);
1092 }
1093 release_map(next_map);
1094}
7c673cae
FG
1095ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1096{
1097 OSDMapRef next_map = get_nextmap_reserved();
1098 // service map is always newer/newest
11fdf7f2 1099 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1100
1101 if (next_map->is_down(peer) ||
1102 next_map->get_info(peer).up_from > from_epoch) {
1103 release_map(next_map);
1104 return NULL;
1105 }
9f95a23c
TL
1106 ConnectionRef con;
1107 if (peer == whoami) {
1108 con = osd->cluster_messenger->get_loopback_connection();
1109 } else {
1110 con = osd->cluster_messenger->connect_to_osd(
1111 next_map->get_cluster_addrs(peer), false, true);
1112 }
7c673cae
FG
1113 release_map(next_map);
1114 return con;
1115}
1116
1117pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1118{
1119 OSDMapRef next_map = get_nextmap_reserved();
1120 // service map is always newer/newest
11fdf7f2 1121 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1122
1123 pair<ConnectionRef,ConnectionRef> ret;
1124 if (next_map->is_down(peer) ||
1125 next_map->get_info(peer).up_from > from_epoch) {
1126 release_map(next_map);
1127 return ret;
1128 }
11fdf7f2
TL
1129 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1130 next_map->get_hb_back_addrs(peer));
1131 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1132 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1133 release_map(next_map);
1134 return ret;
1135}
1136
11fdf7f2
TL
1137entity_name_t OSDService::get_cluster_msgr_name() const
1138{
1139 return cluster_messenger->get_myname();
1140}
7c673cae 1141
94b18763
FG
1142void OSDService::queue_want_pg_temp(pg_t pgid,
1143 const vector<int>& want,
1144 bool forced)
7c673cae 1145{
11fdf7f2 1146 std::lock_guard l(pg_temp_lock);
94b18763 1147 auto p = pg_temp_pending.find(pgid);
7c673cae 1148 if (p == pg_temp_pending.end() ||
94b18763
FG
1149 p->second.acting != want ||
1150 forced) {
11fdf7f2 1151 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1152 }
1153}
1154
1155void OSDService::remove_want_pg_temp(pg_t pgid)
1156{
11fdf7f2 1157 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1158 pg_temp_wanted.erase(pgid);
1159 pg_temp_pending.erase(pgid);
1160}
1161
1162void OSDService::_sent_pg_temp()
1163{
11fdf7f2
TL
1164#ifdef HAVE_STDLIB_MAP_SPLICING
1165 pg_temp_pending.merge(pg_temp_wanted);
1166#else
94b18763
FG
1167 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1168 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1169#endif
7c673cae
FG
1170 pg_temp_wanted.clear();
1171}
1172
1173void OSDService::requeue_pg_temp()
1174{
11fdf7f2 1175 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1176 // wanted overrides pending. note that remove_want_pg_temp
1177 // clears the item out of both.
1178 unsigned old_wanted = pg_temp_wanted.size();
1179 unsigned old_pending = pg_temp_pending.size();
1180 _sent_pg_temp();
1181 pg_temp_wanted.swap(pg_temp_pending);
1182 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1183 << pg_temp_wanted.size() << dendl;
1184}
1185
94b18763
FG
1186std::ostream& operator<<(std::ostream& out,
1187 const OSDService::pg_temp_t& pg_temp)
1188{
1189 out << pg_temp.acting;
1190 if (pg_temp.forced) {
1191 out << " (forced)";
1192 }
1193 return out;
1194}
1195
7c673cae
FG
1196void OSDService::send_pg_temp()
1197{
11fdf7f2 1198 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1199 if (pg_temp_wanted.empty())
1200 return;
1201 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1202 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1203 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1204 auto& m = ms[pg_temp.forced];
94b18763
FG
1205 if (!m) {
1206 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1207 m->forced = pg_temp.forced;
94b18763 1208 }
11fdf7f2 1209 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1210 }
1211 for (auto m : ms) {
1212 if (m) {
1213 monc->send_mon_message(m);
1214 }
1215 }
7c673cae
FG
1216 _sent_pg_temp();
1217}
1218
1219void OSDService::send_pg_created(pg_t pgid)
1220{
11fdf7f2 1221 std::lock_guard l(pg_created_lock);
7c673cae 1222 dout(20) << __func__ << dendl;
11fdf7f2 1223 auto o = get_osdmap();
9f95a23c 1224 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2 1225 pg_created.insert(pgid);
c07f9fc5
FG
1226 monc->send_mon_message(new MOSDPGCreated(pgid));
1227 }
7c673cae
FG
1228}
1229
11fdf7f2
TL
1230void OSDService::send_pg_created()
1231{
1232 std::lock_guard l(pg_created_lock);
1233 dout(20) << __func__ << dendl;
1234 auto o = get_osdmap();
9f95a23c 1235 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2
TL
1236 for (auto pgid : pg_created) {
1237 monc->send_mon_message(new MOSDPGCreated(pgid));
1238 }
1239 }
1240}
1241
1242void OSDService::prune_pg_created()
1243{
1244 std::lock_guard l(pg_created_lock);
1245 dout(20) << __func__ << dendl;
1246 auto o = get_osdmap();
1247 auto i = pg_created.begin();
1248 while (i != pg_created.end()) {
1249 auto p = o->get_pg_pool(i->pool());
1250 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1251 dout(20) << __func__ << " pruning " << *i << dendl;
1252 i = pg_created.erase(i);
1253 } else {
1254 dout(20) << __func__ << " keeping " << *i << dendl;
1255 ++i;
1256 }
1257 }
1258}
1259
1260
7c673cae
FG
1261// --------------------------------------
1262// dispatch
1263
7c673cae
FG
1264void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1265 epoch_t *_bind_epoch) const
1266{
11fdf7f2 1267 std::lock_guard l(epoch_lock);
7c673cae
FG
1268 if (_boot_epoch)
1269 *_boot_epoch = boot_epoch;
1270 if (_up_epoch)
1271 *_up_epoch = up_epoch;
1272 if (_bind_epoch)
1273 *_bind_epoch = bind_epoch;
1274}
1275
1276void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1277 const epoch_t *_bind_epoch)
1278{
11fdf7f2 1279 std::lock_guard l(epoch_lock);
7c673cae 1280 if (_boot_epoch) {
11fdf7f2 1281 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1282 boot_epoch = *_boot_epoch;
1283 }
1284 if (_up_epoch) {
11fdf7f2 1285 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1286 up_epoch = *_up_epoch;
1287 }
1288 if (_bind_epoch) {
11fdf7f2 1289 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1290 bind_epoch = *_bind_epoch;
1291 }
1292}
1293
1294bool OSDService::prepare_to_stop()
1295{
9f95a23c 1296 std::unique_lock l(is_stopping_lock);
7c673cae
FG
1297 if (get_state() != NOT_STOPPING)
1298 return false;
1299
1300 OSDMapRef osdmap = get_osdmap();
1301 if (osdmap && osdmap->is_up(whoami)) {
1d09f67e 1302 dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
7c673cae 1303 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1304 monc->send_mon_message(
1305 new MOSDMarkMeDown(
1306 monc->get_fsid(),
1307 whoami,
1308 osdmap->get_addrs(whoami),
1309 osdmap->get_epoch(),
1d09f67e
TL
1310 true, // request ack
1311 true // mark as down and dead
11fdf7f2 1312 ));
9f95a23c
TL
1313 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1314 is_stopping_cond.wait_for(l, timeout,
1315 [this] { return get_state() == STOPPING; });
7c673cae 1316 }
1d09f67e 1317
7c673cae
FG
1318 dout(0) << __func__ << " starting shutdown" << dendl;
1319 set_state(STOPPING);
1320 return true;
1321}
1322
1323void OSDService::got_stop_ack()
1324{
9f95a23c 1325 std::scoped_lock l(is_stopping_lock);
7c673cae
FG
1326 if (get_state() == PREPARING_TO_STOP) {
1327 dout(0) << __func__ << " starting shutdown" << dendl;
1328 set_state(STOPPING);
9f95a23c 1329 is_stopping_cond.notify_all();
7c673cae
FG
1330 } else {
1331 dout(10) << __func__ << " ignoring msg" << dendl;
1332 }
1333}
1334
1335MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1336 OSDSuperblock& sblock)
1337{
28e407b8
AA
1338 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1339 osdmap->get_encoding_features());
7c673cae
FG
1340 m->oldest_map = max_oldest_map;
1341 m->newest_map = sblock.newest_map;
1342
11fdf7f2
TL
1343 int max = cct->_conf->osd_map_message_max;
1344 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1345
1346 if (since < m->oldest_map) {
1347 // we don't have the next map the target wants, so start with a
1348 // full map.
1349 bufferlist bl;
1350 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1351 << since << ", starting with full map" << dendl;
1352 since = m->oldest_map;
1353 if (!get_map_bl(since, bl)) {
1354 derr << __func__ << " missing full map " << since << dendl;
1355 goto panic;
1356 }
1357 max--;
1358 max_bytes -= bl.length();
f67539c2 1359 m->maps[since] = std::move(bl);
11fdf7f2
TL
1360 }
1361 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1362 bufferlist bl;
11fdf7f2 1363 if (get_inc_map_bl(e, bl)) {
f67539c2 1364 m->incremental_maps[e] = std::move(bl);
11fdf7f2 1365 } else {
e306af50 1366 dout(10) << __func__ << " missing incremental map " << e << dendl;
11fdf7f2
TL
1367 if (!get_map_bl(e, bl)) {
1368 derr << __func__ << " also missing full map " << e << dendl;
1369 goto panic;
1370 }
f67539c2 1371 m->maps[e] = std::move(bl);
11fdf7f2
TL
1372 }
1373 max--;
1374 max_bytes -= bl.length();
1375 if (max <= 0 || max_bytes <= 0) {
7c673cae 1376 break;
11fdf7f2
TL
1377 }
1378 }
1379 return m;
1380
1381 panic:
1382 if (!m->maps.empty() ||
1383 !m->incremental_maps.empty()) {
1384 // send what we have so far
1385 return m;
1386 }
1387 // send something
1388 bufferlist bl;
1389 if (get_inc_map_bl(m->newest_map, bl)) {
f67539c2 1390 m->incremental_maps[m->newest_map] = std::move(bl);
11fdf7f2
TL
1391 } else {
1392 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1393 if (!get_map_bl(m->newest_map, bl)) {
1394 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1395 << dendl;
11fdf7f2 1396 ceph_abort();
7c673cae 1397 }
f67539c2 1398 m->maps[m->newest_map] = std::move(bl);
7c673cae
FG
1399 }
1400 return m;
1401}
1402
1403void OSDService::send_map(MOSDMap *m, Connection *con)
1404{
1405 con->send_message(m);
1406}
1407
1408void OSDService::send_incremental_map(epoch_t since, Connection *con,
9f95a23c 1409 const OSDMapRef& osdmap)
7c673cae
FG
1410{
1411 epoch_t to = osdmap->get_epoch();
1412 dout(10) << "send_incremental_map " << since << " -> " << to
1413 << " to " << con << " " << con->get_peer_addr() << dendl;
1414
1415 MOSDMap *m = NULL;
1416 while (!m) {
1417 OSDSuperblock sblock(get_superblock());
1418 if (since < sblock.oldest_map) {
1419 // just send latest full map
28e407b8
AA
1420 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1421 osdmap->get_encoding_features());
7c673cae
FG
1422 m->oldest_map = max_oldest_map;
1423 m->newest_map = sblock.newest_map;
1424 get_map_bl(to, m->maps[to]);
1425 send_map(m, con);
1426 return;
1427 }
1428
1429 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1430 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1431 << ", only sending most recent" << dendl;
1432 since = to - cct->_conf->osd_map_share_max_epochs;
1433 }
1434
7c673cae
FG
1435 m = build_incremental_map_msg(since, to, sblock);
1436 }
1437 send_map(m, con);
1438}
1439
1440bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1441{
1442 bool found = map_bl_cache.lookup(e, &bl);
31f18b77 1443 if (found) {
f67539c2 1444 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1445 return true;
31f18b77 1446 }
f67539c2 1447 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1448 found = store->read(meta_ch,
31f18b77
FG
1449 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1450 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1451 if (found) {
7c673cae 1452 _add_map_bl(e, bl);
31f18b77 1453 }
7c673cae
FG
1454 return found;
1455}
1456
1457bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1458{
11fdf7f2 1459 std::lock_guard l(map_cache_lock);
7c673cae 1460 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77 1461 if (found) {
f67539c2 1462 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1463 return true;
31f18b77 1464 }
f67539c2 1465 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1466 found = store->read(meta_ch,
31f18b77
FG
1467 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1468 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1469 if (found) {
7c673cae 1470 _add_map_inc_bl(e, bl);
31f18b77 1471 }
7c673cae
FG
1472 return found;
1473}
1474
1475void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1476{
1477 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1478 // cache a contiguous buffer
1479 if (bl.get_num_buffers() > 1) {
1480 bl.rebuild();
1481 }
1482 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1483 map_bl_cache.add(e, bl);
1484}
1485
1486void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1487{
1488 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1489 // cache a contiguous buffer
1490 if (bl.get_num_buffers() > 1) {
1491 bl.rebuild();
1492 }
1493 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1494 map_bl_inc_cache.add(e, bl);
1495}
1496
7c673cae
FG
1497OSDMapRef OSDService::_add_map(OSDMap *o)
1498{
1499 epoch_t e = o->get_epoch();
1500
1501 if (cct->_conf->osd_map_dedup) {
1502 // Dedup against an existing map at a nearby epoch
1503 OSDMapRef for_dedup = map_cache.lower_bound(e);
1504 if (for_dedup) {
1505 OSDMap::dedup(for_dedup.get(), o);
1506 }
1507 }
1508 bool existed;
1509 OSDMapRef l = map_cache.add(e, o, &existed);
1510 if (existed) {
1511 delete o;
1512 }
1513 return l;
1514}
1515
1516OSDMapRef OSDService::try_get_map(epoch_t epoch)
1517{
11fdf7f2 1518 std::lock_guard l(map_cache_lock);
7c673cae
FG
1519 OSDMapRef retval = map_cache.lookup(epoch);
1520 if (retval) {
1521 dout(30) << "get_map " << epoch << " -cached" << dendl;
f67539c2 1522 logger->inc(l_osd_map_cache_hit);
7c673cae
FG
1523 return retval;
1524 }
f67539c2 1525 {
7c673cae
FG
1526 logger->inc(l_osd_map_cache_miss);
1527 epoch_t lb = map_cache.cached_key_lower_bound();
1528 if (epoch < lb) {
1529 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1530 logger->inc(l_osd_map_cache_miss_low);
1531 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1532 }
1533 }
1534
1535 OSDMap *map = new OSDMap;
1536 if (epoch > 0) {
1537 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1538 bufferlist bl;
1539 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1540 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1541 delete map;
1542 return OSDMapRef();
1543 }
1544 map->decode(bl);
1545 } else {
1546 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1547 }
1548 return _add_map(map);
1549}
1550
1551// ops
1552
1553
1554void OSDService::reply_op_error(OpRequestRef op, int err)
1555{
9f95a23c 1556 reply_op_error(op, err, eversion_t(), 0, {});
7c673cae
FG
1557}
1558
1559void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
9f95a23c
TL
1560 version_t uv,
1561 vector<pg_log_op_return_item_t> op_returns)
7c673cae 1562{
9f95a23c 1563 auto m = op->get_req<MOSDOp>();
11fdf7f2 1564 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1565 int flags;
1566 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1567
9f95a23c
TL
1568 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1569 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
7c673cae 1570 reply->set_reply_versions(v, uv);
9f95a23c 1571 reply->set_op_returns(op_returns);
7c673cae
FG
1572 m->get_connection()->send_message(reply);
1573}
1574
1575void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1576{
31f18b77
FG
1577 if (!cct->_conf->osd_debug_misdirected_ops) {
1578 return;
1579 }
1580
9f95a23c 1581 auto m = op->get_req<MOSDOp>();
11fdf7f2 1582 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1583
11fdf7f2 1584 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1585
1586 if (pg->is_ec_pg()) {
1587 /**
1588 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1589 * can get this result:
1590 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1591 * [CRUSH_ITEM_NONE, 2, 3]/3
1592 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1593 * [3, 2, 3]/3
1594 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1595 * -- misdirected op
1596 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1597 * it and fulfils it
1598 *
1599 * We can't compute the op target based on the sending map epoch due to
1600 * splitting. The simplest thing is to detect such cases here and drop
1601 * them without an error (the client will resend anyway).
1602 */
11fdf7f2 1603 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1604 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1605 if (!opmap) {
1606 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1607 << m->get_map_epoch() << ", dropping" << dendl;
1608 return;
1609 }
1610 pg_t _pgid = m->get_raw_pg();
1611 spg_t pgid;
1612 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1613 _pgid = opmap->raw_pg_to_pg(_pgid);
1614 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1615 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1616 dout(7) << __func__ << ": " << *pg << " primary changed since "
1617 << m->get_map_epoch() << ", dropping" << dendl;
1618 return;
1619 }
1620 }
1621
1622 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1623 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1624 << " pg " << m->get_raw_pg()
1625 << " to osd." << whoami
11fdf7f2 1626 << " not " << pg->get_acting()
7c673cae 1627 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1628}
1629
9f95a23c 1630void OSDService::enqueue_back(OpSchedulerItem&& qi)
7c673cae 1631{
11fdf7f2 1632 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1633}
1634
9f95a23c 1635void OSDService::enqueue_front(OpSchedulerItem&& qi)
7c673cae 1636{
11fdf7f2 1637 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1638}
1639
11fdf7f2
TL
1640void OSDService::queue_recovery_context(
1641 PG *pg,
1642 GenContext<ThreadPool::TPHandle&> *c)
7c673cae 1643{
11fdf7f2
TL
1644 epoch_t e = get_osdmap_epoch();
1645 enqueue_back(
9f95a23c
TL
1646 OpSchedulerItem(
1647 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1648 new PGRecoveryContext(pg->get_pgid(), c, e)),
1649 cct->_conf->osd_recovery_cost,
1650 cct->_conf->osd_recovery_priority,
1651 ceph_clock_now(),
1652 0,
1653 e));
7c673cae
FG
1654}
1655
1656void OSDService::queue_for_snap_trim(PG *pg)
1657{
1658 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2 1659 enqueue_back(
9f95a23c
TL
1660 OpSchedulerItem(
1661 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1662 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1663 cct->_conf->osd_snap_trim_cost,
1664 cct->_conf->osd_snap_trim_priority,
1665 ceph_clock_now(),
1666 0,
1667 pg->get_osdmap_epoch()));
1668}
1669
f67539c2
TL
1670template <class MSG_TYPE>
1671void OSDService::queue_scrub_event_msg(PG* pg,
1672 Scrub::scrub_prio_t with_priority,
20effc67
TL
1673 unsigned int qu_priority,
1674 Scrub::act_token_t act_token)
11fdf7f2 1675{
11fdf7f2 1676 const auto epoch = pg->get_osdmap_epoch();
20effc67
TL
1677 auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
1678 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
1679 << ". Epoch: " << epoch << " token: " << act_token << dendl;
f67539c2
TL
1680
1681 enqueue_back(OpSchedulerItem(
1682 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1683 pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1684}
1685
1686template <class MSG_TYPE>
20effc67
TL
1687void OSDService::queue_scrub_event_msg(PG* pg,
1688 Scrub::scrub_prio_t with_priority)
f67539c2
TL
1689{
1690 const auto epoch = pg->get_osdmap_epoch();
1691 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1692 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1693
1694 enqueue_back(OpSchedulerItem(
1695 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1696 pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1697}
1698
1699void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1700{
1701 queue_scrub_event_msg<PGScrub>(pg, with_priority);
1702}
1703
1704void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1705{
1706 queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1707}
1708
1709void OSDService::queue_for_rep_scrub(PG* pg,
1710 Scrub::scrub_prio_t with_priority,
20effc67
TL
1711 unsigned int qu_priority,
1712 Scrub::act_token_t act_token)
f67539c2 1713{
20effc67 1714 queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
f67539c2
TL
1715}
1716
1717void OSDService::queue_for_rep_scrub_resched(PG* pg,
1718 Scrub::scrub_prio_t with_priority,
20effc67
TL
1719 unsigned int qu_priority,
1720 Scrub::act_token_t act_token)
f67539c2
TL
1721{
1722 // Resulting scrub event: 'SchedReplica'
20effc67
TL
1723 queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
1724 act_token);
f67539c2
TL
1725}
1726
1727void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1728{
1729 // Resulting scrub event: 'RemotesReserved'
1730 queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1731}
1732
1733void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1734{
1735 // Resulting scrub event: 'ReservationFailure'
1736 queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1737}
1738
1739void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1740{
1741 // Resulting scrub event: 'InternalSchedScrub'
1742 queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1743}
1744
1745void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1746{
1747 // Resulting scrub event: 'ActivePushesUpd'
1748 queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1749}
1750
20effc67
TL
1751void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
1752{
1753 // Resulting scrub event: 'SelectedChunkFree'
1754 queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
1755}
1756
1757void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
1758{
1759 // Resulting scrub event: 'ChunkIsBusy'
1760 queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
1761}
1762
f67539c2
TL
1763void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1764{
1765 queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1766}
1767
1768void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1769{
1770 // Resulting scrub event: 'Unblocked'
1771 queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1772}
1773
1774void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1775{
1776 // Resulting scrub event: 'DigestUpdate'
1777 queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1778}
1779
20effc67
TL
1780void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
1781{
1782 // Resulting scrub event: 'IntLocalMapDone'
1783 queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
1784}
1785
f67539c2
TL
1786void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1787{
1788 // Resulting scrub event: 'GotReplicas'
1789 queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1790}
1791
20effc67
TL
1792void OSDService::queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority)
1793{
1794 // Resulting scrub event: 'MapsCompared'
1795 queue_scrub_event_msg<PGScrubMapsCompared>(pg, with_priority);
1796}
1797
f67539c2
TL
1798void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1799{
1800 // Resulting scrub event: 'ReplicaPushesUpd'
1801 queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
11fdf7f2
TL
1802}
1803
20effc67
TL
1804void OSDService::queue_scrub_is_finished(PG *pg)
1805{
1806 // Resulting scrub event: 'ScrubFinished'
1807 queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
1808}
1809
1810void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
1811{
1812 // Resulting scrub event: 'NextChunk'
1813 queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
1814}
1815
11fdf7f2
TL
1816void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1817{
1818 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1819 enqueue_back(
9f95a23c
TL
1820 OpSchedulerItem(
1821 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1822 new PGDelete(pgid, e)),
1823 cct->_conf->osd_pg_delete_cost,
1824 cct->_conf->osd_pg_delete_priority,
1825 ceph_clock_now(),
1826 0,
1827 e));
1828}
1829
1830bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1831{
1832 return osd->try_finish_pg_delete(pg, old_pg_num);
1833}
1834
1835// ---
1836
1837void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1838{
1839 std::lock_guard l(merge_lock);
1840 dout(10) << __func__ << " " << pg->pg_id << dendl;
1841 ready_to_merge_source[pg->pg_id.pgid] = version;
1842 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1843 _send_ready_to_merge();
1844}
1845
1846void OSDService::set_ready_to_merge_target(PG *pg,
1847 eversion_t version,
1848 epoch_t last_epoch_started,
1849 epoch_t last_epoch_clean)
1850{
1851 std::lock_guard l(merge_lock);
1852 dout(10) << __func__ << " " << pg->pg_id << dendl;
1853 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1854 make_tuple(version,
1855 last_epoch_started,
1856 last_epoch_clean)));
1857 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1858 _send_ready_to_merge();
1859}
1860
1861void OSDService::set_not_ready_to_merge_source(pg_t source)
1862{
1863 std::lock_guard l(merge_lock);
1864 dout(10) << __func__ << " " << source << dendl;
1865 not_ready_to_merge_source.insert(source);
1866 assert(ready_to_merge_source.count(source) == 0);
1867 _send_ready_to_merge();
1868}
1869
1870void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1871{
1872 std::lock_guard l(merge_lock);
1873 dout(10) << __func__ << " " << target << " source " << source << dendl;
1874 not_ready_to_merge_target[target] = source;
1875 assert(ready_to_merge_target.count(target) == 0);
1876 _send_ready_to_merge();
1877}
1878
1879void OSDService::send_ready_to_merge()
1880{
1881 std::lock_guard l(merge_lock);
1882 _send_ready_to_merge();
1883}
1884
1885void OSDService::_send_ready_to_merge()
1886{
1887 dout(20) << __func__
1888 << " ready_to_merge_source " << ready_to_merge_source
1889 << " not_ready_to_merge_source " << not_ready_to_merge_source
1890 << " ready_to_merge_target " << ready_to_merge_target
1891 << " not_ready_to_merge_target " << not_ready_to_merge_target
1892 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1893 << dendl;
1894 for (auto src : not_ready_to_merge_source) {
1895 if (sent_ready_to_merge_source.count(src) == 0) {
1896 monc->send_mon_message(new MOSDPGReadyToMerge(
1897 src,
1898 {}, {}, 0, 0,
1899 false,
1900 osdmap->get_epoch()));
1901 sent_ready_to_merge_source.insert(src);
1902 }
1903 }
1904 for (auto p : not_ready_to_merge_target) {
1905 if (sent_ready_to_merge_source.count(p.second) == 0) {
1906 monc->send_mon_message(new MOSDPGReadyToMerge(
1907 p.second,
1908 {}, {}, 0, 0,
1909 false,
1910 osdmap->get_epoch()));
1911 sent_ready_to_merge_source.insert(p.second);
1912 }
1913 }
1914 for (auto src : ready_to_merge_source) {
1915 if (not_ready_to_merge_source.count(src.first) ||
1916 not_ready_to_merge_target.count(src.first.get_parent())) {
1917 continue;
1918 }
1919 auto p = ready_to_merge_target.find(src.first.get_parent());
1920 if (p != ready_to_merge_target.end() &&
1921 sent_ready_to_merge_source.count(src.first) == 0) {
1922 monc->send_mon_message(new MOSDPGReadyToMerge(
1923 src.first, // source pgid
1924 src.second, // src version
1925 std::get<0>(p->second), // target version
1926 std::get<1>(p->second), // PG's last_epoch_started
1927 std::get<2>(p->second), // PG's last_epoch_clean
1928 true,
1929 osdmap->get_epoch()));
1930 sent_ready_to_merge_source.insert(src.first);
1931 }
1932 }
1933}
1934
1935void OSDService::clear_ready_to_merge(PG *pg)
1936{
1937 std::lock_guard l(merge_lock);
1938 dout(10) << __func__ << " " << pg->pg_id << dendl;
1939 ready_to_merge_source.erase(pg->pg_id.pgid);
1940 ready_to_merge_target.erase(pg->pg_id.pgid);
1941 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1942 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1943 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1944}
1945
1946void OSDService::clear_sent_ready_to_merge()
1947{
1948 std::lock_guard l(merge_lock);
1949 sent_ready_to_merge_source.clear();
1950}
1951
9f95a23c 1952void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
11fdf7f2
TL
1953{
1954 std::lock_guard l(merge_lock);
1955 auto i = sent_ready_to_merge_source.begin();
1956 while (i != sent_ready_to_merge_source.end()) {
1957 if (!osdmap->pg_exists(*i)) {
1958 dout(10) << __func__ << " " << *i << dendl;
1959 i = sent_ready_to_merge_source.erase(i);
1960 } else {
1961 ++i;
1962 }
1963 }
7c673cae
FG
1964}
1965
11fdf7f2
TL
1966// ---
1967
1968void OSDService::_queue_for_recovery(
1969 std::pair<epoch_t, PGRef> p,
1970 uint64_t reserved_pushes)
1971{
9f95a23c 1972 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
11fdf7f2 1973 enqueue_back(
9f95a23c
TL
1974 OpSchedulerItem(
1975 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1976 new PGRecovery(
1977 p.second->get_pgid(), p.first, reserved_pushes)),
1978 cct->_conf->osd_recovery_cost,
1979 cct->_conf->osd_recovery_priority,
1980 ceph_clock_now(),
1981 0,
1982 p.first));
1983}
7c673cae
FG
1984
1985// ====================================================================
1986// OSD
1987
1988#undef dout_prefix
1989#define dout_prefix *_dout
1990
1991// Commands shared between OSD's console and admin console:
f67539c2 1992namespace ceph::osd_cmds {
7c673cae 1993
11fdf7f2 1994int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
f67539c2
TL
1995
1996} // namespace ceph::osd_cmds
7c673cae 1997
20effc67
TL
1998int OSD::mkfs(CephContext *cct,
1999 std::unique_ptr<ObjectStore> store,
2000 uuid_d fsid,
2001 int whoami,
2002 string osdspec_affinity)
7c673cae
FG
2003{
2004 int ret;
2005
7c673cae
FG
2006 OSDSuperblock sb;
2007 bufferlist sbbl;
7c673cae
FG
2008 // if we are fed a uuid for this osd, use it.
2009 store->set_fsid(cct->_conf->osd_uuid);
2010
2011 ret = store->mkfs();
2012 if (ret) {
224ce89b
WB
2013 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2014 << cpp_strerror(ret) << dendl;
20effc67 2015 return ret;
7c673cae
FG
2016 }
2017
31f18b77 2018 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
2019
2020 ret = store->mount();
2021 if (ret) {
224ce89b
WB
2022 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2023 << cpp_strerror(ret) << dendl;
20effc67 2024 return ret;
7c673cae
FG
2025 }
2026
20effc67
TL
2027 auto umount_store = make_scope_guard([&] {
2028 store->umount();
2029 });
2030
2031 ObjectStore::CollectionHandle ch =
2032 store->open_collection(coll_t::meta());
11fdf7f2
TL
2033 if (ch) {
2034 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2035 if (ret < 0) {
2036 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
20effc67 2037 return ret;
11fdf7f2 2038 }
7c673cae
FG
2039 /* if we already have superblock, check content of superblock */
2040 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
2041 auto p = sbbl.cbegin();
2042 decode(sb, p);
7c673cae
FG
2043 if (whoami != sb.whoami) {
2044 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2045 << dendl;
20effc67 2046 return -EINVAL;
7c673cae
FG
2047 }
2048 if (fsid != sb.cluster_fsid) {
2049 derr << "provided cluster fsid " << fsid
2050 << " != superblock's " << sb.cluster_fsid << dendl;
20effc67 2051 return -EINVAL;
7c673cae
FG
2052 }
2053 } else {
2054 // create superblock
2055 sb.cluster_fsid = fsid;
2056 sb.osd_fsid = store->get_fsid();
2057 sb.whoami = whoami;
2058 sb.compat_features = get_osd_initial_compat_set();
2059
2060 bufferlist bl;
11fdf7f2 2061 encode(sb, bl);
7c673cae 2062
11fdf7f2
TL
2063 ObjectStore::CollectionHandle ch = store->create_new_collection(
2064 coll_t::meta());
7c673cae
FG
2065 ObjectStore::Transaction t;
2066 t.create_collection(coll_t::meta(), 0);
2067 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 2068 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
2069 if (ret) {
2070 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 2071 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
20effc67 2072 return ret;
7c673cae 2073 }
a4b75251 2074 ch->flush();
7c673cae
FG
2075 }
2076
20effc67 2077 ret = write_meta(cct, store.get(), sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
7c673cae 2078 if (ret) {
224ce89b
WB
2079 derr << "OSD::mkfs: failed to write fsid file: error "
2080 << cpp_strerror(ret) << dendl;
11fdf7f2 2081 }
7c673cae
FG
2082 return ret;
2083}
2084
e306af50 2085int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
7c673cae
FG
2086{
2087 char val[80];
2088 int r;
2089
2090 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2091 r = store->write_meta("magic", val);
2092 if (r < 0)
2093 return r;
2094
2095 snprintf(val, sizeof(val), "%d", whoami);
2096 r = store->write_meta("whoami", val);
2097 if (r < 0)
2098 return r;
2099
2100 cluster_fsid.print(val);
2101 r = store->write_meta("ceph_fsid", val);
2102 if (r < 0)
2103 return r;
2104
11fdf7f2 2105 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2106 if (key.size()) {
2107 r = store->write_meta("osd_key", key);
2108 if (r < 0)
2109 return r;
b32b8144 2110 } else {
11fdf7f2 2111 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2112 if (!keyfile.empty()) {
2113 bufferlist keybl;
2114 string err;
11fdf7f2 2115 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2116 if (r < 0) {
2117 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2118 << err << ": " << cpp_strerror(r) << dendl;
2119 return r;
2120 }
2121 r = store->write_meta("osd_key", keybl.to_str());
2122 if (r < 0)
2123 return r;
2124 }
3efd9988 2125 }
e306af50
TL
2126 if (!osdspec_affinity.empty()) {
2127 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2128 if (r < 0)
2129 return r;
2130 }
3efd9988 2131
7c673cae
FG
2132 r = store->write_meta("ready", "ready");
2133 if (r < 0)
2134 return r;
2135
2136 return 0;
2137}
2138
11fdf7f2
TL
2139int OSD::peek_meta(ObjectStore *store,
2140 std::string *magic,
2141 uuid_d *cluster_fsid,
2142 uuid_d *osd_fsid,
2143 int *whoami,
9f95a23c 2144 ceph_release_t *require_osd_release)
7c673cae
FG
2145{
2146 string val;
2147
2148 int r = store->read_meta("magic", &val);
2149 if (r < 0)
2150 return r;
11fdf7f2 2151 *magic = val;
7c673cae
FG
2152
2153 r = store->read_meta("whoami", &val);
2154 if (r < 0)
2155 return r;
11fdf7f2 2156 *whoami = atoi(val.c_str());
7c673cae
FG
2157
2158 r = store->read_meta("ceph_fsid", &val);
2159 if (r < 0)
2160 return r;
11fdf7f2 2161 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2162 if (!r)
2163 return -EINVAL;
2164
2165 r = store->read_meta("fsid", &val);
2166 if (r < 0) {
11fdf7f2 2167 *osd_fsid = uuid_d();
7c673cae 2168 } else {
11fdf7f2 2169 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2170 if (!r)
2171 return -EINVAL;
2172 }
2173
11fdf7f2
TL
2174 r = store->read_meta("require_osd_release", &val);
2175 if (r >= 0) {
9f95a23c 2176 *require_osd_release = ceph_release_from_name(val);
11fdf7f2
TL
2177 }
2178
7c673cae
FG
2179 return 0;
2180}
2181
2182
2183#undef dout_prefix
2184#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2185
2186// cons/des
2187
20effc67
TL
2188OSD::OSD(CephContext *cct_,
2189 std::unique_ptr<ObjectStore> store_,
7c673cae
FG
2190 int id,
2191 Messenger *internal_messenger,
2192 Messenger *external_messenger,
2193 Messenger *hb_client_front,
2194 Messenger *hb_client_back,
2195 Messenger *hb_front_serverm,
2196 Messenger *hb_back_serverm,
2197 Messenger *osdc_messenger,
2198 MonClient *mc,
f67539c2
TL
2199 const std::string &dev, const std::string &jdev,
2200 ceph::async::io_context_pool& poolctx) :
7c673cae 2201 Dispatcher(cct_),
7c673cae 2202 tick_timer(cct, osd_lock),
7c673cae 2203 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2204 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2205 cluster_messenger(internal_messenger),
2206 client_messenger(external_messenger),
2207 objecter_messenger(osdc_messenger),
2208 monc(mc),
9f95a23c 2209 mgrc(cct_, client_messenger, &mc->monmap),
f67539c2
TL
2210 logger(create_logger()),
2211 recoverystate_perf(create_recoverystate_perf()),
20effc67 2212 store(std::move(store_)),
7c673cae
FG
2213 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2214 clog(log_client.create_channel()),
2215 whoami(id),
2216 dev_path(dev), journal_path(jdev),
31f18b77 2217 store_is_rotational(store->is_rotational()),
7c673cae
FG
2218 trace_endpoint("0.0.0.0", 0, "osd"),
2219 asok_hook(NULL),
11fdf7f2
TL
2220 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2221 "osd_pg_epoch_max_lag_factor")),
7c673cae 2222 osd_compat(get_osd_compat_set()),
7c673cae 2223 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2224 get_num_op_threads()),
7c673cae
FG
2225 heartbeat_stop(false),
2226 heartbeat_need_update(true),
2227 hb_front_client_messenger(hb_client_front),
2228 hb_back_client_messenger(hb_client_back),
2229 hb_front_server_messenger(hb_front_serverm),
2230 hb_back_server_messenger(hb_back_serverm),
2231 daily_loadavg(0.0),
2232 heartbeat_thread(this),
2233 heartbeat_dispatcher(this),
2234 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2235 cct->_conf->osd_num_op_tracker_shard),
2236 test_ops_hook(NULL),
7c673cae 2237 op_shardedwq(
7c673cae 2238 this,
f67539c2
TL
2239 ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2240 ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
7c673cae 2241 &osd_op_tp),
7c673cae 2242 last_pg_create_epoch(0),
11fdf7f2 2243 boot_finisher(cct),
7c673cae
FG
2244 up_thru_wanted(0),
2245 requested_full_first(0),
2246 requested_full_last(0),
f67539c2 2247 service(this, poolctx)
7c673cae 2248{
11fdf7f2
TL
2249
2250 if (!gss_ktfile_client.empty()) {
f67539c2
TL
2251 // Assert we can export environment variable
2252 /*
11fdf7f2
TL
2253 The default client keytab is used, if it is present and readable,
2254 to automatically obtain initial credentials for GSSAPI client
2255 applications. The principal name of the first entry in the client
2256 keytab is used by default when obtaining initial credentials.
2257 1. The KRB5_CLIENT_KTNAME environment variable.
2258 2. The default_client_keytab_name profile variable in [libdefaults].
2259 3. The hardcoded default, DEFCKTNAME.
2260 */
f67539c2 2261 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
11fdf7f2
TL
2262 gss_ktfile_client.c_str(), 1));
2263 ceph_assert(set_result == 0);
2264 }
2265
7c673cae
FG
2266 monc->set_messenger(client_messenger);
2267 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2268 cct->_conf->osd_op_log_threshold);
2269 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2270 cct->_conf->osd_op_history_duration);
2271 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2272 cct->_conf->osd_op_history_slow_op_threshold);
9f95a23c 2273 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
7c673cae
FG
2274#ifdef WITH_BLKIN
2275 std::stringstream ss;
2276 ss << "osd." << whoami;
2277 trace_endpoint.copy_name(ss.str());
2278#endif
11fdf7f2
TL
2279
2280 // initialize shards
2281 num_shards = get_num_op_shards();
2282 for (uint32_t i = 0; i < num_shards; i++) {
2283 OSDShard *one_shard = new OSDShard(
2284 i,
2285 cct,
9f95a23c 2286 this);
11fdf7f2
TL
2287 shards.push_back(one_shard);
2288 }
7c673cae
FG
2289}
2290
2291OSD::~OSD()
2292{
11fdf7f2
TL
2293 while (!shards.empty()) {
2294 delete shards.back();
2295 shards.pop_back();
2296 }
7c673cae
FG
2297 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2298 cct->get_perfcounters_collection()->remove(logger);
2299 delete recoverystate_perf;
2300 delete logger;
7c673cae
FG
2301}
2302
91327a77
AA
2303double OSD::get_tick_interval() const
2304{
2305 // vary +/- 5% to avoid scrub scheduling livelocks
2306 constexpr auto delta = 0.05;
91327a77 2307 return (OSD_TICK_INTERVAL *
11fdf7f2 2308 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2309}
2310
7c673cae
FG
2311void OSD::handle_signal(int signum)
2312{
11fdf7f2 2313 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2314 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2315 shutdown();
2316}
2317
2318int OSD::pre_init()
2319{
11fdf7f2 2320 std::lock_guard lock(osd_lock);
7c673cae
FG
2321 if (is_stopping())
2322 return 0;
2323
2324 if (store->test_mount_in_use()) {
2325 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2326 << "currently in use. (Is ceph-osd already running?)" << dendl;
2327 return -EBUSY;
2328 }
2329
11fdf7f2
TL
2330 cct->_conf.add_observer(this);
2331 return 0;
2332}
2333
2334int OSD::set_numa_affinity()
2335{
2336 // storage numa node
2337 int store_node = -1;
2338 store->get_numa_node(&store_node, nullptr, nullptr);
2339 if (store_node >= 0) {
2340 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2341 }
2342
2343 // check network numa node(s)
2344 int front_node = -1, back_node = -1;
2345 string front_iface = pick_iface(
2346 cct,
2347 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2348 string back_iface = pick_iface(
2349 cct,
2350 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2351 int r = get_iface_numa_node(front_iface, &front_node);
92f5a8d4 2352 if (r >= 0 && front_node >= 0) {
11fdf7f2 2353 dout(1) << __func__ << " public network " << front_iface << " numa node "
92f5a8d4 2354 << front_node << dendl;
11fdf7f2 2355 r = get_iface_numa_node(back_iface, &back_node);
92f5a8d4 2356 if (r >= 0 && back_node >= 0) {
11fdf7f2
TL
2357 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2358 << back_node << dendl;
2359 if (front_node == back_node &&
2360 front_node == store_node) {
2361 dout(1) << " objectstore and network numa nodes all match" << dendl;
2362 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2363 numa_node = front_node;
2364 }
92f5a8d4
TL
2365 } else if (front_node != back_node) {
2366 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2367 << dendl;
11fdf7f2
TL
2368 } else {
2369 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2370 << dendl;
2371 }
92f5a8d4
TL
2372 } else if (back_node == -2) {
2373 dout(1) << __func__ << " cluster network " << back_iface
2374 << " ports numa nodes do not match" << dendl;
2375 } else {
2376 derr << __func__ << " unable to identify cluster interface '" << back_iface
2377 << "' numa node: " << cpp_strerror(r) << dendl;
11fdf7f2 2378 }
92f5a8d4
TL
2379 } else if (front_node == -2) {
2380 dout(1) << __func__ << " public network " << front_iface
2381 << " ports numa nodes do not match" << dendl;
11fdf7f2
TL
2382 } else {
2383 derr << __func__ << " unable to identify public interface '" << front_iface
2384 << "' numa node: " << cpp_strerror(r) << dendl;
2385 }
2386 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2387 // this takes precedence over the automagic logic above
2388 numa_node = node;
2389 }
2390 if (numa_node >= 0) {
2391 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2392 if (r < 0) {
2393 dout(1) << __func__ << " unable to determine numa node " << numa_node
2394 << " CPUs" << dendl;
2395 numa_node = -1;
2396 } else {
2397 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2398 << " cpus "
2399 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2400 << dendl;
92f5a8d4 2401 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
11fdf7f2
TL
2402 if (r < 0) {
2403 r = -errno;
2404 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2405 << dendl;
2406 numa_node = -1;
2407 }
2408 }
2409 } else {
2410 dout(1) << __func__ << " not setting numa affinity" << dendl;
2411 }
7c673cae
FG
2412 return 0;
2413}
2414
2415// asok
2416
2417class OSDSocketHook : public AdminSocketHook {
2418 OSD *osd;
2419public:
2420 explicit OSDSocketHook(OSD *o) : osd(o) {}
9f95a23c
TL
2421 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2422 Formatter *f,
2423 std::ostream& ss,
2424 bufferlist& out) override {
2425 ceph_abort("should use async hook");
2426 }
2427 void call_async(
2428 std::string_view prefix,
2429 const cmdmap_t& cmdmap,
2430 Formatter *f,
2431 const bufferlist& inbl,
2432 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
11fdf7f2 2433 try {
9f95a23c
TL
2434 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2435 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2436 bufferlist empty;
2437 on_finish(-EINVAL, e.what(), empty);
11fdf7f2 2438 }
7c673cae
FG
2439 }
2440};
2441
11fdf7f2
TL
2442std::set<int64_t> OSD::get_mapped_pools()
2443{
2444 std::set<int64_t> pools;
2445 std::vector<spg_t> pgids;
2446 _get_pgids(&pgids);
2447 for (const auto &pgid : pgids) {
2448 pools.insert(pgid.pool());
2449 }
2450 return pools;
2451}
2452
20effc67
TL
2453OSD::PGRefOrError OSD::locate_asok_target(const cmdmap_t& cmdmap,
2454 stringstream& ss,
2455 bool only_primary)
2456{
2457 string pgidstr;
2458 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2459 ss << "no pgid specified";
2460 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2461 }
2462
2463 pg_t pgid;
2464 if (!pgid.parse(pgidstr.c_str())) {
2465 ss << "couldn't parse pgid '" << pgidstr << "'";
2466 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2467 }
2468
2469 spg_t pcand;
2470 PGRef pg;
2471 if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) {
2472 if (pg->is_primary() || !only_primary) {
2473 return OSD::PGRefOrError{pg, 0};
2474 }
2475
2476 ss << "not primary for pgid " << pgid;
2477 pg->unlock();
2478 return OSD::PGRefOrError{std::nullopt, -EAGAIN};
2479 } else {
2480 ss << "i don't have pgid " << pgid;
2481 return OSD::PGRefOrError{std::nullopt, -ENOENT};
2482 }
2483}
2484
2485// note that the cmdmap is explicitly copied into asok_route_to_pg()
2486int OSD::asok_route_to_pg(
2487 bool only_primary,
2488 std::string_view prefix,
2489 cmdmap_t cmdmap,
2490 Formatter* f,
2491 stringstream& ss,
2492 const bufferlist& inbl,
2493 bufferlist& outbl,
2494 std::function<void(int, const std::string&, bufferlist&)> on_finish)
2495{
2496 auto [target_pg, ret] = locate_asok_target(cmdmap, ss, only_primary);
2497
2498 if (!target_pg.has_value()) {
2499 // 'ss' and 'ret' already contain the error information
2500 on_finish(ret, ss.str(), outbl);
2501 return ret;
2502 }
2503
2504 // the PG was locked by locate_asok_target()
2505 try {
2506 (*target_pg)->do_command(prefix, cmdmap, inbl, on_finish);
2507 (*target_pg)->unlock();
2508 return 0; // the pg handler calls on_finish directly
2509 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2510 (*target_pg)->unlock();
2511 ss << e.what();
2512 on_finish(ret, ss.str(), outbl);
2513 return -EINVAL;
2514 }
2515}
2516
9f95a23c
TL
2517void OSD::asok_command(
2518 std::string_view prefix, const cmdmap_t& cmdmap,
2519 Formatter *f,
2520 const bufferlist& inbl,
2521 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 2522{
9f95a23c
TL
2523 int ret = 0;
2524 stringstream ss; // stderr error message stream
2525 bufferlist outbl; // if empty at end, we'll dump formatter as output
2526
2527 // --- PG commands are routed here to PG::do_command ---
2528 if (prefix == "pg" ||
2529 prefix == "query" ||
2530 prefix == "mark_unfound_lost" ||
2531 prefix == "list_unfound" ||
2532 prefix == "scrub" ||
2533 prefix == "deep_scrub"
2534 ) {
2535 string pgidstr;
2536 pg_t pgid;
2537 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2538 ss << "no pgid specified";
2539 ret = -EINVAL;
2540 goto out;
2541 }
2542 if (!pgid.parse(pgidstr.c_str())) {
2543 ss << "couldn't parse pgid '" << pgidstr << "'";
2544 ret = -EINVAL;
2545 goto out;
2546 }
2547 spg_t pcand;
2548 PGRef pg;
2549 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2550 (pg = _lookup_lock_pg(pcand))) {
2551 if (pg->is_primary()) {
2552 cmdmap_t new_cmdmap = cmdmap;
2553 try {
2554 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2555 pg->unlock();
2556 return; // the pg handler calls on_finish directly
2557 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2558 pg->unlock();
2559 ss << e.what();
2560 ret = -EINVAL;
2561 goto out;
2562 }
2563 } else {
2564 ss << "not primary for pgid " << pgid;
2565 // do not reply; they will get newer maps and realize they
2566 // need to resend.
2567 pg->unlock();
2568 ret = -EAGAIN;
2569 goto out;
2570 }
2571 } else {
2572 ss << "i don't have pgid " << pgid;
2573 ret = -ENOENT;
2574 }
2575 }
2576
20effc67
TL
2577 // --- PG commands that will be answered even if !primary ---
2578
2579 else if (prefix == "scrubdebug") {
2580 asok_route_to_pg(false, prefix, cmdmap, f, ss, inbl, outbl, on_finish);
2581 return;
2582 }
2583
9f95a23c
TL
2584 // --- OSD commands follow ---
2585
2586 else if (prefix == "status") {
2587 lock_guard l(osd_lock);
7c673cae
FG
2588 f->open_object_section("status");
2589 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2590 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2591 f->dump_unsigned("whoami", superblock.whoami);
2592 f->dump_string("state", get_state_name(get_state()));
2593 f->dump_unsigned("oldest_map", superblock.oldest_map);
2594 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2595 f->dump_unsigned("num_pgs", num_pgs);
7c673cae 2596 f->close_section();
9f95a23c 2597 } else if (prefix == "flush_journal") {
7c673cae 2598 store->flush_journal();
9f95a23c
TL
2599 } else if (prefix == "dump_ops_in_flight" ||
2600 prefix == "ops" ||
2601 prefix == "dump_blocked_ops" ||
2602 prefix == "dump_historic_ops" ||
2603 prefix == "dump_historic_ops_by_duration" ||
2604 prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2605
2606 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2607even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2608will start to track new ops received afterwards.";
2609
2610 set<string> filters;
2611 vector<string> filter_str;
9f95a23c 2612 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
c07f9fc5
FG
2613 copy(filter_str.begin(), filter_str.end(),
2614 inserter(filters, filters.end()));
2615 }
2616
9f95a23c
TL
2617 if (prefix == "dump_ops_in_flight" ||
2618 prefix == "ops") {
c07f9fc5
FG
2619 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2620 ss << error_str;
9f95a23c
TL
2621 ret = -EINVAL;
2622 goto out;
c07f9fc5
FG
2623 }
2624 }
9f95a23c 2625 if (prefix == "dump_blocked_ops") {
c07f9fc5
FG
2626 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2627 ss << error_str;
9f95a23c
TL
2628 ret = -EINVAL;
2629 goto out;
c07f9fc5
FG
2630 }
2631 }
9f95a23c 2632 if (prefix == "dump_historic_ops") {
c07f9fc5
FG
2633 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2634 ss << error_str;
9f95a23c
TL
2635 ret = -EINVAL;
2636 goto out;
c07f9fc5
FG
2637 }
2638 }
9f95a23c 2639 if (prefix == "dump_historic_ops_by_duration") {
c07f9fc5
FG
2640 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2641 ss << error_str;
9f95a23c
TL
2642 ret = -EINVAL;
2643 goto out;
c07f9fc5
FG
2644 }
2645 }
9f95a23c 2646 if (prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2647 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2648 ss << error_str;
9f95a23c
TL
2649 ret = -EINVAL;
2650 goto out;
c07f9fc5 2651 }
7c673cae 2652 }
9f95a23c 2653 } else if (prefix == "dump_op_pq_state") {
7c673cae
FG
2654 f->open_object_section("pq");
2655 op_shardedwq.dump(f);
2656 f->close_section();
f67539c2 2657 } else if (prefix == "dump_blocklist") {
7c673cae
FG
2658 list<pair<entity_addr_t,utime_t> > bl;
2659 OSDMapRef curmap = service.get_osdmap();
2660
f67539c2
TL
2661 f->open_array_section("blocklist");
2662 curmap->get_blocklist(&bl);
7c673cae
FG
2663 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2664 it != bl.end(); ++it) {
224ce89b 2665 f->open_object_section("entry");
7c673cae
FG
2666 f->open_object_section("entity_addr_t");
2667 it->first.dump(f);
2668 f->close_section(); //entity_addr_t
2669 it->second.localtime(f->dump_stream("expire_time"));
2670 f->close_section(); //entry
2671 }
f67539c2 2672 f->close_section(); //blocklist
9f95a23c 2673 } else if (prefix == "dump_watchers") {
7c673cae
FG
2674 list<obj_watch_item_t> watchers;
2675 // scan pg's
11fdf7f2
TL
2676 vector<PGRef> pgs;
2677 _get_pgs(&pgs);
2678 for (auto& pg : pgs) {
2679 list<obj_watch_item_t> pg_watchers;
2680 pg->get_watchers(&pg_watchers);
2681 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2682 }
2683
2684 f->open_array_section("watchers");
2685 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2686 it != watchers.end(); ++it) {
2687
224ce89b 2688 f->open_object_section("watch");
7c673cae
FG
2689
2690 f->dump_string("namespace", it->obj.nspace);
2691 f->dump_string("object", it->obj.oid.name);
2692
2693 f->open_object_section("entity_name");
2694 it->wi.name.dump(f);
2695 f->close_section(); //entity_name_t
2696
224ce89b
WB
2697 f->dump_unsigned("cookie", it->wi.cookie);
2698 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2699
2700 f->open_object_section("entity_addr_t");
2701 it->wi.addr.dump(f);
2702 f->close_section(); //entity_addr_t
2703
2704 f->close_section(); //watch
2705 }
2706
2707 f->close_section(); //watchers
9f95a23c 2708 } else if (prefix == "dump_recovery_reservations") {
7c673cae
FG
2709 f->open_object_section("reservations");
2710 f->open_object_section("local_reservations");
2711 service.local_reserver.dump(f);
2712 f->close_section();
2713 f->open_object_section("remote_reservations");
2714 service.remote_reserver.dump(f);
2715 f->close_section();
2716 f->close_section();
9f95a23c 2717 } else if (prefix == "dump_scrub_reservations") {
eafe8130 2718 f->open_object_section("scrub_reservations");
20effc67 2719 service.get_scrub_services().dump_scrub_reservations(f);
eafe8130 2720 f->close_section();
9f95a23c 2721 } else if (prefix == "get_latest_osdmap") {
7c673cae 2722 get_latest_osdmap();
9f95a23c 2723 } else if (prefix == "set_heap_property") {
7c673cae
FG
2724 string property;
2725 int64_t value = 0;
2726 string error;
2727 bool success = false;
9f95a23c 2728 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2729 error = "unable to get property";
2730 success = false;
9f95a23c 2731 } else if (!cmd_getval(cmdmap, "value", value)) {
7c673cae
FG
2732 error = "unable to get value";
2733 success = false;
2734 } else if (value < 0) {
2735 error = "negative value not allowed";
2736 success = false;
2737 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2738 error = "invalid property";
2739 success = false;
2740 } else {
2741 success = true;
2742 }
2743 f->open_object_section("result");
2744 f->dump_string("error", error);
2745 f->dump_bool("success", success);
2746 f->close_section();
9f95a23c 2747 } else if (prefix == "get_heap_property") {
7c673cae
FG
2748 string property;
2749 size_t value = 0;
2750 string error;
2751 bool success = false;
9f95a23c 2752 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2753 error = "unable to get property";
2754 success = false;
2755 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2756 error = "invalid property";
2757 success = false;
2758 } else {
2759 success = true;
2760 }
2761 f->open_object_section("result");
2762 f->dump_string("error", error);
2763 f->dump_bool("success", success);
2764 f->dump_int("value", value);
2765 f->close_section();
9f95a23c 2766 } else if (prefix == "dump_objectstore_kv_stats") {
7c673cae 2767 store->get_db_statistics(f);
9f95a23c 2768 } else if (prefix == "dump_scrubs") {
20effc67 2769 service.get_scrub_services().dump_scrubs(f);
9f95a23c 2770 } else if (prefix == "calc_objectstore_db_histogram") {
7c673cae 2771 store->generate_db_histogram(f);
9f95a23c 2772 } else if (prefix == "flush_store_cache") {
11fdf7f2 2773 store->flush_cache(&ss);
9f95a23c 2774 } else if (prefix == "dump_pgstate_history") {
7c673cae 2775 f->open_object_section("pgstate_history");
9f95a23c 2776 f->open_array_section("pgs");
11fdf7f2
TL
2777 vector<PGRef> pgs;
2778 _get_pgs(&pgs);
2779 for (auto& pg : pgs) {
9f95a23c 2780 f->open_object_section("pg");
11fdf7f2 2781 f->dump_stream("pg") << pg->pg_id;
9f95a23c 2782 f->dump_string("currently", pg->get_current_state());
11fdf7f2 2783 pg->dump_pgstate_history(f);
9f95a23c 2784 f->close_section();
7c673cae
FG
2785 }
2786 f->close_section();
9f95a23c
TL
2787 f->close_section();
2788 } else if (prefix == "compact") {
224ce89b
WB
2789 dout(1) << "triggering manual compaction" << dendl;
2790 auto start = ceph::coarse_mono_clock::now();
2791 store->compact();
2792 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2793 double duration = std::chrono::duration<double>(end-start).count();
f67539c2 2794 dout(1) << "finished manual compaction in "
11fdf7f2 2795 << duration
224ce89b
WB
2796 << " seconds" << dendl;
2797 f->open_object_section("compact_result");
11fdf7f2
TL
2798 f->dump_float("elapsed_time", duration);
2799 f->close_section();
9f95a23c 2800 } else if (prefix == "get_mapped_pools") {
11fdf7f2
TL
2801 f->open_array_section("mapped_pools");
2802 set<int64_t> poollist = get_mapped_pools();
2803 for (auto pool : poollist) {
2804 f->dump_int("pool_id", pool);
2805 }
2806 f->close_section();
9f95a23c 2807 } else if (prefix == "smart") {
11fdf7f2 2808 string devid;
9f95a23c
TL
2809 cmd_getval(cmdmap, "devid", devid);
2810 ostringstream out;
2811 probe_smart(devid, out);
2812 outbl.append(out.str());
2813 } else if (prefix == "list_devices") {
11fdf7f2
TL
2814 set<string> devnames;
2815 store->get_devices(&devnames);
9f95a23c 2816 f->open_array_section("list_devices");
11fdf7f2
TL
2817 for (auto dev : devnames) {
2818 if (dev.find("dm-") == 0) {
2819 continue;
2820 }
9f95a23c
TL
2821 string err;
2822 f->open_object_section("device");
11fdf7f2 2823 f->dump_string("device", "/dev/" + dev);
9f95a23c
TL
2824 f->dump_string("device_id", get_device_id(dev, &err));
2825 f->close_section();
11fdf7f2 2826 }
224ce89b 2827 f->close_section();
9f95a23c
TL
2828 } else if (prefix == "send_beacon") {
2829 lock_guard l(osd_lock);
11fdf7f2
TL
2830 if (is_active()) {
2831 send_beacon(ceph::coarse_mono_clock::now());
2832 }
9f95a23c
TL
2833 }
2834
2835 else if (prefix == "cluster_log") {
2836 vector<string> msg;
2837 cmd_getval(cmdmap, "message", msg);
2838 if (msg.empty()) {
2839 ret = -EINVAL;
2840 ss << "ignoring empty log message";
2841 goto out;
2842 }
2843 string message = msg.front();
2844 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2845 message += " " + *a;
2846 string lvl;
2847 cmd_getval(cmdmap, "level", lvl);
2848 clog_type level = string_to_clog_type(lvl);
2849 if (level < 0) {
2850 ret = -EINVAL;
2851 ss << "unknown level '" << lvl << "'";
2852 goto out;
2853 }
2854 clog->do_log(level, message);
2855 }
2856
2857 else if (prefix == "bench") {
9f95a23c 2858 // default count 1G, size 4MB
20effc67
TL
2859 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 1LL << 30);
2860 int64_t bsize = cmd_getval_or<int64_t>(cmdmap, "size", 4LL << 20);
2861 int64_t osize = cmd_getval_or<int64_t>(cmdmap, "object_size", 0);
2862 int64_t onum = cmd_getval_or<int64_t>(cmdmap, "object_num", 0);
a4b75251 2863 double elapsed = 0.0;
9f95a23c 2864
a4b75251
TL
2865 ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
2866 if (ret != 0) {
9f95a23c 2867 goto out;
9f95a23c
TL
2868 }
2869
9f95a23c
TL
2870 double rate = count / elapsed;
2871 double iops = rate / bsize;
2872 f->open_object_section("osd_bench_results");
2873 f->dump_int("bytes_written", count);
2874 f->dump_int("blocksize", bsize);
2875 f->dump_float("elapsed_sec", elapsed);
2876 f->dump_float("bytes_per_sec", rate);
2877 f->dump_float("iops", iops);
2878 f->close_section();
2879 }
2880
2881 else if (prefix == "flush_pg_stats") {
2882 mgrc.send_pgstats();
2883 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2884 }
2885
2886 else if (prefix == "heap") {
2887 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2888 }
2889
2890 else if (prefix == "debug dump_missing") {
2891 f->open_array_section("pgs");
2892 vector<PGRef> pgs;
2893 _get_pgs(&pgs);
2894 for (auto& pg : pgs) {
2895 string s = stringify(pg->pg_id);
2896 f->open_array_section(s.c_str());
2897 pg->lock();
2898 pg->dump_missing(f);
2899 pg->unlock();
2900 f->close_section();
2901 }
2902 f->close_section();
2903 }
2904
2905 else if (prefix == "debug kick_recovery_wq") {
2906 int64_t delay;
2907 cmd_getval(cmdmap, "delay", delay);
2908 ostringstream oss;
2909 oss << delay;
2910 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2911 if (ret != 0) {
2912 ss << "kick_recovery_wq: error setting "
2913 << "osd_recovery_delay_start to '" << delay << "': error "
2914 << ret;
2915 goto out;
2916 }
2917 cct->_conf.apply_changes(nullptr);
2918 ss << "kicking recovery queue. set osd_recovery_delay_start "
2919 << "to " << cct->_conf->osd_recovery_delay_start;
2920 }
2921
2922 else if (prefix == "cpu_profiler") {
2923 ostringstream ds;
2924 string arg;
2925 cmd_getval(cmdmap, "arg", arg);
2926 vector<string> argvec;
2927 get_str_vec(arg, argvec);
2928 cpu_profiler_handle_command(argvec, ds);
2929 outbl.append(ds.str());
2930 }
2931
2932 else if (prefix == "dump_pg_recovery_stats") {
2933 lock_guard l(osd_lock);
2934 pg_recovery_stats.dump_formatted(f);
2935 }
2936
2937 else if (prefix == "reset_pg_recovery_stats") {
2938 lock_guard l(osd_lock);
2939 pg_recovery_stats.reset();
2940 }
2941
2942 else if (prefix == "perf histogram dump") {
2943 std::string logger;
2944 std::string counter;
2945 cmd_getval(cmdmap, "logger", logger);
2946 cmd_getval(cmdmap, "counter", counter);
2947 cct->get_perfcounters_collection()->dump_formatted_histograms(
2948 f, false, logger, counter);
2949 }
2950
2951 else if (prefix == "cache drop") {
2952 lock_guard l(osd_lock);
2953 dout(20) << "clearing all caches" << dendl;
2954 // Clear the objectstore's cache - onode and buffer for Bluestore,
2955 // system's pagecache for Filestore
2956 ret = store->flush_cache(&ss);
2957 if (ret < 0) {
2958 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2959 goto out;
2960 }
2961 // Clear the objectcontext cache (per PG)
2962 vector<PGRef> pgs;
2963 _get_pgs(&pgs);
2964 for (auto& pg: pgs) {
2965 pg->clear_cache();
2966 }
2967 }
2968
2969 else if (prefix == "cache status") {
2970 lock_guard l(osd_lock);
2971 int obj_ctx_count = 0;
2972 vector<PGRef> pgs;
2973 _get_pgs(&pgs);
2974 for (auto& pg: pgs) {
2975 obj_ctx_count += pg->get_cache_obj_count();
2976 }
2977 f->open_object_section("cache_status");
2978 f->dump_int("object_ctx", obj_ctx_count);
2979 store->dump_cache_stats(f);
2980 f->close_section();
2981 }
2982
2983 else if (prefix == "scrub_purged_snaps") {
2984 lock_guard l(osd_lock);
2985 scrub_purged_snaps();
2986 }
2987
2988 else if (prefix == "dump_osd_network") {
2989 lock_guard l(osd_lock);
2990 int64_t value = 0;
2991 if (!(cmd_getval(cmdmap, "value", value))) {
2992 // Convert milliseconds to microseconds
2993 value = static_cast<double>(g_conf().get_val<double>(
2994 "mon_warn_on_slow_ping_time")) * 1000;
2995 if (value == 0) {
2996 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2997 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2998 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2999 }
3000 } else {
3001 // Convert user input to microseconds
3002 value *= 1000;
3003 }
3004 if (value < 0) value = 0;
3005
3006 struct osd_ping_time_t {
3007 uint32_t pingtime;
3008 int to;
3009 bool back;
3010 std::array<uint32_t,3> times;
3011 std::array<uint32_t,3> min;
3012 std::array<uint32_t,3> max;
3013 uint32_t last;
3014 uint32_t last_update;
3015
3016 bool operator<(const osd_ping_time_t& rhs) const {
3017 if (pingtime < rhs.pingtime)
3018 return true;
3019 if (pingtime > rhs.pingtime)
3020 return false;
3021 if (to < rhs.to)
3022 return true;
3023 if (to > rhs.to)
3024 return false;
3025 return back;
3026 }
3027 };
3028
3029 set<osd_ping_time_t> sorted;
3030 // Get pingtimes under lock and not on the stack
eafe8130
TL
3031 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3032 service.get_hb_pingtime(pingtimes);
3033 for (auto j : *pingtimes) {
3034 if (j.second.last_update == 0)
3035 continue;
3036 osd_ping_time_t item;
3037 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3038 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3039 if (item.pingtime >= value) {
3040 item.to = j.first;
3041 item.times[0] = j.second.back_pingtime[0];
3042 item.times[1] = j.second.back_pingtime[1];
3043 item.times[2] = j.second.back_pingtime[2];
3044 item.min[0] = j.second.back_min[0];
3045 item.min[1] = j.second.back_min[1];
3046 item.min[2] = j.second.back_min[2];
3047 item.max[0] = j.second.back_max[0];
3048 item.max[1] = j.second.back_max[1];
3049 item.max[2] = j.second.back_max[2];
3050 item.last = j.second.back_last;
3051 item.back = true;
3052 item.last_update = j.second.last_update;
3053 sorted.emplace(item);
3054 }
3055 if (j.second.front_last == 0)
3056 continue;
3057 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3058 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3059 if (item.pingtime >= value) {
3060 item.to = j.first;
3061 item.times[0] = j.second.front_pingtime[0];
3062 item.times[1] = j.second.front_pingtime[1];
3063 item.times[2] = j.second.front_pingtime[2];
3064 item.min[0] = j.second.front_min[0];
3065 item.min[1] = j.second.front_min[1];
3066 item.min[2] = j.second.front_min[2];
3067 item.max[0] = j.second.front_max[0];
3068 item.max[1] = j.second.front_max[1];
3069 item.max[2] = j.second.front_max[2];
3070 item.last = j.second.front_last;
3071 item.last_update = j.second.last_update;
3072 item.back = false;
3073 sorted.emplace(item);
3074 }
3075 }
3076 delete pingtimes;
3077 //
3078 // Network ping times (1min 5min 15min)
3079 f->open_object_section("network_ping_times");
3080 f->dump_int("threshold", value / 1000);
3081 f->open_array_section("entries");
3082 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3083 ceph_assert(sitem.pingtime >= value);
3084 f->open_object_section("entry");
3085
3086 const time_t lu(sitem.last_update);
3087 char buffer[26];
3088 string lustr(ctime_r(&lu, buffer));
3089 lustr.pop_back(); // Remove trailing \n
3090 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3091 f->dump_string("last update", lustr);
3092 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3093 f->dump_int("from osd", whoami);
3094 f->dump_int("to osd", sitem.to);
3095 f->dump_string("interface", (sitem.back ? "back" : "front"));
3096 f->open_object_section("average");
3097 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3098 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3099 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3100 f->close_section(); // average
3101 f->open_object_section("min");
3102 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3103 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3104 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3105 f->close_section(); // min
3106 f->open_object_section("max");
3107 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3108 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3109 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3110 f->close_section(); // max
3111 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3112 f->close_section(); // entry
3113 }
3114 f->close_section(); // entries
3115 f->close_section(); // network_ping_times
20effc67
TL
3116 } else if (prefix == "dump_pool_statfs") {
3117 lock_guard l(osd_lock);
3118
3119 int64_t p = 0;
3120 if (!(cmd_getval(cmdmap, "poolid", p))) {
3121 ss << "Error dumping pool statfs: no poolid provided";
3122 ret = -EINVAL;
3123 goto out;
3124 }
3125
3126 store_statfs_t st;
3127 bool per_pool_omap_stats = false;
3128
3129 ret = store->pool_statfs(p, &st, &per_pool_omap_stats);
3130 if (ret < 0) {
3131 ss << "Error dumping pool statfs: " << cpp_strerror(ret);
3132 goto out;
3133 } else {
3134 ss << "dumping pool statfs...";
3135 f->open_object_section("pool_statfs");
3136 f->dump_int("poolid", p);
3137 st.dump(f);
3138 f->close_section();
3139 }
7c673cae 3140 } else {
11fdf7f2 3141 ceph_abort_msg("broken asok registration");
7c673cae 3142 }
9f95a23c
TL
3143
3144 out:
3145 on_finish(ret, ss.str(), outbl);
7c673cae
FG
3146}
3147
a4b75251
TL
3148int OSD::run_osd_bench_test(
3149 int64_t count,
3150 int64_t bsize,
3151 int64_t osize,
3152 int64_t onum,
3153 double *elapsed,
3154 ostream &ss)
3155{
3156 int ret = 0;
3157 uint32_t duration = cct->_conf->osd_bench_duration;
3158
3159 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3160 // let us limit the block size because the next checks rely on it
3161 // having a sane value. If we allow any block size to be set things
3162 // can still go sideways.
3163 ss << "block 'size' values are capped at "
3164 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3165 << " a higher value, please adjust 'osd_bench_max_block_size'";
3166 ret = -EINVAL;
3167 return ret;
3168 } else if (bsize < (int64_t) (1 << 20)) {
3169 // entering the realm of small block sizes.
3170 // limit the count to a sane value, assuming a configurable amount of
3171 // IOPS and duration, so that the OSD doesn't get hung up on this,
3172 // preventing timeouts from going off
3173 int64_t max_count =
3174 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3175 if (count > max_count) {
3176 ss << "'count' values greater than " << max_count
3177 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3178 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3179 << " for " << duration << " seconds,"
3180 << " can cause ill effects on osd. "
3181 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3182 << " value if you wish to use a higher 'count'.";
3183 ret = -EINVAL;
3184 return ret;
3185 }
3186 } else {
3187 // 1MB block sizes are big enough so that we get more stuff done.
3188 // However, to avoid the osd from getting hung on this and having
3189 // timers being triggered, we are going to limit the count assuming
3190 // a configurable throughput and duration.
3191 // NOTE: max_count is the total amount of bytes that we believe we
3192 // will be able to write during 'duration' for the given
3193 // throughput. The block size hardly impacts this unless it's
3194 // way too big. Given we already check how big the block size
3195 // is, it's safe to assume everything will check out.
3196 int64_t max_count =
3197 cct->_conf->osd_bench_large_size_max_throughput * duration;
3198 if (count > max_count) {
3199 ss << "'count' values greater than " << max_count
3200 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3201 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3202 << " for " << duration << " seconds,"
3203 << " can cause ill effects on osd. "
3204 << " Please adjust 'osd_bench_large_size_max_throughput'"
3205 << " with a higher value if you wish to use a higher 'count'.";
3206 ret = -EINVAL;
3207 return ret;
3208 }
3209 }
3210
3211 if (osize && bsize > osize) {
3212 bsize = osize;
3213 }
3214
3215 dout(1) << " bench count " << count
3216 << " bsize " << byte_u_t(bsize) << dendl;
3217
3218 ObjectStore::Transaction cleanupt;
3219
3220 if (osize && onum) {
3221 bufferlist bl;
3222 bufferptr bp(osize);
20effc67 3223 memset(bp.c_str(), 'a', bp.length());
a4b75251
TL
3224 bl.push_back(std::move(bp));
3225 bl.rebuild_page_aligned();
3226 for (int i=0; i<onum; ++i) {
3227 char nm[30];
3228 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3229 object_t oid(nm);
3230 hobject_t soid(sobject_t(oid, 0));
3231 ObjectStore::Transaction t;
3232 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3233 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3234 cleanupt.remove(coll_t(), ghobject_t(soid));
3235 }
3236 }
3237
3238 bufferlist bl;
3239 bufferptr bp(bsize);
20effc67 3240 memset(bp.c_str(), 'a', bp.length());
a4b75251
TL
3241 bl.push_back(std::move(bp));
3242 bl.rebuild_page_aligned();
3243
3244 {
3245 C_SaferCond waiter;
3246 if (!service.meta_ch->flush_commit(&waiter)) {
3247 waiter.wait();
3248 }
3249 }
3250
3251 utime_t start = ceph_clock_now();
3252 for (int64_t pos = 0; pos < count; pos += bsize) {
3253 char nm[30];
3254 unsigned offset = 0;
3255 if (onum && osize) {
3256 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3257 offset = rand() % (osize / bsize) * bsize;
3258 } else {
3259 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3260 }
3261 object_t oid(nm);
3262 hobject_t soid(sobject_t(oid, 0));
3263 ObjectStore::Transaction t;
3264 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3265 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3266 if (!onum || !osize) {
3267 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3268 }
3269 }
3270
3271 {
3272 C_SaferCond waiter;
3273 if (!service.meta_ch->flush_commit(&waiter)) {
3274 waiter.wait();
3275 }
3276 }
3277 utime_t end = ceph_clock_now();
3278 *elapsed = end - start;
3279
3280 // clean up
3281 store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3282 {
3283 C_SaferCond waiter;
3284 if (!service.meta_ch->flush_commit(&waiter)) {
3285 waiter.wait();
3286 }
3287 }
3288
3289 return ret;
3290}
3291
7c673cae
FG
3292class TestOpsSocketHook : public AdminSocketHook {
3293 OSDService *service;
3294 ObjectStore *store;
3295public:
3296 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
9f95a23c
TL
3297 int call(std::string_view command, const cmdmap_t& cmdmap,
3298 Formatter *f,
3299 std::ostream& errss,
3300 bufferlist& out) override {
3301 int r = 0;
3302 stringstream outss;
11fdf7f2 3303 try {
9f95a23c
TL
3304 test_ops(service, store, command, cmdmap, outss);
3305 out.append(outss);
3306 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3307 errss << e.what();
3308 r = -EINVAL;
11fdf7f2 3309 }
9f95a23c 3310 return r;
7c673cae
FG
3311 }
3312 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 3313 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
3314
3315};
3316
3317class OSD::C_Tick : public Context {
3318 OSD *osd;
3319 public:
3320 explicit C_Tick(OSD *o) : osd(o) {}
3321 void finish(int r) override {
3322 osd->tick();
3323 }
3324};
3325
3326class OSD::C_Tick_WithoutOSDLock : public Context {
3327 OSD *osd;
3328 public:
3329 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3330 void finish(int r) override {
3331 osd->tick_without_osd_lock();
3332 }
3333};
3334
3335int OSD::enable_disable_fuse(bool stop)
3336{
3337#ifdef HAVE_LIBFUSE
3338 int r;
3339 string mntpath = cct->_conf->osd_data + "/fuse";
3340 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3341 dout(1) << __func__ << " disabling" << dendl;
3342 fuse_store->stop();
3343 delete fuse_store;
3344 fuse_store = NULL;
3345 r = ::rmdir(mntpath.c_str());
7c673cae 3346 if (r < 0) {
c07f9fc5
FG
3347 r = -errno;
3348 derr << __func__ << " failed to rmdir " << mntpath << ": "
3349 << cpp_strerror(r) << dendl;
7c673cae
FG
3350 return r;
3351 }
3352 return 0;
3353 }
3354 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3355 dout(1) << __func__ << " enabling" << dendl;
3356 r = ::mkdir(mntpath.c_str(), 0700);
3357 if (r < 0)
3358 r = -errno;
3359 if (r < 0 && r != -EEXIST) {
3360 derr << __func__ << " unable to create " << mntpath << ": "
3361 << cpp_strerror(r) << dendl;
3362 return r;
3363 }
20effc67 3364 fuse_store = new FuseStore(store.get(), mntpath);
7c673cae
FG
3365 r = fuse_store->start();
3366 if (r < 0) {
3367 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3368 delete fuse_store;
3369 fuse_store = NULL;
3370 return r;
3371 }
3372 }
3373#endif // HAVE_LIBFUSE
3374 return 0;
3375}
3376
9f95a23c
TL
3377size_t OSD::get_num_cache_shards()
3378{
3379 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3380}
3381
31f18b77
FG
3382int OSD::get_num_op_shards()
3383{
3384 if (cct->_conf->osd_op_num_shards)
3385 return cct->_conf->osd_op_num_shards;
3386 if (store_is_rotational)
3387 return cct->_conf->osd_op_num_shards_hdd;
3388 else
3389 return cct->_conf->osd_op_num_shards_ssd;
3390}
3391
3392int OSD::get_num_op_threads()
3393{
3394 if (cct->_conf->osd_op_num_threads_per_shard)
3395 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3396 if (store_is_rotational)
3397 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3398 else
3399 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3400}
3401
c07f9fc5
FG
3402float OSD::get_osd_recovery_sleep()
3403{
3404 if (cct->_conf->osd_recovery_sleep)
3405 return cct->_conf->osd_recovery_sleep;
d2e6a577 3406 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 3407 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 3408 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 3409 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
3410 else
3411 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
3412}
3413
11fdf7f2
TL
3414float OSD::get_osd_delete_sleep()
3415{
3416 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3417 if (osd_delete_sleep > 0)
3418 return osd_delete_sleep;
3419 if (!store_is_rotational && !journal_is_rotational)
3420 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3421 if (store_is_rotational && !journal_is_rotational)
3422 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3423 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3424}
3425
9f95a23c
TL
3426int OSD::get_recovery_max_active()
3427{
3428 if (cct->_conf->osd_recovery_max_active)
3429 return cct->_conf->osd_recovery_max_active;
3430 if (store_is_rotational)
3431 return cct->_conf->osd_recovery_max_active_hdd;
3432 else
3433 return cct->_conf->osd_recovery_max_active_ssd;
3434}
3435
494da23a
TL
3436float OSD::get_osd_snap_trim_sleep()
3437{
3438 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3439 if (osd_snap_trim_sleep > 0)
3440 return osd_snap_trim_sleep;
3441 if (!store_is_rotational && !journal_is_rotational)
3442 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3443 if (store_is_rotational && !journal_is_rotational)
3444 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3445 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3446}
3447
7c673cae
FG
3448int OSD::init()
3449{
9f95a23c 3450 OSDMapRef osdmap;
7c673cae 3451 CompatSet initial, diff;
11fdf7f2 3452 std::lock_guard lock(osd_lock);
7c673cae
FG
3453 if (is_stopping())
3454 return 0;
20effc67 3455 tracing::osd::tracer.init("osd");
7c673cae
FG
3456 tick_timer.init();
3457 tick_timer_without_osd_lock.init();
3458 service.recovery_request_timer.init();
11fdf7f2
TL
3459 service.sleep_timer.init();
3460
3461 boot_finisher.start();
3462
3463 {
3464 string val;
3465 store->read_meta("require_osd_release", &val);
9f95a23c 3466 last_require_osd_release = ceph_release_from_name(val);
11fdf7f2 3467 }
7c673cae
FG
3468
3469 // mount.
31f18b77
FG
3470 dout(2) << "init " << dev_path
3471 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3472 << dendl;
d2e6a577 3473 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 3474 ceph_assert(store); // call pre_init() first!
7c673cae 3475
9f95a23c 3476 store->set_cache_shards(get_num_cache_shards());
7c673cae 3477
20effc67
TL
3478 int rotating_auth_attempts = 0;
3479 auto rotating_auth_timeout =
3480 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3481
7c673cae
FG
3482 int r = store->mount();
3483 if (r < 0) {
3484 derr << "OSD:init: unable to mount object store" << dendl;
3485 return r;
3486 }
d2e6a577
FG
3487 journal_is_rotational = store->is_journal_rotational();
3488 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3489 << dendl;
7c673cae
FG
3490
3491 enable_disable_fuse(false);
3492
3493 dout(2) << "boot" << dendl;
3494
11fdf7f2 3495 service.meta_ch = store->open_collection(coll_t::meta());
20effc67
TL
3496 if (!service.meta_ch) {
3497 derr << "OSD:init: unable to open meta collection"
3498 << dendl;
3499 r = -ENOENT;
3500 goto out;
3501 }
7c673cae
FG
3502 // initialize the daily loadavg with current 15min loadavg
3503 double loadavgs[3];
3504 if (getloadavg(loadavgs, 3) == 3) {
3505 daily_loadavg = loadavgs[2];
3506 } else {
3507 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3508 daily_loadavg = 1.0;
3509 }
3510
7c673cae
FG
3511 // sanity check long object name handling
3512 {
3513 hobject_t l;
3514 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3515 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3516 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3517 r = store->validate_hobject_key(l);
3518 if (r < 0) {
3519 derr << "backend (" << store->get_type() << ") is unable to support max "
3520 << "object name[space] len" << dendl;
3521 derr << " osd max object name len = "
3522 << cct->_conf->osd_max_object_name_len << dendl;
3523 derr << " osd max object namespace len = "
3524 << cct->_conf->osd_max_object_namespace_len << dendl;
3525 derr << cpp_strerror(r) << dendl;
3526 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3527 goto out;
3528 }
3529 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3530 << dendl;
3531 } else {
3532 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3533 }
3534 }
3535
3536 // read superblock
3537 r = read_superblock();
3538 if (r < 0) {
3539 derr << "OSD::init() : unable to read osd superblock" << dendl;
3540 r = -EINVAL;
3541 goto out;
3542 }
3543
3544 if (osd_compat.compare(superblock.compat_features) < 0) {
3545 derr << "The disk uses features unsupported by the executable." << dendl;
3546 derr << " ondisk features " << superblock.compat_features << dendl;
3547 derr << " daemon features " << osd_compat << dendl;
3548
3549 if (osd_compat.writeable(superblock.compat_features)) {
3550 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3551 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3552 r = -EOPNOTSUPP;
3553 goto out;
3554 }
3555 else {
3556 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3557 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3558 r = -EOPNOTSUPP;
3559 goto out;
3560 }
3561 }
3562
3563 assert_warn(whoami == superblock.whoami);
3564 if (whoami != superblock.whoami) {
3565 derr << "OSD::init: superblock says osd"
3566 << superblock.whoami << " but I am osd." << whoami << dendl;
3567 r = -EINVAL;
3568 goto out;
3569 }
3570
9f95a23c
TL
3571 startup_time = ceph::mono_clock::now();
3572
11fdf7f2 3573 // load up "current" osdmap
9f95a23c
TL
3574 assert_warn(!get_osdmap());
3575 if (get_osdmap()) {
11fdf7f2
TL
3576 derr << "OSD::init: unable to read current osdmap" << dendl;
3577 r = -EINVAL;
3578 goto out;
3579 }
3580 osdmap = get_map(superblock.current_epoch);
9f95a23c 3581 set_osdmap(osdmap);
11fdf7f2
TL
3582
3583 // make sure we don't have legacy pgs deleting
3584 {
3585 vector<coll_t> ls;
3586 int r = store->list_collections(ls);
3587 ceph_assert(r >= 0);
3588 for (auto c : ls) {
3589 spg_t pgid;
3590 if (c.is_pg(&pgid) &&
3591 !osdmap->have_pg_pool(pgid.pool())) {
3592 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3593 if (!store->exists(service.meta_ch, oid)) {
3594 derr << __func__ << " missing pg_pool_t for deleted pool "
3595 << pgid.pool() << " for pg " << pgid
3596 << "; please downgrade to luminous and allow "
3597 << "pg deletion to complete before upgrading" << dendl;
3598 ceph_abort();
3599 }
3600 }
3601 }
3602 }
3603
7c673cae
FG
3604 initial = get_osd_initial_compat_set();
3605 diff = superblock.compat_features.unsupported(initial);
3606 if (superblock.compat_features.merge(initial)) {
9f95a23c
TL
3607 // Are we adding SNAPMAPPER2?
3608 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3609 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3610 << dendl;
3611 auto ch = service.meta_ch;
3612 auto hoid = make_snapmapper_oid();
3613 unsigned max = cct->_conf->osd_target_transaction_size;
20effc67 3614 r = SnapMapper::convert_legacy(cct, store.get(), ch, hoid, max);
9f95a23c
TL
3615 if (r < 0)
3616 goto out;
3617 }
7c673cae
FG
3618 // We need to persist the new compat_set before we
3619 // do anything else
3620 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3621 ObjectStore::Transaction t;
3622 write_superblock(t);
11fdf7f2 3623 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3624 if (r < 0)
3625 goto out;
3626 }
3627
3628 // make sure snap mapper object exists
11fdf7f2 3629 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3630 dout(10) << "init creating/touching snapmapper object" << dendl;
3631 ObjectStore::Transaction t;
3632 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3633 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3634 if (r < 0)
3635 goto out;
3636 }
9f95a23c
TL
3637 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3638 dout(10) << "init creating/touching purged_snaps object" << dendl;
3639 ObjectStore::Transaction t;
3640 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3641 r = store->queue_transaction(service.meta_ch, std::move(t));
3642 if (r < 0)
3643 goto out;
3644 }
7c673cae
FG
3645
3646 if (cct->_conf->osd_open_classes_on_start) {
9f95a23c 3647 int r = ClassHandler::get_instance().open_all_classes();
7c673cae
FG
3648 if (r)
3649 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3650 }
3651
11fdf7f2 3652 check_osdmap_features();
7c673cae 3653
7c673cae
FG
3654 {
3655 epoch_t bind_epoch = osdmap->get_epoch();
3656 service.set_epochs(NULL, NULL, &bind_epoch);
3657 }
3658
3659 clear_temp_objects();
3660
d2e6a577 3661 // initialize osdmap references in sharded wq
11fdf7f2
TL
3662 for (auto& shard : shards) {
3663 std::lock_guard l(shard->osdmap_lock);
3664 shard->shard_osdmap = osdmap;
3665 }
d2e6a577 3666
7c673cae
FG
3667 // load up pgs (as they previously existed)
3668 load_pgs();
3669
3670 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
7c673cae 3671
f67539c2
TL
3672 if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3673 dout(2) << "compacting object store's omap" << dendl;
3674 store->compact();
3675 }
7c673cae 3676
11fdf7f2
TL
3677 // prime osd stats
3678 {
3679 struct store_statfs_t stbuf;
3680 osd_alert_list_t alerts;
3681 int r = store->statfs(&stbuf, &alerts);
3682 ceph_assert(r == 0);
3683 service.set_statfs(stbuf, alerts);
3684 }
3685
f67539c2 3686 // client_messenger's auth_client will be set up by monc->init() later.
11fdf7f2
TL
3687 for (auto m : { cluster_messenger,
3688 objecter_messenger,
3689 hb_front_client_messenger,
3690 hb_back_client_messenger,
3691 hb_front_server_messenger,
3692 hb_back_server_messenger } ) {
3693 m->set_auth_client(monc);
3694 }
3695 for (auto m : { client_messenger,
3696 cluster_messenger,
3697 hb_front_server_messenger,
3698 hb_back_server_messenger }) {
3699 m->set_auth_server(monc);
3700 }
3701 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3702
3703 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3704 | CEPH_ENTITY_TYPE_MGR);
3705 r = monc->init();
3706 if (r < 0)
3707 goto out;
3708
f67539c2 3709 mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
11fdf7f2 3710 mgrc.set_perf_metric_query_cb(
9f95a23c
TL
3711 [this](const ConfigPayload &config_payload) {
3712 set_perf_queries(config_payload);
11fdf7f2 3713 },
9f95a23c
TL
3714 [this] {
3715 return get_perf_reports();
11fdf7f2 3716 });
7c673cae 3717 mgrc.init();
7c673cae
FG
3718
3719 // tell monc about log_client so it will know about mon session resets
3720 monc->set_log_client(&log_client);
3721 update_log_config();
3722
11fdf7f2
TL
3723 // i'm ready!
3724 client_messenger->add_dispatcher_tail(&mgrc);
3725 client_messenger->add_dispatcher_tail(this);
3726 cluster_messenger->add_dispatcher_head(this);
3727
3728 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3729 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3730 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3731 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3732
9f95a23c 3733 objecter_messenger->add_dispatcher_head(service.objecter.get());
11fdf7f2 3734
28e407b8
AA
3735 service.init();
3736 service.publish_map(osdmap);
3737 service.publish_superblock(superblock);
3738 service.max_oldest_map = superblock.oldest_map;
3739
11fdf7f2
TL
3740 for (auto& shard : shards) {
3741 // put PGs in a temporary set because we may modify pg_slots
3742 // unordered_map below.
3743 set<PGRef> pgs;
3744 for (auto& i : shard->pg_slots) {
3745 PGRef pg = i.second->pg;
3746 if (!pg) {
3747 continue;
3748 }
3749 pgs.insert(pg);
3750 }
3751 for (auto pg : pgs) {
9f95a23c 3752 std::scoped_lock l{*pg};
11fdf7f2
TL
3753 set<pair<spg_t,epoch_t>> new_children;
3754 set<pair<spg_t,epoch_t>> merge_pgs;
3755 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3756 &new_children, &merge_pgs);
3757 if (!new_children.empty()) {
3758 for (auto shard : shards) {
3759 shard->prime_splits(osdmap, &new_children);
3760 }
3761 assert(new_children.empty());
3762 }
3763 if (!merge_pgs.empty()) {
3764 for (auto shard : shards) {
3765 shard->prime_merges(osdmap, &merge_pgs);
3766 }
3767 assert(merge_pgs.empty());
3768 }
11fdf7f2
TL
3769 }
3770 }
3771
7c673cae 3772 osd_op_tp.start();
7c673cae 3773
7c673cae
FG
3774 // start the heartbeat
3775 heartbeat_thread.create("osd_srv_heartbt");
3776
3777 // tick
91327a77
AA
3778 tick_timer.add_event_after(get_tick_interval(),
3779 new C_Tick(this));
7c673cae 3780 {
11fdf7f2 3781 std::lock_guard l(tick_timer_lock);
91327a77
AA
3782 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3783 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3784 }
3785
9f95a23c 3786 osd_lock.unlock();
7c673cae
FG
3787
3788 r = monc->authenticate();
3789 if (r < 0) {
c07f9fc5
FG
3790 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3791 << dendl;
11fdf7f2 3792 exit(1);
7c673cae
FG
3793 }
3794
11fdf7f2 3795 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3796 derr << "unable to obtain rotating service keys; retrying" << dendl;
3797 ++rotating_auth_attempts;
11fdf7f2 3798 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
c07f9fc5 3799 derr << __func__ << " wait_auth_rotating timed out" << dendl;
11fdf7f2 3800 exit(1);
7c673cae
FG
3801 }
3802 }
3803
3804 r = update_crush_device_class();
3805 if (r < 0) {
d2e6a577
FG
3806 derr << __func__ << " unable to update_crush_device_class: "
3807 << cpp_strerror(r) << dendl;
11fdf7f2 3808 exit(1);
7c673cae
FG
3809 }
3810
3811 r = update_crush_location();
3812 if (r < 0) {
d2e6a577 3813 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3814 << cpp_strerror(r) << dendl;
11fdf7f2 3815 exit(1);
7c673cae
FG
3816 }
3817
9f95a23c 3818 osd_lock.lock();
7c673cae
FG
3819 if (is_stopping())
3820 return 0;
3821
3822 // start objecter *after* we have authenticated, so that we don't ignore
3823 // the OSDMaps it requests.
3824 service.final_init();
3825
3826 check_config();
3827
3828 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3829 consume_map();
7c673cae
FG
3830
3831 dout(0) << "done with init, starting boot process" << dendl;
3832
3833 // subscribe to any pg creations
3834 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3835
3836 // MgrClient needs this (it doesn't have MonClient reference itself)
3837 monc->sub_want("mgrmap", 0, 0);
3838
3839 // we don't need to ask for an osdmap here; objecter will
3840 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3841
3842 monc->renew_subs();
3843
3844 start_boot();
3845
a4b75251
TL
3846 // Override a few options if mclock scheduler is enabled.
3847 maybe_override_max_osd_capacity_for_qos();
3848 maybe_override_options_for_qos();
3849
7c673cae 3850 return 0;
7c673cae
FG
3851
3852out:
3853 enable_disable_fuse(true);
3854 store->umount();
20effc67 3855 store.reset();
7c673cae
FG
3856 return r;
3857}
3858
3859void OSD::final_init()
3860{
3861 AdminSocket *admin_socket = cct->get_admin_socket();
3862 asok_hook = new OSDSocketHook(this);
9f95a23c 3863 int r = admin_socket->register_command("status", asok_hook,
7c673cae 3864 "high-level status of OSD");
11fdf7f2 3865 ceph_assert(r == 0);
9f95a23c 3866 r = admin_socket->register_command("flush_journal",
7c673cae
FG
3867 asok_hook,
3868 "flush the journal to permanent store");
11fdf7f2 3869 ceph_assert(r == 0);
9f95a23c 3870 r = admin_socket->register_command("dump_ops_in_flight " \
c07f9fc5
FG
3871 "name=filterstr,type=CephString,n=N,req=false",
3872 asok_hook,
7c673cae 3873 "show the ops currently in flight");
11fdf7f2 3874 ceph_assert(r == 0);
9f95a23c 3875 r = admin_socket->register_command("ops " \
c07f9fc5
FG
3876 "name=filterstr,type=CephString,n=N,req=false",
3877 asok_hook,
7c673cae 3878 "show the ops currently in flight");
11fdf7f2 3879 ceph_assert(r == 0);
9f95a23c 3880 r = admin_socket->register_command("dump_blocked_ops " \
c07f9fc5
FG
3881 "name=filterstr,type=CephString,n=N,req=false",
3882 asok_hook,
7c673cae 3883 "show the blocked ops currently in flight");
11fdf7f2 3884 ceph_assert(r == 0);
9f95a23c 3885 r = admin_socket->register_command("dump_historic_ops " \
c07f9fc5 3886 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3887 asok_hook,
3888 "show recent ops");
11fdf7f2 3889 ceph_assert(r == 0);
9f95a23c 3890 r = admin_socket->register_command("dump_historic_slow_ops " \
c07f9fc5 3891 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3892 asok_hook,
3893 "show slowest recent ops");
11fdf7f2 3894 ceph_assert(r == 0);
9f95a23c 3895 r = admin_socket->register_command("dump_historic_ops_by_duration " \
c07f9fc5 3896 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3897 asok_hook,
3898 "show slowest recent ops, sorted by duration");
11fdf7f2 3899 ceph_assert(r == 0);
9f95a23c 3900 r = admin_socket->register_command("dump_op_pq_state",
7c673cae 3901 asok_hook,
20effc67 3902 "dump op queue state");
11fdf7f2 3903 ceph_assert(r == 0);
f67539c2 3904 r = admin_socket->register_command("dump_blocklist",
7c673cae 3905 asok_hook,
f67539c2 3906 "dump blocklisted clients and times");
11fdf7f2 3907 ceph_assert(r == 0);
9f95a23c 3908 r = admin_socket->register_command("dump_watchers",
7c673cae
FG
3909 asok_hook,
3910 "show clients which have active watches,"
3911 " and on which objects");
11fdf7f2 3912 ceph_assert(r == 0);
9f95a23c 3913 r = admin_socket->register_command("dump_recovery_reservations",
7c673cae
FG
3914 asok_hook,
3915 "show recovery reservations");
11fdf7f2 3916 ceph_assert(r == 0);
9f95a23c 3917 r = admin_socket->register_command("dump_scrub_reservations",
eafe8130 3918 asok_hook,
f6b5b4d7 3919 "show scrub reservations");
eafe8130 3920 ceph_assert(r == 0);
9f95a23c 3921 r = admin_socket->register_command("get_latest_osdmap",
7c673cae
FG
3922 asok_hook,
3923 "force osd to update the latest map from "
3924 "the mon");
11fdf7f2 3925 ceph_assert(r == 0);
7c673cae 3926
9f95a23c 3927 r = admin_socket->register_command("set_heap_property " \
7c673cae
FG
3928 "name=property,type=CephString " \
3929 "name=value,type=CephInt",
3930 asok_hook,
3931 "update malloc extension heap property");
11fdf7f2 3932 ceph_assert(r == 0);
7c673cae 3933
9f95a23c 3934 r = admin_socket->register_command("get_heap_property " \
7c673cae
FG
3935 "name=property,type=CephString",
3936 asok_hook,
3937 "get malloc extension heap property");
11fdf7f2 3938 ceph_assert(r == 0);
7c673cae
FG
3939
3940 r = admin_socket->register_command("dump_objectstore_kv_stats",
7c673cae
FG
3941 asok_hook,
3942 "print statistics of kvdb which used by bluestore");
11fdf7f2 3943 ceph_assert(r == 0);
7c673cae
FG
3944
3945 r = admin_socket->register_command("dump_scrubs",
7c673cae
FG
3946 asok_hook,
3947 "print scheduled scrubs");
11fdf7f2 3948 ceph_assert(r == 0);
7c673cae
FG
3949
3950 r = admin_socket->register_command("calc_objectstore_db_histogram",
7c673cae
FG
3951 asok_hook,
3952 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 3953 ceph_assert(r == 0);
7c673cae
FG
3954
3955 r = admin_socket->register_command("flush_store_cache",
7c673cae
FG
3956 asok_hook,
3957 "Flush bluestore internal cache");
11fdf7f2 3958 ceph_assert(r == 0);
9f95a23c 3959 r = admin_socket->register_command("dump_pgstate_history",
7c673cae
FG
3960 asok_hook,
3961 "show recent state history");
11fdf7f2 3962 ceph_assert(r == 0);
7c673cae 3963
9f95a23c 3964 r = admin_socket->register_command("compact",
224ce89b
WB
3965 asok_hook,
3966 "Commpact object store's omap."
3967 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
3968 ceph_assert(r == 0);
3969
9f95a23c 3970 r = admin_socket->register_command("get_mapped_pools",
11fdf7f2
TL
3971 asok_hook,
3972 "dump pools whose PG(s) are mapped to this OSD.");
3973
3974 ceph_assert(r == 0);
3975
9f95a23c 3976 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
11fdf7f2
TL
3977 asok_hook,
3978 "probe OSD devices for SMART data.");
3979
3980 ceph_assert(r == 0);
3981
9f95a23c 3982 r = admin_socket->register_command("list_devices",
11fdf7f2
TL
3983 asok_hook,
3984 "list OSD devices.");
9f95a23c 3985 r = admin_socket->register_command("send_beacon",
11fdf7f2
TL
3986 asok_hook,
3987 "send OSD beacon to mon immediately");
224ce89b 3988
9f95a23c
TL
3989 r = admin_socket->register_command(
3990 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3991 "Dump osd heartbeat network ping times");
eafe8130
TL
3992 ceph_assert(r == 0);
3993
20effc67
TL
3994 r = admin_socket->register_command(
3995 "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook,
3996 "Dump store's statistics for the given pool");
3997 ceph_assert(r == 0);
3998
3999 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store.get());
7c673cae
FG
4000 // Note: pools are CephString instead of CephPoolname because
4001 // these commands traditionally support both pool names and numbers
4002 r = admin_socket->register_command(
7c673cae
FG
4003 "setomapval " \
4004 "name=pool,type=CephString " \
4005 "name=objname,type=CephObjectname " \
4006 "name=key,type=CephString "\
4007 "name=val,type=CephString",
4008 test_ops_hook,
4009 "set omap key");
11fdf7f2 4010 ceph_assert(r == 0);
7c673cae 4011 r = admin_socket->register_command(
7c673cae
FG
4012 "rmomapkey " \
4013 "name=pool,type=CephString " \
4014 "name=objname,type=CephObjectname " \
4015 "name=key,type=CephString",
4016 test_ops_hook,
4017 "remove omap key");
11fdf7f2 4018 ceph_assert(r == 0);
7c673cae 4019 r = admin_socket->register_command(
7c673cae
FG
4020 "setomapheader " \
4021 "name=pool,type=CephString " \
4022 "name=objname,type=CephObjectname " \
4023 "name=header,type=CephString",
4024 test_ops_hook,
4025 "set omap header");
11fdf7f2 4026 ceph_assert(r == 0);
7c673cae
FG
4027
4028 r = admin_socket->register_command(
7c673cae
FG
4029 "getomap " \
4030 "name=pool,type=CephString " \
4031 "name=objname,type=CephObjectname",
4032 test_ops_hook,
4033 "output entire object map");
11fdf7f2 4034 ceph_assert(r == 0);
7c673cae
FG
4035
4036 r = admin_socket->register_command(
7c673cae
FG
4037 "truncobj " \
4038 "name=pool,type=CephString " \
4039 "name=objname,type=CephObjectname " \
4040 "name=len,type=CephInt",
4041 test_ops_hook,
4042 "truncate object to length");
11fdf7f2 4043 ceph_assert(r == 0);
7c673cae
FG
4044
4045 r = admin_socket->register_command(
7c673cae
FG
4046 "injectdataerr " \
4047 "name=pool,type=CephString " \
4048 "name=objname,type=CephObjectname " \
4049 "name=shardid,type=CephInt,req=false,range=0|255",
4050 test_ops_hook,
4051 "inject data error to an object");
11fdf7f2 4052 ceph_assert(r == 0);
7c673cae
FG
4053
4054 r = admin_socket->register_command(
7c673cae
FG
4055 "injectmdataerr " \
4056 "name=pool,type=CephString " \
4057 "name=objname,type=CephObjectname " \
4058 "name=shardid,type=CephInt,req=false,range=0|255",
4059 test_ops_hook,
4060 "inject metadata error to an object");
11fdf7f2 4061 ceph_assert(r == 0);
7c673cae 4062 r = admin_socket->register_command(
7c673cae
FG
4063 "set_recovery_delay " \
4064 "name=utime,type=CephInt,req=false",
4065 test_ops_hook,
4066 "Delay osd recovery by specified seconds");
11fdf7f2 4067 ceph_assert(r == 0);
7c673cae 4068 r = admin_socket->register_command(
7c673cae
FG
4069 "injectfull " \
4070 "name=type,type=CephString,req=false " \
4071 "name=count,type=CephInt,req=false ",
4072 test_ops_hook,
4073 "Inject a full disk (optional count times)");
11fdf7f2 4074 ceph_assert(r == 0);
9f95a23c
TL
4075 r = admin_socket->register_command(
4076 "bench " \
4077 "name=count,type=CephInt,req=false " \
4078 "name=size,type=CephInt,req=false " \
4079 "name=object_size,type=CephInt,req=false " \
4080 "name=object_num,type=CephInt,req=false ",
4081 asok_hook,
4082 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4083 "(default count=1G default size=4MB). Results in log.");
4084 ceph_assert(r == 0);
4085 r = admin_socket->register_command(
4086 "cluster_log " \
4087 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4088 "name=message,type=CephString,n=N",
4089 asok_hook,
4090 "log a message to the cluster log");
4091 ceph_assert(r == 0);
4092 r = admin_socket->register_command(
4093 "flush_pg_stats",
4094 asok_hook,
4095 "flush pg stats");
4096 ceph_assert(r == 0);
4097 r = admin_socket->register_command(
4098 "heap " \
4099 "name=heapcmd,type=CephChoices,strings=" \
4100 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4101 "name=value,type=CephString,req=false",
4102 asok_hook,
4103 "show heap usage info (available only if compiled with tcmalloc)");
4104 ceph_assert(r == 0);
4105 r = admin_socket->register_command(
4106 "debug dump_missing " \
4107 "name=filename,type=CephFilepath",
4108 asok_hook,
4109 "dump missing objects to a named file");
4110 ceph_assert(r == 0);
4111 r = admin_socket->register_command(
4112 "debug kick_recovery_wq " \
4113 "name=delay,type=CephInt,range=0",
4114 asok_hook,
4115 "set osd_recovery_delay_start to <val>");
4116 ceph_assert(r == 0);
4117 r = admin_socket->register_command(
4118 "cpu_profiler " \
4119 "name=arg,type=CephChoices,strings=status|flush",
4120 asok_hook,
4121 "run cpu profiling on daemon");
4122 ceph_assert(r == 0);
4123 r = admin_socket->register_command(
4124 "dump_pg_recovery_stats",
4125 asok_hook,
4126 "dump pg recovery statistics");
4127 ceph_assert(r == 0);
4128 r = admin_socket->register_command(
4129 "reset_pg_recovery_stats",
4130 asok_hook,
4131 "reset pg recovery statistics");
4132 ceph_assert(r == 0);
4133 r = admin_socket->register_command(
4134 "cache drop",
4135 asok_hook,
4136 "Drop all OSD caches");
4137 ceph_assert(r == 0);
4138 r = admin_socket->register_command(
4139 "cache status",
4140 asok_hook,
4141 "Get OSD caches statistics");
4142 ceph_assert(r == 0);
4143 r = admin_socket->register_command(
4144 "scrub_purged_snaps",
4145 asok_hook,
4146 "Scrub purged_snaps vs snapmapper index");
4147 ceph_assert(r == 0);
20effc67
TL
4148 r = admin_socket->register_command(
4149 "scrubdebug " \
4150 "name=pgid,type=CephPgid " \
4151 "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4152 "name=value,type=CephString,req=false",
4153 asok_hook,
4154 "debug the scrubber");
4155 ceph_assert(r == 0);
7c673cae 4156
9f95a23c
TL
4157 // -- pg commands --
4158 // old form: ceph pg <pgid> command ...
4159 r = admin_socket->register_command(
4160 "pg " \
4161 "name=pgid,type=CephPgid " \
4162 "name=cmd,type=CephChoices,strings=query",
4163 asok_hook,
4164 "");
4165 ceph_assert(r == 0);
4166 r = admin_socket->register_command(
4167 "pg " \
4168 "name=pgid,type=CephPgid " \
4169 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4170 "name=mulcmd,type=CephChoices,strings=revert|delete",
4171 asok_hook,
4172 "");
4173 ceph_assert(r == 0);
4174 r = admin_socket->register_command(
4175 "pg " \
4176 "name=pgid,type=CephPgid " \
4177 "name=cmd,type=CephChoices,strings=list_unfound " \
4178 "name=offset,type=CephString,req=false",
4179 asok_hook,
4180 "");
4181 ceph_assert(r == 0);
4182 r = admin_socket->register_command(
4183 "pg " \
4184 "name=pgid,type=CephPgid " \
4185 "name=cmd,type=CephChoices,strings=scrub " \
4186 "name=time,type=CephInt,req=false",
4187 asok_hook,
4188 "");
4189 ceph_assert(r == 0);
4190 r = admin_socket->register_command(
4191 "pg " \
4192 "name=pgid,type=CephPgid " \
4193 "name=cmd,type=CephChoices,strings=deep_scrub " \
4194 "name=time,type=CephInt,req=false",
4195 asok_hook,
4196 "");
4197 ceph_assert(r == 0);
4198 // new form: tell <pgid> <cmd> for both cli and rest
4199 r = admin_socket->register_command(
4200 "query",
4201 asok_hook,
4202 "show details of a specific pg");
4203 ceph_assert(r == 0);
4204 r = admin_socket->register_command(
4205 "mark_unfound_lost " \
4206 "name=pgid,type=CephPgid,req=false " \
4207 "name=mulcmd,type=CephChoices,strings=revert|delete",
4208 asok_hook,
4209 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4210 ceph_assert(r == 0);
4211 r = admin_socket->register_command(
4212 "list_unfound " \
4213 "name=pgid,type=CephPgid,req=false " \
4214 "name=offset,type=CephString,req=false",
4215 asok_hook,
4216 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4217 ceph_assert(r == 0);
4218 r = admin_socket->register_command(
4219 "scrub " \
4220 "name=pgid,type=CephPgid,req=false " \
4221 "name=time,type=CephInt,req=false",
4222 asok_hook,
4223 "Trigger a scheduled scrub ");
4224 ceph_assert(r == 0);
4225 r = admin_socket->register_command(
4226 "deep_scrub " \
4227 "name=pgid,type=CephPgid,req=false " \
4228 "name=time,type=CephInt,req=false",
4229 asok_hook,
4230 "Trigger a scheduled deep scrub ");
4231 ceph_assert(r == 0);
4232}
7c673cae 4233
f67539c2 4234PerfCounters* OSD::create_logger()
9f95a23c 4235{
f67539c2 4236 PerfCounters* logger = build_osd_logger(cct);
7c673cae 4237 cct->get_perfcounters_collection()->add(logger);
f67539c2 4238 return logger;
7c673cae
FG
4239}
4240
f67539c2 4241PerfCounters* OSD::create_recoverystate_perf()
7c673cae 4242{
f67539c2 4243 PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
7c673cae 4244 cct->get_perfcounters_collection()->add(recoverystate_perf);
f67539c2 4245 return recoverystate_perf;
7c673cae
FG
4246}
4247
4248int OSD::shutdown()
4249{
1d09f67e
TL
4250 // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4251 //cct->_conf->osd_fast_shutdown = true;
4252
4253 dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4254 << cct->_conf->osd_fast_shutdown
4255 << ", null-fm = " << store->has_null_manager() << dendl;
4256
4257 utime_t start_time_func = ceph_clock_now();
4258
92f5a8d4
TL
4259 if (cct->_conf->osd_fast_shutdown) {
4260 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
f67539c2
TL
4261 if (cct->_conf->osd_fast_shutdown_notify_mon)
4262 service.prepare_to_stop();
92f5a8d4 4263
1d09f67e
TL
4264 // There is no state we need to keep wehn running in NULL-FM moode
4265 if (!store->has_null_manager()) {
4266 cct->_log->flush();
4267 _exit(0);
4268 }
4269 } else if (!service.prepare_to_stop()) {
7c673cae 4270 return 0; // already shutting down
1d09f67e
TL
4271 }
4272
9f95a23c 4273 osd_lock.lock();
7c673cae 4274 if (is_stopping()) {
9f95a23c 4275 osd_lock.unlock();
7c673cae
FG
4276 return 0;
4277 }
7c673cae 4278
1d09f67e
TL
4279 if (!cct->_conf->osd_fast_shutdown) {
4280 dout(0) << "shutdown" << dendl;
4281 }
4282
4283 // don't accept new task for this OSD
7c673cae
FG
4284 set_state(STATE_STOPPING);
4285
1d09f67e
TL
4286 // Disabled debugging during fast-shutdown
4287 if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
11fdf7f2
TL
4288 cct->_conf.set_val("debug_osd", "100");
4289 cct->_conf.set_val("debug_journal", "100");
4290 cct->_conf.set_val("debug_filestore", "100");
4291 cct->_conf.set_val("debug_bluestore", "100");
4292 cct->_conf.set_val("debug_ms", "100");
4293 cct->_conf.apply_changes(nullptr);
3efd9988 4294 }
7c673cae 4295
1d09f67e
TL
4296 if (cct->_conf->osd_fast_shutdown) {
4297 // first, stop new task from being taken from op_shardedwq
4298 // and clear all pending tasks
4299 op_shardedwq.stop_for_fast_shutdown();
4300
4301 utime_t start_time_timer = ceph_clock_now();
4302 tick_timer.shutdown();
4303 {
4304 std::lock_guard l(tick_timer_lock);
4305 tick_timer_without_osd_lock.shutdown();
4306 }
4307
4308 osd_lock.unlock();
4309 utime_t start_time_osd_drain = ceph_clock_now();
4310
4311 // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4312 osd_op_tp.drain();
4313 osd_op_tp.stop();
4314
4315 utime_t start_time_umount = ceph_clock_now();
4316 store->prepare_for_fast_shutdown();
4317 std::lock_guard lock(osd_lock);
4318 // TBD: assert in allocator that nothing is being add
4319 store->umount();
4320
4321 utime_t end_time = ceph_clock_now();
4322 if (cct->_conf->osd_fast_shutdown_timeout) {
4323 ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4324 }
4325 dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl;
4326 dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl;
4327 dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl;
4328 dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl;
4329 cct->_log->flush();
4330
4331 // now it is safe to exit
4332 _exit(0);
4333 }
4334
7c673cae
FG
4335 // stop MgrClient earlier as it's more like an internal consumer of OSD
4336 mgrc.shutdown();
4337
4338 service.start_shutdown();
4339
4340 // stop sending work to pgs. this just prevents any new work in _process
4341 // from racing with on_shutdown and potentially entering the pg after.
4342 op_shardedwq.drain();
4343
4344 // Shutdown PGs
4345 {
11fdf7f2
TL
4346 vector<PGRef> pgs;
4347 _get_pgs(&pgs);
4348 for (auto pg : pgs) {
4349 pg->shutdown();
7c673cae
FG
4350 }
4351 }
7c673cae
FG
4352
4353 // drain op queue again (in case PGs requeued something)
4354 op_shardedwq.drain();
4355 {
4356 finished.clear(); // zap waiters (bleh, this is messy)
11fdf7f2 4357 waiting_for_osdmap.clear();
7c673cae
FG
4358 }
4359
7c673cae 4360 // unregister commands
11fdf7f2 4361 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
4362 delete asok_hook;
4363 asok_hook = NULL;
4364
11fdf7f2 4365 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
4366 delete test_ops_hook;
4367 test_ops_hook = NULL;
4368
9f95a23c 4369 osd_lock.unlock();
7c673cae 4370
9f95a23c
TL
4371 {
4372 std::lock_guard l{heartbeat_lock};
4373 heartbeat_stop = true;
4374 heartbeat_cond.notify_all();
4375 heartbeat_peers.clear();
4376 }
7c673cae
FG
4377 heartbeat_thread.join();
4378
9f95a23c
TL
4379 hb_back_server_messenger->mark_down_all();
4380 hb_front_server_messenger->mark_down_all();
4381 hb_front_client_messenger->mark_down_all();
4382 hb_back_client_messenger->mark_down_all();
4383
7c673cae
FG
4384 osd_op_tp.drain();
4385 osd_op_tp.stop();
4386 dout(10) << "op sharded tp stopped" << dendl;
4387
7c673cae
FG
4388 dout(10) << "stopping agent" << dendl;
4389 service.agent_stop();
4390
11fdf7f2
TL
4391 boot_finisher.wait_for_empty();
4392
9f95a23c 4393 osd_lock.lock();
7c673cae 4394
11fdf7f2 4395 boot_finisher.stop();
494da23a 4396 reset_heartbeat_peers(true);
7c673cae
FG
4397
4398 tick_timer.shutdown();
4399
4400 {
11fdf7f2 4401 std::lock_guard l(tick_timer_lock);
7c673cae
FG
4402 tick_timer_without_osd_lock.shutdown();
4403 }
4404
4405 // note unmount epoch
9f95a23c 4406 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
7c673cae 4407 superblock.mounted = service.get_boot_epoch();
9f95a23c 4408 superblock.clean_thru = get_osdmap_epoch();
7c673cae
FG
4409 ObjectStore::Transaction t;
4410 write_superblock(t);
11fdf7f2 4411 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4412 if (r) {
4413 derr << "OSD::shutdown: error writing superblock: "
4414 << cpp_strerror(r) << dendl;
4415 }
4416
4417
31f18b77
FG
4418 service.shutdown_reserver();
4419
7c673cae
FG
4420 // Remove PGs
4421#ifdef PG_DEBUG_REFS
4422 service.dump_live_pgids();
4423#endif
11fdf7f2
TL
4424 while (true) {
4425 vector<PGRef> pgs;
4426 _get_pgs(&pgs, true);
4427 if (pgs.empty()) {
4428 break;
4429 }
4430 for (auto& pg : pgs) {
4431 if (pg->is_deleted()) {
4432 continue;
4433 }
4434 dout(20) << " kicking pg " << pg << dendl;
4435 pg->lock();
4436 if (pg->get_num_ref() != 1) {
4437 derr << "pgid " << pg->get_pgid() << " has ref count of "
4438 << pg->get_num_ref() << dendl;
7c673cae 4439#ifdef PG_DEBUG_REFS
11fdf7f2 4440 pg->dump_live_ids();
7c673cae 4441#endif
31f18b77
FG
4442 if (cct->_conf->osd_shutdown_pgref_assert) {
4443 ceph_abort();
4444 }
7c673cae 4445 }
11fdf7f2
TL
4446 pg->ch.reset();
4447 pg->unlock();
7c673cae 4448 }
7c673cae
FG
4449 }
4450#ifdef PG_DEBUG_REFS
4451 service.dump_live_pgids();
4452#endif
f64942e4 4453
9f95a23c 4454 osd_lock.unlock();
11fdf7f2 4455 cct->_conf.remove_observer(this);
9f95a23c 4456 osd_lock.lock();
7c673cae 4457
11fdf7f2
TL
4458 service.meta_ch.reset();
4459
7c673cae
FG
4460 dout(10) << "syncing store" << dendl;
4461 enable_disable_fuse(true);
4462
4463 if (cct->_conf->osd_journal_flush_on_shutdown) {
4464 dout(10) << "flushing journal" << dendl;
4465 store->flush_journal();
4466 }
4467
7c673cae 4468 monc->shutdown();
9f95a23c
TL
4469 osd_lock.unlock();
4470 {
4471 std::unique_lock l{map_lock};
4472 set_osdmap(OSDMapRef());
4473 }
11fdf7f2
TL
4474 for (auto s : shards) {
4475 std::lock_guard l(s->osdmap_lock);
4476 s->shard_osdmap = OSDMapRef();
4477 }
7c673cae 4478 service.shutdown();
11fdf7f2
TL
4479
4480 std::lock_guard lock(osd_lock);
4481 store->umount();
20effc67 4482 store.reset();
11fdf7f2
TL
4483 dout(10) << "Store synced" << dendl;
4484
7c673cae
FG
4485 op_tracker.on_shutdown();
4486
9f95a23c 4487 ClassHandler::get_instance().shutdown();
7c673cae
FG
4488 client_messenger->shutdown();
4489 cluster_messenger->shutdown();
4490 hb_front_client_messenger->shutdown();
4491 hb_back_client_messenger->shutdown();
4492 objecter_messenger->shutdown();
4493 hb_front_server_messenger->shutdown();
4494 hb_back_server_messenger->shutdown();
4495
1d09f67e
TL
4496 utime_t duration = ceph_clock_now() - start_time_func;
4497 dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
4498
20effc67
TL
4499 tracing::osd::tracer.shutdown();
4500
7c673cae
FG
4501 return r;
4502}
4503
4504int OSD::mon_cmd_maybe_osd_create(string &cmd)
4505{
4506 bool created = false;
4507 while (true) {
4508 dout(10) << __func__ << " cmd: " << cmd << dendl;
4509 vector<string> vcmd{cmd};
4510 bufferlist inbl;
4511 C_SaferCond w;
4512 string outs;
4513 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4514 int r = w.wait();
4515 if (r < 0) {
4516 if (r == -ENOENT && !created) {
4517 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4518 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4519 vector<string> vnewcmd{newcmd};
4520 bufferlist inbl;
4521 C_SaferCond w;
4522 string outs;
4523 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4524 int r = w.wait();
4525 if (r < 0) {
4526 derr << __func__ << " fail: osd does not exist and created failed: "
4527 << cpp_strerror(r) << dendl;
4528 return r;
4529 }
4530 created = true;
4531 continue;
4532 }
4533 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4534 return r;
4535 }
4536 break;
4537 }
4538
4539 return 0;
4540}
4541
4542int OSD::update_crush_location()
4543{
4544 if (!cct->_conf->osd_crush_update_on_start) {
4545 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4546 return 0;
4547 }
4548
4549 char weight[32];
4550 if (cct->_conf->osd_crush_initial_weight >= 0) {
4551 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4552 } else {
4553 struct store_statfs_t st;
11fdf7f2
TL
4554 osd_alert_list_t alerts;
4555 int r = store->statfs(&st, &alerts);
7c673cae
FG
4556 if (r < 0) {
4557 derr << "statfs: " << cpp_strerror(r) << dendl;
4558 return r;
4559 }
4560 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4561 std::max(.00001,
4562 double(st.total) /
4563 double(1ull << 40 /* TB */)));
7c673cae
FG
4564 }
4565
9f95a23c 4566 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
7c673cae
FG
4567
4568 string cmd =
4569 string("{\"prefix\": \"osd crush create-or-move\", ") +
9f95a23c
TL
4570 string("\"id\": ") + stringify(whoami) + ", " +
4571 string("\"weight\":") + weight + ", " +
4572 string("\"args\": [") + stringify(cct->crush_location) + "]}";
7c673cae
FG
4573 return mon_cmd_maybe_osd_create(cmd);
4574}
4575
4576int OSD::update_crush_device_class()
4577{
224ce89b
WB
4578 if (!cct->_conf->osd_class_update_on_start) {
4579 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4580 return 0;
4581 }
4582
7c673cae
FG
4583 string device_class;
4584 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4585 if (r < 0 || device_class.empty()) {
4586 device_class = store->get_default_device_class();
4587 }
4588
4589 if (device_class.empty()) {
d2e6a577 4590 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4591 return 0;
224ce89b 4592 }
7c673cae
FG
4593
4594 string cmd =
4595 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4596 string("\"class\": \"") + device_class + string("\", ") +
4597 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4598
224ce89b 4599 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4600 if (r == -EBUSY) {
4601 // good, already bound to a device-class
4602 return 0;
4603 } else {
4604 return r;
4605 }
7c673cae
FG
4606}
4607
4608void OSD::write_superblock(ObjectStore::Transaction& t)
4609{
4610 dout(10) << "write_superblock " << superblock << dendl;
4611
4612 //hack: at minimum it's using the baseline feature set
4613 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4614 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4615
4616 bufferlist bl;
11fdf7f2 4617 encode(superblock, bl);
7c673cae
FG
4618 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4619}
4620
4621int OSD::read_superblock()
4622{
4623 bufferlist bl;
11fdf7f2 4624 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4625 if (r < 0)
4626 return r;
4627
11fdf7f2
TL
4628 auto p = bl.cbegin();
4629 decode(superblock, p);
7c673cae
FG
4630
4631 dout(10) << "read_superblock " << superblock << dendl;
4632
4633 return 0;
4634}
4635
4636void OSD::clear_temp_objects()
4637{
4638 dout(10) << __func__ << dendl;
4639 vector<coll_t> ls;
4640 store->list_collections(ls);
4641 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4642 spg_t pgid;
4643 if (!p->is_pg(&pgid))
4644 continue;
4645
4646 // list temp objects
4647 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4648
4649 vector<ghobject_t> temps;
4650 ghobject_t next;
4651 while (1) {
4652 vector<ghobject_t> objects;
11fdf7f2
TL
4653 auto ch = store->open_collection(*p);
4654 ceph_assert(ch);
4655 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4656 store->get_ideal_list_max(),
4657 &objects, &next);
4658 if (objects.empty())
4659 break;
4660 vector<ghobject_t>::iterator q;
4661 for (q = objects.begin(); q != objects.end(); ++q) {
4662 // Hammer set pool for temps to -1, so check for clean-up
4663 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4664 temps.push_back(*q);
4665 } else {
4666 break;
4667 }
4668 }
4669 // If we saw a non-temp object and hit the break above we can
4670 // break out of the while loop too.
4671 if (q != objects.end())
4672 break;
4673 }
4674 if (!temps.empty()) {
4675 ObjectStore::Transaction t;
4676 int removed = 0;
4677 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4678 dout(20) << " removing " << *p << " object " << *q << dendl;
4679 t.remove(*p, *q);
4680 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4681 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4682 t = ObjectStore::Transaction();
4683 removed = 0;
4684 }
4685 }
4686 if (removed) {
11fdf7f2 4687 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4688 }
4689 }
4690 }
4691}
4692
4693void OSD::recursive_remove_collection(CephContext* cct,
4694 ObjectStore *store, spg_t pgid,
4695 coll_t tmp)
4696{
4697 OSDriver driver(
4698 store,
4699 coll_t(),
4700 make_snapmapper_oid());
4701
11fdf7f2 4702 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4703 ObjectStore::Transaction t;
4704 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4705
11fdf7f2
TL
4706 ghobject_t next;
4707 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4708 vector<ghobject_t> objects;
11fdf7f2
TL
4709 objects.reserve(max);
4710 while (true) {
4711 objects.clear();
4712 store->collection_list(ch, next, ghobject_t::get_max(),
4713 max, &objects, &next);
4714 generic_dout(10) << __func__ << " " << objects << dendl;
4715 if (objects.empty())
4716 break;
4717 for (auto& p: objects) {
4718 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4719 int r = mapper.remove_oid(p.hobj, &_t);
4720 if (r != 0 && r != -ENOENT)
4721 ceph_abort();
4722 t.remove(tmp, p);
7c673cae 4723 }
11fdf7f2
TL
4724 int r = store->queue_transaction(ch, std::move(t));
4725 ceph_assert(r == 0);
4726 t = ObjectStore::Transaction();
7c673cae
FG
4727 }
4728 t.remove_collection(tmp);
11fdf7f2
TL
4729 int r = store->queue_transaction(ch, std::move(t));
4730 ceph_assert(r == 0);
7c673cae
FG
4731
4732 C_SaferCond waiter;
11fdf7f2 4733 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4734 waiter.wait();
4735 }
4736}
4737
4738
4739// ======================================================
4740// PG's
4741
7c673cae
FG
4742PG* OSD::_make_pg(
4743 OSDMapRef createmap,
4744 spg_t pgid)
4745{
11fdf7f2
TL
4746 dout(10) << __func__ << " " << pgid << dendl;
4747 pg_pool_t pi;
4748 map<string,string> ec_profile;
4749 string name;
4750 if (createmap->have_pg_pool(pgid.pool())) {
4751 pi = *createmap->get_pg_pool(pgid.pool());
4752 name = createmap->get_pool_name(pgid.pool());
4753 if (pi.is_erasure()) {
4754 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4755 }
4756 } else {
4757 // pool was deleted; grab final pg_pool_t off disk.
4758 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4759 bufferlist bl;
4760 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4761 if (r < 0) {
4762 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4763 << dendl;
4764 return nullptr;
4765 }
4766 ceph_assert(r >= 0);
4767 auto p = bl.cbegin();
4768 decode(pi, p);
4769 decode(name, p);
4770 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4771 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4772 << " tombstone" << dendl;
4773 return nullptr;
4774 }
4775 decode(ec_profile, p);
4776 }
f67539c2 4777 PGPool pool(createmap, pgid.pool(), pi, name);
7c673cae 4778 PG *pg;
11fdf7f2
TL
4779 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4780 pi.type == pg_pool_t::TYPE_ERASURE)
4781 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4782 else
4783 ceph_abort();
7c673cae
FG
4784 return pg;
4785}
4786
11fdf7f2 4787void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4788{
11fdf7f2
TL
4789 v->clear();
4790 v->reserve(get_num_pgs());
4791 for (auto& s : shards) {
4792 std::lock_guard l(s->shard_lock);
4793 for (auto& j : s->pg_slots) {
4794 if (j.second->pg &&
4795 !j.second->pg->is_deleted()) {
4796 v->push_back(j.second->pg);
4797 if (clear_too) {
4798 s->_detach_pg(j.second.get());
4799 }
4800 }
7c673cae 4801 }
7c673cae 4802 }
7c673cae
FG
4803}
4804
11fdf7f2 4805void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4806{
11fdf7f2
TL
4807 v->clear();
4808 v->reserve(get_num_pgs());
4809 for (auto& s : shards) {
4810 std::lock_guard l(s->shard_lock);
4811 for (auto& j : s->pg_slots) {
4812 if (j.second->pg &&
4813 !j.second->pg->is_deleted()) {
4814 v->push_back(j.first);
4815 }
7c673cae
FG
4816 }
4817 }
7c673cae
FG
4818}
4819
11fdf7f2 4820void OSD::register_pg(PGRef pg)
7c673cae 4821{
11fdf7f2
TL
4822 spg_t pgid = pg->get_pgid();
4823 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4824 auto sdata = shards[shard_index];
4825 std::lock_guard l(sdata->shard_lock);
4826 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4827 ceph_assert(r.second);
4828 auto *slot = r.first->second.get();
4829 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4830 sdata->_attach_pg(slot, pg.get());
4831}
7c673cae 4832
11fdf7f2
TL
4833bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4834{
4835 auto sdata = pg->osd_shard;
4836 ceph_assert(sdata);
4837 {
4838 std::lock_guard l(sdata->shard_lock);
4839 auto p = sdata->pg_slots.find(pg->pg_id);
4840 if (p == sdata->pg_slots.end() ||
4841 !p->second->pg) {
4842 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4843 return false;
4844 }
4845 if (p->second->waiting_for_merge_epoch) {
4846 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4847 return false;
4848 }
4849 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4850 sdata->_detach_pg(p->second.get());
4851 }
7c673cae 4852
11fdf7f2
TL
4853 for (auto shard : shards) {
4854 shard->unprime_split_children(pg->pg_id, old_pg_num);
4855 }
7c673cae 4856
11fdf7f2
TL
4857 // update pg count now since we might not get an osdmap any time soon.
4858 if (pg->is_primary())
4859 service.logger->dec(l_osd_pg_primary);
9f95a23c
TL
4860 else if (pg->is_nonprimary())
4861 service.logger->dec(l_osd_pg_replica); // misnomver
11fdf7f2
TL
4862 else
4863 service.logger->dec(l_osd_pg_stray);
7c673cae 4864
11fdf7f2 4865 return true;
7c673cae
FG
4866}
4867
11fdf7f2 4868PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 4869{
11fdf7f2
TL
4870 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4871 auto sdata = shards[shard_index];
4872 std::lock_guard l(sdata->shard_lock);
4873 auto p = sdata->pg_slots.find(pgid);
4874 if (p == sdata->pg_slots.end()) {
7c673cae 4875 return nullptr;
11fdf7f2
TL
4876 }
4877 return p->second->pg;
7c673cae
FG
4878}
4879
11fdf7f2 4880PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 4881{
11fdf7f2
TL
4882 PGRef pg = _lookup_pg(pgid);
4883 if (!pg) {
4884 return nullptr;
4885 }
4886 pg->lock();
4887 if (!pg->is_deleted()) {
4888 return pg;
4889 }
4890 pg->unlock();
4891 return nullptr;
31f18b77
FG
4892}
4893
11fdf7f2 4894PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 4895{
11fdf7f2 4896 return _lookup_lock_pg(pgid);
7c673cae
FG
4897}
4898
4899void OSD::load_pgs()
4900{
9f95a23c 4901 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 4902 dout(0) << "load_pgs" << dendl;
11fdf7f2 4903
7c673cae 4904 {
11fdf7f2
TL
4905 auto pghist = make_pg_num_history_oid();
4906 bufferlist bl;
4907 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4908 if (r >= 0 && bl.length() > 0) {
4909 auto p = bl.cbegin();
4910 decode(pg_num_history, p);
4911 }
4912 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
4913 }
4914
4915 vector<coll_t> ls;
4916 int r = store->list_collections(ls);
4917 if (r < 0) {
4918 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4919 }
4920
11fdf7f2 4921 int num = 0;
7c673cae
FG
4922 for (vector<coll_t>::iterator it = ls.begin();
4923 it != ls.end();
4924 ++it) {
4925 spg_t pgid;
4926 if (it->is_temp(&pgid) ||
20effc67 4927 (it->is_pg(&pgid) && PG::_has_removal_flag(store.get(), pgid))) {
11fdf7f2
TL
4928 dout(10) << "load_pgs " << *it
4929 << " removing, legacy or flagged for removal pg" << dendl;
20effc67 4930 recursive_remove_collection(cct, store.get(), pgid, *it);
7c673cae
FG
4931 continue;
4932 }
4933
4934 if (!it->is_pg(&pgid)) {
4935 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4936 continue;
4937 }
4938
7c673cae 4939 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 4940 epoch_t map_epoch = 0;
20effc67 4941 int r = PG::peek_map_epoch(store.get(), pgid, &map_epoch);
7c673cae
FG
4942 if (r < 0) {
4943 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4944 << dendl;
4945 continue;
4946 }
4947
11fdf7f2 4948 PGRef pg;
7c673cae
FG
4949 if (map_epoch > 0) {
4950 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4951 if (!pgosdmap) {
9f95a23c 4952 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
7c673cae
FG
4953 derr << __func__ << ": could not find map for epoch " << map_epoch
4954 << " on pg " << pgid << ", but the pool is not present in the "
4955 << "current map, so this is probably a result of bug 10617. "
4956 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4957 << "to clean it up later." << dendl;
4958 continue;
4959 } else {
4960 derr << __func__ << ": have pgid " << pgid << " at epoch "
4961 << map_epoch << ", but missing map. Crashing."
4962 << dendl;
11fdf7f2 4963 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
4964 }
4965 }
11fdf7f2 4966 pg = _make_pg(pgosdmap, pgid);
7c673cae 4967 } else {
9f95a23c 4968 pg = _make_pg(get_osdmap(), pgid);
7c673cae 4969 }
11fdf7f2 4970 if (!pg) {
20effc67 4971 recursive_remove_collection(cct, store.get(), pgid, *it);
11fdf7f2
TL
4972 continue;
4973 }
4974
4975 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 4976
11fdf7f2 4977 pg->lock();
7c673cae
FG
4978 pg->ch = store->open_collection(pg->coll);
4979
4980 // read pg state, log
20effc67 4981 pg->read_state(store.get());
7c673cae 4982
94b18763
FG
4983 if (pg->dne()) {
4984 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4985 pg->ch = nullptr;
94b18763 4986 pg->unlock();
20effc67 4987 recursive_remove_collection(cct, store.get(), pgid, *it);
94b18763
FG
4988 continue;
4989 }
11fdf7f2
TL
4990 {
4991 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4992 assert(NULL != shards[shard_index]);
4993 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4994 }
7c673cae 4995
11fdf7f2 4996 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 4997 pg->unlock();
7c673cae 4998
11fdf7f2
TL
4999 register_pg(pg);
5000 ++num;
7c673cae 5001 }
11fdf7f2 5002 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
5003}
5004
5005
11fdf7f2
TL
5006PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
5007 const PGCreateInfo *info)
5008{
5009 spg_t pgid = info->pgid;
7c673cae 5010
11fdf7f2
TL
5011 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
5012 dout(10) << __func__ << " hit max pg, dropping" << dendl;
5013 return nullptr;
5014 }
3efd9988 5015
11fdf7f2 5016 OSDMapRef startmap = get_map(info->epoch);
7c673cae 5017
11fdf7f2
TL
5018 if (info->by_mon) {
5019 int64_t pool_id = pgid.pgid.pool();
5020 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
5021 if (!pool) {
5022 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
5023 return nullptr;
5024 }
9f95a23c 5025 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
5026 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
5027 // this ensures we do not process old creating messages after the
5028 // pool's initial pgs have been created (and pg are subsequently
5029 // allowed to split or merge).
5030 dout(20) << __func__ << " dropping " << pgid
5031 << "create, pool does not have CREATING flag set" << dendl;
5032 return nullptr;
7c673cae
FG
5033 }
5034 }
7c673cae 5035
11fdf7f2
TL
5036 int up_primary, acting_primary;
5037 vector<int> up, acting;
5038 startmap->pg_to_up_acting_osds(
5039 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 5040
11fdf7f2
TL
5041 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
5042 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
5043 store->get_type() != "bluestore") {
5044 clog->warn() << "pg " << pgid
5045 << " is at risk of silent data corruption: "
5046 << "the pool allows ec overwrites but is not stored in "
5047 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 5048 }
20effc67 5049 PeeringCtx rctx;
9f95a23c
TL
5050 create_pg_collection(
5051 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
5052 init_pg_ondisk(rctx.transaction, pgid, pp);
7c673cae 5053
9f95a23c 5054 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae 5055
11fdf7f2
TL
5056 PGRef pg = _make_pg(startmap, pgid);
5057 pg->ch = store->create_new_collection(pg->coll);
7c673cae 5058
11fdf7f2
TL
5059 {
5060 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5061 assert(NULL != shards[shard_index]);
5062 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 5063 }
7c673cae 5064
11fdf7f2 5065 pg->lock(true);
7c673cae 5066
11fdf7f2
TL
5067 // we are holding the shard lock
5068 ceph_assert(!pg->is_deleted());
5069
5070 pg->init(
5071 role,
5072 up,
5073 up_primary,
5074 acting,
5075 acting_primary,
5076 info->history,
5077 info->past_intervals,
11fdf7f2 5078 rctx.transaction);
7c673cae 5079
92f5a8d4
TL
5080 pg->init_collection_pool_opts();
5081
11fdf7f2 5082 if (pg->is_primary()) {
9f95a23c 5083 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
5084 pg->set_dynamic_perf_stats_queries(m_perf_queries);
5085 }
7c673cae 5086
9f95a23c
TL
5087 pg->handle_initialize(rctx);
5088 pg->handle_activate_map(rctx);
7c673cae 5089
11fdf7f2 5090 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 5091
11fdf7f2
TL
5092 dout(10) << __func__ << " new pg " << *pg << dendl;
5093 return pg;
7c673cae
FG
5094}
5095
11fdf7f2
TL
5096bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
5097 spg_t pgid,
5098 bool is_mon_create)
3efd9988
FG
5099{
5100 const auto max_pgs_per_osd =
11fdf7f2
TL
5101 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5102 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 5103
11fdf7f2 5104 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
5105 return false;
5106 }
11fdf7f2
TL
5107
5108 std::lock_guard l(pending_creates_lock);
3efd9988
FG
5109 if (is_mon_create) {
5110 pending_creates_from_mon++;
5111 } else {
9f95a23c
TL
5112 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
5113 pending_creates_from_osd.emplace(pgid, is_primary);
3efd9988 5114 }
1adf2230 5115 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 5116 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
5117 return true;
5118}
5119
5120// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5121// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5122// to up set if pg_temp is empty. so an empty pg_temp won't work.
5123static vector<int32_t> twiddle(const vector<int>& acting) {
5124 if (acting.size() > 1) {
5125 return {acting[0]};
5126 } else {
5127 vector<int32_t> twiddled(acting.begin(), acting.end());
5128 twiddled.push_back(-1);
5129 return twiddled;
5130 }
5131}
5132
5133void OSD::resume_creating_pg()
5134{
5135 bool do_sub_pg_creates = false;
b32b8144 5136 bool have_pending_creates = false;
3efd9988
FG
5137 {
5138 const auto max_pgs_per_osd =
11fdf7f2
TL
5139 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5140 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5141 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
5142 // this could happen if admin decreases this setting before a PG is removed
5143 return;
5144 }
11fdf7f2
TL
5145 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5146 std::lock_guard l(pending_creates_lock);
3efd9988 5147 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
5148 dout(20) << __func__ << " pending_creates_from_mon "
5149 << pending_creates_from_mon << dendl;
3efd9988
FG
5150 do_sub_pg_creates = true;
5151 if (pending_creates_from_mon >= spare_pgs) {
5152 spare_pgs = pending_creates_from_mon = 0;
5153 } else {
5154 spare_pgs -= pending_creates_from_mon;
5155 pending_creates_from_mon = 0;
5156 }
5157 }
5158 auto pg = pending_creates_from_osd.cbegin();
5159 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 5160 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 5161 vector<int> acting;
9f95a23c
TL
5162 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5163 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
3efd9988 5164 pg = pending_creates_from_osd.erase(pg);
94b18763 5165 do_sub_pg_creates = true;
3efd9988
FG
5166 spare_pgs--;
5167 }
b32b8144
FG
5168 have_pending_creates = (pending_creates_from_mon > 0 ||
5169 !pending_creates_from_osd.empty());
3efd9988 5170 }
b32b8144
FG
5171
5172 bool do_renew_subs = false;
3efd9988
FG
5173 if (do_sub_pg_creates) {
5174 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5175 dout(4) << __func__ << ": resolicit pg creates from mon since "
5176 << last_pg_create_epoch << dendl;
b32b8144 5177 do_renew_subs = true;
3efd9988
FG
5178 }
5179 }
9f95a23c 5180 version_t start = get_osdmap_epoch() + 1;
b32b8144
FG
5181 if (have_pending_creates) {
5182 // don't miss any new osdmap deleting PGs
5183 if (monc->sub_want("osdmap", start, 0)) {
5184 dout(4) << __func__ << ": resolicit osdmap from mon since "
5185 << start << dendl;
5186 do_renew_subs = true;
5187 }
94b18763 5188 } else if (do_sub_pg_creates) {
b32b8144
FG
5189 // no need to subscribe the osdmap continuously anymore
5190 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5191 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 5192 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
5193 << start << dendl;
5194 do_renew_subs = true;
5195 }
5196 }
5197
5198 if (do_renew_subs) {
5199 monc->renew_subs();
5200 }
5201
94b18763 5202 service.send_pg_temp();
3efd9988 5203}
7c673cae
FG
5204
5205void OSD::build_initial_pg_history(
5206 spg_t pgid,
5207 epoch_t created,
5208 utime_t created_stamp,
5209 pg_history_t *h,
5210 PastIntervals *pi)
5211{
5212 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
9f95a23c 5213 *h = pg_history_t(created, created_stamp);
7c673cae
FG
5214
5215 OSDMapRef lastmap = service.get_map(created);
5216 int up_primary, acting_primary;
5217 vector<int> up, acting;
5218 lastmap->pg_to_up_acting_osds(
5219 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5220
5221 ostringstream debug;
9f95a23c 5222 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
7c673cae
FG
5223 OSDMapRef osdmap = service.get_map(e);
5224 int new_up_primary, new_acting_primary;
5225 vector<int> new_up, new_acting;
5226 osdmap->pg_to_up_acting_osds(
5227 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5228
5229 // this is a bit imprecise, but sufficient?
5230 struct min_size_predicate_t : public IsPGRecoverablePredicate {
5231 const pg_pool_t *pi;
5232 bool operator()(const set<pg_shard_t> &have) const {
5233 return have.size() >= pi->min_size;
5234 }
11fdf7f2 5235 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
7c673cae
FG
5236 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5237
5238 bool new_interval = PastIntervals::check_new_interval(
5239 acting_primary,
5240 new_acting_primary,
5241 acting, new_acting,
5242 up_primary,
5243 new_up_primary,
5244 up, new_up,
5245 h->same_interval_since,
5246 h->last_epoch_clean,
9f95a23c
TL
5247 osdmap.get(),
5248 lastmap.get(),
7c673cae 5249 pgid.pgid,
9f95a23c 5250 min_size_predicate,
7c673cae
FG
5251 pi,
5252 &debug);
5253 if (new_interval) {
5254 h->same_interval_since = e;
181888fb
FG
5255 if (up != new_up) {
5256 h->same_up_since = e;
5257 }
5258 if (acting_primary != new_acting_primary) {
5259 h->same_primary_since = e;
5260 }
5261 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5262 osdmap->get_pg_num(pgid.pgid.pool()),
5263 nullptr)) {
5264 h->last_epoch_split = e;
5265 }
5266 up = new_up;
5267 acting = new_acting;
5268 up_primary = new_up_primary;
5269 acting_primary = new_acting_primary;
c07f9fc5 5270 }
7c673cae
FG
5271 lastmap = osdmap;
5272 }
5273 dout(20) << __func__ << " " << debug.str() << dendl;
5274 dout(10) << __func__ << " " << *h << " " << *pi
5275 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5276 pi->get_bounds()) << ")"
5277 << dendl;
5278}
5279
7c673cae
FG
5280void OSD::_add_heartbeat_peer(int p)
5281{
5282 if (p == whoami)
5283 return;
5284 HeartbeatInfo *hi;
5285
5286 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5287 if (i == heartbeat_peers.end()) {
9f95a23c 5288 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
7c673cae
FG
5289 if (!cons.first)
5290 return;
9f95a23c
TL
5291 assert(cons.second);
5292
7c673cae
FG
5293 hi = &heartbeat_peers[p];
5294 hi->peer = p;
9f95a23c
TL
5295
5296 auto stamps = service.get_hb_stamps(p);
5297
5298 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5299 sb->peer = p;
5300 sb->stamps = stamps;
eafe8130 5301 hi->hb_interval_start = ceph_clock_now();
7c673cae 5302 hi->con_back = cons.first.get();
9f95a23c
TL
5303 hi->con_back->set_priv(sb);
5304
5305 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5306 sf->peer = p;
5307 sf->stamps = stamps;
5308 hi->con_front = cons.second.get();
5309 hi->con_front->set_priv(sf);
5310
5311 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5312 << " " << hi->con_back->get_peer_addr()
5313 << " " << hi->con_front->get_peer_addr()
5314 << dendl;
7c673cae
FG
5315 } else {
5316 hi = &i->second;
5317 }
9f95a23c 5318 hi->epoch = get_osdmap_epoch();
7c673cae
FG
5319}
5320
5321void OSD::_remove_heartbeat_peer(int n)
5322{
5323 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 5324 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
5325 dout(20) << " removing heartbeat peer osd." << n
5326 << " " << q->second.con_back->get_peer_addr()
5327 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5328 << dendl;
9f95a23c 5329 q->second.clear_mark_down();
7c673cae
FG
5330 heartbeat_peers.erase(q);
5331}
5332
5333void OSD::need_heartbeat_peer_update()
5334{
5335 if (is_stopping())
5336 return;
5337 dout(20) << "need_heartbeat_peer_update" << dendl;
5338 heartbeat_set_peers_need_update();
5339}
5340
5341void OSD::maybe_update_heartbeat_peers()
5342{
9f95a23c 5343 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5344
11fdf7f2 5345 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
5346 utime_t now = ceph_clock_now();
5347 if (last_heartbeat_resample == utime_t()) {
5348 last_heartbeat_resample = now;
5349 heartbeat_set_peers_need_update();
5350 } else if (!heartbeat_peers_need_update()) {
5351 utime_t dur = now - last_heartbeat_resample;
5352 if (dur > cct->_conf->osd_heartbeat_grace) {
5353 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5354 heartbeat_set_peers_need_update();
5355 last_heartbeat_resample = now;
494da23a
TL
5356 // automatically clean up any stale heartbeat peers
5357 // if we are unhealthy, then clean all
5358 reset_heartbeat_peers(is_waiting_for_healthy());
7c673cae
FG
5359 }
5360 }
5361 }
5362
5363 if (!heartbeat_peers_need_update())
5364 return;
5365 heartbeat_clear_peers_need_update();
5366
11fdf7f2 5367 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5368
5369 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5370
5371
5372 // build heartbeat from set
5373 if (is_active()) {
11fdf7f2
TL
5374 vector<PGRef> pgs;
5375 _get_pgs(&pgs);
5376 for (auto& pg : pgs) {
5377 pg->with_heartbeat_peers([&](int peer) {
9f95a23c 5378 if (get_osdmap()->is_up(peer)) {
11fdf7f2
TL
5379 _add_heartbeat_peer(peer);
5380 }
5381 });
7c673cae
FG
5382 }
5383 }
5384
5385 // include next and previous up osds to ensure we have a fully-connected set
5386 set<int> want, extras;
9f95a23c 5387 const int next = get_osdmap()->get_next_up_osd_after(whoami);
7c673cae
FG
5388 if (next >= 0)
5389 want.insert(next);
9f95a23c 5390 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
7c673cae
FG
5391 if (prev >= 0 && prev != next)
5392 want.insert(prev);
5393
11fdf7f2
TL
5394 // make sure we have at least **min_down** osds coming from different
5395 // subtree level (e.g., hosts) for fast failure detection.
5396 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5397 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
9f95a23c
TL
5398 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5399 get_osdmap()->get_random_up_osds_by_subtree(
5400 whoami, subtree, limit, want, &want);
11fdf7f2 5401
7c673cae
FG
5402 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5403 dout(10) << " adding neighbor peer osd." << *p << dendl;
5404 extras.insert(*p);
5405 _add_heartbeat_peer(*p);
5406 }
5407
5408 // remove down peers; enumerate extras
5409 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5410 while (p != heartbeat_peers.end()) {
9f95a23c 5411 if (!get_osdmap()->is_up(p->first)) {
7c673cae
FG
5412 int o = p->first;
5413 ++p;
5414 _remove_heartbeat_peer(o);
5415 continue;
5416 }
9f95a23c 5417 if (p->second.epoch < get_osdmap_epoch()) {
7c673cae
FG
5418 extras.insert(p->first);
5419 }
5420 ++p;
5421 }
5422
5423 // too few?
11fdf7f2 5424 for (int n = next; n >= 0; ) {
7c673cae
FG
5425 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5426 break;
5427 if (!extras.count(n) && !want.count(n) && n != whoami) {
5428 dout(10) << " adding random peer osd." << n << dendl;
5429 extras.insert(n);
5430 _add_heartbeat_peer(n);
5431 }
9f95a23c 5432 n = get_osdmap()->get_next_up_osd_after(n);
11fdf7f2 5433 if (n == next)
7c673cae
FG
5434 break; // came full circle; stop
5435 }
5436
5437 // too many?
5438 for (set<int>::iterator p = extras.begin();
5439 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5440 ++p) {
5441 if (want.count(*p))
5442 continue;
5443 _remove_heartbeat_peer(*p);
5444 }
5445
5446 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
9f95a23c
TL
5447
5448 // clean up stale failure pending
5449 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5450 if (heartbeat_peers.count(it->first) == 0) {
5451 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5452 failure_pending.erase(it++);
5453 } else {
5454 it++;
5455 }
5456 }
7c673cae
FG
5457}
5458
494da23a 5459void OSD::reset_heartbeat_peers(bool all)
7c673cae 5460{
9f95a23c 5461 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5462 dout(10) << "reset_heartbeat_peers" << dendl;
494da23a
TL
5463 utime_t stale = ceph_clock_now();
5464 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
11fdf7f2 5465 std::lock_guard l(heartbeat_lock);
494da23a 5466 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
b3b6e05e 5467 auto& [peer, hi] = *it;
494da23a 5468 if (all || hi.is_stale(stale)) {
9f95a23c 5469 hi.clear_mark_down();
494da23a 5470 // stop sending failure_report to mon too
b3b6e05e
TL
5471 failure_queue.erase(peer);
5472 failure_pending.erase(peer);
5473 it = heartbeat_peers.erase(it);
494da23a 5474 } else {
b3b6e05e 5475 ++it;
7c673cae 5476 }
7c673cae 5477 }
7c673cae
FG
5478}
5479
5480void OSD::handle_osd_ping(MOSDPing *m)
5481{
5482 if (superblock.cluster_fsid != m->fsid) {
5483 dout(20) << "handle_osd_ping from " << m->get_source_inst()
9f95a23c
TL
5484 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5485 << dendl;
7c673cae
FG
5486 m->put();
5487 return;
5488 }
5489
5490 int from = m->get_source().num();
5491
9f95a23c 5492 heartbeat_lock.lock();
7c673cae 5493 if (is_stopping()) {
9f95a23c 5494 heartbeat_lock.unlock();
7c673cae
FG
5495 m->put();
5496 return;
5497 }
5498
9f95a23c
TL
5499 utime_t now = ceph_clock_now();
5500 auto mnow = service.get_mnow();
5501 ConnectionRef con(m->get_connection());
7c673cae 5502 OSDMapRef curmap = service.get_osdmap();
c07f9fc5 5503 if (!curmap) {
9f95a23c 5504 heartbeat_lock.unlock();
c07f9fc5
FG
5505 m->put();
5506 return;
5507 }
7c673cae 5508
9f95a23c
TL
5509 auto sref = con->get_priv();
5510 Session *s = static_cast<Session*>(sref.get());
5511 if (!s) {
5512 heartbeat_lock.unlock();
5513 m->put();
5514 return;
5515 }
5516 if (!s->stamps) {
5517 s->peer = from;
5518 s->stamps = service.get_hb_stamps(from);
5519 }
5520
7c673cae
FG
5521 switch (m->op) {
5522
5523 case MOSDPing::PING:
5524 {
5525 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5526 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5527 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5528 if (heartbeat_drop->second == 0) {
5529 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5530 } else {
5531 --heartbeat_drop->second;
5532 dout(5) << "Dropping heartbeat from " << from
5533 << ", " << heartbeat_drop->second
5534 << " remaining to drop" << dendl;
5535 break;
5536 }
5537 } else if (cct->_conf->osd_debug_drop_ping_probability >
5538 ((((double)(rand()%100))/100.0))) {
5539 heartbeat_drop =
5540 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5541 cct->_conf->osd_debug_drop_ping_duration)).first;
5542 dout(5) << "Dropping heartbeat from " << from
5543 << ", " << heartbeat_drop->second
5544 << " remaining to drop" << dendl;
5545 break;
5546 }
5547 }
5548
9f95a23c
TL
5549 ceph::signedspan sender_delta_ub{};
5550 s->stamps->got_ping(
5551 m->up_from,
5552 mnow,
5553 m->mono_send_stamp,
5554 m->delta_ub,
5555 &sender_delta_ub);
5556 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5557
7c673cae 5558 if (!cct->get_heartbeat_map()->is_healthy()) {
9f95a23c
TL
5559 dout(10) << "internal heartbeat not healthy, dropping ping request"
5560 << dendl;
7c673cae
FG
5561 break;
5562 }
5563
5564 Message *r = new MOSDPing(monc->get_fsid(),
5565 curmap->get_epoch(),
9f95a23c
TL
5566 MOSDPing::PING_REPLY,
5567 m->ping_stamp,
5568 m->mono_ping_stamp,
5569 mnow,
5570 service.get_up_epoch(),
5571 cct->_conf->osd_heartbeat_min_size,
5572 sender_delta_ub);
5573 con->send_message(r);
7c673cae
FG
5574
5575 if (curmap->is_up(from)) {
7c673cae 5576 if (is_active()) {
9f95a23c
TL
5577 ConnectionRef cluster_con = service.get_con_osd_cluster(
5578 from, curmap->get_epoch());
5579 if (cluster_con) {
5580 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5581 }
5582 }
5583 } else if (!curmap->exists(from) ||
5584 curmap->get_down_at(from) > m->map_epoch) {
5585 // tell them they have died
5586 Message *r = new MOSDPing(monc->get_fsid(),
5587 curmap->get_epoch(),
5588 MOSDPing::YOU_DIED,
9f95a23c
TL
5589 m->ping_stamp,
5590 m->mono_ping_stamp,
5591 mnow,
5592 service.get_up_epoch(),
31f18b77 5593 cct->_conf->osd_heartbeat_min_size);
9f95a23c 5594 con->send_message(r);
7c673cae
FG
5595 }
5596 }
5597 break;
5598
5599 case MOSDPing::PING_REPLY:
5600 {
5601 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5602 if (i != heartbeat_peers.end()) {
9f95a23c 5603 auto acked = i->second.ping_history.find(m->ping_stamp);
11fdf7f2 5604 if (acked != i->second.ping_history.end()) {
11fdf7f2 5605 int &unacknowledged = acked->second.second;
9f95a23c 5606 if (con == i->second.con_back) {
11fdf7f2
TL
5607 dout(25) << "handle_osd_ping got reply from osd." << from
5608 << " first_tx " << i->second.first_tx
5609 << " last_tx " << i->second.last_tx
9f95a23c
TL
5610 << " last_rx_back " << i->second.last_rx_back
5611 << " -> " << now
11fdf7f2
TL
5612 << " last_rx_front " << i->second.last_rx_front
5613 << dendl;
5614 i->second.last_rx_back = now;
5615 ceph_assert(unacknowledged > 0);
5616 --unacknowledged;
5617 // if there is no front con, set both stamps.
5618 if (i->second.con_front == NULL) {
5619 i->second.last_rx_front = now;
5620 ceph_assert(unacknowledged > 0);
5621 --unacknowledged;
5622 }
9f95a23c 5623 } else if (con == i->second.con_front) {
11fdf7f2
TL
5624 dout(25) << "handle_osd_ping got reply from osd." << from
5625 << " first_tx " << i->second.first_tx
5626 << " last_tx " << i->second.last_tx
5627 << " last_rx_back " << i->second.last_rx_back
9f95a23c
TL
5628 << " last_rx_front " << i->second.last_rx_front
5629 << " -> " << now
11fdf7f2
TL
5630 << dendl;
5631 i->second.last_rx_front = now;
5632 ceph_assert(unacknowledged > 0);
5633 --unacknowledged;
5634 }
7c673cae 5635
11fdf7f2
TL
5636 if (unacknowledged == 0) {
5637 // succeeded in getting all replies
5638 dout(25) << "handle_osd_ping got all replies from osd." << from
9f95a23c 5639 << " , erase pending ping(sent at " << m->ping_stamp << ")"
11fdf7f2
TL
5640 << " and older pending ping(s)"
5641 << dendl;
eafe8130
TL
5642
5643#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5644 ++i->second.hb_average_count;
9f95a23c 5645 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
eafe8130
TL
5646 i->second.hb_total_back += back_pingtime;
5647 if (back_pingtime < i->second.hb_min_back)
5648 i->second.hb_min_back = back_pingtime;
5649 if (back_pingtime > i->second.hb_max_back)
5650 i->second.hb_max_back = back_pingtime;
9f95a23c 5651 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
eafe8130
TL
5652 i->second.hb_total_front += front_pingtime;
5653 if (front_pingtime < i->second.hb_min_front)
5654 i->second.hb_min_front = front_pingtime;
5655 if (front_pingtime > i->second.hb_max_front)
5656 i->second.hb_max_front = front_pingtime;
5657
5658 ceph_assert(i->second.hb_interval_start != utime_t());
5659 if (i->second.hb_interval_start == utime_t())
5660 i->second.hb_interval_start = now;
5661 int64_t hb_avg_time_period = 60;
5662 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5663 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5664 }
5665 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5666 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5667 uint32_t back_min = i->second.hb_min_back;
5668 uint32_t back_max = i->second.hb_max_back;
5669 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5670 uint32_t front_min = i->second.hb_min_front;
5671 uint32_t front_max = i->second.hb_max_front;
5672
5673 // Reset for new interval
5674 i->second.hb_average_count = 0;
5675 i->second.hb_interval_start = now;
5676 i->second.hb_total_back = i->second.hb_max_back = 0;
5677 i->second.hb_min_back = UINT_MAX;
5678 i->second.hb_total_front = i->second.hb_max_front = 0;
5679 i->second.hb_min_front = UINT_MAX;
5680
5681 // Record per osd interace ping times
5682 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5683 if (i->second.hb_back_pingtime.size() == 0) {
5684 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5685 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5686 i->second.hb_back_pingtime.push_back(back_avg);
5687 i->second.hb_back_min.push_back(back_min);
5688 i->second.hb_back_max.push_back(back_max);
5689 i->second.hb_front_pingtime.push_back(front_avg);
5690 i->second.hb_front_min.push_back(front_min);
5691 i->second.hb_front_max.push_back(front_max);
5692 ++i->second.hb_index;
5693 }
5694 } else {
5695 int index = i->second.hb_index & (hb_vector_size - 1);
5696 i->second.hb_back_pingtime[index] = back_avg;
5697 i->second.hb_back_min[index] = back_min;
5698 i->second.hb_back_max[index] = back_max;
5699 i->second.hb_front_pingtime[index] = front_avg;
5700 i->second.hb_front_min[index] = front_min;
5701 i->second.hb_front_max[index] = front_max;
5702 ++i->second.hb_index;
5703 }
5704
5705 {
5706 std::lock_guard l(service.stat_lock);
5707 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5708 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5709
5710 uint32_t total = 0;
5711 uint32_t min = UINT_MAX;
5712 uint32_t max = 0;
5713 uint32_t count = 0;
5714 uint32_t which = 0;
5715 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5716 for (int32_t k = size - 1 ; k >= 0; --k) {
5717 ++count;
5718 int index = (i->second.hb_index + k) % size;
5719 total += i->second.hb_back_pingtime[index];
5720 if (i->second.hb_back_min[index] < min)
5721 min = i->second.hb_back_min[index];
5722 if (i->second.hb_back_max[index] > max)
5723 max = i->second.hb_back_max[index];
5724 if (count == 1 || count == 5 || count == 15) {
5725 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5726 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5727 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5728 which++;
5729 if (count == 15)
5730 break;
5731 }
5732 }
5733
5734 if (i->second.con_front != NULL) {
5735 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5736
5737 total = 0;
5738 min = UINT_MAX;
5739 max = 0;
5740 count = 0;
5741 which = 0;
5742 for (int32_t k = size - 1 ; k >= 0; --k) {
5743 ++count;
5744 int index = (i->second.hb_index + k) % size;
5745 total += i->second.hb_front_pingtime[index];
5746 if (i->second.hb_front_min[index] < min)
5747 min = i->second.hb_front_min[index];
5748 if (i->second.hb_front_max[index] > max)
5749 max = i->second.hb_front_max[index];
5750 if (count == 1 || count == 5 || count == 15) {
5751 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5752 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5753 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5754 which++;
5755 if (count == 15)
5756 break;
5757 }
5758 }
5759 }
5760 }
5761 } else {
5762 std::lock_guard l(service.stat_lock);
5763 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5764 if (i->second.con_front != NULL)
5765 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5766 }
11fdf7f2 5767 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5768 }
5769
11fdf7f2
TL
5770 if (i->second.is_healthy(now)) {
5771 // Cancel false reports
5772 auto failure_queue_entry = failure_queue.find(from);
5773 if (failure_queue_entry != failure_queue.end()) {
5774 dout(10) << "handle_osd_ping canceling queued "
5775 << "failure report for osd." << from << dendl;
5776 failure_queue.erase(failure_queue_entry);
5777 }
5778
5779 auto failure_pending_entry = failure_pending.find(from);
5780 if (failure_pending_entry != failure_pending.end()) {
5781 dout(10) << "handle_osd_ping canceling in-flight "
5782 << "failure report for osd." << from << dendl;
5783 send_still_alive(curmap->get_epoch(),
5784 from,
5785 failure_pending_entry->second.second);
5786 failure_pending.erase(failure_pending_entry);
5787 }
7c673cae 5788 }
11fdf7f2
TL
5789 } else {
5790 // old replies, deprecated by newly sent pings.
9f95a23c 5791 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
11fdf7f2
TL
5792 << ") is found, treat as covered by newly sent pings "
5793 << "and ignore"
5794 << dendl;
7c673cae
FG
5795 }
5796 }
5797
5798 if (m->map_epoch &&
5799 curmap->is_up(from)) {
7c673cae 5800 if (is_active()) {
9f95a23c
TL
5801 ConnectionRef cluster_con = service.get_con_osd_cluster(
5802 from, curmap->get_epoch());
5803 if (cluster_con) {
5804 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5805 }
5806 }
5807 }
9f95a23c
TL
5808
5809 s->stamps->got_ping_reply(
5810 mnow,
5811 m->mono_send_stamp,
5812 m->delta_ub);
5813 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
7c673cae
FG
5814 }
5815 break;
5816
5817 case MOSDPing::YOU_DIED:
5818 dout(10) << "handle_osd_ping " << m->get_source_inst()
5819 << " says i am down in " << m->map_epoch << dendl;
5820 osdmap_subscribe(curmap->get_epoch()+1, false);
5821 break;
5822 }
5823
9f95a23c 5824 heartbeat_lock.unlock();
7c673cae
FG
5825 m->put();
5826}
5827
5828void OSD::heartbeat_entry()
5829{
9f95a23c 5830 std::unique_lock l(heartbeat_lock);
7c673cae
FG
5831 if (is_stopping())
5832 return;
5833 while (!heartbeat_stop) {
5834 heartbeat();
5835
eafe8130
TL
5836 double wait;
5837 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5838 wait = (float)cct->_conf->osd_heartbeat_interval;
5839 } else {
5840 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5841 }
9f95a23c 5842 auto w = ceph::make_timespan(wait);
7c673cae 5843 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
9f95a23c 5844 heartbeat_cond.wait_for(l, w);
7c673cae
FG
5845 if (is_stopping())
5846 return;
5847 dout(30) << "heartbeat_entry woke up" << dendl;
5848 }
5849}
5850
5851void OSD::heartbeat_check()
5852{
9f95a23c 5853 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
7c673cae
FG
5854 utime_t now = ceph_clock_now();
5855
11fdf7f2 5856 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5857 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5858 p != heartbeat_peers.end();
5859 ++p) {
5860
5861 if (p->second.first_tx == utime_t()) {
5862 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5863 << " yet, skipping" << dendl;
7c673cae
FG
5864 continue;
5865 }
5866
5867 dout(25) << "heartbeat_check osd." << p->first
5868 << " first_tx " << p->second.first_tx
5869 << " last_tx " << p->second.last_tx
5870 << " last_rx_back " << p->second.last_rx_back
5871 << " last_rx_front " << p->second.last_rx_front
5872 << dendl;
11fdf7f2
TL
5873 if (p->second.is_unhealthy(now)) {
5874 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5875 if (p->second.last_rx_back == utime_t() ||
5876 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5877 derr << "heartbeat_check: no reply from "
5878 << p->second.con_front->get_peer_addr().get_sockaddr()
5879 << " osd." << p->first
5880 << " ever on either front or back, first ping sent "
5881 << p->second.first_tx
5882 << " (oldest deadline " << oldest_deadline << ")"
5883 << dendl;
7c673cae 5884 // fail
11fdf7f2 5885 failure_queue[p->first] = p->second.first_tx;
7c673cae 5886 } else {
11fdf7f2
TL
5887 derr << "heartbeat_check: no reply from "
5888 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5889 << " osd." << p->first << " since back " << p->second.last_rx_back
5890 << " front " << p->second.last_rx_front
11fdf7f2
TL
5891 << " (oldest deadline " << oldest_deadline << ")"
5892 << dendl;
7c673cae 5893 // fail
11fdf7f2 5894 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5895 }
5896 }
5897 }
5898}
5899
5900void OSD::heartbeat()
5901{
9f95a23c 5902 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
7c673cae
FG
5903 dout(30) << "heartbeat" << dendl;
5904
20effc67
TL
5905 auto load_for_logger = service.get_scrub_services().update_load_average();
5906 if (load_for_logger) {
5907 logger->set(l_osd_loadavg, load_for_logger.value());
7c673cae 5908 }
7c673cae
FG
5909 dout(30) << "heartbeat checking stats" << dendl;
5910
11fdf7f2 5911 // refresh peer list and osd stats
7c673cae
FG
5912 vector<int> hb_peers;
5913 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5914 p != heartbeat_peers.end();
5915 ++p)
5916 hb_peers.push_back(p->first);
7c673cae 5917
11fdf7f2
TL
5918 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5919 dout(5) << __func__ << " " << new_stat << dendl;
5920 ceph_assert(new_stat.statfs.total);
5921
5922 float pratio;
5923 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5924
5925 service.check_full_status(ratio, pratio);
7c673cae
FG
5926
5927 utime_t now = ceph_clock_now();
9f95a23c 5928 auto mnow = service.get_mnow();
11fdf7f2
TL
5929 utime_t deadline = now;
5930 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
5931
5932 // send heartbeats
5933 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5934 i != heartbeat_peers.end();
5935 ++i) {
5936 int peer = i->first;
f67539c2
TL
5937 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5938 if (!s) {
5939 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5940 continue;
5941 }
9f95a23c
TL
5942 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5943
7c673cae
FG
5944 i->second.last_tx = now;
5945 if (i->second.first_tx == utime_t())
5946 i->second.first_tx = now;
11fdf7f2
TL
5947 i->second.ping_history[now] = make_pair(deadline,
5948 HeartbeatInfo::HEARTBEAT_MAX_CONN);
eafe8130
TL
5949 if (i->second.hb_interval_start == utime_t())
5950 i->second.hb_interval_start = now;
9f95a23c 5951
9f95a23c
TL
5952 std::optional<ceph::signedspan> delta_ub;
5953 s->stamps->sent_ping(&delta_ub);
5954
5955 i->second.con_back->send_message(
5956 new MOSDPing(monc->get_fsid(),
5957 service.get_osdmap_epoch(),
5958 MOSDPing::PING,
5959 now,
5960 mnow,
5961 mnow,
5962 service.get_up_epoch(),
5963 cct->_conf->osd_heartbeat_min_size,
5964 delta_ub));
7c673cae
FG
5965
5966 if (i->second.con_front)
9f95a23c
TL
5967 i->second.con_front->send_message(
5968 new MOSDPing(monc->get_fsid(),
5969 service.get_osdmap_epoch(),
5970 MOSDPing::PING,
5971 now,
5972 mnow,
5973 mnow,
5974 service.get_up_epoch(),
5975 cct->_conf->osd_heartbeat_min_size,
5976 delta_ub));
7c673cae
FG
5977 }
5978
5979 logger->set(l_osd_hb_to, heartbeat_peers.size());
5980
5981 // hmm.. am i all alone?
5982 dout(30) << "heartbeat lonely?" << dendl;
5983 if (heartbeat_peers.empty()) {
5984 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5985 last_mon_heartbeat = now;
5986 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
9f95a23c 5987 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
5988 }
5989 }
5990
5991 dout(30) << "heartbeat done" << dendl;
5992}
5993
5994bool OSD::heartbeat_reset(Connection *con)
5995{
11fdf7f2
TL
5996 std::lock_guard l(heartbeat_lock);
5997 auto s = con->get_priv();
9f95a23c 5998 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
11fdf7f2 5999 con->set_priv(nullptr);
7c673cae 6000 if (s) {
7c673cae 6001 if (is_stopping()) {
7c673cae
FG
6002 return true;
6003 }
9f95a23c
TL
6004 auto session = static_cast<Session*>(s.get());
6005 auto p = heartbeat_peers.find(session->peer);
7c673cae
FG
6006 if (p != heartbeat_peers.end() &&
6007 (p->second.con_back == con ||
6008 p->second.con_front == con)) {
6009 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6010 << ", reopening" << dendl;
9f95a23c 6011 p->second.clear_mark_down(con);
7c673cae
FG
6012 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
6013 if (newcon.first) {
6014 p->second.con_back = newcon.first.get();
11fdf7f2 6015 p->second.con_back->set_priv(s);
7c673cae
FG
6016 if (newcon.second) {
6017 p->second.con_front = newcon.second.get();
11fdf7f2 6018 p->second.con_front->set_priv(s);
7c673cae 6019 }
11fdf7f2 6020 p->second.ping_history.clear();
7c673cae
FG
6021 } else {
6022 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6023 << ", raced with osdmap update, closing out peer" << dendl;
6024 heartbeat_peers.erase(p);
6025 }
6026 } else {
6027 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
6028 }
7c673cae
FG
6029 }
6030 return true;
6031}
6032
6033
6034
6035// =========================================
6036
6037void OSD::tick()
6038{
9f95a23c 6039 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6040 dout(10) << "tick" << dendl;
6041
9f95a23c
TL
6042 utime_t now = ceph_clock_now();
6043 // throw out any obsolete markdown log
6044 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6045 while (!osd_markdown_log.empty() &&
6046 osd_markdown_log.front() + grace < now)
6047 osd_markdown_log.pop_front();
6048
7c673cae
FG
6049 if (is_active() || is_waiting_for_healthy()) {
6050 maybe_update_heartbeat_peers();
6051 }
6052
6053 if (is_waiting_for_healthy()) {
6054 start_boot();
494da23a
TL
6055 }
6056
6057 if (is_waiting_for_healthy() || is_booting()) {
6058 std::lock_guard l(heartbeat_lock);
494da23a
TL
6059 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
6060 last_mon_heartbeat = now;
6061 dout(1) << __func__ << " checking mon for new map" << dendl;
9f95a23c 6062 osdmap_subscribe(get_osdmap_epoch() + 1, false);
11fdf7f2 6063 }
7c673cae
FG
6064 }
6065
6066 do_waiters();
6067
9f95a23c
TL
6068 // scrub purged_snaps every deep scrub interval
6069 {
6070 const utime_t last = superblock.last_purged_snaps_scrub;
6071 utime_t next = last;
6072 next += cct->_conf->osd_scrub_min_interval;
6073 std::mt19937 rng;
6074 // use a seed that is stable for each scrub interval, but varies
6075 // by OSD to avoid any herds.
6076 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
20effc67 6077 double r = (rng() % 1024) / 1024.0;
9f95a23c
TL
6078 next +=
6079 cct->_conf->osd_scrub_min_interval *
6080 cct->_conf->osd_scrub_interval_randomize_ratio * r;
6081 if (next < ceph_clock_now()) {
6082 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6083 << " next " << next << " ... now" << dendl;
6084 scrub_purged_snaps();
6085 } else {
6086 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6087 << " next " << next << dendl;
6088 }
6089 }
6090
91327a77 6091 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
6092}
6093
6094void OSD::tick_without_osd_lock()
6095{
9f95a23c 6096 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
7c673cae
FG
6097 dout(10) << "tick_without_osd_lock" << dendl;
6098
f67539c2
TL
6099 logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
6100 logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
6101 logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
11fdf7f2
TL
6102
6103 // refresh osd stats
6104 struct store_statfs_t stbuf;
6105 osd_alert_list_t alerts;
6106 int r = store->statfs(&stbuf, &alerts);
6107 ceph_assert(r == 0);
6108 service.set_statfs(stbuf, alerts);
7c673cae
FG
6109
6110 // osd_lock is not being held, which means the OSD state
6111 // might change when doing the monitor report
6112 if (is_active() || is_waiting_for_healthy()) {
9f95a23c
TL
6113 {
6114 std::lock_guard l{heartbeat_lock};
6115 heartbeat_check();
6116 }
6117 map_lock.lock_shared();
11fdf7f2 6118 std::lock_guard l(mon_report_lock);
7c673cae
FG
6119
6120 // mon report?
7c673cae 6121 utime_t now = ceph_clock_now();
11fdf7f2
TL
6122 if (service.need_fullness_update() ||
6123 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 6124 last_mon_report = now;
7c673cae
FG
6125 send_full_update();
6126 send_failures();
7c673cae 6127 }
9f95a23c 6128 map_lock.unlock_shared();
11fdf7f2
TL
6129
6130 epoch_t max_waiting_epoch = 0;
6131 for (auto s : shards) {
6132 max_waiting_epoch = std::max(max_waiting_epoch,
6133 s->get_max_waiting_epoch());
6134 }
6135 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6136 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6137 << ", requesting new map" << dendl;
6138 osdmap_subscribe(superblock.newest_map + 1, false);
6139 }
7c673cae
FG
6140 }
6141
6142 if (is_active()) {
6143 if (!scrub_random_backoff()) {
6144 sched_scrub();
6145 }
6146 service.promote_throttle_recalibrate();
3efd9988 6147 resume_creating_pg();
224ce89b
WB
6148 bool need_send_beacon = false;
6149 const auto now = ceph::coarse_mono_clock::now();
6150 {
6151 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 6152 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b 6153 const auto elapsed = now - last_sent_beacon;
f67539c2 6154 if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
224ce89b
WB
6155 cct->_conf->osd_beacon_report_interval) {
6156 need_send_beacon = true;
6157 }
6158 }
6159 if (need_send_beacon) {
6160 send_beacon(now);
6161 }
7c673cae
FG
6162 }
6163
11fdf7f2 6164 mgrc.update_daemon_health(get_health_metrics());
7c673cae 6165 service.kick_recovery_queue();
91327a77
AA
6166 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6167 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
6168}
6169
7c673cae
FG
6170// Usage:
6171// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6172// rmomapkey <pool-id> [namespace/]<obj-name> <key>
6173// setomapheader <pool-id> [namespace/]<obj-name> <header>
6174// getomap <pool> [namespace/]<obj-name>
6175// truncobj <pool-id> [namespace/]<obj-name> <newlen>
6176// injectmdataerr [namespace/]<obj-name> [shardid]
6177// injectdataerr [namespace/]<obj-name> [shardid]
6178//
6179// set_recovery_delay [utime]
6180void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
6181 std::string_view command,
6182 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
6183{
6184 //Test support
6185 //Support changing the omap on a single osd by using the Admin Socket to
6186 //directly request the osd make a change.
6187 if (command == "setomapval" || command == "rmomapkey" ||
6188 command == "setomapheader" || command == "getomap" ||
6189 command == "truncobj" || command == "injectmdataerr" ||
6190 command == "injectdataerr"
6191 ) {
6192 pg_t rawpg;
6193 int64_t pool;
6194 OSDMapRef curmap = service->get_osdmap();
6195 int r = -1;
6196
6197 string poolstr;
6198
9f95a23c 6199 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
6200 pool = curmap->lookup_pg_pool_name(poolstr);
6201 //If we can't find it by name then maybe id specified
6202 if (pool < 0 && isdigit(poolstr[0]))
6203 pool = atoll(poolstr.c_str());
6204 if (pool < 0) {
b5b8bbf5 6205 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
6206 return;
6207 }
6208
6209 string objname, nspace;
9f95a23c 6210 cmd_getval(cmdmap, "objname", objname);
7c673cae
FG
6211 std::size_t found = objname.find_first_of('/');
6212 if (found != string::npos) {
6213 nspace = objname.substr(0, found);
6214 objname = objname.substr(found+1);
6215 }
6216 object_locator_t oloc(pool, nspace);
6217 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
6218
6219 if (r < 0) {
6220 ss << "Invalid namespace/objname";
6221 return;
6222 }
6223
20effc67 6224 int64_t shardid = cmd_getval_or<int64_t>(cmdmap, "shardid", shard_id_t::NO_SHARD);
7c673cae
FG
6225 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6226 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6227 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6228 if (curmap->pg_is_ec(rawpg)) {
6229 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6230 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6231 return;
6232 }
6233 }
6234
6235 ObjectStore::Transaction t;
6236
6237 if (command == "setomapval") {
6238 map<string, bufferlist> newattrs;
6239 bufferlist val;
6240 string key, valstr;
9f95a23c
TL
6241 cmd_getval(cmdmap, "key", key);
6242 cmd_getval(cmdmap, "val", valstr);
7c673cae
FG
6243
6244 val.append(valstr);
6245 newattrs[key] = val;
6246 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 6247 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6248 if (r < 0)
6249 ss << "error=" << r;
6250 else
6251 ss << "ok";
6252 } else if (command == "rmomapkey") {
6253 string key;
9f95a23c 6254 cmd_getval(cmdmap, "key", key);
7c673cae 6255
9f95a23c 6256 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
11fdf7f2 6257 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6258 if (r < 0)
6259 ss << "error=" << r;
6260 else
6261 ss << "ok";
6262 } else if (command == "setomapheader") {
6263 bufferlist newheader;
6264 string headerstr;
6265
9f95a23c 6266 cmd_getval(cmdmap, "header", headerstr);
7c673cae
FG
6267 newheader.append(headerstr);
6268 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 6269 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6270 if (r < 0)
6271 ss << "error=" << r;
6272 else
6273 ss << "ok";
6274 } else if (command == "getomap") {
6275 //Debug: Output entire omap
6276 bufferlist hdrbl;
6277 map<string, bufferlist> keyvals;
11fdf7f2
TL
6278 auto ch = store->open_collection(coll_t(pgid));
6279 if (!ch) {
6280 ss << "unable to open collection for " << pgid;
6281 r = -ENOENT;
6282 } else {
6283 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6284 if (r >= 0) {
7c673cae
FG
6285 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6286 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 6287 it != keyvals.end(); ++it)
7c673cae
FG
6288 ss << " key=" << (*it).first << " val="
6289 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 6290 } else {
7c673cae 6291 ss << "error=" << r;
11fdf7f2 6292 }
7c673cae
FG
6293 }
6294 } else if (command == "truncobj") {
6295 int64_t trunclen;
9f95a23c 6296 cmd_getval(cmdmap, "len", trunclen);
7c673cae 6297 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 6298 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6299 if (r < 0)
6300 ss << "error=" << r;
6301 else
6302 ss << "ok";
6303 } else if (command == "injectdataerr") {
6304 store->inject_data_error(gobj);
6305 ss << "ok";
6306 } else if (command == "injectmdataerr") {
6307 store->inject_mdata_error(gobj);
6308 ss << "ok";
6309 }
6310 return;
6311 }
6312 if (command == "set_recovery_delay") {
20effc67 6313 int64_t delay = cmd_getval_or<int64_t>(cmdmap, "utime", 0);
7c673cae
FG
6314 ostringstream oss;
6315 oss << delay;
11fdf7f2 6316 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
6317 oss.str().c_str());
6318 if (r != 0) {
6319 ss << "set_recovery_delay: error setting "
6320 << "osd_recovery_delay_start to '" << delay << "': error "
6321 << r;
6322 return;
6323 }
11fdf7f2 6324 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
6325 ss << "set_recovery_delay: set osd_recovery_delay_start "
6326 << "to " << service->cct->_conf->osd_recovery_delay_start;
6327 return;
6328 }
7c673cae 6329 if (command == "injectfull") {
20effc67
TL
6330 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", -1);
6331 string type = cmd_getval_or<string>(cmdmap, "type", "full");
7c673cae 6332 OSDService::s_names state;
20effc67 6333
7c673cae
FG
6334 if (type == "none" || count == 0) {
6335 type = "none";
6336 count = 0;
6337 }
6338 state = service->get_full_state(type);
6339 if (state == OSDService::s_names::INVALID) {
6340 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6341 return;
6342 }
6343 service->set_injectfull(state, count);
6344 return;
6345 }
6346 ss << "Internal error - command=" << command;
6347}
6348
7c673cae
FG
6349// =========================================
6350
6351void OSD::ms_handle_connect(Connection *con)
6352{
6353 dout(10) << __func__ << " con " << con << dendl;
6354 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 6355 std::lock_guard l(osd_lock);
7c673cae
FG
6356 if (is_stopping())
6357 return;
6358 dout(10) << __func__ << " on mon" << dendl;
6359
6360 if (is_preboot()) {
6361 start_boot();
6362 } else if (is_booting()) {
6363 _send_boot(); // resend boot message
6364 } else {
9f95a23c 6365 map_lock.lock_shared();
11fdf7f2 6366 std::lock_guard l2(mon_report_lock);
7c673cae
FG
6367
6368 utime_t now = ceph_clock_now();
6369 last_mon_report = now;
6370
6371 // resend everything, it's a new session
6372 send_full_update();
6373 send_alive();
6374 service.requeue_pg_temp();
11fdf7f2 6375 service.clear_sent_ready_to_merge();
7c673cae 6376 service.send_pg_temp();
11fdf7f2
TL
6377 service.send_ready_to_merge();
6378 service.send_pg_created();
7c673cae
FG
6379 requeue_failures();
6380 send_failures();
7c673cae 6381
9f95a23c 6382 map_lock.unlock_shared();
7c673cae
FG
6383 if (is_active()) {
6384 send_beacon(ceph::coarse_mono_clock::now());
6385 }
6386 }
6387
6388 // full map requests may happen while active or pre-boot
6389 if (requested_full_first) {
6390 rerequest_full_maps();
6391 }
6392 }
6393}
6394
6395void OSD::ms_handle_fast_connect(Connection *con)
6396{
6397 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6398 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6399 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6400 s = ceph::make_ref<Session>(cct, con);
6401 con->set_priv(s);
7c673cae
FG
6402 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6403 << " addr=" << s->con->get_peer_addr() << dendl;
6404 // we don't connect to clients
11fdf7f2 6405 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6406 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6407 }
7c673cae
FG
6408 }
6409}
6410
6411void OSD::ms_handle_fast_accept(Connection *con)
6412{
6413 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6414 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6415 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6416 s = ceph::make_ref<Session>(cct, con);
6417 con->set_priv(s);
7c673cae
FG
6418 dout(10) << "new session (incoming)" << s << " con=" << con
6419 << " addr=" << con->get_peer_addr()
6420 << " must have raced with connect" << dendl;
11fdf7f2 6421 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6422 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6423 }
7c673cae
FG
6424 }
6425}
6426
6427bool OSD::ms_handle_reset(Connection *con)
6428{
9f95a23c
TL
6429 auto session = ceph::ref_cast<Session>(con->get_priv());
6430 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
7c673cae
FG
6431 if (!session)
6432 return false;
6433 session->wstate.reset(con);
11fdf7f2
TL
6434 session->con->set_priv(nullptr);
6435 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
6436 // note that we break session->con *before* the session_handle_reset
6437 // cleanup below. this avoids a race between us and
6438 // PG::add_backoff, Session::check_backoff, etc.
9f95a23c 6439 session_handle_reset(session);
7c673cae
FG
6440 return true;
6441}
6442
6443bool OSD::ms_handle_refused(Connection *con)
6444{
6445 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6446 return false;
6447
9f95a23c
TL
6448 auto session = ceph::ref_cast<Session>(con->get_priv());
6449 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
7c673cae
FG
6450 if (!session)
6451 return false;
6452 int type = con->get_peer_type();
6453 // handle only OSD failures here
6454 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6455 OSDMapRef osdmap = get_osdmap();
6456 if (osdmap) {
6457 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6458 if (id >= 0 && osdmap->is_up(id)) {
6459 // I'm cheating mon heartbeat grace logic, because we know it's not going
6460 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
6461 monc->send_mon_message(
6462 new MOSDFailure(
6463 monc->get_fsid(),
6464 id,
6465 osdmap->get_addrs(id),
6466 cct->_conf->osd_heartbeat_grace + 1,
6467 osdmap->get_epoch(),
6468 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6469 ));
7c673cae
FG
6470 }
6471 }
6472 }
7c673cae
FG
6473 return true;
6474}
6475
f67539c2 6476struct CB_OSD_GetVersion {
7c673cae 6477 OSD *osd;
f67539c2
TL
6478 explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6479 void operator ()(boost::system::error_code ec, version_t newest,
6480 version_t oldest) {
6481 if (!ec)
7c673cae
FG
6482 osd->_got_mon_epochs(oldest, newest);
6483 }
6484};
6485
6486void OSD::start_boot()
6487{
6488 if (!_is_healthy()) {
6489 // if we are not healthy, do not mark ourselves up (yet)
6490 dout(1) << "not healthy; waiting to boot" << dendl;
6491 if (!is_waiting_for_healthy())
6492 start_waiting_for_healthy();
6493 // send pings sooner rather than later
6494 heartbeat_kick();
6495 return;
6496 }
6497 dout(1) << __func__ << dendl;
6498 set_state(STATE_PREBOOT);
6499 dout(10) << "start_boot - have maps " << superblock.oldest_map
6500 << ".." << superblock.newest_map << dendl;
f67539c2 6501 monc->get_version("osdmap", CB_OSD_GetVersion(this));
7c673cae
FG
6502}
6503
6504void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6505{
11fdf7f2 6506 std::lock_guard l(osd_lock);
7c673cae
FG
6507 if (is_preboot()) {
6508 _preboot(oldest, newest);
6509 }
6510}
6511
6512void OSD::_preboot(epoch_t oldest, epoch_t newest)
6513{
11fdf7f2 6514 ceph_assert(is_preboot());
7c673cae
FG
6515 dout(10) << __func__ << " _preboot mon has osdmaps "
6516 << oldest << ".." << newest << dendl;
6517
6518 // ensure our local fullness awareness is accurate
81eedcae
TL
6519 {
6520 std::lock_guard l(heartbeat_lock);
6521 heartbeat();
6522 }
7c673cae 6523
9f95a23c
TL
6524 const auto& monmap = monc->monmap;
6525 const auto osdmap = get_osdmap();
7c673cae 6526 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
6527 if (osdmap->get_epoch() == 0) {
6528 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 6529 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
6530 derr << "osdmap says I am destroyed" << dendl;
6531 // provide a small margin so we don't livelock seeing if we
6532 // un-destroyed ourselves.
6533 if (osdmap->get_epoch() > newest - 1) {
6534 exit(0);
6535 }
81eedcae 6536 } else if (osdmap->is_noup(whoami)) {
7c673cae
FG
6537 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6538 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6539 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6540 << dendl;
7c673cae
FG
6541 } else if (service.need_fullness_update()) {
6542 derr << "osdmap fullness state needs update" << dendl;
6543 send_full_update();
9f95a23c
TL
6544 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6545 superblock.purged_snaps_last < superblock.current_epoch) {
6546 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6547 << " < newest_map " << superblock.current_epoch << dendl;
6548 _get_purged_snaps();
7c673cae
FG
6549 } else if (osdmap->get_epoch() >= oldest - 1 &&
6550 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
6551
6552 // wait for pgs to fully catch up in a different thread, since
6553 // this thread might be required for splitting and merging PGs to
6554 // make progress.
6555 boot_finisher.queue(
9f95a23c 6556 new LambdaContext(
11fdf7f2 6557 [this](int r) {
9f95a23c 6558 std::unique_lock l(osd_lock);
11fdf7f2
TL
6559 if (is_preboot()) {
6560 dout(10) << __func__ << " waiting for peering work to drain"
6561 << dendl;
9f95a23c 6562 l.unlock();
11fdf7f2 6563 for (auto shard : shards) {
9f95a23c 6564 shard->wait_min_pg_epoch(get_osdmap_epoch());
11fdf7f2 6565 }
9f95a23c 6566 l.lock();
11fdf7f2
TL
6567 }
6568 if (is_preboot()) {
6569 _send_boot();
6570 }
6571 }));
6572 return;
7c673cae
FG
6573 }
6574
6575 // get all the latest maps
6576 if (osdmap->get_epoch() + 1 >= oldest)
6577 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6578 else
6579 osdmap_subscribe(oldest - 1, true);
6580}
6581
9f95a23c
TL
6582void OSD::_get_purged_snaps()
6583{
6584 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6585 // overlapping requests to the mon, which will be somewhat inefficient, but
6586 // it should be reliable.
6587 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6588 << ", newest_map " << superblock.current_epoch << dendl;
6589 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6590 superblock.purged_snaps_last + 1,
6591 superblock.current_epoch + 1);
6592 monc->send_mon_message(m);
6593}
6594
6595void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6596{
6597 dout(10) << __func__ << " " << *m << dendl;
6598 ObjectStore::Transaction t;
6599 if (!is_preboot() ||
6600 m->last < superblock.purged_snaps_last) {
6601 goto out;
6602 }
20effc67 6603 SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
9f95a23c
TL
6604 make_purged_snaps_oid(), &t,
6605 m->purged_snaps);
6606 superblock.purged_snaps_last = m->last;
6607 write_superblock(t);
6608 store->queue_transaction(
6609 service.meta_ch,
6610 std::move(t));
6611 service.publish_superblock(superblock);
6612 if (m->last < superblock.current_epoch) {
6613 _get_purged_snaps();
6614 } else {
6615 start_boot();
6616 }
6617out:
6618 m->put();
6619}
6620
7c673cae
FG
6621void OSD::send_full_update()
6622{
6623 if (!service.need_fullness_update())
6624 return;
6625 unsigned state = 0;
6626 if (service.is_full()) {
6627 state = CEPH_OSD_FULL;
6628 } else if (service.is_backfillfull()) {
6629 state = CEPH_OSD_BACKFILLFULL;
6630 } else if (service.is_nearfull()) {
6631 state = CEPH_OSD_NEARFULL;
6632 }
6633 set<string> s;
6634 OSDMap::calc_state_set(state, s);
6635 dout(10) << __func__ << " want state " << s << dendl;
9f95a23c 6636 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
7c673cae
FG
6637}
6638
6639void OSD::start_waiting_for_healthy()
6640{
6641 dout(1) << "start_waiting_for_healthy" << dendl;
6642 set_state(STATE_WAITING_FOR_HEALTHY);
6643 last_heartbeat_resample = utime_t();
181888fb
FG
6644
6645 // subscribe to osdmap updates, in case our peers really are known to be dead
9f95a23c 6646 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
6647}
6648
6649bool OSD::_is_healthy()
6650{
6651 if (!cct->get_heartbeat_map()->is_healthy()) {
6652 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6653 return false;
6654 }
6655
6656 if (is_waiting_for_healthy()) {
11fdf7f2 6657 utime_t now = ceph_clock_now();
9f95a23c
TL
6658 if (osd_markdown_log.empty()) {
6659 dout(5) << __func__ << " force returning true since last markdown"
6660 << " was " << cct->_conf->osd_max_markdown_period
6661 << "s ago" << dendl;
11fdf7f2
TL
6662 return true;
6663 }
6664 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6665 int num = 0, up = 0;
6666 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6667 p != heartbeat_peers.end();
6668 ++p) {
11fdf7f2 6669 if (p->second.is_healthy(now))
7c673cae
FG
6670 ++up;
6671 ++num;
6672 }
6673 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6674 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6675 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6676 return false;
6677 }
6678 }
6679
6680 return true;
6681}
6682
6683void OSD::_send_boot()
6684{
6685 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
6686 Connection *local_connection =
6687 cluster_messenger->get_loopback_connection().get();
6688 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6689 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6690 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6691 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6692
6693 dout(20) << " initial client_addrs " << client_addrs
6694 << ", cluster_addrs " << cluster_addrs
6695 << ", hb_back_addrs " << hb_back_addrs
6696 << ", hb_front_addrs " << hb_front_addrs
6697 << dendl;
6698 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6699 dout(10) << " assuming cluster_addrs match client_addrs "
6700 << client_addrs << dendl;
6701 cluster_addrs = cluster_messenger->get_myaddrs();
6702 }
6703 if (auto session = local_connection->get_priv(); !session) {
6704 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6705 }
6706
7c673cae 6707 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6708 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6709 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6710 << cluster_addrs << dendl;
6711 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6712 }
11fdf7f2
TL
6713 if (auto session = local_connection->get_priv(); !session) {
6714 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6715 }
6716
11fdf7f2
TL
6717 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6718 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6719 dout(10) << " assuming hb_front_addrs match client_addrs "
6720 << client_addrs << dendl;
6721 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6722 }
6723 if (auto session = local_connection->get_priv(); !session) {
6724 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6725 }
6726
6727 // we now know what our front and back addrs will be, and we are
6728 // about to tell the mon what our metadata (including numa bindings)
6729 // are, so now is a good time!
6730 set_numa_affinity();
6731
6732 MOSDBoot *mboot = new MOSDBoot(
6733 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6734 hb_back_addrs, hb_front_addrs, cluster_addrs,
6735 CEPH_FEATURES_ALL);
6736 dout(10) << " final client_addrs " << client_addrs
6737 << ", cluster_addrs " << cluster_addrs
6738 << ", hb_back_addrs " << hb_back_addrs
6739 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6740 << dendl;
6741 _collect_metadata(&mboot->metadata);
6742 monc->send_mon_message(mboot);
6743 set_state(STATE_BOOTING);
6744}
6745
6746void OSD::_collect_metadata(map<string,string> *pm)
6747{
6748 // config info
6749 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6750 if (store->get_type() == "filestore") {
6751 // not applicable for bluestore
6752 (*pm)["osd_journal"] = journal_path;
6753 }
11fdf7f2
TL
6754 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6755 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6756 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6757 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6758
6759 // backend
6760 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6761 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6762 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6763 (*pm)["default_device_class"] = store->get_default_device_class();
f6b5b4d7
TL
6764 string osdspec_affinity;
6765 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6766 if (r < 0 || osdspec_affinity.empty()) {
6767 osdspec_affinity = "";
6768 }
6769 (*pm)["osdspec_affinity"] = osdspec_affinity;
7c673cae
FG
6770 store->collect_metadata(pm);
6771
6772 collect_sys_info(pm, cct);
6773
11fdf7f2
TL
6774 (*pm)["front_iface"] = pick_iface(
6775 cct,
6776 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6777 (*pm)["back_iface"] = pick_iface(
6778 cct,
6779 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6780
6781 // network numa
6782 {
6783 int node = -1;
6784 set<int> nodes;
6785 set<string> unknown;
6786 for (auto nm : { "front_iface", "back_iface" }) {
6787 if (!(*pm)[nm].size()) {
6788 unknown.insert(nm);
6789 continue;
6790 }
6791 int n = -1;
6792 int r = get_iface_numa_node((*pm)[nm], &n);
6793 if (r < 0) {
6794 unknown.insert((*pm)[nm]);
6795 continue;
6796 }
6797 nodes.insert(n);
6798 if (node < 0) {
6799 node = n;
6800 }
6801 }
6802 if (unknown.size()) {
6803 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6804 }
6805 if (!nodes.empty()) {
6806 (*pm)["network_numa_nodes"] = stringify(nodes);
6807 }
6808 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6809 (*pm)["network_numa_node"] = stringify(node);
6810 }
6811 }
6812
6813 if (numa_node >= 0) {
6814 (*pm)["numa_node"] = stringify(numa_node);
6815 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6816 &numa_cpu_set);
6817 }
6818
6819 set<string> devnames;
6820 store->get_devices(&devnames);
9f95a23c
TL
6821 map<string,string> errs;
6822 get_device_metadata(devnames, pm, &errs);
6823 for (auto& i : errs) {
6824 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
11fdf7f2 6825 }
7c673cae
FG
6826 dout(10) << __func__ << " " << *pm << dendl;
6827}
6828
6829void OSD::queue_want_up_thru(epoch_t want)
6830{
9f95a23c
TL
6831 std::shared_lock map_locker{map_lock};
6832 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6833 std::lock_guard report_locker(mon_report_lock);
7c673cae
FG
6834 if (want > up_thru_wanted) {
6835 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6836 << ", currently " << cur
6837 << dendl;
6838 up_thru_wanted = want;
6839 send_alive();
6840 } else {
6841 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6842 << ", currently " << cur
6843 << dendl;
6844 }
7c673cae
FG
6845}
6846
6847void OSD::send_alive()
6848{
9f95a23c
TL
6849 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6850 const auto osdmap = get_osdmap();
7c673cae
FG
6851 if (!osdmap->exists(whoami))
6852 return;
6853 epoch_t up_thru = osdmap->get_up_thru(whoami);
6854 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6855 if (up_thru_wanted > up_thru) {
6856 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6857 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6858 }
6859}
6860
6861void OSD::request_full_map(epoch_t first, epoch_t last)
6862{
6863 dout(10) << __func__ << " " << first << ".." << last
6864 << ", previously requested "
6865 << requested_full_first << ".." << requested_full_last << dendl;
9f95a23c 6866 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
6867 ceph_assert(first > 0 && last > 0);
6868 ceph_assert(first <= last);
6869 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6870 if (requested_full_first == 0) {
6871 // first request
6872 requested_full_first = first;
6873 requested_full_last = last;
6874 } else if (last <= requested_full_last) {
6875 // dup
6876 return;
6877 } else {
6878 // additional request
6879 first = requested_full_last + 1;
6880 requested_full_last = last;
6881 }
6882 MMonGetOSDMap *req = new MMonGetOSDMap;
6883 req->request_full(first, last);
6884 monc->send_mon_message(req);
6885}
6886
6887void OSD::got_full_map(epoch_t e)
6888{
11fdf7f2 6889 ceph_assert(requested_full_first <= requested_full_last);
9f95a23c 6890 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6891 if (requested_full_first == 0) {
6892 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6893 return;
6894 }
6895 if (e < requested_full_first) {
6896 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6897 << ".." << requested_full_last
6898 << ", ignoring" << dendl;
6899 return;
6900 }
6901 if (e >= requested_full_last) {
6902 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6903 << ".." << requested_full_last << ", resetting" << dendl;
6904 requested_full_first = requested_full_last = 0;
6905 return;
6906 }
f67539c2 6907
7c673cae
FG
6908 requested_full_first = e + 1;
6909
6910 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6911 << ".." << requested_full_last
6912 << ", still need more" << dendl;
6913}
6914
6915void OSD::requeue_failures()
6916{
11fdf7f2 6917 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6918 unsigned old_queue = failure_queue.size();
6919 unsigned old_pending = failure_pending.size();
11fdf7f2 6920 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
6921 failure_queue[p->first] = p->second.first;
6922 failure_pending.erase(p++);
6923 }
6924 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6925 << failure_queue.size() << dendl;
6926}
6927
6928void OSD::send_failures()
6929{
9f95a23c
TL
6930 ceph_assert(ceph_mutex_is_locked(map_lock));
6931 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
11fdf7f2 6932 std::lock_guard l(heartbeat_lock);
7c673cae 6933 utime_t now = ceph_clock_now();
9f95a23c 6934 const auto osdmap = get_osdmap();
7c673cae
FG
6935 while (!failure_queue.empty()) {
6936 int osd = failure_queue.begin()->first;
7c673cae
FG
6937 if (!failure_pending.count(osd)) {
6938 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
6939 monc->send_mon_message(
6940 new MOSDFailure(
6941 monc->get_fsid(),
6942 osd,
6943 osdmap->get_addrs(osd),
6944 failed_for,
6945 osdmap->get_epoch()));
6946 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6947 osdmap->get_addrs(osd));
7c673cae
FG
6948 }
6949 failure_queue.erase(osd);
6950 }
6951}
6952
11fdf7f2 6953void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 6954{
11fdf7f2
TL
6955 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6956 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
6957 monc->send_mon_message(m);
6958}
6959
11fdf7f2 6960void OSD::cancel_pending_failures()
7c673cae 6961{
11fdf7f2
TL
6962 std::lock_guard l(heartbeat_lock);
6963 auto it = failure_pending.begin();
6964 while (it != failure_pending.end()) {
6965 dout(10) << __func__ << " canceling in-flight failure report for osd."
6966 << it->first << dendl;
9f95a23c 6967 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
11fdf7f2 6968 failure_pending.erase(it++);
7c673cae 6969 }
7c673cae
FG
6970}
6971
6972void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6973{
6974 const auto& monmap = monc->monmap;
6975 // send beacon to mon even if we are just connected, and the monmap is not
6976 // initialized yet by then.
6977 if (monmap.epoch > 0 &&
6978 monmap.get_required_features().contains_all(
6979 ceph::features::mon::FEATURE_LUMINOUS)) {
6980 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6981 MOSDBeacon* beacon = nullptr;
6982 {
11fdf7f2 6983 std::lock_guard l{min_last_epoch_clean_lock};
9f95a23c
TL
6984 beacon = new MOSDBeacon(get_osdmap_epoch(),
6985 min_last_epoch_clean,
f67539c2
TL
6986 superblock.last_purged_snaps_scrub,
6987 cct->_conf->osd_beacon_report_interval);
494da23a 6988 beacon->pgs = min_last_epoch_clean_pgs;
224ce89b 6989 last_sent_beacon = now;
7c673cae
FG
6990 }
6991 monc->send_mon_message(beacon);
6992 } else {
6993 dout(20) << __func__ << " not sending" << dendl;
6994 }
6995}
6996
7c673cae
FG
6997void OSD::handle_command(MCommand *m)
6998{
6999 ConnectionRef con = m->get_connection();
9f95a23c 7000 auto session = ceph::ref_cast<Session>(con->get_priv());
7c673cae 7001 if (!session) {
9f95a23c 7002 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
7003 m->put();
7004 return;
7005 }
9f95a23c
TL
7006 if (!session->caps.allow_all()) {
7007 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
7008 m->put();
7009 return;
7010 }
9f95a23c 7011 cct->get_admin_socket()->queue_tell_command(m);
7c673cae
FG
7012 m->put();
7013}
7014
f64942e4
AA
7015namespace {
7016 class unlock_guard {
9f95a23c 7017 ceph::mutex& m;
f64942e4 7018 public:
9f95a23c 7019 explicit unlock_guard(ceph::mutex& mutex)
f64942e4
AA
7020 : m(mutex)
7021 {
11fdf7f2 7022 m.unlock();
f64942e4
AA
7023 }
7024 unlock_guard(unlock_guard&) = delete;
7025 ~unlock_guard() {
11fdf7f2 7026 m.lock();
f64942e4
AA
7027 }
7028 };
7029}
7030
9f95a23c 7031void OSD::scrub_purged_snaps()
7c673cae 7032{
9f95a23c
TL
7033 dout(10) << __func__ << dendl;
7034 ceph_assert(ceph_mutex_is_locked(osd_lock));
20effc67 7035 SnapMapper::Scrubber s(cct, store.get(), service.meta_ch,
9f95a23c
TL
7036 make_snapmapper_oid(),
7037 make_purged_snaps_oid());
7038 clog->debug() << "purged_snaps scrub starts";
7039 osd_lock.unlock();
7040 s.run();
7041 if (s.stray.size()) {
7042 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
7043 } else {
7044 clog->debug() << "purged_snaps scrub ok";
224ce89b 7045 }
9f95a23c
TL
7046 set<pair<spg_t,snapid_t>> queued;
7047 for (auto& [pool, snap, hash, shard] : s.stray) {
7048 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
7049 if (!pi) {
7050 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
7051 continue;
11fdf7f2 7052 }
9f95a23c
TL
7053 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
7054 spg_t spgid(pgid, shard);
7055 pair<spg_t,snapid_t> p(spgid, snap);
7056 if (queued.count(p)) {
7057 dout(20) << __func__ << " pg " << spgid << " snap " << snap
7058 << " already queued" << dendl;
7059 continue;
11fdf7f2 7060 }
9f95a23c
TL
7061 PGRef pg = lookup_lock_pg(spgid);
7062 if (!pg) {
7063 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
7064 continue;
11fdf7f2 7065 }
9f95a23c
TL
7066 queued.insert(p);
7067 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
7068 << snap << dendl;
7069 pg->queue_snap_retrim(snap);
7070 pg->unlock();
7c673cae 7071 }
9f95a23c
TL
7072 osd_lock.lock();
7073 if (is_stopping()) {
7074 return;
7075 }
7076 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
7077 ObjectStore::Transaction t;
7078 superblock.last_purged_snaps_scrub = ceph_clock_now();
7079 write_superblock(t);
7080 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7081 ceph_assert(tr == 0);
7082 if (is_active()) {
7083 send_beacon(ceph::coarse_mono_clock::now());
7084 }
7085 dout(10) << __func__ << " done" << dendl;
11fdf7f2
TL
7086}
7087
7088void OSD::probe_smart(const string& only_devid, ostream& ss)
7089{
7090 set<string> devnames;
7091 store->get_devices(&devnames);
7092 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7093 "osd_smart_report_timeout");
7094
7095 // == typedef std::map<std::string, mValue> mObject;
7096 json_spirit::mObject json_map;
7097
7098 for (auto dev : devnames) {
7099 // smartctl works only on physical devices; filter out any logical device
7100 if (dev.find("dm-") == 0) {
7101 continue;
7102 }
7103
7104 string err;
7105 string devid = get_device_id(dev, &err);
7106 if (devid.size() == 0) {
7107 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7108 << err << "), skipping" << dendl;
7109 continue;
7110 }
7111 if (only_devid.size() && devid != only_devid) {
7112 continue;
7113 }
7114
7115 json_spirit::mValue smart_json;
7116 if (block_device_get_metrics(dev, smart_timeout,
7117 &smart_json)) {
7118 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7119 continue;
7120 }
7121 json_map[devid] = smart_json;
7c673cae 7122 }
11fdf7f2 7123 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
7124}
7125
7126bool OSD::heartbeat_dispatch(Message *m)
7127{
7128 dout(30) << "heartbeat_dispatch " << m << dendl;
7129 switch (m->get_type()) {
7130
7131 case CEPH_MSG_PING:
7132 dout(10) << "ping from " << m->get_source_inst() << dendl;
7133 m->put();
7134 break;
7135
7136 case MSG_OSD_PING:
7137 handle_osd_ping(static_cast<MOSDPing*>(m));
7138 break;
7139
7140 default:
7141 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7142 m->put();
7143 }
7144
7145 return true;
7146}
7147
7148bool OSD::ms_dispatch(Message *m)
7149{
7150 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7151 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7152 service.got_stop_ack();
7153 m->put();
7154 return true;
7155 }
7156
7157 // lock!
7158
9f95a23c 7159 osd_lock.lock();
7c673cae 7160 if (is_stopping()) {
9f95a23c 7161 osd_lock.unlock();
7c673cae
FG
7162 m->put();
7163 return true;
7164 }
7165
7166 do_waiters();
7167 _dispatch(m);
7168
9f95a23c 7169 osd_lock.unlock();
7c673cae
FG
7170
7171 return true;
7172}
7173
9f95a23c
TL
7174void OSDService::maybe_share_map(
7175 Connection *con,
7176 const OSDMapRef& osdmap,
7177 epoch_t peer_epoch_lb)
7c673cae 7178{
9f95a23c
TL
7179 // NOTE: we assume caller hold something that keeps the Connection itself
7180 // pinned (e.g., an OpRequest's MessageRef).
7181 auto session = ceph::ref_cast<Session>(con->get_priv());
7182 if (!session) {
7c673cae
FG
7183 return;
7184 }
7c673cae 7185
9f95a23c
TL
7186 // assume the peer has the newer of the op's sent_epoch and what
7187 // we think we sent them.
7c673cae 7188 session->sent_epoch_lock.lock();
9f95a23c
TL
7189 if (peer_epoch_lb > session->last_sent_epoch) {
7190 dout(10) << __func__ << " con " << con
7191 << " " << con->get_peer_addr()
7192 << " map epoch " << session->last_sent_epoch
7193 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7194 session->last_sent_epoch = peer_epoch_lb;
7195 }
7196 epoch_t last_sent_epoch = session->last_sent_epoch;
7c673cae
FG
7197 session->sent_epoch_lock.unlock();
7198
9f95a23c
TL
7199 if (osdmap->get_epoch() <= last_sent_epoch) {
7200 return;
7201 }
11fdf7f2 7202
9f95a23c
TL
7203 send_incremental_map(last_sent_epoch, con, osdmap);
7204 last_sent_epoch = osdmap->get_epoch();
7c673cae
FG
7205
7206 session->sent_epoch_lock.lock();
7207 if (session->last_sent_epoch < last_sent_epoch) {
9f95a23c
TL
7208 dout(10) << __func__ << " con " << con
7209 << " " << con->get_peer_addr()
7210 << " map epoch " << session->last_sent_epoch
7211 << " -> " << last_sent_epoch << " (shared)" << dendl;
7c673cae
FG
7212 session->last_sent_epoch = last_sent_epoch;
7213 }
7214 session->sent_epoch_lock.unlock();
7c673cae
FG
7215}
7216
9f95a23c 7217void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7c673cae 7218{
9f95a23c 7219 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7c673cae
FG
7220
7221 auto i = session->waiting_on_map.begin();
7222 while (i != session->waiting_on_map.end()) {
7223 OpRequestRef op = &(*i);
11fdf7f2 7224 ceph_assert(ms_can_fast_dispatch(op->get_req()));
9f95a23c 7225 auto m = op->get_req<MOSDFastDispatchOp>();
7c673cae
FG
7226 if (m->get_min_epoch() > osdmap->get_epoch()) {
7227 break;
7228 }
7229 session->waiting_on_map.erase(i++);
7230 op->put();
7231
7232 spg_t pgid;
7233 if (m->get_type() == CEPH_MSG_OSD_OP) {
7234 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7235 static_cast<const MOSDOp*>(m)->get_pg());
7236 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7237 continue;
7238 }
7239 } else {
7240 pgid = m->get_spg();
7241 }
11fdf7f2 7242 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
7243 }
7244
7245 if (session->waiting_on_map.empty()) {
7246 clear_session_waiting_on_map(session);
7247 } else {
7248 register_session_waiting_on_map(session);
7249 }
7250}
7251
7252void OSD::ms_fast_dispatch(Message *m)
7253{
20effc67 7254 auto dispatch_span = tracing::osd::tracer.start_trace(__func__);
11fdf7f2 7255 FUNCTRACE(cct);
7c673cae
FG
7256 if (service.is_stopping()) {
7257 m->put();
7258 return;
7259 }
11fdf7f2
TL
7260 // peering event?
7261 switch (m->get_type()) {
7262 case CEPH_MSG_PING:
7263 dout(10) << "ping from " << m->get_source() << dendl;
7264 m->put();
7265 return;
11fdf7f2
TL
7266 case MSG_OSD_FORCE_RECOVERY:
7267 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7268 return;
7269 case MSG_OSD_SCRUB2:
7270 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7271 return;
11fdf7f2
TL
7272 case MSG_OSD_PG_CREATE2:
7273 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
11fdf7f2
TL
7274 case MSG_OSD_PG_NOTIFY:
7275 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7276 case MSG_OSD_PG_INFO:
7277 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7278 case MSG_OSD_PG_REMOVE:
7279 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
11fdf7f2
TL
7280 // these are single-pg messages that handle themselves
7281 case MSG_OSD_PG_LOG:
7282 case MSG_OSD_PG_TRIM:
9f95a23c
TL
7283 case MSG_OSD_PG_NOTIFY2:
7284 case MSG_OSD_PG_QUERY2:
7285 case MSG_OSD_PG_INFO2:
11fdf7f2
TL
7286 case MSG_OSD_BACKFILL_RESERVE:
7287 case MSG_OSD_RECOVERY_RESERVE:
9f95a23c
TL
7288 case MSG_OSD_PG_LEASE:
7289 case MSG_OSD_PG_LEASE_ACK:
11fdf7f2
TL
7290 {
7291 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7292 if (require_osd_peer(pm)) {
7293 enqueue_peering_evt(
7294 pm->get_spg(),
7295 PGPeeringEventRef(pm->get_event()));
7296 }
7297 pm->put();
7298 return;
7299 }
7300 }
7301
7c673cae
FG
7302 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7303 {
7304#ifdef WITH_LTTNG
7305 osd_reqid_t reqid = op->get_reqid();
7306#endif
7307 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7308 reqid.name._num, reqid.tid, reqid.inc);
7309 }
20effc67
TL
7310 op->osd_parent_span = tracing::osd::tracer.add_span("op-request-created", dispatch_span);
7311
7c673cae
FG
7312 if (m->trace)
7313 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7314
11fdf7f2 7315 // note sender epoch, min req's epoch
7c673cae
FG
7316 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7317 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7318 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7319
7320 service.maybe_inject_dispatch_delay();
7321
7322 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7323 m->get_type() != CEPH_MSG_OSD_OP) {
7324 // queue it directly
7325 enqueue_op(
7326 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7327 std::move(op),
7c673cae
FG
7328 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7329 } else {
7330 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7331 // message that didn't have an explicit spg_t); we need to map
7332 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7333 auto priv = m->get_connection()->get_priv();
7334 if (auto session = static_cast<Session*>(priv.get()); session) {
7335 std::lock_guard l{session->session_dispatch_lock};
7336 op->get();
7337 session->waiting_on_map.push_back(*op);
7338 OSDMapRef nextmap = service.get_nextmap_reserved();
7339 dispatch_session_waiting(session, nextmap);
7340 service.release_map(nextmap);
7c673cae
FG
7341 }
7342 }
f67539c2 7343 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7c673cae
FG
7344}
7345
11fdf7f2 7346int OSD::ms_handle_authentication(Connection *con)
7c673cae 7347{
11fdf7f2 7348 int ret = 0;
9f95a23c 7349 auto s = ceph::ref_cast<Session>(con->get_priv());
11fdf7f2 7350 if (!s) {
9f95a23c
TL
7351 s = ceph::make_ref<Session>(cct, con);
7352 con->set_priv(s);
11fdf7f2
TL
7353 s->entity_name = con->get_peer_entity_name();
7354 dout(10) << __func__ << " new session " << s << " con " << s->con
7355 << " entity " << s->entity_name
7356 << " addr " << con->get_peer_addrs() << dendl;
7357 } else {
7358 dout(10) << __func__ << " existing session " << s << " con " << s->con
7359 << " entity " << s->entity_name
7360 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7361 }
7362
11fdf7f2 7363 AuthCapsInfo &caps_info = con->get_peer_caps_info();
9f95a23c 7364 if (caps_info.allow_all) {
11fdf7f2 7365 s->caps.set_allow_all();
9f95a23c 7366 } else if (caps_info.caps.length() > 0) {
11fdf7f2
TL
7367 bufferlist::const_iterator p = caps_info.caps.cbegin();
7368 string str;
7369 try {
7370 decode(str, p);
7371 }
f67539c2 7372 catch (ceph::buffer::error& e) {
11fdf7f2
TL
7373 dout(10) << __func__ << " session " << s << " " << s->entity_name
7374 << " failed to decode caps string" << dendl;
9f95a23c 7375 ret = -EACCES;
11fdf7f2
TL
7376 }
7377 if (!ret) {
7c673cae 7378 bool success = s->caps.parse(str);
11fdf7f2
TL
7379 if (success) {
7380 dout(10) << __func__ << " session " << s
7381 << " " << s->entity_name
7382 << " has caps " << s->caps << " '" << str << "'" << dendl;
7383 ret = 1;
7384 } else {
7385 dout(10) << __func__ << " session " << s << " " << s->entity_name
7386 << " failed to parse caps '" << str << "'" << dendl;
9f95a23c 7387 ret = -EACCES;
11fdf7f2 7388 }
7c673cae 7389 }
7c673cae 7390 }
11fdf7f2 7391 return ret;
7c673cae
FG
7392}
7393
7394void OSD::do_waiters()
7395{
9f95a23c 7396 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7397
7398 dout(10) << "do_waiters -- start" << dendl;
7399 while (!finished.empty()) {
7400 OpRequestRef next = finished.front();
7401 finished.pop_front();
7402 dispatch_op(next);
7403 }
7404 dout(10) << "do_waiters -- finish" << dendl;
7405}
7406
7407void OSD::dispatch_op(OpRequestRef op)
7408{
7409 switch (op->get_req()->get_type()) {
7410
7411 case MSG_OSD_PG_CREATE:
7412 handle_pg_create(op);
7413 break;
7c673cae
FG
7414 }
7415}
7416
7417void OSD::_dispatch(Message *m)
7418{
9f95a23c 7419 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7420 dout(20) << "_dispatch " << m << " " << *m << dendl;
7421
7422 switch (m->get_type()) {
7c673cae
FG
7423 // -- don't need OSDMap --
7424
7425 // map and replication
7426 case CEPH_MSG_OSD_MAP:
7427 handle_osd_map(static_cast<MOSDMap*>(m));
7428 break;
9f95a23c
TL
7429 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7430 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7431 break;
7c673cae
FG
7432
7433 // osd
7c673cae
FG
7434 case MSG_OSD_SCRUB:
7435 handle_scrub(static_cast<MOSDScrub*>(m));
7436 break;
7437
11fdf7f2
TL
7438 case MSG_COMMAND:
7439 handle_command(static_cast<MCommand*>(m));
7440 return;
c07f9fc5 7441
7c673cae
FG
7442 // -- need OSDMap --
7443
7444 case MSG_OSD_PG_CREATE:
7c673cae
FG
7445 {
7446 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7447 if (m->trace)
7448 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7449 // no map? starting up?
9f95a23c 7450 if (!get_osdmap()) {
7c673cae
FG
7451 dout(7) << "no OSDMap, not booted" << dendl;
7452 logger->inc(l_osd_waiting_for_map);
7453 waiting_for_osdmap.push_back(op);
7454 op->mark_delayed("no osdmap");
7455 break;
7456 }
7457
7458 // need OSDMap
7459 dispatch_op(op);
7460 }
7461 }
7462}
7463
11fdf7f2 7464// remove me post-nautilus
7c673cae
FG
7465void OSD::handle_scrub(MOSDScrub *m)
7466{
7467 dout(10) << "handle_scrub " << *m << dendl;
7468 if (!require_mon_or_mgr_peer(m)) {
7469 m->put();
7470 return;
7471 }
7472 if (m->fsid != monc->get_fsid()) {
11fdf7f2
TL
7473 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7474 << dendl;
7c673cae
FG
7475 m->put();
7476 return;
7477 }
7478
11fdf7f2
TL
7479 vector<spg_t> spgs;
7480 _get_pgids(&spgs);
7481
7482 if (!m->scrub_pgs.empty()) {
7483 vector<spg_t> v;
7484 for (auto pgid : m->scrub_pgs) {
7c673cae 7485 spg_t pcand;
9f95a23c 7486 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
11fdf7f2
TL
7487 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7488 v.push_back(pcand);
7c673cae
FG
7489 }
7490 }
11fdf7f2
TL
7491 spgs.swap(v);
7492 }
7493
7494 for (auto pgid : spgs) {
7495 enqueue_peering_evt(
7496 pgid,
7497 PGPeeringEventRef(
7498 std::make_shared<PGPeeringEvent>(
7499 get_osdmap_epoch(),
7500 get_osdmap_epoch(),
9f95a23c 7501 PeeringState::RequestScrub(m->deep, m->repair))));
7c673cae
FG
7502 }
7503
7504 m->put();
7505}
7506
11fdf7f2
TL
7507void OSD::handle_fast_scrub(MOSDScrub2 *m)
7508{
7509 dout(10) << __func__ << " " << *m << dendl;
7510 if (!require_mon_or_mgr_peer(m)) {
7511 m->put();
7512 return;
7513 }
7514 if (m->fsid != monc->get_fsid()) {
7515 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7516 << dendl;
7517 m->put();
7518 return;
7519 }
7520 for (auto pgid : m->scrub_pgs) {
7521 enqueue_peering_evt(
7522 pgid,
7523 PGPeeringEventRef(
7524 std::make_shared<PGPeeringEvent>(
7525 m->epoch,
7526 m->epoch,
9f95a23c 7527 PeeringState::RequestScrub(m->deep, m->repair))));
11fdf7f2
TL
7528 }
7529 m->put();
7530}
7531
7c673cae
FG
7532bool OSD::scrub_random_backoff()
7533{
7534 bool coin_flip = (rand() / (double)RAND_MAX >=
7535 cct->_conf->osd_scrub_backoff_ratio);
7536 if (!coin_flip) {
7537 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7538 return true;
7539 }
7540 return false;
7541}
7542
7c673cae 7543
20effc67 7544void OSD::sched_scrub()
f67539c2 7545{
20effc67 7546 auto& scrub_scheduler = service.get_scrub_services();
f67539c2 7547
20effc67
TL
7548 // fail fast if no resources are available
7549 if (!scrub_scheduler.can_inc_scrubs()) {
7550 dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7551 return;
f67539c2 7552 }
f67539c2 7553
20effc67
TL
7554 // if there is a PG that is just now trying to reserve scrub replica resources -
7555 // we should wait and not initiate a new scrub
7556 if (scrub_scheduler.is_reserving_now()) {
7557 dout(20) << __func__ << ": scrub resources reservation in progress" << dendl;
7558 return;
9f95a23c 7559 }
9f95a23c 7560
20effc67 7561 Scrub::ScrubPreconds env_conditions;
28e407b8 7562
20effc67
TL
7563 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7564 if (!cct->_conf->osd_repair_during_recovery) {
7565 dout(15) << __func__ << ": not scheduling scrubs due to active recovery"
7566 << dendl;
7567 return;
28e407b8 7568 }
20effc67
TL
7569 dout(10) << __func__
7570 << " will only schedule explicitly requested repair due to active recovery"
7571 << dendl;
7572 env_conditions.allow_requested_repair_only = true;
28e407b8
AA
7573 }
7574
20effc67
TL
7575 if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
7576 dout(20) << __func__ << " sched_scrub starts" << dendl;
7577 auto all_jobs = scrub_scheduler.list_registered_jobs();
7578 for (const auto& sj : all_jobs) {
7579 dout(20) << "sched_scrub scrub-queue jobs: " << *sj << dendl;
7c673cae
FG
7580 }
7581 }
20effc67
TL
7582
7583 auto was_started = scrub_scheduler.select_pg_and_scrub(env_conditions);
7584 dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started)
7585 << ")" << dendl;
7c673cae
FG
7586}
7587
20effc67
TL
7588Scrub::schedule_result_t OSDService::initiate_a_scrub(spg_t pgid,
7589 bool allow_requested_repair_only)
7c673cae 7590{
20effc67 7591 dout(20) << __func__ << " trying " << pgid << dendl;
7c673cae 7592
20effc67
TL
7593 // we have a candidate to scrub. We need some PG information to know if scrubbing is
7594 // allowed
7c673cae 7595
20effc67
TL
7596 PGRef pg = osd->lookup_lock_pg(pgid);
7597 if (!pg) {
7598 // the PG was dequeued in the short timespan between creating the candidates list
7599 // (collect_ripe_jobs()) and here
7600 dout(5) << __func__ << " pg " << pgid << " not found" << dendl;
7601 return Scrub::schedule_result_t::no_such_pg;
7c673cae
FG
7602 }
7603
20effc67
TL
7604 // This has already started, so go on to the next scrub job
7605 if (pg->is_scrub_queued_or_active()) {
7606 pg->unlock();
7607 dout(20) << __func__ << ": already in progress pgid " << pgid << dendl;
7608 return Scrub::schedule_result_t::already_started;
7c673cae 7609 }
20effc67
TL
7610 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7611 if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
7612 pg->unlock();
7613 dout(10) << __func__ << " skip " << pgid
7614 << " because repairing is not explicitly requested on it" << dendl;
7615 return Scrub::schedule_result_t::preconditions;
b5b8bbf5
FG
7616 }
7617
20effc67
TL
7618 auto scrub_attempt = pg->sched_scrub();
7619 pg->unlock();
7620 return scrub_attempt;
7c673cae
FG
7621}
7622
494da23a
TL
7623void OSD::resched_all_scrubs()
7624{
7625 dout(10) << __func__ << ": start" << dendl;
20effc67
TL
7626 auto all_jobs = service.get_scrub_services().list_registered_jobs();
7627 for (auto& e : all_jobs) {
7628
7629 auto& job = *e;
7630 dout(20) << __func__ << ": examine " << job.pgid << dendl;
7631
7632 PGRef pg = _lookup_lock_pg(job.pgid);
7633 if (!pg)
7634 continue;
7635
7636 if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
7637 dout(15) << __func__ << ": reschedule " << job.pgid << dendl;
7638 pg->reschedule_scrub();
7639 }
7640 pg->unlock();
494da23a
TL
7641 }
7642 dout(10) << __func__ << ": done" << dendl;
7643}
7644
11fdf7f2
TL
7645MPGStats* OSD::collect_pg_stats()
7646{
20effc67 7647 dout(15) << __func__ << dendl;
11fdf7f2
TL
7648 // This implementation unconditionally sends every is_primary PG's
7649 // stats every time we're called. This has equivalent cost to the
7650 // previous implementation's worst case where all PGs are busy and
7651 // their stats are always enqueued for sending.
9f95a23c 7652 std::shared_lock l{map_lock};
11fdf7f2 7653
11fdf7f2
TL
7654 osd_stat_t cur_stat = service.get_osd_stat();
7655 cur_stat.os_perf_stat = store->get_cur_stats();
7656
9f95a23c 7657 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
11fdf7f2
TL
7658 m->osd_stat = cur_stat;
7659
7660 std::lock_guard lec{min_last_epoch_clean_lock};
9f95a23c 7661 min_last_epoch_clean = get_osdmap_epoch();
11fdf7f2
TL
7662 min_last_epoch_clean_pgs.clear();
7663
7664 std::set<int64_t> pool_set;
7665 vector<PGRef> pgs;
7666 _get_pgs(&pgs);
7667 for (auto& pg : pgs) {
7668 auto pool = pg->pg_id.pgid.pool();
7669 pool_set.emplace((int64_t)pool);
7670 if (!pg->is_primary()) {
7671 continue;
7672 }
20effc67 7673 pg->with_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
11fdf7f2 7674 m->pg_stat[pg->pg_id.pgid] = s;
f67539c2 7675 min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
11fdf7f2
TL
7676 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7677 });
7678 }
7679 store_statfs_t st;
81eedcae 7680 bool per_pool_stats = false;
9f95a23c 7681 bool per_pool_omap_stats = false;
11fdf7f2 7682 for (auto p : pool_set) {
9f95a23c 7683 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
11fdf7f2
TL
7684 if (r == -ENOTSUP) {
7685 break;
7686 } else {
7687 assert(r >= 0);
7688 m->pool_stat[p] = st;
81eedcae 7689 per_pool_stats = true;
11fdf7f2
TL
7690 }
7691 }
7c673cae 7692
81eedcae
TL
7693 // indicate whether we are reporting per-pool stats
7694 m->osd_stat.num_osds = 1;
7695 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
9f95a23c 7696 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
81eedcae 7697
11fdf7f2
TL
7698 return m;
7699}
7c673cae 7700
11fdf7f2 7701vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 7702{
11fdf7f2
TL
7703 vector<DaemonHealthMetric> metrics;
7704 {
7705 utime_t oldest_secs;
7706 const utime_t now = ceph_clock_now();
7707 auto too_old = now;
7708 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7709 int slow = 0;
7710 TrackedOpRef oldest_op;
20effc67
TL
7711 OSDMapRef osdmap = get_osdmap();
7712 // map of slow op counts by slow op event type for an aggregated logging to
7713 // the cluster log.
7714 map<uint8_t, int> slow_op_types;
7715 // map of slow op counts by pool for reporting a pool name with highest
7716 // slow ops.
7717 map<uint64_t, int> slow_op_pools;
7718 bool log_aggregated_slow_op =
7719 cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
11fdf7f2
TL
7720 auto count_slow_ops = [&](TrackedOp& op) {
7721 if (op.get_initiated() < too_old) {
9f95a23c
TL
7722 stringstream ss;
7723 ss << "slow request " << op.get_desc()
7724 << " initiated "
7725 << op.get_initiated()
7726 << " currently "
7727 << op.state_string();
7728 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
20effc67
TL
7729 if (log_aggregated_slow_op) {
7730 if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
7731 uint8_t op_type = req->state_flag();
7732 auto m = req->get_req<MOSDFastDispatchOp>();
7733 uint64_t poolid = m->get_spg().pgid.m_pool;
7734 slow_op_types[op_type]++;
7735 if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
7736 slow_op_pools[poolid]++;
7737 }
7738 }
7739 } else {
7740 clog->warn() << ss.str();
7741 }
11fdf7f2
TL
7742 slow++;
7743 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7744 oldest_op = &op;
7745 }
7746 return true;
7747 } else {
7748 return false;
7749 }
7750 };
7751 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7752 if (slow) {
7753 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7754 << oldest_op->get_desc() << dendl;
20effc67
TL
7755 if (log_aggregated_slow_op &&
7756 slow_op_types.size() > 0) {
7757 stringstream ss;
7758 ss << slow << " slow requests (by type [ ";
7759 for (const auto& [op_type, count] : slow_op_types) {
7760 ss << "'" << OpRequest::get_state_string(op_type)
7761 << "' : " << count
7762 << " ";
7763 }
7764 auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
7765 [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
7766 return p1.second < p2.second;
7767 });
7768 if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
7769 string pool_name = osdmap->get_pool_name(slow_pool_it->first);
7770 ss << "] most affected pool [ '"
7771 << pool_name
7772 << "' : "
7773 << slow_pool_it->second
7774 << " ])";
7775 } else {
7776 ss << "])";
7777 }
7778 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7779 clog->warn() << ss.str();
7780 }
11fdf7f2
TL
7781 }
7782 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7783 } else {
7784 // no news is not good news.
7785 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7786 }
7787 }
7788 {
7789 std::lock_guard l(pending_creates_lock);
7790 auto n_primaries = pending_creates_from_mon;
7791 for (const auto& create : pending_creates_from_osd) {
7792 if (create.second) {
7793 n_primaries++;
7794 }
b32b8144 7795 }
11fdf7f2 7796 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 7797 }
b32b8144
FG
7798 return metrics;
7799}
7800
7c673cae
FG
7801// =====================================================
7802// MAP
7803
7804void OSD::wait_for_new_map(OpRequestRef op)
7805{
7806 // ask?
7807 if (waiting_for_osdmap.empty()) {
9f95a23c 7808 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
7809 }
7810
7811 logger->inc(l_osd_waiting_for_map);
7812 waiting_for_osdmap.push_back(op);
7813 op->mark_delayed("wait for new map");
7814}
7815
7816
7817/** update_map
7818 * assimilate new OSDMap(s). scan pgs, etc.
7819 */
7820
7821void OSD::note_down_osd(int peer)
7822{
9f95a23c
TL
7823 ceph_assert(ceph_mutex_is_locked(osd_lock));
7824 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7c673cae 7825
9f95a23c 7826 std::lock_guard l{heartbeat_lock};
7c673cae
FG
7827 failure_queue.erase(peer);
7828 failure_pending.erase(peer);
7829 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7830 if (p != heartbeat_peers.end()) {
9f95a23c 7831 p->second.clear_mark_down();
7c673cae
FG
7832 heartbeat_peers.erase(p);
7833 }
7c673cae
FG
7834}
7835
7836void OSD::note_up_osd(int peer)
7837{
7c673cae
FG
7838 heartbeat_set_peers_need_update();
7839}
7840
7841struct C_OnMapCommit : public Context {
7842 OSD *osd;
7843 epoch_t first, last;
7844 MOSDMap *msg;
7845 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7846 : osd(o), first(f), last(l), msg(m) {}
7847 void finish(int r) override {
7848 osd->_committed_osd_maps(first, last, msg);
7849 msg->put();
7850 }
7851};
7852
7c673cae
FG
7853void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7854{
11fdf7f2 7855 std::lock_guard l(osdmap_subscribe_lock);
181888fb 7856 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7857 return;
7858
11fdf7f2 7859 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 7860
7c673cae
FG
7861 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7862 force_request) {
7863 monc->renew_subs();
7864 }
7865}
7866
7867void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7868{
7869 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7870 if (min <= superblock.oldest_map)
7871 return;
7872
7873 int num = 0;
7874 ObjectStore::Transaction t;
7875 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7876 dout(20) << " removing old osdmap epoch " << e << dendl;
7877 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7878 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7879 superblock.oldest_map = e + 1;
7880 num++;
7881 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7882 service.publish_superblock(superblock);
7883 write_superblock(t);
11fdf7f2
TL
7884 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7885 ceph_assert(tr == 0);
7c673cae
FG
7886 num = 0;
7887 if (!skip_maps) {
7888 // skip_maps leaves us with a range of old maps if we fail to remove all
7889 // of them before moving superblock.oldest_map forward to the first map
7890 // in the incoming MOSDMap msg. so we should continue removing them in
7891 // this case, even we could do huge series of delete transactions all at
7892 // once.
7893 break;
7894 }
7895 }
7896 }
7897 if (num > 0) {
7898 service.publish_superblock(superblock);
7899 write_superblock(t);
11fdf7f2
TL
7900 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7901 ceph_assert(tr == 0);
7c673cae
FG
7902 }
7903 // we should not remove the cached maps
11fdf7f2 7904 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
7905}
7906
7907void OSD::handle_osd_map(MOSDMap *m)
7908{
11fdf7f2
TL
7909 // wait for pgs to catch up
7910 {
7911 // we extend the map cache pins to accomodate pgs slow to consume maps
7912 // for some period, until we hit the max_lag_factor bound, at which point
7913 // we block here to stop injesting more maps than they are able to keep
7914 // up with.
7915 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7916 m_osd_pg_epoch_max_lag_factor;
7917 ceph_assert(max_lag > 0);
7918 epoch_t osd_min = 0;
7919 for (auto shard : shards) {
7920 epoch_t min = shard->get_min_pg_epoch();
7921 if (osd_min == 0 || min < osd_min) {
7922 osd_min = min;
7923 }
7924 }
9f95a23c 7925 epoch_t osdmap_epoch = get_osdmap_epoch();
11fdf7f2 7926 if (osd_min > 0 &&
9f95a23c
TL
7927 osdmap_epoch > max_lag &&
7928 osdmap_epoch - max_lag > osd_min) {
7929 epoch_t need = osdmap_epoch - max_lag;
11fdf7f2
TL
7930 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7931 << " max_lag " << max_lag << ")" << dendl;
7932 for (auto shard : shards) {
7933 epoch_t min = shard->get_min_pg_epoch();
7934 if (need > min) {
7935 dout(10) << __func__ << " waiting for pgs to consume " << need
7936 << " (shard " << shard->shard_id << " min " << min
7937 << ", map cache is " << cct->_conf->osd_map_cache_size
7938 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7939 << ")" << dendl;
7940 unlock_guard unlock{osd_lock};
7941 shard->wait_min_pg_epoch(need);
7942 }
7943 }
7944 }
7945 }
7946
9f95a23c 7947 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
7948 map<epoch_t,OSDMapRef> added_maps;
7949 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
7950 if (m->fsid != monc->get_fsid()) {
7951 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7952 << monc->get_fsid() << dendl;
7953 m->put();
7954 return;
7955 }
7956 if (is_initializing()) {
7957 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7958 m->put();
7959 return;
7960 }
7961
9f95a23c
TL
7962 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7963 if (session && !(session->entity_name.is_mon() ||
7c673cae
FG
7964 session->entity_name.is_osd())) {
7965 //not enough perms!
7966 dout(10) << "got osd map from Session " << session
7967 << " which we can't take maps from (not a mon or osd)" << dendl;
7968 m->put();
7c673cae
FG
7969 return;
7970 }
7c673cae
FG
7971
7972 // share with the objecter
7973 if (!is_preboot())
7974 service.objecter->handle_osd_map(m);
7975
7976 epoch_t first = m->get_first();
7977 epoch_t last = m->get_last();
7978 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7979 << superblock.newest_map
7980 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7981 << dendl;
7982
7983 logger->inc(l_osd_map);
7984 logger->inc(l_osd_mape, last - first + 1);
7985 if (first <= superblock.newest_map)
7986 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7987 if (service.max_oldest_map < m->oldest_map) {
7988 service.max_oldest_map = m->oldest_map;
11fdf7f2 7989 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7c673cae
FG
7990 }
7991
7992 // make sure there is something new, here, before we bother flushing
7993 // the queues and such
7994 if (last <= superblock.newest_map) {
7995 dout(10) << " no new maps here, dropping" << dendl;
7996 m->put();
7997 return;
7998 }
7999
8000 // missing some?
8001 bool skip_maps = false;
8002 if (first > superblock.newest_map + 1) {
8003 dout(10) << "handle_osd_map message skips epochs "
8004 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8005 if (m->oldest_map <= superblock.newest_map + 1) {
8006 osdmap_subscribe(superblock.newest_map + 1, false);
8007 m->put();
8008 return;
8009 }
8010 // always try to get the full range of maps--as many as we can. this
8011 // 1- is good to have
8012 // 2- is at present the only way to ensure that we get a *full* map as
8013 // the first map!
8014 if (m->oldest_map < first) {
8015 osdmap_subscribe(m->oldest_map - 1, true);
8016 m->put();
8017 return;
8018 }
8019 skip_maps = true;
8020 }
8021
8022 ObjectStore::Transaction t;
8023 uint64_t txn_size = 0;
8024
9f95a23c
TL
8025 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8026
7c673cae 8027 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 8028 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
8029 for (epoch_t e = start; e <= last; e++) {
8030 if (txn_size >= t.get_num_bytes()) {
8031 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 8032 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
8033 }
8034 txn_size = t.get_num_bytes();
8035 map<epoch_t,bufferlist>::iterator p;
8036 p = m->maps.find(e);
8037 if (p != m->maps.end()) {
8038 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8039 OSDMap *o = new OSDMap;
8040 bufferlist& bl = p->second;
8041
8042 o->decode(bl);
8043
9f95a23c
TL
8044 purged_snaps[e] = o->get_new_purged_snaps();
8045
7c673cae
FG
8046 ghobject_t fulloid = get_osdmap_pobject_name(e);
8047 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
8048 added_maps[e] = add_map(o);
8049 added_maps_bl[e] = bl;
7c673cae
FG
8050 got_full_map(e);
8051 continue;
8052 }
8053
8054 p = m->incremental_maps.find(e);
8055 if (p != m->incremental_maps.end()) {
8056 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8057 bufferlist& bl = p->second;
8058 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8059 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
8060
8061 OSDMap *o = new OSDMap;
8062 if (e > 1) {
8063 bufferlist obl;
8064 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
8065 if (!got) {
8066 auto p = added_maps_bl.find(e - 1);
8067 ceph_assert(p != added_maps_bl.end());
8068 obl = p->second;
8069 }
7c673cae
FG
8070 o->decode(obl);
8071 }
8072
8073 OSDMap::Incremental inc;
11fdf7f2 8074 auto p = bl.cbegin();
7c673cae 8075 inc.decode(p);
494da23a 8076
7c673cae 8077 if (o->apply_incremental(inc) < 0) {
9f95a23c 8078 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 8079 ceph_abort_msg("bad fsid");
7c673cae
FG
8080 }
8081
8082 bufferlist fbl;
8083 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8084
8085 bool injected_failure = false;
8086 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8087 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8088 derr << __func__ << " injecting map crc failure" << dendl;
8089 injected_failure = true;
8090 }
8091
8092 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8093 dout(2) << "got incremental " << e
8094 << " but failed to encode full with correct crc; requesting"
8095 << dendl;
8096 clog->warn() << "failed to encode map e" << e << " with expected crc";
8097 dout(20) << "my encoded map was:\n";
8098 fbl.hexdump(*_dout);
8099 *_dout << dendl;
8100 delete o;
8101 request_full_map(e, last);
8102 last = e - 1;
f6b5b4d7
TL
8103
8104 // don't continue committing if we failed to enc the first inc map
8105 if (last < start) {
8106 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8107 m->put();
8108 return;
8109 }
7c673cae
FG
8110 break;
8111 }
8112 got_full_map(e);
9f95a23c 8113 purged_snaps[e] = o->get_new_purged_snaps();
7c673cae
FG
8114
8115 ghobject_t fulloid = get_osdmap_pobject_name(e);
8116 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
8117 added_maps[e] = add_map(o);
8118 added_maps_bl[e] = fbl;
7c673cae
FG
8119 continue;
8120 }
8121
11fdf7f2 8122 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
8123 }
8124
8125 // even if this map isn't from a mon, we may have satisfied our subscription
8126 monc->sub_got("osdmap", last);
8127
8128 if (!m->maps.empty() && requested_full_first) {
8129 dout(10) << __func__ << " still missing full maps " << requested_full_first
8130 << ".." << requested_full_last << dendl;
8131 rerequest_full_maps();
8132 }
8133
7c673cae
FG
8134 if (superblock.oldest_map) {
8135 // make sure we at least keep pace with incoming maps
8136 trim_maps(m->oldest_map, last - first + 1, skip_maps);
11fdf7f2 8137 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
8138 }
8139
8140 if (!superblock.oldest_map || skip_maps)
8141 superblock.oldest_map = first;
8142 superblock.newest_map = last;
8143 superblock.current_epoch = last;
8144
8145 // note in the superblock that we were clean thru the prior epoch
8146 epoch_t boot_epoch = service.get_boot_epoch();
8147 if (boot_epoch && boot_epoch >= superblock.mounted) {
8148 superblock.mounted = boot_epoch;
8149 superblock.clean_thru = last;
8150 }
8151
11fdf7f2
TL
8152 // check for pg_num changes and deleted pools
8153 OSDMapRef lastmap;
8154 for (auto& i : added_maps) {
8155 if (!lastmap) {
8156 if (!(lastmap = service.try_get_map(i.first - 1))) {
8157 dout(10) << __func__ << " can't get previous map " << i.first - 1
8158 << " probably first start of this osd" << dendl;
8159 continue;
8160 }
8161 }
8162 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8163 for (auto& j : lastmap->get_pools()) {
8164 if (!i.second->have_pg_pool(j.first)) {
8165 pg_num_history.log_pool_delete(i.first, j.first);
8166 dout(10) << __func__ << " recording final pg_pool_t for pool "
8167 << j.first << dendl;
8168 // this information is needed by _make_pg() if have to restart before
8169 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8170 ghobject_t obj = make_final_pool_info_oid(j.first);
8171 bufferlist bl;
8172 encode(j.second, bl, CEPH_FEATURES_ALL);
8173 string name = lastmap->get_pool_name(j.first);
8174 encode(name, bl);
8175 map<string,string> profile;
8176 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8177 profile = lastmap->get_erasure_code_profile(
8178 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8179 }
8180 encode(profile, bl);
8181 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
11fdf7f2
TL
8182 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8183 new_pg_num != j.second.get_pg_num()) {
8184 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8185 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8186 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8187 }
8188 }
8189 for (auto& j : i.second->get_pools()) {
8190 if (!lastmap->have_pg_pool(j.first)) {
8191 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8192 << j.second.get_pg_num() << dendl;
8193 pg_num_history.log_pg_num_change(i.first, j.first,
8194 j.second.get_pg_num());
8195 }
8196 }
8197 lastmap = i.second;
8198 }
8199 pg_num_history.epoch = last;
8200 {
8201 bufferlist bl;
8202 ::encode(pg_num_history, bl);
8203 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8204 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8205 }
8206
9f95a23c
TL
8207 // record new purged_snaps
8208 if (superblock.purged_snaps_last == start - 1) {
20effc67 8209 SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
9f95a23c
TL
8210 make_purged_snaps_oid(), &t,
8211 purged_snaps);
8212 superblock.purged_snaps_last = last;
8213 } else {
8214 dout(10) << __func__ << " superblock purged_snaps_last is "
8215 << superblock.purged_snaps_last
8216 << ", not recording new purged_snaps" << dendl;
8217 }
8218
7c673cae
FG
8219 // superblock and commit
8220 write_superblock(t);
11fdf7f2 8221 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8222 store->queue_transaction(
11fdf7f2
TL
8223 service.meta_ch,
8224 std::move(t));
7c673cae
FG
8225 service.publish_superblock(superblock);
8226}
8227
8228void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8229{
8230 dout(10) << __func__ << " " << first << ".." << last << dendl;
8231 if (is_stopping()) {
8232 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8233 return;
8234 }
11fdf7f2 8235 std::lock_guard l(osd_lock);
31f18b77
FG
8236 if (is_stopping()) {
8237 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8238 return;
8239 }
9f95a23c 8240 map_lock.lock();
7c673cae 8241
f6b5b4d7
TL
8242 ceph_assert(first <= last);
8243
7c673cae
FG
8244 bool do_shutdown = false;
8245 bool do_restart = false;
8246 bool network_error = false;
f6b5b4d7 8247 OSDMapRef osdmap = get_osdmap();
7c673cae
FG
8248
8249 // advance through the new maps
8250 for (epoch_t cur = first; cur <= last; cur++) {
8251 dout(10) << " advance to epoch " << cur
8252 << " (<= last " << last
8253 << " <= newest_map " << superblock.newest_map
8254 << ")" << dendl;
8255
8256 OSDMapRef newmap = get_map(cur);
11fdf7f2 8257 ceph_assert(newmap); // we just cached it above!
7c673cae 8258
f67539c2 8259 // start blocklisting messages sent to peers that go down.
7c673cae
FG
8260 service.pre_publish_map(newmap);
8261
8262 // kill connections to newly down osds
8263 bool waited_for_reservations = false;
8264 set<int> old;
9f95a23c 8265 osdmap = get_osdmap();
7c673cae
FG
8266 osdmap->get_all_osds(old);
8267 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8268 if (*p != whoami &&
8269 osdmap->is_up(*p) && // in old map
8270 newmap->is_down(*p)) { // but not the new one
8271 if (!waited_for_reservations) {
8272 service.await_reserved_maps();
8273 waited_for_reservations = true;
8274 }
8275 note_down_osd(*p);
8276 } else if (*p != whoami &&
8277 osdmap->is_down(*p) &&
8278 newmap->is_up(*p)) {
8279 note_up_osd(*p);
8280 }
8281 }
8282
81eedcae 8283 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
7c673cae
FG
8284 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8285 << dendl;
8286 if (is_booting()) {
8287 // this captures the case where we sent the boot message while
8288 // NOUP was being set on the mon and our boot request was
8289 // dropped, and then later it is cleared. it imperfectly
8290 // handles the case where our original boot message was not
8291 // dropped and we restart even though we might have booted, but
8292 // that is harmless (boot will just take slightly longer).
8293 do_restart = true;
8294 }
8295 }
8296
9f95a23c
TL
8297 osdmap = std::move(newmap);
8298 set_osdmap(osdmap);
7c673cae
FG
8299 epoch_t up_epoch;
8300 epoch_t boot_epoch;
8301 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8302 if (!up_epoch &&
8303 osdmap->is_up(whoami) &&
11fdf7f2 8304 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8305 up_epoch = osdmap->get_epoch();
8306 dout(10) << "up_epoch is " << up_epoch << dendl;
8307 if (!boot_epoch) {
8308 boot_epoch = osdmap->get_epoch();
8309 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8310 }
8311 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8312 }
8313 }
8314
7c673cae
FG
8315 epoch_t _bind_epoch = service.get_bind_epoch();
8316 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8317 osdmap->get_addrs(whoami).legacy_equals(
8318 client_messenger->get_myaddrs()) &&
7c673cae
FG
8319 _bind_epoch < osdmap->get_up_from(whoami)) {
8320
8321 if (is_booting()) {
8322 dout(1) << "state: booting -> active" << dendl;
8323 set_state(STATE_ACTIVE);
11fdf7f2 8324 do_restart = false;
7c673cae
FG
8325
8326 // set incarnation so that osd_reqid_t's we generate for our
8327 // objecter requests are unique across restarts.
8328 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8329 cancel_pending_failures();
7c673cae
FG
8330 }
8331 }
8332
8333 if (osdmap->get_epoch() > 0 &&
8334 is_active()) {
8335 if (!osdmap->exists(whoami)) {
9f95a23c 8336 derr << "map says i do not exist. shutting down." << dendl;
7c673cae
FG
8337 do_shutdown = true; // don't call shutdown() while we have
8338 // everything paused
9f95a23c
TL
8339 } else if (osdmap->is_stop(whoami)) {
8340 derr << "map says i am stopped by admin. shutting down." << dendl;
8341 do_shutdown = true;
7c673cae 8342 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8343 !osdmap->get_addrs(whoami).legacy_equals(
8344 client_messenger->get_myaddrs()) ||
8345 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8346 cluster_messenger->get_myaddrs()) ||
8347 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8348 hb_back_server_messenger->get_myaddrs()) ||
8349 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8350 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8351 if (!osdmap->is_up(whoami)) {
8352 if (service.is_preparing_to_stop() || service.is_stopping()) {
8353 service.got_stop_ack();
8354 } else {
c07f9fc5
FG
8355 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8356 "but it is still running";
8357 clog->debug() << "map e" << osdmap->get_epoch()
8358 << " wrongly marked me down at e"
8359 << osdmap->get_down_at(whoami);
7c673cae 8360 }
9f95a23c
TL
8361 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8362 // note that this is best-effort...
8363 monc->send_mon_message(
8364 new MOSDMarkMeDead(
8365 monc->get_fsid(),
8366 whoami,
8367 osdmap->get_epoch()));
8368 }
11fdf7f2
TL
8369 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8370 client_messenger->get_myaddrs())) {
7c673cae 8371 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8372 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8373 << " != my " << client_messenger->get_myaddrs() << ")";
8374 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8375 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8376 clog->error() << "map e" << osdmap->get_epoch()
8377 << " had wrong cluster addr ("
11fdf7f2
TL
8378 << osdmap->get_cluster_addrs(whoami)
8379 << " != my " << cluster_messenger->get_myaddrs() << ")";
8380 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8381 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8382 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8383 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8384 << osdmap->get_hb_back_addrs(whoami)
8385 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8386 << ")";
11fdf7f2
TL
8387 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8388 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8389 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8390 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8391 << osdmap->get_hb_front_addrs(whoami)
8392 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8393 << ")";
8394 }
8395
8396 if (!service.is_stopping()) {
8397 epoch_t up_epoch = 0;
8398 epoch_t bind_epoch = osdmap->get_epoch();
8399 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8400 do_restart = true;
8401
8402 //add markdown log
8403 utime_t now = ceph_clock_now();
8404 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8405 osd_markdown_log.push_back(now);
7c673cae 8406 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
9f95a23c
TL
8407 derr << __func__ << " marked down "
8408 << osd_markdown_log.size()
8409 << " > osd_max_markdown_count "
8410 << cct->_conf->osd_max_markdown_count
8411 << " in last " << grace << " seconds, shutting down"
8412 << dendl;
7c673cae
FG
8413 do_restart = false;
8414 do_shutdown = true;
8415 }
8416
8417 start_waiting_for_healthy();
8418
8419 set<int> avoid_ports;
8420#if defined(__FreeBSD__)
8421 // prevent FreeBSD from grabbing the client_messenger port during
f67539c2 8422 // rebinding. In which case a cluster_meesneger will connect also
7c673cae 8423 // to the same port
11fdf7f2 8424 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8425#endif
11fdf7f2 8426 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8427
8428 int r = cluster_messenger->rebind(avoid_ports);
8429 if (r != 0) {
8430 do_shutdown = true; // FIXME: do_restart?
8431 network_error = true;
9f95a23c
TL
8432 derr << __func__ << " marked down:"
8433 << " rebind cluster_messenger failed" << dendl;
7c673cae
FG
8434 }
8435
9f95a23c
TL
8436 hb_back_server_messenger->mark_down_all();
8437 hb_front_server_messenger->mark_down_all();
7c673cae
FG
8438 hb_front_client_messenger->mark_down_all();
8439 hb_back_client_messenger->mark_down_all();
8440
494da23a 8441 reset_heartbeat_peers(true);
7c673cae
FG
8442 }
8443 }
20effc67
TL
8444 } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
8445 derr << "map says i am stopped by admin. shutting down." << dendl;
8446 do_shutdown = true;
7c673cae
FG
8447 }
8448
9f95a23c 8449 map_lock.unlock();
7c673cae 8450
11fdf7f2 8451 check_osdmap_features();
7c673cae
FG
8452
8453 // yay!
8454 consume_map();
8455
8456 if (is_active() || is_waiting_for_healthy())
8457 maybe_update_heartbeat_peers();
8458
11fdf7f2 8459 if (is_active()) {
7c673cae
FG
8460 activate_map();
8461 }
8462
31f18b77 8463 if (do_shutdown) {
7c673cae 8464 if (network_error) {
11fdf7f2 8465 cancel_pending_failures();
7c673cae
FG
8466 }
8467 // trigger shutdown in a different thread
8468 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8469 queue_async_signal(SIGINT);
8470 }
31f18b77
FG
8471 else if (m->newest_map && m->newest_map > last) {
8472 dout(10) << " msg say newest map is " << m->newest_map
8473 << ", requesting more" << dendl;
8474 osdmap_subscribe(osdmap->get_epoch()+1, false);
8475 }
7c673cae
FG
8476 else if (is_preboot()) {
8477 if (m->get_source().is_mon())
8478 _preboot(m->oldest_map, m->newest_map);
8479 else
8480 start_boot();
8481 }
8482 else if (do_restart)
8483 start_boot();
8484
8485}
8486
11fdf7f2 8487void OSD::check_osdmap_features()
7c673cae
FG
8488{
8489 // adjust required feature bits?
8490
8491 // we have to be a bit careful here, because we are accessing the
8492 // Policy structures without taking any lock. in particular, only
8493 // modify integer values that can safely be read by a racing CPU.
8494 // since we are only accessing existing Policy structures a their
8495 // current memory location, and setting or clearing bits in integer
8496 // fields, and we are the only writer, this is not a problem.
8497
9f95a23c 8498 const auto osdmap = get_osdmap();
7c673cae
FG
8499 {
8500 Messenger::Policy p = client_messenger->get_default_policy();
8501 uint64_t mask;
8502 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8503 if ((p.features_required & mask) != features) {
8504 dout(0) << "crush map has features " << features
8505 << ", adjusting msgr requires for clients" << dendl;
8506 p.features_required = (p.features_required & ~mask) | features;
8507 client_messenger->set_default_policy(p);
8508 }
8509 }
8510 {
8511 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8512 uint64_t mask;
8513 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8514 if ((p.features_required & mask) != features) {
8515 dout(0) << "crush map has features " << features
8516 << " was " << p.features_required
8517 << ", adjusting msgr requires for mons" << dendl;
8518 p.features_required = (p.features_required & ~mask) | features;
8519 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8520 }
8521 }
8522 {
8523 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8524 uint64_t mask;
8525 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8526
8527 if ((p.features_required & mask) != features) {
8528 dout(0) << "crush map has features " << features
8529 << ", adjusting msgr requires for osds" << dendl;
8530 p.features_required = (p.features_required & ~mask) | features;
8531 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8532 }
8533
11fdf7f2 8534 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8535 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8536 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8537 ObjectStore::Transaction t;
8538 write_superblock(t);
11fdf7f2
TL
8539 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8540 ceph_assert(err == 0);
7c673cae
FG
8541 }
8542 }
11fdf7f2 8543
9f95a23c
TL
8544 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8545 hb_front_server_messenger->set_require_authorizer(false);
8546 hb_back_server_messenger->set_require_authorizer(false);
8547 } else {
8548 hb_front_server_messenger->set_require_authorizer(true);
8549 hb_back_server_messenger->set_require_authorizer(true);
11fdf7f2
TL
8550 }
8551
8552 if (osdmap->require_osd_release != last_require_osd_release) {
8553 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8554 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8555 store->write_meta("require_osd_release",
8556 stringify((int)osdmap->require_osd_release));
8557 last_require_osd_release = osdmap->require_osd_release;
8558 }
7c673cae
FG
8559}
8560
11fdf7f2
TL
8561struct C_FinishSplits : public Context {
8562 OSD *osd;
8563 set<PGRef> pgs;
8564 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8565 : osd(osd), pgs(in) {}
8566 void finish(int r) override {
8567 osd->_finish_splits(pgs);
8568 }
8569};
8570
8571void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8572{
11fdf7f2
TL
8573 dout(10) << __func__ << " " << pgs << dendl;
8574 if (is_stopping())
8575 return;
11fdf7f2
TL
8576 for (set<PGRef>::iterator i = pgs.begin();
8577 i != pgs.end();
8578 ++i) {
8579 PG *pg = i->get();
7c673cae 8580
20effc67 8581 PeeringCtx rctx;
11fdf7f2
TL
8582 pg->lock();
8583 dout(10) << __func__ << " " << *pg << dendl;
8584 epoch_t e = pg->get_osdmap_epoch();
9f95a23c 8585 pg->handle_initialize(rctx);
11fdf7f2 8586 pg->queue_null(e, e);
9f95a23c 8587 dispatch_context(rctx, pg, service.get_osdmap());
11fdf7f2 8588 pg->unlock();
7c673cae 8589
11fdf7f2
TL
8590 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8591 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae 8592 }
11fdf7f2
TL
8593};
8594
8595bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8596 unsigned need)
8597{
8598 std::lock_guard l(merge_lock);
8599 auto& p = merge_waiters[nextmap->get_epoch()][target];
8600 p[src->pg_id] = src;
8601 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8602 << " for " << target << ", have " << p.size() << "/" << need
8603 << dendl;
8604 return p.size() == need;
8605}
8606
8607bool OSD::advance_pg(
8608 epoch_t osd_epoch,
8609 PG *pg,
8610 ThreadPool::TPHandle &handle,
9f95a23c 8611 PeeringCtx &rctx)
11fdf7f2
TL
8612{
8613 if (osd_epoch <= pg->get_osdmap_epoch()) {
8614 return true;
8615 }
8616 ceph_assert(pg->is_locked());
8617 OSDMapRef lastmap = pg->get_osdmap();
11fdf7f2
TL
8618 set<PGRef> new_pgs; // any split children
8619 bool ret = true;
8620
8621 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8622 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8623 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8624 next_epoch <= osd_epoch;
7c673cae
FG
8625 ++next_epoch) {
8626 OSDMapRef nextmap = service.try_get_map(next_epoch);
8627 if (!nextmap) {
8628 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8629 continue;
8630 }
8631
11fdf7f2
TL
8632 unsigned new_pg_num =
8633 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8634 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8635 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8636 // check for merge
8637 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8638 spg_t parent;
8639 if (pg->pg_id.is_merge_source(
8640 old_pg_num,
8641 new_pg_num,
8642 &parent)) {
8643 // we are merge source
8644 PGRef spg = pg; // carry a ref
8645 dout(1) << __func__ << " " << pg->pg_id
8646 << " is merge source, target is " << parent
8647 << dendl;
8648 pg->write_if_dirty(rctx);
9f95a23c
TL
8649 if (!new_pgs.empty()) {
8650 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8651 new_pgs));
8652 new_pgs.clear();
8653 }
8654 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2 8655 pg->ch->flush();
eafe8130
TL
8656 // release backoffs explicitly, since the on_shutdown path
8657 // aggressively tears down backoff state.
8658 if (pg->is_primary()) {
8659 pg->release_pg_backoffs();
8660 }
11fdf7f2
TL
8661 pg->on_shutdown();
8662 OSDShard *sdata = pg->osd_shard;
8663 {
8664 std::lock_guard l(sdata->shard_lock);
8665 if (pg->pg_slot) {
8666 sdata->_detach_pg(pg->pg_slot);
8667 // update pg count now since we might not get an osdmap
8668 // any time soon.
8669 if (pg->is_primary())
8670 logger->dec(l_osd_pg_primary);
9f95a23c
TL
8671 else if (pg->is_nonprimary())
8672 logger->dec(l_osd_pg_replica); // misnomer
11fdf7f2
TL
8673 else
8674 logger->dec(l_osd_pg_stray);
8675 }
8676 }
8677 pg->unlock();
8678
8679 set<spg_t> children;
8680 parent.is_split(new_pg_num, old_pg_num, &children);
8681 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8682 enqueue_peering_evt(
8683 parent,
8684 PGPeeringEventRef(
8685 std::make_shared<PGPeeringEvent>(
8686 nextmap->get_epoch(),
8687 nextmap->get_epoch(),
8688 NullEvt())));
8689 }
8690 ret = false;
8691 goto out;
8692 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8693 // we are merge target
8694 set<spg_t> children;
8695 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8696 dout(20) << __func__ << " " << pg->pg_id
8697 << " is merge target, sources are " << children
8698 << dendl;
8699 map<spg_t,PGRef> sources;
8700 {
8701 std::lock_guard l(merge_lock);
8702 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8703 unsigned need = children.size();
8704 dout(20) << __func__ << " have " << s.size() << "/"
8705 << need << dendl;
8706 if (s.size() == need) {
8707 sources.swap(s);
8708 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8709 if (merge_waiters[nextmap->get_epoch()].empty()) {
8710 merge_waiters.erase(nextmap->get_epoch());
8711 }
8712 }
8713 }
8714 if (!sources.empty()) {
8715 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8716 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8717 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8718 pg->merge_from(
8719 sources, rctx, split_bits,
8720 nextmap->get_pg_pool(
8721 pg->pg_id.pool())->last_pg_merge_meta);
8722 pg->pg_slot->waiting_for_merge_epoch = 0;
8723 } else {
8724 dout(20) << __func__ << " not ready to merge yet" << dendl;
8725 pg->write_if_dirty(rctx);
9f95a23c
TL
8726 if (!new_pgs.empty()) {
8727 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8728 new_pgs));
8729 new_pgs.clear();
8730 }
8731 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2
TL
8732 pg->unlock();
8733 // kick source(s) to get them ready
8734 for (auto& i : children) {
8735 dout(20) << __func__ << " kicking source " << i << dendl;
8736 enqueue_peering_evt(
8737 i,
8738 PGPeeringEventRef(
8739 std::make_shared<PGPeeringEvent>(
8740 nextmap->get_epoch(),
8741 nextmap->get_epoch(),
8742 NullEvt())));
8743 }
8744 ret = false;
8745 goto out;
8746 }
8747 }
8748 }
8749 }
8750
7c673cae
FG
8751 vector<int> newup, newacting;
8752 int up_primary, acting_primary;
8753 nextmap->pg_to_up_acting_osds(
11fdf7f2 8754 pg->pg_id.pgid,
7c673cae
FG
8755 &newup, &up_primary,
8756 &newacting, &acting_primary);
8757 pg->handle_advance_map(
8758 nextmap, lastmap, newup, up_primary,
8759 newacting, acting_primary, rctx);
8760
494da23a
TL
8761 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8762 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8763 if (oldpool != lastmap->get_pools().end()
8764 && newpool != nextmap->get_pools().end()) {
8765 dout(20) << __func__
8766 << " new pool opts " << newpool->second.opts
8767 << " old pool opts " << oldpool->second.opts
8768 << dendl;
8769
8770 double old_min_interval = 0, new_min_interval = 0;
8771 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8772 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8773
8774 double old_max_interval = 0, new_max_interval = 0;
8775 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8776 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8777
8778 // Assume if an interval is change from set to unset or vice versa the actual config
8779 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8780 // unnecessarily.
8781 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8782 pg->on_info_history_change();
8783 }
8784 }
8785
11fdf7f2
TL
8786 if (new_pg_num && old_pg_num != new_pg_num) {
8787 // check for split
8788 set<spg_t> children;
8789 if (pg->pg_id.is_split(
8790 old_pg_num,
8791 new_pg_num,
8792 &children)) {
8793 split_pgs(
8794 pg, children, &new_pgs, lastmap, nextmap,
8795 rctx);
8796 }
7c673cae
FG
8797 }
8798
8799 lastmap = nextmap;
11fdf7f2 8800 old_pg_num = new_pg_num;
7c673cae
FG
8801 handle.reset_tp_timeout();
8802 }
7c673cae 8803 pg->handle_activate_map(rctx);
11fdf7f2
TL
8804
8805 ret = true;
8806 out:
8807 if (!new_pgs.empty()) {
9f95a23c 8808 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 8809 }
11fdf7f2 8810 return ret;
7c673cae
FG
8811}
8812
8813void OSD::consume_map()
8814{
9f95a23c
TL
8815 ceph_assert(ceph_mutex_is_locked(osd_lock));
8816 auto osdmap = get_osdmap();
7c673cae
FG
8817 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8818
3efd9988
FG
8819 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8820 * speak the older sorting version any more. Be careful not to force
8821 * a shutdown if we are merely processing old maps, though.
8822 */
8823 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8824 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8825 ceph_abort();
8826 }
8827
11fdf7f2
TL
8828 service.pre_publish_map(osdmap);
8829 service.await_reserved_maps();
8830 service.publish_map(osdmap);
7c673cae 8831
11fdf7f2
TL
8832 // prime splits and merges
8833 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8834 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8835 for (auto& shard : shards) {
8836 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8837 }
8838 if (!newly_split.empty()) {
8839 for (auto& shard : shards) {
8840 shard->prime_splits(osdmap, &newly_split);
8841 }
8842 ceph_assert(newly_split.empty());
8843 }
7c673cae 8844
11fdf7f2
TL
8845 // prune sent_ready_to_merge
8846 service.prune_sent_ready_to_merge(osdmap);
7c673cae 8847
11fdf7f2
TL
8848 // FIXME, maybe: We could race against an incoming peering message
8849 // that instantiates a merge PG after identify_merges() below and
8850 // never set up its peer to complete the merge. An OSD restart
8851 // would clear it up. This is a hard race to resolve,
8852 // extraordinarily rare (we only merge PGs that are stable and
8853 // clean, so it'd have to be an imported PG to an OSD with a
8854 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8855 // replace all of this with a seastar-based code soon anyway.
8856 if (!merge_pgs.empty()) {
8857 // mark the pgs we already have, or create new and empty merge
8858 // participants for those we are missing. do this all under the
8859 // shard lock so we don't have to worry about racing pg creates
8860 // via _process.
8861 for (auto& shard : shards) {
8862 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 8863 }
11fdf7f2
TL
8864 ceph_assert(merge_pgs.empty());
8865 }
8866
8867 service.prune_pg_created();
8868
8869 unsigned pushes_to_free = 0;
8870 for (auto& shard : shards) {
8871 shard->consume_map(osdmap, &pushes_to_free);
8872 }
8873
8874 vector<spg_t> pgids;
8875 _get_pgids(&pgids);
8876
8877 // count (FIXME, probably during seastar rewrite)
8878 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8879 vector<PGRef> pgs;
8880 _get_pgs(&pgs);
8881 for (auto& pg : pgs) {
8882 // FIXME (probably during seastar rewrite): this is lockless and
8883 // racy, but we don't want to take pg lock here.
8884 if (pg->is_primary())
8885 num_pg_primary++;
9f95a23c
TL
8886 else if (pg->is_nonprimary())
8887 num_pg_replica++; // misnomer
11fdf7f2
TL
8888 else
8889 num_pg_stray++;
8890 }
3efd9988 8891
11fdf7f2
TL
8892 {
8893 // FIXME (as part of seastar rewrite): move to OSDShard
8894 std::lock_guard l(pending_creates_lock);
8895 for (auto pg = pending_creates_from_osd.begin();
8896 pg != pending_creates_from_osd.end();) {
9f95a23c 8897 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
11fdf7f2
TL
8898 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8899 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
8900 pg = pending_creates_from_osd.erase(pg);
8901 } else {
8902 ++pg;
8903 }
8904 }
7c673cae
FG
8905 }
8906
7c673cae
FG
8907 service.maybe_inject_dispatch_delay();
8908
8909 dispatch_sessions_waiting_on_map();
8910
8911 service.maybe_inject_dispatch_delay();
8912
11fdf7f2 8913 service.release_reserved_pushes(pushes_to_free);
7c673cae 8914
11fdf7f2
TL
8915 // queue null events to push maps down to individual PGs
8916 for (auto pgid : pgids) {
8917 enqueue_peering_evt(
8918 pgid,
8919 PGPeeringEventRef(
8920 std::make_shared<PGPeeringEvent>(
8921 osdmap->get_epoch(),
8922 osdmap->get_epoch(),
8923 NullEvt())));
7c673cae 8924 }
11fdf7f2 8925 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
8926 logger->set(l_osd_pg_primary, num_pg_primary);
8927 logger->set(l_osd_pg_replica, num_pg_replica);
8928 logger->set(l_osd_pg_stray, num_pg_stray);
8929}
8930
8931void OSD::activate_map()
8932{
9f95a23c
TL
8933 ceph_assert(ceph_mutex_is_locked(osd_lock));
8934 auto osdmap = get_osdmap();
7c673cae
FG
8935
8936 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8937
7c673cae
FG
8938 // norecover?
8939 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8940 if (!service.recovery_is_paused()) {
8941 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8942 service.pause_recovery();
8943 }
8944 } else {
8945 if (service.recovery_is_paused()) {
8946 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8947 service.unpause_recovery();
8948 }
8949 }
8950
8951 service.activate_map();
8952
8953 // process waiters
8954 take_waiters(waiting_for_osdmap);
8955}
8956
8957bool OSD::require_mon_peer(const Message *m)
8958{
8959 if (!m->get_connection()->peer_is_mon()) {
8960 dout(0) << "require_mon_peer received from non-mon "
8961 << m->get_connection()->get_peer_addr()
8962 << " " << *m << dendl;
8963 return false;
8964 }
8965 return true;
8966}
8967
8968bool OSD::require_mon_or_mgr_peer(const Message *m)
8969{
8970 if (!m->get_connection()->peer_is_mon() &&
8971 !m->get_connection()->peer_is_mgr()) {
8972 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8973 << m->get_connection()->get_peer_addr()
8974 << " " << *m << dendl;
8975 return false;
8976 }
8977 return true;
8978}
8979
8980bool OSD::require_osd_peer(const Message *m)
8981{
8982 if (!m->get_connection()->peer_is_osd()) {
8983 dout(0) << "require_osd_peer received from non-osd "
8984 << m->get_connection()->get_peer_addr()
8985 << " " << *m << dendl;
8986 return false;
8987 }
8988 return true;
8989}
8990
8991bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8992{
8993 epoch_t up_epoch = service.get_up_epoch();
8994 if (epoch < up_epoch) {
8995 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8996 return false;
8997 }
8998
8999 if (!is_active()) {
9000 dout(7) << "still in boot state, dropping message " << *m << dendl;
9001 return false;
9002 }
9003
9004 return true;
9005}
9006
9f95a23c 9007bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
7c673cae
FG
9008 bool is_fast_dispatch)
9009{
9010 int from = m->get_source().num();
9011
9012 if (map->is_down(from) ||
11fdf7f2 9013 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
7c673cae
FG
9014 dout(5) << "from dead osd." << from << ", marking down, "
9015 << " msg was " << m->get_source_inst().addr
11fdf7f2
TL
9016 << " expected "
9017 << (map->is_up(from) ?
9018 map->get_cluster_addrs(from) : entity_addrvec_t())
7c673cae
FG
9019 << dendl;
9020 ConnectionRef con = m->get_connection();
9021 con->mark_down();
9f95a23c 9022 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
7c673cae 9023 if (!is_fast_dispatch)
9f95a23c 9024 s->session_dispatch_lock.lock();
7c673cae 9025 clear_session_waiting_on_map(s);
11fdf7f2
TL
9026 con->set_priv(nullptr); // break ref <-> session cycle, if any
9027 s->con.reset();
7c673cae 9028 if (!is_fast_dispatch)
9f95a23c 9029 s->session_dispatch_lock.unlock();
7c673cae
FG
9030 }
9031 return false;
9032 }
9033 return true;
9034}
9035
9036
9037/*
9038 * require that we have same (or newer) map, and that
9039 * the source is the pg primary.
9040 */
9041bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9042 bool is_fast_dispatch)
9043{
9044 const Message *m = op->get_req();
9f95a23c 9045 const auto osdmap = get_osdmap();
7c673cae
FG
9046 dout(15) << "require_same_or_newer_map " << epoch
9047 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9048
9f95a23c 9049 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
9050
9051 // do they have a newer map?
9052 if (epoch > osdmap->get_epoch()) {
9053 dout(7) << "waiting for newer map epoch " << epoch
9054 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9055 wait_for_new_map(op);
9056 return false;
9057 }
9058
9059 if (!require_self_aliveness(op->get_req(), epoch)) {
9060 return false;
9061 }
9062
9063 // ok, our map is same or newer.. do they still exist?
9064 if (m->get_connection()->get_messenger() == cluster_messenger &&
9065 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9066 return false;
9067 }
9068
9069 return true;
9070}
9071
9072
9073
9074
9075
9076// ----------------------------------------
9077// pg creation
9078
9079void OSD::split_pgs(
9080 PG *parent,
31f18b77 9081 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
9082 OSDMapRef curmap,
9083 OSDMapRef nextmap,
9f95a23c 9084 PeeringCtx &rctx)
7c673cae 9085{
11fdf7f2
TL
9086 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9087 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 9088
11fdf7f2
TL
9089 vector<object_stat_sum_t> updated_stats;
9090 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
9091
9092 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9093 for (set<spg_t>::const_iterator i = childpgids.begin();
9094 i != childpgids.end();
9095 ++i, ++stat_iter) {
11fdf7f2
TL
9096 ceph_assert(stat_iter != updated_stats.end());
9097 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
9098 PG* child = _make_pg(nextmap, *i);
9099 child->lock(true);
9100 out_pgs->insert(child);
11fdf7f2 9101 child->ch = store->create_new_collection(child->coll);
7c673cae 9102
11fdf7f2
TL
9103 {
9104 uint32_t shard_index = i->hash_to_shard(shards.size());
9105 assert(NULL != shards[shard_index]);
9106 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9107 }
7c673cae 9108
11fdf7f2
TL
9109 unsigned split_bits = i->get_split_bits(pg_num);
9110 dout(10) << " pg_num is " << pg_num
9111 << ", m_seed " << i->ps()
9112 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
9113 parent->split_colls(
9114 *i,
9115 split_bits,
9116 i->ps(),
11fdf7f2 9117 &child->get_pool().info,
9f95a23c 9118 rctx.transaction);
7c673cae
FG
9119 parent->split_into(
9120 i->pgid,
9121 child,
9122 split_bits);
7c673cae 9123
92f5a8d4
TL
9124 child->init_collection_pool_opts();
9125
9f95a23c 9126 child->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
9127 child->unlock();
9128 }
11fdf7f2 9129 ceph_assert(stat_iter != updated_stats.end());
9f95a23c 9130 parent->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
9131}
9132
9133/*
9134 * holding osd_lock
9135 */
9136void OSD::handle_pg_create(OpRequestRef op)
9137{
9f95a23c
TL
9138 // NOTE: this can be removed in P release (mimic is the last version to
9139 // send MOSDPGCreate messages).
9140
9141 auto m = op->get_req<MOSDPGCreate>();
11fdf7f2 9142 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
7c673cae
FG
9143
9144 dout(10) << "handle_pg_create " << *m << dendl;
9145
9146 if (!require_mon_peer(op->get_req())) {
9147 return;
9148 }
9149
9150 if (!require_same_or_newer_map(op, m->epoch, false))
9151 return;
9152
9153 op->mark_started();
9154
9f95a23c 9155 const auto osdmap = get_osdmap();
7c673cae
FG
9156 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9157 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9158 p != m->mkpg.end();
9159 ++p, ++ci) {
11fdf7f2 9160 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
7c673cae
FG
9161 epoch_t created = p->second.created;
9162 if (p->second.split_bits) // Skip split pgs
9163 continue;
9164 pg_t on = p->first;
9165
7c673cae
FG
9166 if (!osdmap->have_pg_pool(on.pool())) {
9167 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9168 continue;
9169 }
9170
9171 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9172
9f95a23c
TL
9173 spg_t pgid;
9174 bool mapped = osdmap->get_primary_shard(on, &pgid);
9175 ceph_assert(mapped);
9176
7c673cae
FG
9177 // is it still ours?
9178 vector<int> up, acting;
9179 int up_primary = -1;
9180 int acting_primary = -1;
9181 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9f95a23c 9182 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae
FG
9183
9184 if (acting_primary != whoami) {
9185 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9186 << "), my role=" << role << ", skipping" << dendl;
9187 continue;
9188 }
9189
7c673cae 9190
11fdf7f2 9191 PastIntervals pi;
7c673cae
FG
9192 pg_history_t history;
9193 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9194
11fdf7f2
TL
9195 // The mon won't resend unless the primary changed, so we ignore
9196 // same_interval_since. We'll pass this history with the current
9197 // epoch as the event.
7c673cae
FG
9198 if (history.same_primary_since > m->epoch) {
9199 dout(10) << __func__ << ": got obsolete pg create on pgid "
9200 << pgid << " from epoch " << m->epoch
9201 << ", primary changed in " << history.same_primary_since
9202 << dendl;
9203 continue;
9204 }
11fdf7f2
TL
9205 enqueue_peering_evt(
9206 pgid,
9207 PGPeeringEventRef(
9208 std::make_shared<PGPeeringEvent>(
9209 osdmap->get_epoch(),
9210 osdmap->get_epoch(),
9211 NullEvt(),
9212 true,
9213 new PGCreateInfo(
9214 pgid,
9215 osdmap->get_epoch(),
9216 history,
9217 pi,
9218 true)
9219 )));
7c673cae 9220 }
7c673cae 9221
3efd9988 9222 {
11fdf7f2 9223 std::lock_guard l(pending_creates_lock);
3efd9988
FG
9224 if (pending_creates_from_mon == 0) {
9225 last_pg_create_epoch = m->epoch;
9226 }
9227 }
11fdf7f2 9228
7c673cae
FG
9229 maybe_update_heartbeat_peers();
9230}
9231
9232
9233// ----------------------------------------
9234// peering and recovery
9235
9f95a23c 9236void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
7c673cae
FG
9237 ThreadPool::TPHandle *handle)
9238{
11fdf7f2
TL
9239 if (!service.get_osdmap()->is_up(whoami)) {
9240 dout(20) << __func__ << " not up in osdmap" << dendl;
9241 } else if (!is_active()) {
9242 dout(20) << __func__ << " not active" << dendl;
9243 } else {
9f95a23c
TL
9244 for (auto& [osd, ls] : ctx.message_map) {
9245 if (!curmap->is_up(osd)) {
9246 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9247 continue;
9248 }
9249 ConnectionRef con = service.get_con_osd_cluster(
9250 osd, curmap->get_epoch());
9251 if (!con) {
9252 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9253 << dendl;
9254 continue;
9255 }
9256 service.maybe_share_map(con.get(), curmap);
9257 for (auto m : ls) {
9258 con->send_message2(m);
9259 }
9260 ls.clear();
9261 }
7c673cae 9262 }
9f95a23c 9263 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
7c673cae 9264 int tr = store->queue_transaction(
11fdf7f2 9265 pg->ch,
9f95a23c 9266 std::move(ctx.transaction), TrackedOpRef(),
7c673cae 9267 handle);
11fdf7f2 9268 ceph_assert(tr == 0);
7c673cae 9269 }
7c673cae
FG
9270}
9271
11fdf7f2 9272void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9273{
11fdf7f2
TL
9274 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9275 if (!require_mon_peer(m)) {
9276 m->put();
7c673cae 9277 return;
7c673cae 9278 }
11fdf7f2
TL
9279 for (auto& p : m->pgs) {
9280 spg_t pgid = p.first;
9281 epoch_t created = p.second.first;
9282 utime_t created_stamp = p.second.second;
9f95a23c
TL
9283 auto q = m->pg_extra.find(pgid);
9284 if (q == m->pg_extra.end()) {
9285 dout(20) << __func__ << " " << pgid << " e" << created
9286 << "@" << created_stamp
9287 << " (no history or past_intervals)" << dendl;
9288 // pre-octopus ... no pg history. this can be removed in Q release.
9289 enqueue_peering_evt(
9290 pgid,
9291 PGPeeringEventRef(
9292 std::make_shared<PGPeeringEvent>(
9293 m->epoch,
9294 m->epoch,
9295 NullEvt(),
9296 true,
9297 new PGCreateInfo(
9298 pgid,
9299 created,
9300 pg_history_t(created, created_stamp),
9301 PastIntervals(),
9302 true)
9303 )));
9304 } else {
9305 dout(20) << __func__ << " " << pgid << " e" << created
9306 << "@" << created_stamp
9307 << " history " << q->second.first
9308 << " pi " << q->second.second << dendl;
9309 if (!q->second.second.empty() &&
9310 m->epoch < q->second.second.get_bounds().second) {
9311 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9312 << " and unmatched past_intervals " << q->second.second
9313 << " (history " << q->second.first << ")";
9314 } else {
9315 enqueue_peering_evt(
9316 pgid,
9317 PGPeeringEventRef(
9318 std::make_shared<PGPeeringEvent>(
9319 m->epoch,
9320 m->epoch,
9321 NullEvt(),
9322 true,
9323 new PGCreateInfo(
9324 pgid,
9325 m->epoch,
9326 q->second.first,
9327 q->second.second,
9328 true)
9329 )));
9330 }
9331 }
11fdf7f2 9332 }
7c673cae 9333
11fdf7f2
TL
9334 {
9335 std::lock_guard l(pending_creates_lock);
9336 if (pending_creates_from_mon == 0) {
9337 last_pg_create_epoch = m->epoch;
9338 }
7c673cae
FG
9339 }
9340
11fdf7f2 9341 m->put();
7c673cae
FG
9342}
9343
11fdf7f2 9344void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9345{
11fdf7f2
TL
9346 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9347 if (!require_osd_peer(m)) {
9348 m->put();
7c673cae
FG
9349 return;
9350 }
11fdf7f2
TL
9351 int from = m->get_source().num();
9352 for (auto& p : m->get_pg_list()) {
9f95a23c 9353 spg_t pgid(p.info.pgid.pgid, p.to);
11fdf7f2
TL
9354 enqueue_peering_evt(
9355 pgid,
9356 PGPeeringEventRef(
9357 std::make_shared<PGPeeringEvent>(
9f95a23c
TL
9358 p.epoch_sent,
9359 p.query_epoch,
11fdf7f2 9360 MNotifyRec(
9f95a23c
TL
9361 pgid, pg_shard_t(from, p.from),
9362 p,
9363 m->get_connection()->get_features()),
11fdf7f2
TL
9364 true,
9365 new PGCreateInfo(
9366 pgid,
9f95a23c
TL
9367 p.query_epoch,
9368 p.info.history,
9369 p.past_intervals,
11fdf7f2
TL
9370 false)
9371 )));
7c673cae 9372 }
11fdf7f2 9373 m->put();
7c673cae
FG
9374}
9375
11fdf7f2 9376void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9377{
11fdf7f2
TL
9378 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9379 if (!require_osd_peer(m)) {
9380 m->put();
7c673cae
FG
9381 return;
9382 }
11fdf7f2
TL
9383 int from = m->get_source().num();
9384 for (auto& p : m->pg_list) {
9385 enqueue_peering_evt(
9f95a23c 9386 spg_t(p.info.pgid.pgid, p.to),
11fdf7f2 9387 PGPeeringEventRef(
20effc67
TL
9388 std::make_shared<PGPeeringEvent>(
9389 p.epoch_sent, p.query_epoch,
9390 MInfoRec(
9391 pg_shard_t(from, p.from),
9392 p.info,
9393 p.epoch_sent)))
11fdf7f2 9394 );
7c673cae 9395 }
11fdf7f2 9396 m->put();
7c673cae
FG
9397}
9398
11fdf7f2 9399void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9400{
11fdf7f2
TL
9401 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9402 if (!require_osd_peer(m)) {
9403 m->put();
7c673cae
FG
9404 return;
9405 }
11fdf7f2
TL
9406 for (auto& pgid : m->pg_list) {
9407 enqueue_peering_evt(
9408 pgid,
9409 PGPeeringEventRef(
9410 std::make_shared<PGPeeringEvent>(
9411 m->get_epoch(), m->get_epoch(),
9f95a23c 9412 PeeringState::DeleteStart())));
7c673cae 9413 }
11fdf7f2 9414 m->put();
7c673cae
FG
9415}
9416
11fdf7f2 9417void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9418{
11fdf7f2
TL
9419 dout(10) << __func__ << " " << *m << dendl;
9420 if (!require_mon_or_mgr_peer(m)) {
9421 m->put();
9422 return;
9423 }
9424 epoch_t epoch = get_osdmap_epoch();
9425 for (auto pgid : m->forced_pgs) {
9426 if (m->options & OFR_BACKFILL) {
9427 if (m->options & OFR_CANCEL) {
9428 enqueue_peering_evt(
9429 pgid,
9430 PGPeeringEventRef(
9431 std::make_shared<PGPeeringEvent>(
9432 epoch, epoch,
9f95a23c 9433 PeeringState::UnsetForceBackfill())));
11fdf7f2
TL
9434 } else {
9435 enqueue_peering_evt(
9436 pgid,
9437 PGPeeringEventRef(
9438 std::make_shared<PGPeeringEvent>(
9439 epoch, epoch,
9f95a23c 9440 PeeringState::SetForceBackfill())));
11fdf7f2
TL
9441 }
9442 } else if (m->options & OFR_RECOVERY) {
9443 if (m->options & OFR_CANCEL) {
9444 enqueue_peering_evt(
9445 pgid,
9446 PGPeeringEventRef(
9447 std::make_shared<PGPeeringEvent>(
9448 epoch, epoch,
9f95a23c 9449 PeeringState::UnsetForceRecovery())));
11fdf7f2
TL
9450 } else {
9451 enqueue_peering_evt(
9452 pgid,
9453 PGPeeringEventRef(
9454 std::make_shared<PGPeeringEvent>(
9455 epoch, epoch,
9f95a23c 9456 PeeringState::SetForceRecovery())));
c07f9fc5
FG
9457 }
9458 }
9459 }
11fdf7f2 9460 m->put();
c07f9fc5 9461}
7c673cae 9462
11fdf7f2 9463void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9464{
11fdf7f2
TL
9465 spg_t pgid = q.pgid;
9466 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9467
11fdf7f2
TL
9468 OSDMapRef osdmap = get_osdmap();
9469 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9470 return;
9471
11fdf7f2
TL
9472 dout(10) << " pg " << pgid << " dne" << dendl;
9473 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9474 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9475 if (con) {
9476 Message *m;
9477 if (q.query.type == pg_query_t::LOG ||
9478 q.query.type == pg_query_t::FULLLOG) {
9479 m = new MOSDPGLog(
9480 q.query.from, q.query.to,
9481 osdmap->get_epoch(), empty,
9482 q.query.epoch_sent);
7c673cae 9483 } else {
20effc67
TL
9484 pg_notify_t notify{q.query.from, q.query.to,
9485 q.query.epoch_sent,
9486 osdmap->get_epoch(),
9487 empty,
9488 PastIntervals()};
9489 m = new MOSDPGNotify2(spg_t{pgid.pgid, q.query.from},
9490 std::move(notify));
7c673cae 9491 }
9f95a23c 9492 service.maybe_share_map(con.get(), osdmap);
11fdf7f2 9493 con->send_message(m);
7c673cae
FG
9494 }
9495}
9496
9f95a23c
TL
9497void OSDService::queue_check_readable(spg_t spgid,
9498 epoch_t lpr,
9499 ceph::signedspan delay)
9500{
9501 if (delay == ceph::signedspan::zero()) {
9502 osd->enqueue_peering_evt(
9503 spgid,
9504 PGPeeringEventRef(
9505 std::make_shared<PGPeeringEvent>(
9506 lpr, lpr,
9507 PeeringState::CheckReadable())));
9508 } else {
9509 mono_timer.add_event(
9510 delay,
9511 [this, spgid, lpr]() {
9512 queue_check_readable(spgid, lpr);
9513 });
9514 }
9515}
9516
7c673cae 9517
7c673cae
FG
9518// =========================================================
9519// RECOVERY
9520
9521void OSDService::_maybe_queue_recovery() {
9f95a23c 9522 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
7c673cae
FG
9523 uint64_t available_pushes;
9524 while (!awaiting_throttle.empty() &&
9525 _recover_now(&available_pushes)) {
11fdf7f2 9526 uint64_t to_start = std::min(
7c673cae
FG
9527 available_pushes,
9528 cct->_conf->osd_recovery_max_single_start);
9529 _queue_for_recovery(awaiting_throttle.front(), to_start);
9530 awaiting_throttle.pop_front();
11fdf7f2
TL
9531 dout(10) << __func__ << " starting " << to_start
9532 << ", recovery_ops_reserved " << recovery_ops_reserved
9533 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9534 recovery_ops_reserved += to_start;
9535 }
9536}
9537
9538bool OSDService::_recover_now(uint64_t *available_pushes)
9539{
9540 if (available_pushes)
9541 *available_pushes = 0;
9542
9543 if (ceph_clock_now() < defer_recovery_until) {
9544 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9545 return false;
9546 }
9547
9548 if (recovery_paused) {
9549 dout(15) << __func__ << " paused" << dendl;
9550 return false;
9551 }
9552
9f95a23c 9553 uint64_t max = osd->get_recovery_max_active();
7c673cae
FG
9554 if (max <= recovery_ops_active + recovery_ops_reserved) {
9555 dout(15) << __func__ << " active " << recovery_ops_active
9556 << " + reserved " << recovery_ops_reserved
9557 << " >= max " << max << dendl;
9558 return false;
9559 }
9560
9561 if (available_pushes)
9562 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9563
9564 return true;
9565}
9566
9f95a23c
TL
9567unsigned OSDService::get_target_pg_log_entries() const
9568{
9569 auto num_pgs = osd->get_num_pgs();
9570 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9571 if (num_pgs > 0 && target > 0) {
9572 // target an even spread of our budgeted log entries across all
9573 // PGs. note that while we only get to control the entry count
9574 // for primary PGs, we'll normally be responsible for a mix of
9575 // primary and replica PGs (for the same pool(s) even), so this
9576 // will work out.
9577 return std::max<unsigned>(
9578 std::min<unsigned>(target / num_pgs,
9579 cct->_conf->osd_max_pg_log_entries),
9580 cct->_conf->osd_min_pg_log_entries);
9581 } else {
9582 // fall back to a per-pg value.
9583 return cct->_conf->osd_min_pg_log_entries;
9584 }
9585}
9586
7c673cae
FG
9587void OSD::do_recovery(
9588 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9589 ThreadPool::TPHandle &handle)
9590{
9591 uint64_t started = 0;
31f18b77
FG
9592
9593 /*
9594 * When the value of osd_recovery_sleep is set greater than zero, recovery
9595 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9596 * recovery event's schedule time. This is done by adding a
9597 * recovery_requeue_callback event, which re-queues the recovery op using
9598 * queue_recovery_after_sleep.
9599 */
c07f9fc5 9600 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9601 {
11fdf7f2 9602 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9603 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9604 PGRef pgref(pg);
9f95a23c 9605 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
b32b8144
FG
9606 dout(20) << "do_recovery wake up at "
9607 << ceph_clock_now()
9608 << ", re-queuing recovery" << dendl;
11fdf7f2 9609 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9610 service.recovery_needs_sleep = false;
9611 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9612 });
9613
9614 // This is true for the first recovery op and when the previous recovery op
9615 // has been scheduled in the past. The next recovery op is scheduled after
9616 // completing the sleep from now.
f67539c2 9617
9f95a23c
TL
9618 if (auto now = ceph::real_clock::now();
9619 service.recovery_schedule_time < now) {
9620 service.recovery_schedule_time = now;
b32b8144 9621 }
9f95a23c 9622 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
11fdf7f2 9623 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9f95a23c 9624 recovery_requeue_callback);
b32b8144
FG
9625 dout(20) << "Recovery event scheduled at "
9626 << service.recovery_schedule_time << dendl;
9627 return;
9628 }
7c673cae
FG
9629 }
9630
9631 {
b32b8144 9632 {
11fdf7f2 9633 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9634 service.recovery_needs_sleep = true;
9635 }
9636
7c673cae
FG
9637 if (pg->pg_has_reset_since(queued)) {
9638 goto out;
9639 }
9640
7c673cae
FG
9641 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9642#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 9643 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
9644#endif
9645
11fdf7f2 9646 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
f67539c2 9647 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
7c673cae
FG
9648 << " on " << *pg << dendl;
9649
11fdf7f2 9650 if (do_unfound) {
20effc67 9651 PeeringCtx rctx;
11fdf7f2 9652 rctx.handle = &handle;
9f95a23c 9653 pg->find_unfound(queued, rctx);
11fdf7f2 9654 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 9655 }
7c673cae
FG
9656 }
9657
9658 out:
11fdf7f2 9659 ceph_assert(started <= reserved_pushes);
7c673cae
FG
9660 service.release_reserved_pushes(reserved_pushes);
9661}
9662
9663void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9664{
11fdf7f2 9665 std::lock_guard l(recovery_lock);
7c673cae
FG
9666 dout(10) << "start_recovery_op " << *pg << " " << soid
9667 << " (" << recovery_ops_active << "/"
9f95a23c 9668 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9669 << dendl;
9670 recovery_ops_active++;
9671
9672#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9673 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9674 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9675 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
9676#endif
9677}
9678
9679void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9680{
11fdf7f2 9681 std::lock_guard l(recovery_lock);
7c673cae
FG
9682 dout(10) << "finish_recovery_op " << *pg << " " << soid
9683 << " dequeue=" << dequeue
9f95a23c
TL
9684 << " (" << recovery_ops_active << "/"
9685 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9686 << dendl;
9687
9688 // adjust count
11fdf7f2 9689 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
9690 recovery_ops_active--;
9691
9692#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9693 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9694 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9695 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
9696#endif
9697
9698 _maybe_queue_recovery();
9699}
9700
9701bool OSDService::is_recovery_active()
9702{
eafe8130
TL
9703 if (cct->_conf->osd_debug_pretend_recovery_active) {
9704 return true;
9705 }
b5b8bbf5 9706 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9707}
9708
11fdf7f2
TL
9709void OSDService::release_reserved_pushes(uint64_t pushes)
9710{
9711 std::lock_guard l(recovery_lock);
9712 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9713 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9714 << dendl;
9715 ceph_assert(recovery_ops_reserved >= pushes);
9716 recovery_ops_reserved -= pushes;
9717 _maybe_queue_recovery();
9718}
9719
7c673cae
FG
9720// =========================================================
9721// OPS
9722
9723bool OSD::op_is_discardable(const MOSDOp *op)
9724{
9725 // drop client request if they are not connected and can't get the
9726 // reply anyway.
9727 if (!op->get_connection()->is_connected()) {
9728 return true;
9729 }
9730 return false;
9731}
9732
11fdf7f2 9733void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 9734{
11fdf7f2
TL
9735 const utime_t stamp = op->get_req()->get_recv_stamp();
9736 const utime_t latency = ceph_clock_now() - stamp;
9737 const unsigned priority = op->get_req()->get_priority();
9738 const int cost = op->get_req()->get_cost();
9739 const uint64_t owner = op->get_req()->get_source().num();
f67539c2 9740 const int type = op->get_req()->get_type();
11fdf7f2
TL
9741
9742 dout(15) << "enqueue_op " << op << " prio " << priority
f67539c2 9743 << " type " << type
11fdf7f2 9744 << " cost " << cost
7c673cae
FG
9745 << " latency " << latency
9746 << " epoch " << epoch
9747 << " " << *(op->get_req()) << dendl;
9748 op->osd_trace.event("enqueue op");
11fdf7f2
TL
9749 op->osd_trace.keyval("priority", priority);
9750 op->osd_trace.keyval("cost", cost);
20effc67
TL
9751
9752 auto enqueue_span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
9753 enqueue_span->AddEvent(__func__, {
9754 {"priority", priority},
9755 {"cost", cost},
9756 {"epoch", epoch},
9757 {"owner", owner},
9758 {"type", type}
9759 });
9760
7c673cae 9761 op->mark_queued_for_pg();
224ce89b 9762 logger->tinc(l_osd_op_before_queue_op_lat, latency);
f67539c2
TL
9763 if (type == MSG_OSD_PG_PUSH ||
9764 type == MSG_OSD_PG_PUSH_REPLY) {
9765 op_shardedwq.queue(
9766 OpSchedulerItem(
9767 unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9768 cost, priority, stamp, owner, epoch));
9769 } else {
9770 op_shardedwq.queue(
9771 OpSchedulerItem(
9772 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9773 cost, priority, stamp, owner, epoch));
9774 }
7c673cae
FG
9775}
9776
11fdf7f2
TL
9777void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9778{
9779 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9780 op_shardedwq.queue(
9f95a23c
TL
9781 OpSchedulerItem(
9782 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
11fdf7f2
TL
9783 10,
9784 cct->_conf->osd_peering_op_priority,
9785 utime_t(),
9786 0,
9787 evt->get_epoch_sent()));
9788}
7c673cae
FG
9789
9790/*
9791 * NOTE: dequeue called in worker thread, with pg lock
9792 */
9793void OSD::dequeue_op(
9794 PGRef pg, OpRequestRef op,
9795 ThreadPool::TPHandle &handle)
9796{
9f95a23c
TL
9797 const Message *m = op->get_req();
9798
11fdf7f2 9799 FUNCTRACE(cct);
9f95a23c 9800 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
7c673cae
FG
9801
9802 utime_t now = ceph_clock_now();
9803 op->set_dequeued_time(now);
9f95a23c
TL
9804
9805 utime_t latency = now - m->get_recv_stamp();
9806 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9807 << " cost " << m->get_cost()
7c673cae 9808 << " latency " << latency
9f95a23c 9809 << " " << *m
7c673cae
FG
9810 << " pg " << *pg << dendl;
9811
224ce89b
WB
9812 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9813
9f95a23c
TL
9814 service.maybe_share_map(m->get_connection().get(),
9815 pg->get_osdmap(),
9816 op->sent_epoch);
7c673cae 9817
11fdf7f2 9818 if (pg->is_deleting())
7c673cae
FG
9819 return;
9820
9821 op->mark_reached_pg();
9822 op->osd_trace.event("dequeue_op");
9823
9824 pg->do_request(op, handle);
9825
9826 // finish
9827 dout(10) << "dequeue_op " << op << " finish" << dendl;
9f95a23c 9828 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
7c673cae
FG
9829}
9830
9831
11fdf7f2
TL
9832void OSD::dequeue_peering_evt(
9833 OSDShard *sdata,
9834 PG *pg,
9835 PGPeeringEventRef evt,
9836 ThreadPool::TPHandle& handle)
7c673cae 9837{
11fdf7f2 9838 auto curmap = sdata->get_osdmap();
9f95a23c
TL
9839 bool need_up_thru = false;
9840 epoch_t same_interval_since = 0;
11fdf7f2
TL
9841 if (!pg) {
9842 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9843 handle_pg_query_nopg(*q);
7c673cae 9844 } else {
11fdf7f2
TL
9845 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9846 ceph_abort();
9847 }
20effc67
TL
9848 } else if (PeeringCtx rctx;
9849 advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9f95a23c 9850 pg->do_peering_event(evt, rctx);
11fdf7f2 9851 if (pg->is_deleted()) {
11fdf7f2
TL
9852 pg->unlock();
9853 return;
7c673cae 9854 }
9f95a23c 9855 dispatch_context(rctx, pg, curmap, &handle);
11fdf7f2
TL
9856 need_up_thru = pg->get_need_up_thru();
9857 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
9858 pg->unlock();
9859 }
11fdf7f2
TL
9860
9861 if (need_up_thru) {
7c673cae 9862 queue_want_up_thru(same_interval_since);
11fdf7f2 9863 }
7c673cae
FG
9864
9865 service.send_pg_temp();
9866}
9867
11fdf7f2
TL
9868void OSD::dequeue_delete(
9869 OSDShard *sdata,
9870 PG *pg,
9871 epoch_t e,
9872 ThreadPool::TPHandle& handle)
9873{
9874 dequeue_peering_evt(
9875 sdata,
9876 pg,
9877 PGPeeringEventRef(
9878 std::make_shared<PGPeeringEvent>(
9879 e, e,
9f95a23c 9880 PeeringState::DeleteSome())),
11fdf7f2
TL
9881 handle);
9882}
9883
9884
9885
7c673cae
FG
9886// --------------------------------
9887
9888const char** OSD::get_tracked_conf_keys() const
9889{
9890 static const char* KEYS[] = {
9891 "osd_max_backfills",
9892 "osd_min_recovery_priority",
224ce89b
WB
9893 "osd_max_trimming_pgs",
9894 "osd_op_complaint_time",
9895 "osd_op_log_threshold",
9896 "osd_op_history_size",
9897 "osd_op_history_duration",
9898 "osd_op_history_slow_op_size",
9899 "osd_op_history_slow_op_threshold",
7c673cae
FG
9900 "osd_enable_op_tracker",
9901 "osd_map_cache_size",
11fdf7f2 9902 "osd_pg_epoch_max_lag_factor",
7c673cae 9903 "osd_pg_epoch_persisted_max_stale",
f67539c2
TL
9904 "osd_recovery_sleep",
9905 "osd_recovery_sleep_hdd",
9906 "osd_recovery_sleep_ssd",
9907 "osd_recovery_sleep_hybrid",
b3b6e05e
TL
9908 "osd_delete_sleep",
9909 "osd_delete_sleep_hdd",
9910 "osd_delete_sleep_ssd",
9911 "osd_delete_sleep_hybrid",
9912 "osd_snap_trim_sleep",
9913 "osd_snap_trim_sleep_hdd",
9914 "osd_snap_trim_sleep_ssd",
20effc67 9915 "osd_snap_trim_sleep_hybrid",
b3b6e05e 9916 "osd_scrub_sleep",
f67539c2
TL
9917 "osd_recovery_max_active",
9918 "osd_recovery_max_active_hdd",
9919 "osd_recovery_max_active_ssd",
7c673cae
FG
9920 // clog & admin clog
9921 "clog_to_monitors",
9922 "clog_to_syslog",
9923 "clog_to_syslog_facility",
9924 "clog_to_syslog_level",
9925 "osd_objectstore_fuse",
9926 "clog_to_graylog",
9927 "clog_to_graylog_host",
9928 "clog_to_graylog_port",
9929 "host",
9930 "fsid",
9931 "osd_recovery_delay_start",
9932 "osd_client_message_size_cap",
9933 "osd_client_message_cap",
31f18b77
FG
9934 "osd_heartbeat_min_size",
9935 "osd_heartbeat_interval",
9f95a23c 9936 "osd_object_clean_region_max_num_intervals",
494da23a
TL
9937 "osd_scrub_min_interval",
9938 "osd_scrub_max_interval",
7c673cae
FG
9939 NULL
9940 };
9941 return KEYS;
9942}
9943
11fdf7f2 9944void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
9945 const std::set <std::string> &changed)
9946{
9f95a23c 9947 std::lock_guard l{osd_lock};
f67539c2
TL
9948
9949 if (changed.count("osd_max_backfills") ||
9950 changed.count("osd_delete_sleep") ||
9951 changed.count("osd_delete_sleep_hdd") ||
9952 changed.count("osd_delete_sleep_ssd") ||
9953 changed.count("osd_delete_sleep_hybrid") ||
9954 changed.count("osd_snap_trim_sleep") ||
9955 changed.count("osd_snap_trim_sleep_hdd") ||
9956 changed.count("osd_snap_trim_sleep_ssd") ||
9957 changed.count("osd_snap_trim_sleep_hybrid") ||
9958 changed.count("osd_scrub_sleep") ||
9959 changed.count("osd_recovery_sleep") ||
9960 changed.count("osd_recovery_sleep_hdd") ||
9961 changed.count("osd_recovery_sleep_ssd") ||
9962 changed.count("osd_recovery_sleep_hybrid") ||
9963 changed.count("osd_recovery_max_active") ||
9964 changed.count("osd_recovery_max_active_hdd") ||
9965 changed.count("osd_recovery_max_active_ssd")) {
b3b6e05e
TL
9966 if (!maybe_override_options_for_qos() &&
9967 changed.count("osd_max_backfills")) {
9968 // Scheduler is not "mclock". Fallback to earlier behavior
f67539c2
TL
9969 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9970 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9971 }
7c673cae
FG
9972 }
9973 if (changed.count("osd_min_recovery_priority")) {
9974 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9975 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9976 }
9977 if (changed.count("osd_max_trimming_pgs")) {
9978 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9979 }
9980 if (changed.count("osd_op_complaint_time") ||
9981 changed.count("osd_op_log_threshold")) {
9982 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9983 cct->_conf->osd_op_log_threshold);
9984 }
9985 if (changed.count("osd_op_history_size") ||
9986 changed.count("osd_op_history_duration")) {
9987 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9988 cct->_conf->osd_op_history_duration);
9989 }
9990 if (changed.count("osd_op_history_slow_op_size") ||
9991 changed.count("osd_op_history_slow_op_threshold")) {
9992 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9993 cct->_conf->osd_op_history_slow_op_threshold);
9994 }
9995 if (changed.count("osd_enable_op_tracker")) {
9996 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9997 }
7c673cae
FG
9998 if (changed.count("osd_map_cache_size")) {
9999 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10000 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10001 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10002 }
10003 if (changed.count("clog_to_monitors") ||
10004 changed.count("clog_to_syslog") ||
10005 changed.count("clog_to_syslog_level") ||
10006 changed.count("clog_to_syslog_facility") ||
10007 changed.count("clog_to_graylog") ||
10008 changed.count("clog_to_graylog_host") ||
10009 changed.count("clog_to_graylog_port") ||
10010 changed.count("host") ||
10011 changed.count("fsid")) {
10012 update_log_config();
10013 }
11fdf7f2
TL
10014 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10015 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10016 "osd_pg_epoch_max_lag_factor");
10017 }
7c673cae
FG
10018
10019#ifdef HAVE_LIBFUSE
10020 if (changed.count("osd_objectstore_fuse")) {
10021 if (store) {
10022 enable_disable_fuse(false);
10023 }
10024 }
10025#endif
10026
10027 if (changed.count("osd_recovery_delay_start")) {
10028 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10029 service.kick_recovery_queue();
10030 }
10031
10032 if (changed.count("osd_client_message_cap")) {
10033 uint64_t newval = cct->_conf->osd_client_message_cap;
10034 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
20effc67 10035 if (pol.throttler_messages) {
7c673cae
FG
10036 pol.throttler_messages->reset_max(newval);
10037 }
10038 }
10039 if (changed.count("osd_client_message_size_cap")) {
10040 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10041 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
20effc67 10042 if (pol.throttler_bytes) {
7c673cae
FG
10043 pol.throttler_bytes->reset_max(newval);
10044 }
10045 }
9f95a23c
TL
10046 if (changed.count("osd_object_clean_region_max_num_intervals")) {
10047 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10048 }
7c673cae 10049
494da23a
TL
10050 if (changed.count("osd_scrub_min_interval") ||
10051 changed.count("osd_scrub_max_interval")) {
10052 resched_all_scrubs();
10053 dout(0) << __func__ << ": scrub interval change" << dendl;
10054 }
7c673cae 10055 check_config();
f67539c2
TL
10056 if (changed.count("osd_asio_thread_count")) {
10057 service.poolctx.stop();
10058 service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10059 }
7c673cae
FG
10060}
10061
a4b75251
TL
10062void OSD::maybe_override_max_osd_capacity_for_qos()
10063{
10064 // If the scheduler enabled is mclock, override the default
10065 // osd capacity with the value obtained from running the
10066 // osd bench test. This is later used to setup mclock.
10067 if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
20effc67
TL
10068 (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false) &&
10069 (!unsupported_objstore_for_qos())) {
a4b75251
TL
10070 std::string max_capacity_iops_config;
10071 bool force_run_benchmark =
10072 cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
10073
10074 if (store_is_rotational) {
10075 max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
10076 } else {
10077 max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
10078 }
10079
10080 if (!force_run_benchmark) {
10081 double default_iops = 0.0;
10082
10083 // Get the current osd iops capacity
10084 double cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
10085
10086 // Get the default max iops capacity
10087 auto val = cct->_conf.get_val_default(max_capacity_iops_config);
10088 if (!val.has_value()) {
10089 derr << __func__ << " Unable to determine default value of "
10090 << max_capacity_iops_config << dendl;
10091 // Cannot determine default iops. Force a run of the OSD benchmark.
10092 force_run_benchmark = true;
10093 } else {
10094 // Default iops
10095 default_iops = std::stod(val.value());
10096 }
10097
10098 // Determine if we really need to run the osd benchmark
10099 if (!force_run_benchmark && (default_iops != cur_iops)) {
10100 dout(1) << __func__ << std::fixed << std::setprecision(2)
10101 << " default_iops: " << default_iops
10102 << " cur_iops: " << cur_iops
10103 << ". Skip OSD benchmark test." << dendl;
10104 return;
10105 }
10106 }
10107
10108 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
10109 int64_t count = 12288000; // Count of bytes to write
10110 int64_t bsize = 4096; // Block size
10111 int64_t osize = 4194304; // Object size
10112 int64_t onum = 100; // Count of objects to write
10113 double elapsed = 0.0; // Time taken to complete the test
10114 double iops = 0.0;
10115 stringstream ss;
10116 int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
10117 if (ret != 0) {
10118 derr << __func__
10119 << " osd bench err: " << ret
10120 << " osd bench errstr: " << ss.str()
10121 << dendl;
10122 return;
10123 }
10124
10125 double rate = count / elapsed;
10126 iops = rate / bsize;
10127 dout(1) << __func__
10128 << " osd bench result -"
10129 << std::fixed << std::setprecision(3)
10130 << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
10131 << " iops: " << iops
10132 << " elapsed_sec: " << elapsed
10133 << dendl;
10134
10135 // Persist iops to the MON store
10136 ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
10137 if (ret < 0) {
10138 // Fallback to setting the config within the in-memory "values" map.
10139 cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops));
10140 }
10141
10142 // Override the max osd capacity for all shards
10143 for (auto& shard : shards) {
10144 shard->update_scheduler_config();
10145 }
10146 }
10147}
10148
b3b6e05e
TL
10149bool OSD::maybe_override_options_for_qos()
10150{
10151 // If the scheduler enabled is mclock, override the recovery, backfill
10152 // and sleep options so that mclock can meet the QoS goals.
20effc67
TL
10153 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10154 !unsupported_objstore_for_qos()) {
b3b6e05e
TL
10155 dout(1) << __func__
10156 << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
10157
10158 // Set high value for recovery max active
10159 uint32_t rec_max_active = 1000;
10160 cct->_conf.set_val(
10161 "osd_recovery_max_active", std::to_string(rec_max_active));
10162 cct->_conf.set_val(
10163 "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
10164 cct->_conf.set_val(
10165 "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
10166
10167 // Set high value for osd_max_backfill
10168 uint32_t max_backfills = 1000;
10169 cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
10170 service.local_reserver.set_max(max_backfills);
10171 service.remote_reserver.set_max(max_backfills);
10172
10173 // Disable recovery sleep
10174 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10175 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10176 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10177 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10178
10179 // Disable delete sleep
10180 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10181 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10182 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10183 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10184
10185 // Disable snap trim sleep
10186 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10187 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10188 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10189 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10190
10191 // Disable scrub sleep
10192 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10193 return true;
10194 }
10195 return false;
10196}
10197
a4b75251
TL
10198int OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
10199{
10200 std::string cmd =
10201 "{"
10202 "\"prefix\": \"config set\", "
10203 "\"who\": \"osd." + std::to_string(whoami) + "\", "
10204 "\"name\": \"" + key + "\", "
10205 "\"value\": \"" + val + "\""
10206 "}";
10207
10208 vector<std::string> vcmd{cmd};
10209 bufferlist inbl;
10210 std::string outs;
10211 C_SaferCond cond;
10212 monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond);
10213 int r = cond.wait();
10214 if (r < 0) {
10215 derr << __func__ << " Failed to set config key " << key
10216 << " err: " << cpp_strerror(r)
10217 << " errstr: " << outs << dendl;
10218 return r;
10219 }
10220
10221 return 0;
10222}
10223
20effc67
TL
10224bool OSD::unsupported_objstore_for_qos()
10225{
10226 static const std::vector<std::string> unsupported_objstores = { "filestore" };
10227 return std::find(unsupported_objstores.begin(),
10228 unsupported_objstores.end(),
10229 store->get_type()) != unsupported_objstores.end();
10230}
10231
7c673cae
FG
10232void OSD::update_log_config()
10233{
20effc67
TL
10234 auto parsed_options = clog->parse_client_options(cct);
10235 derr << "log_to_monitors " << parsed_options.log_to_monitors << dendl;
7c673cae
FG
10236}
10237
10238void OSD::check_config()
10239{
10240 // some sanity checks
7c673cae
FG
10241 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10242 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10243 << " is not > osd_pg_epoch_persisted_max_stale ("
10244 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10245 }
9f95a23c 10246 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
f67539c2 10247 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9f95a23c
TL
10248 << cct->_conf->osd_object_clean_region_max_num_intervals
10249 << ") is < 0";
10250 }
7c673cae
FG
10251}
10252
7c673cae
FG
10253// --------------------------------
10254
10255void OSD::get_latest_osdmap()
10256{
10257 dout(10) << __func__ << " -- start" << dendl;
10258
f67539c2
TL
10259 boost::system::error_code ec;
10260 service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
7c673cae
FG
10261
10262 dout(10) << __func__ << " -- finish" << dendl;
10263}
10264
10265// --------------------------------
10266
9f95a23c
TL
10267void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10268 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10269 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
11fdf7f2
TL
10270 dout(10) << "setting " << queries.size() << " queries" << dendl;
10271
10272 std::list<OSDPerfMetricQuery> supported_queries;
10273 for (auto &it : queries) {
10274 auto &query = it.first;
10275 if (!query.key_descriptor.empty()) {
10276 supported_queries.push_back(query);
10277 }
10278 }
10279 if (supported_queries.size() < queries.size()) {
10280 dout(1) << queries.size() - supported_queries.size()
10281 << " unsupported queries" << dendl;
10282 }
11fdf7f2 10283 {
9f95a23c 10284 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
10285 m_perf_queries = supported_queries;
10286 m_perf_limits = queries;
10287 }
11fdf7f2
TL
10288 std::vector<PGRef> pgs;
10289 _get_pgs(&pgs);
10290 for (auto& pg : pgs) {
9f95a23c 10291 std::scoped_lock l{*pg};
eafe8130 10292 pg->set_dynamic_perf_stats_queries(supported_queries);
7c673cae 10293 }
7c673cae
FG
10294}
10295
9f95a23c
TL
10296MetricPayload OSD::get_perf_reports() {
10297 OSDMetricPayload payload;
10298 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10299
11fdf7f2
TL
10300 std::vector<PGRef> pgs;
10301 _get_pgs(&pgs);
10302 DynamicPerfStats dps;
10303 for (auto& pg : pgs) {
eafe8130
TL
10304 // m_perf_queries can be modified only in set_perf_queries by mgr client
10305 // request, and it is protected by by mgr client's lock, which is held
10306 // when set_perf_queries/get_perf_reports are called, so we may not hold
10307 // m_perf_queries_lock here.
10308 DynamicPerfStats pg_dps(m_perf_queries);
10309 pg->lock();
10310 pg->get_dynamic_perf_stats(&pg_dps);
10311 pg->unlock();
10312 dps.merge(pg_dps);
11fdf7f2 10313 }
9f95a23c
TL
10314 dps.add_to_reports(m_perf_limits, &reports);
10315 dout(20) << "reports for " << reports.size() << " queries" << dendl;
10316
10317 return payload;
11fdf7f2 10318}
224ce89b 10319
7c673cae
FG
10320// =============================================================
10321
10322#undef dout_context
11fdf7f2 10323#define dout_context cct
7c673cae 10324#undef dout_prefix
11fdf7f2 10325#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 10326
11fdf7f2 10327void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 10328{
11fdf7f2
TL
10329 dout(10) << pg->pg_id << " " << pg << dendl;
10330 slot->pg = pg;
10331 pg->osd_shard = this;
10332 pg->pg_slot = slot;
10333 osd->inc_num_pgs();
10334
10335 slot->epoch = pg->get_osdmap_epoch();
10336 pg_slots_by_epoch.insert(*slot);
10337}
10338
10339void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10340{
10341 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10342 slot->pg->osd_shard = nullptr;
10343 slot->pg->pg_slot = nullptr;
10344 slot->pg = nullptr;
10345 osd->dec_num_pgs();
10346
10347 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10348 slot->epoch = 0;
10349 if (waiting_for_min_pg_epoch) {
10350 min_pg_epoch_cond.notify_all();
10351 }
10352}
10353
10354void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10355{
10356 std::lock_guard l(shard_lock);
10357 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10358 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10359 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10360 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10361 slot->epoch = e;
10362 pg_slots_by_epoch.insert(*slot);
10363 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10364 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10365 if (waiting_for_min_pg_epoch) {
10366 min_pg_epoch_cond.notify_all();
10367 }
10368}
10369
10370epoch_t OSDShard::get_min_pg_epoch()
10371{
10372 std::lock_guard l(shard_lock);
10373 auto p = pg_slots_by_epoch.begin();
10374 if (p == pg_slots_by_epoch.end()) {
10375 return 0;
10376 }
10377 return p->epoch;
10378}
10379
10380void OSDShard::wait_min_pg_epoch(epoch_t need)
10381{
10382 std::unique_lock l{shard_lock};
10383 ++waiting_for_min_pg_epoch;
10384 min_pg_epoch_cond.wait(l, [need, this] {
10385 if (pg_slots_by_epoch.empty()) {
10386 return true;
10387 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10388 return true;
10389 } else {
10390 dout(10) << need << " waiting on "
10391 << pg_slots_by_epoch.begin()->epoch << dendl;
10392 return false;
10393 }
10394 });
10395 --waiting_for_min_pg_epoch;
10396}
10397
10398epoch_t OSDShard::get_max_waiting_epoch()
10399{
10400 std::lock_guard l(shard_lock);
10401 epoch_t r = 0;
10402 for (auto& i : pg_slots) {
10403 if (!i.second->waiting_peering.empty()) {
10404 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10405 }
10406 }
10407 return r;
10408}
10409
10410void OSDShard::consume_map(
9f95a23c 10411 const OSDMapRef& new_osdmap,
11fdf7f2
TL
10412 unsigned *pushes_to_free)
10413{
10414 std::lock_guard l(shard_lock);
10415 OSDMapRef old_osdmap;
7c673cae 10416 {
11fdf7f2
TL
10417 std::lock_guard l(osdmap_lock);
10418 old_osdmap = std::move(shard_osdmap);
10419 shard_osdmap = new_osdmap;
10420 }
10421 dout(10) << new_osdmap->get_epoch()
10422 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10423 << dendl;
20effc67 10424 int queued = 0;
11fdf7f2
TL
10425
10426 // check slots
10427 auto p = pg_slots.begin();
10428 while (p != pg_slots.end()) {
10429 OSDShardPGSlot *slot = p->second.get();
10430 const spg_t& pgid = p->first;
10431 dout(20) << __func__ << " " << pgid << dendl;
10432 if (!slot->waiting_for_split.empty()) {
10433 dout(20) << __func__ << " " << pgid
10434 << " waiting for split " << slot->waiting_for_split << dendl;
10435 ++p;
10436 continue;
10437 }
10438 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10439 dout(20) << __func__ << " " << pgid
10440 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10441 << dendl;
10442 ++p;
10443 continue;
10444 }
10445 if (!slot->waiting_peering.empty()) {
10446 epoch_t first = slot->waiting_peering.begin()->first;
10447 if (first <= new_osdmap->get_epoch()) {
10448 dout(20) << __func__ << " " << pgid
10449 << " pending_peering first epoch " << first
10450 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
20effc67 10451 queued += _wake_pg_slot(pgid, slot);
11fdf7f2
TL
10452 }
10453 ++p;
10454 continue;
10455 }
10456 if (!slot->waiting.empty()) {
10457 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10458 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10459 << dendl;
10460 ++p;
10461 continue;
7c673cae 10462 }
11fdf7f2
TL
10463 while (!slot->waiting.empty() &&
10464 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10465 auto& qi = slot->waiting.front();
10466 dout(20) << __func__ << " " << pgid
10467 << " waiting item " << qi
10468 << " epoch " << qi.get_map_epoch()
10469 << " <= " << new_osdmap->get_epoch()
10470 << ", "
10471 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10472 "misdirected")
10473 << ", dropping" << dendl;
10474 *pushes_to_free += qi.get_reserved_pushes();
10475 slot->waiting.pop_front();
10476 }
10477 }
10478 if (slot->waiting.empty() &&
10479 slot->num_running == 0 &&
10480 slot->waiting_for_split.empty() &&
10481 !slot->pg) {
10482 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10483 p = pg_slots.erase(p);
10484 continue;
7c673cae 10485 }
11fdf7f2
TL
10486
10487 ++p;
7c673cae 10488 }
7c673cae 10489 if (queued) {
11fdf7f2 10490 std::lock_guard l{sdata_wait_lock};
20effc67
TL
10491 if (queued == 1)
10492 sdata_cond.notify_one();
10493 else
10494 sdata_cond.notify_all();
7c673cae
FG
10495 }
10496}
10497
20effc67 10498int OSDShard::_wake_pg_slot(
11fdf7f2
TL
10499 spg_t pgid,
10500 OSDShardPGSlot *slot)
10501{
20effc67 10502 int count = 0;
11fdf7f2
TL
10503 dout(20) << __func__ << " " << pgid
10504 << " to_process " << slot->to_process
10505 << " waiting " << slot->waiting
10506 << " waiting_peering " << slot->waiting_peering << dendl;
10507 for (auto i = slot->to_process.rbegin();
10508 i != slot->to_process.rend();
10509 ++i) {
9f95a23c 10510 scheduler->enqueue_front(std::move(*i));
20effc67 10511 count++;
11fdf7f2
TL
10512 }
10513 slot->to_process.clear();
10514 for (auto i = slot->waiting.rbegin();
10515 i != slot->waiting.rend();
10516 ++i) {
9f95a23c 10517 scheduler->enqueue_front(std::move(*i));
20effc67 10518 count++;
11fdf7f2
TL
10519 }
10520 slot->waiting.clear();
10521 for (auto i = slot->waiting_peering.rbegin();
10522 i != slot->waiting_peering.rend();
10523 ++i) {
10524 // this is overkill; we requeue everything, even if some of these
10525 // items are waiting for maps we don't have yet. FIXME, maybe,
10526 // someday, if we decide this inefficiency matters
10527 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
9f95a23c 10528 scheduler->enqueue_front(std::move(*j));
20effc67 10529 count++;
11fdf7f2
TL
10530 }
10531 }
10532 slot->waiting_peering.clear();
10533 ++slot->requeue_seq;
20effc67 10534 return count;
11fdf7f2
TL
10535}
10536
10537void OSDShard::identify_splits_and_merges(
10538 const OSDMapRef& as_of_osdmap,
10539 set<pair<spg_t,epoch_t>> *split_pgs,
10540 set<pair<spg_t,epoch_t>> *merge_pgs)
10541{
10542 std::lock_guard l(shard_lock);
10543 if (shard_osdmap) {
10544 for (auto& i : pg_slots) {
10545 const spg_t& pgid = i.first;
10546 auto *slot = i.second.get();
10547 if (slot->pg) {
10548 osd->service.identify_splits_and_merges(
10549 shard_osdmap, as_of_osdmap, pgid,
10550 split_pgs, merge_pgs);
10551 } else if (!slot->waiting_for_split.empty()) {
10552 osd->service.identify_splits_and_merges(
10553 shard_osdmap, as_of_osdmap, pgid,
10554 split_pgs, nullptr);
10555 } else {
10556 dout(20) << __func__ << " slot " << pgid
9f95a23c 10557 << " has no pg and waiting_for_split " << dendl;
7c673cae 10558 }
11fdf7f2
TL
10559 }
10560 }
10561}
10562
10563void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10564 set<pair<spg_t,epoch_t>> *pgids)
10565{
10566 std::lock_guard l(shard_lock);
10567 _prime_splits(pgids);
10568 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10569 set<pair<spg_t,epoch_t>> newer_children;
10570 for (auto i : *pgids) {
10571 osd->service.identify_splits_and_merges(
10572 as_of_osdmap, shard_osdmap, i.first,
10573 &newer_children, nullptr);
10574 }
10575 newer_children.insert(pgids->begin(), pgids->end());
10576 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10577 << shard_osdmap->get_epoch() << ", new children " << newer_children
10578 << dendl;
10579 _prime_splits(&newer_children);
10580 // note: we don't care what is left over here for other shards.
10581 // if this shard is ahead of us and one isn't, e.g., one thread is
10582 // calling into prime_splits via _process (due to a newly created
10583 // pg) and this shard has a newer map due to a racing consume_map,
10584 // then any grandchildren left here will be identified (or were
10585 // identified) when the slower shard's osdmap is advanced.
10586 // _prime_splits() will tolerate the case where the pgid is
10587 // already primed.
10588 }
10589}
10590
10591void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10592{
10593 dout(10) << *pgids << dendl;
10594 auto p = pgids->begin();
10595 while (p != pgids->end()) {
10596 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10597 if (shard_index == shard_id) {
10598 auto r = pg_slots.emplace(p->first, nullptr);
10599 if (r.second) {
10600 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10601 r.first->second = make_unique<OSDShardPGSlot>();
10602 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10603 } else {
11fdf7f2
TL
10604 auto q = r.first;
10605 ceph_assert(q != pg_slots.end());
10606 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10607 << dendl;
10608 q->second->waiting_for_split.insert(p->second);
7c673cae 10609 }
11fdf7f2
TL
10610 p = pgids->erase(p);
10611 } else {
10612 ++p;
7c673cae
FG
10613 }
10614 }
11fdf7f2
TL
10615}
10616
10617void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10618 set<pair<spg_t,epoch_t>> *merge_pgs)
10619{
10620 std::lock_guard l(shard_lock);
10621 dout(20) << __func__ << " checking shard " << shard_id
10622 << " for remaining merge pgs " << merge_pgs << dendl;
10623 auto p = merge_pgs->begin();
10624 while (p != merge_pgs->end()) {
10625 spg_t pgid = p->first;
10626 epoch_t epoch = p->second;
10627 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10628 if (shard_index != shard_id) {
10629 ++p;
10630 continue;
10631 }
10632 OSDShardPGSlot *slot;
10633 auto r = pg_slots.emplace(pgid, nullptr);
10634 if (r.second) {
10635 r.first->second = make_unique<OSDShardPGSlot>();
10636 }
10637 slot = r.first->second.get();
10638 if (slot->pg) {
10639 // already have pg
10640 dout(20) << __func__ << " have merge participant pg " << pgid
10641 << " " << slot->pg << dendl;
10642 } else if (!slot->waiting_for_split.empty() &&
10643 *slot->waiting_for_split.begin() < epoch) {
10644 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10645 << " " << slot->waiting_for_split << dendl;
10646 } else {
10647 dout(20) << __func__ << " creating empty merge participant " << pgid
10648 << " for merge in " << epoch << dendl;
10649 // leave history zeroed; PG::merge_from() will fill it in.
10650 pg_history_t history;
10651 PGCreateInfo cinfo(pgid, epoch - 1,
10652 history, PastIntervals(), false);
10653 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10654 _attach_pg(r.first->second.get(), pg.get());
10655 _wake_pg_slot(pgid, slot);
10656 pg->unlock();
10657 }
10658 // mark slot for merge
10659 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10660 slot->waiting_for_merge_epoch = epoch;
10661 p = merge_pgs->erase(p);
7c673cae
FG
10662 }
10663}
10664
11fdf7f2 10665void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 10666{
20effc67 10667 dout(15) << __func__ << ": " << pg << " #:" << pg_slots.size() << dendl;
11fdf7f2
TL
10668 epoch_t epoch;
10669 {
10670 std::lock_guard l(shard_lock);
20effc67 10671 dout(10) << __func__ << ": " << pg->pg_id << " " << pg << dendl;
11fdf7f2
TL
10672 auto p = pg_slots.find(pg->pg_id);
10673 ceph_assert(p != pg_slots.end());
10674 auto *slot = p->second.get();
20effc67
TL
10675 dout(20) << __func__ << ": " << pg->pg_id << " waiting_for_split "
10676 << slot->waiting_for_split << dendl;
11fdf7f2
TL
10677 ceph_assert(!slot->pg);
10678 ceph_assert(!slot->waiting_for_split.empty());
10679 _attach_pg(slot, pg);
10680
10681 epoch = pg->get_osdmap_epoch();
10682 ceph_assert(slot->waiting_for_split.count(epoch));
10683 slot->waiting_for_split.erase(epoch);
10684 if (slot->waiting_for_split.empty()) {
10685 _wake_pg_slot(pg->pg_id, slot);
10686 } else {
10687 dout(10) << __func__ << " still waiting for split on "
10688 << slot->waiting_for_split << dendl;
10689 }
7c673cae 10690 }
11fdf7f2
TL
10691
10692 // kick child to ensure it pulls up to the latest osdmap
10693 osd->enqueue_peering_evt(
10694 pg->pg_id,
10695 PGPeeringEventRef(
10696 std::make_shared<PGPeeringEvent>(
10697 epoch,
10698 epoch,
10699 NullEvt())));
10700
10701 std::lock_guard l{sdata_wait_lock};
10702 sdata_cond.notify_one();
7c673cae
FG
10703}
10704
11fdf7f2 10705void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 10706{
11fdf7f2
TL
10707 std::lock_guard l(shard_lock);
10708 vector<spg_t> to_delete;
10709 for (auto& i : pg_slots) {
10710 if (i.first != parent &&
10711 i.first.get_ancestor(old_pg_num) == parent) {
10712 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10713 << dendl;
10714 _wake_pg_slot(i.first, i.second.get());
10715 to_delete.push_back(i.first);
10716 }
10717 }
10718 for (auto pgid : to_delete) {
10719 pg_slots.erase(pgid);
10720 }
10721}
10722
a4b75251
TL
10723void OSDShard::update_scheduler_config()
10724{
10725 std::lock_guard l(shard_lock);
10726 scheduler->update_configuration();
10727}
10728
20effc67
TL
10729std::string OSDShard::get_scheduler_type()
10730{
10731 std::ostringstream scheduler_type;
10732 scheduler_type << *scheduler;
10733 return scheduler_type.str();
10734}
10735
9f95a23c
TL
10736OSDShard::OSDShard(
10737 int id,
10738 CephContext *cct,
10739 OSD *osd)
10740 : shard_id(id),
10741 cct(cct),
10742 osd(osd),
10743 shard_name(string("OSDShard.") + stringify(id)),
10744 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10745 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10746 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10747 shard_lock_name(shard_name + "::shard_lock"),
10748 shard_lock{make_mutex(shard_lock_name)},
f67539c2 10749 scheduler(ceph::osd::scheduler::make_scheduler(
20effc67
TL
10750 cct, osd->num_shards, osd->store->is_rotational(),
10751 osd->store->get_type())),
9f95a23c
TL
10752 context_queue(sdata_wait_lock, sdata_cond)
10753{
10754 dout(0) << "using op scheduler " << *scheduler << dendl;
10755}
10756
11fdf7f2
TL
10757
10758// =============================================================
10759
10760#undef dout_context
10761#define dout_context osd->cct
10762#undef dout_prefix
10763#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10764
10765void OSD::ShardedOpWQ::_add_slot_waiter(
10766 spg_t pgid,
10767 OSDShardPGSlot *slot,
9f95a23c 10768 OpSchedulerItem&& qi)
11fdf7f2
TL
10769{
10770 if (qi.is_peering()) {
10771 dout(20) << __func__ << " " << pgid
10772 << " peering, item epoch is "
10773 << qi.get_map_epoch()
10774 << ", will wait on " << qi << dendl;
10775 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10776 } else {
10777 dout(20) << __func__ << " " << pgid
10778 << " item epoch is "
10779 << qi.get_map_epoch()
10780 << ", will wait on " << qi << dendl;
10781 slot->waiting.push_back(std::move(qi));
7c673cae
FG
10782 }
10783}
10784
10785#undef dout_prefix
10786#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10787
10788void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10789{
11fdf7f2
TL
10790 uint32_t shard_index = thread_index % osd->num_shards;
10791 auto& sdata = osd->shards[shard_index];
10792 ceph_assert(sdata);
10793
10794 // If all threads of shards do oncommits, there is a out-of-order
10795 // problem. So we choose the thread which has the smallest
10796 // thread_index(thread_index < num_shards) of shard to do oncommit
10797 // callback.
10798 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
10799
10800 // peek at spg_t
11fdf7f2 10801 sdata->shard_lock.lock();
9f95a23c 10802 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10803 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10804 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10805 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10806 // we raced with a context_queue addition, don't wait
10807 wait_lock.unlock();
10808 } else if (!sdata->stop_waiting) {
10809 dout(20) << __func__ << " empty q, waiting" << dendl;
10810 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10811 sdata->shard_lock.unlock();
10812 sdata->sdata_cond.wait(wait_lock);
10813 wait_lock.unlock();
10814 sdata->shard_lock.lock();
9f95a23c 10815 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10816 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10817 sdata->shard_lock.unlock();
10818 return;
10819 }
e306af50 10820 // found a work item; reapply default wq timeouts
11fdf7f2 10821 osd->cct->get_heartbeat_map()->reset_timeout(hb,
e306af50 10822 timeout_interval, suicide_interval);
11fdf7f2
TL
10823 } else {
10824 dout(20) << __func__ << " need return immediately" << dendl;
10825 wait_lock.unlock();
10826 sdata->shard_lock.unlock();
7c673cae
FG
10827 return;
10828 }
10829 }
11fdf7f2
TL
10830
10831 list<Context *> oncommits;
9f95a23c
TL
10832 if (is_smallest_thread_index) {
10833 sdata->context_queue.move_to(oncommits);
7c673cae 10834 }
11fdf7f2 10835
f67539c2
TL
10836 WorkItem work_item;
10837 while (!std::get_if<OpSchedulerItem>(&work_item)) {
10838 if (sdata->scheduler->empty()) {
10839 if (osd->is_stopping()) {
10840 sdata->shard_lock.unlock();
10841 for (auto c : oncommits) {
10842 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10843 delete c;
10844 }
10845 return; // OSD shutdown, discard.
10846 }
10847 sdata->shard_lock.unlock();
10848 handle_oncommits(oncommits);
10849 return;
10850 }
10851
10852 work_item = sdata->scheduler->dequeue();
11fdf7f2
TL
10853 if (osd->is_stopping()) {
10854 sdata->shard_lock.unlock();
10855 for (auto c : oncommits) {
f67539c2
TL
10856 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10857 delete c;
11fdf7f2
TL
10858 }
10859 return; // OSD shutdown, discard.
7c673cae 10860 }
7c673cae 10861
f67539c2
TL
10862 // If the work item is scheduled in the future, wait until
10863 // the time returned in the dequeue response before retrying.
10864 if (auto when_ready = std::get_if<double>(&work_item)) {
10865 if (is_smallest_thread_index) {
10866 sdata->shard_lock.unlock();
10867 handle_oncommits(oncommits);
10868 return;
10869 }
10870 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10871 auto future_time = ceph::real_clock::from_double(*when_ready);
10872 dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
a4b75251
TL
10873 // Disable heartbeat timeout until we find a non-future work item to process.
10874 osd->cct->get_heartbeat_map()->clear_timeout(hb);
f67539c2
TL
10875 sdata->shard_lock.unlock();
10876 ++sdata->waiting_threads;
10877 sdata->sdata_cond.wait_until(wait_lock, future_time);
10878 --sdata->waiting_threads;
10879 wait_lock.unlock();
10880 sdata->shard_lock.lock();
a4b75251
TL
10881 // Reapply default wq timeouts
10882 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10883 timeout_interval, suicide_interval);
f67539c2
TL
10884 }
10885 } // while
10886
10887 // Access the stored item
10888 auto item = std::move(std::get<OpSchedulerItem>(work_item));
11fdf7f2
TL
10889 if (osd->is_stopping()) {
10890 sdata->shard_lock.unlock();
10891 for (auto c : oncommits) {
10892 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10893 delete c;
10894 }
10895 return; // OSD shutdown, discard.
10896 }
7c673cae 10897
11fdf7f2
TL
10898 const auto token = item.get_ordering_token();
10899 auto r = sdata->pg_slots.emplace(token, nullptr);
10900 if (r.second) {
10901 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 10902 }
11fdf7f2
TL
10903 OSDShardPGSlot *slot = r.first->second.get();
10904 dout(20) << __func__ << " " << token
10905 << (r.second ? " (new)" : "")
10906 << " to_process " << slot->to_process
10907 << " waiting " << slot->waiting
10908 << " waiting_peering " << slot->waiting_peering
10909 << dendl;
10910 slot->to_process.push_back(std::move(item));
10911 dout(20) << __func__ << " " << slot->to_process.back()
10912 << " queued" << dendl;
7c673cae 10913
11fdf7f2
TL
10914 retry_pg:
10915 PGRef pg = slot->pg;
7c673cae 10916
11fdf7f2
TL
10917 // lock pg (if we have it)
10918 if (pg) {
10919 // note the requeue seq now...
10920 uint64_t requeue_seq = slot->requeue_seq;
10921 ++slot->num_running;
7c673cae 10922
11fdf7f2
TL
10923 sdata->shard_lock.unlock();
10924 osd->service.maybe_inject_dispatch_delay();
10925 pg->lock();
10926 osd->service.maybe_inject_dispatch_delay();
10927 sdata->shard_lock.lock();
7c673cae 10928
11fdf7f2
TL
10929 auto q = sdata->pg_slots.find(token);
10930 if (q == sdata->pg_slots.end()) {
10931 // this can happen if we race with pg removal.
10932 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10933 pg->unlock();
10934 sdata->shard_lock.unlock();
10935 handle_oncommits(oncommits);
10936 return;
10937 }
10938 slot = q->second.get();
10939 --slot->num_running;
7c673cae 10940
11fdf7f2
TL
10941 if (slot->to_process.empty()) {
10942 // raced with _wake_pg_slot or consume_map
10943 dout(20) << __func__ << " " << token
10944 << " nothing queued" << dendl;
7c673cae 10945 pg->unlock();
11fdf7f2
TL
10946 sdata->shard_lock.unlock();
10947 handle_oncommits(oncommits);
10948 return;
7c673cae 10949 }
11fdf7f2
TL
10950 if (requeue_seq != slot->requeue_seq) {
10951 dout(20) << __func__ << " " << token
10952 << " requeue_seq " << slot->requeue_seq << " > our "
10953 << requeue_seq << ", we raced with _wake_pg_slot"
10954 << dendl;
7c673cae 10955 pg->unlock();
11fdf7f2
TL
10956 sdata->shard_lock.unlock();
10957 handle_oncommits(oncommits);
10958 return;
7c673cae 10959 }
11fdf7f2
TL
10960 if (slot->pg != pg) {
10961 // this can happen if we race with pg removal.
10962 dout(20) << __func__ << " slot " << token << " no longer attached to "
10963 << pg << dendl;
7c673cae 10964 pg->unlock();
11fdf7f2 10965 goto retry_pg;
7c673cae 10966 }
7c673cae
FG
10967 }
10968
11fdf7f2
TL
10969 dout(20) << __func__ << " " << token
10970 << " to_process " << slot->to_process
10971 << " waiting " << slot->waiting
10972 << " waiting_peering " << slot->waiting_peering << dendl;
10973
10974 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10975 suicide_interval);
10976
7c673cae 10977 // take next item
11fdf7f2
TL
10978 auto qi = std::move(slot->to_process.front());
10979 slot->to_process.pop_front();
10980 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10981 set<pair<spg_t,epoch_t>> new_children;
10982 OSDMapRef osdmap;
7c673cae 10983
11fdf7f2 10984 while (!pg) {
7c673cae 10985 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
10986 osdmap = sdata->shard_osdmap;
10987 const PGCreateInfo *create_info = qi.creates_pg();
10988 if (!slot->waiting_for_split.empty()) {
10989 dout(20) << __func__ << " " << token
10990 << " splitting " << slot->waiting_for_split << dendl;
10991 _add_slot_waiter(token, slot, std::move(qi));
10992 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10993 dout(20) << __func__ << " " << token
10994 << " map " << qi.get_map_epoch() << " > "
10995 << osdmap->get_epoch() << dendl;
10996 _add_slot_waiter(token, slot, std::move(qi));
10997 } else if (qi.is_peering()) {
10998 if (!qi.peering_requires_pg()) {
10999 // for pg-less events, we run them under the ordering lock, since
11000 // we don't have the pg lock to keep them ordered.
11001 qi.run(osd, sdata, pg, tp_handle);
11002 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11003 if (create_info) {
11004 if (create_info->by_mon &&
11005 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11006 dout(20) << __func__ << " " << token
11007 << " no pg, no longer primary, ignoring mon create on "
11008 << qi << dendl;
11009 } else {
11010 dout(20) << __func__ << " " << token
11011 << " no pg, should create on " << qi << dendl;
11012 pg = osd->handle_pg_create_info(osdmap, create_info);
11013 if (pg) {
11014 // we created the pg! drop out and continue "normally"!
11015 sdata->_attach_pg(slot, pg.get());
11016 sdata->_wake_pg_slot(token, slot);
11017
11018 // identify split children between create epoch and shard epoch.
11019 osd->service.identify_splits_and_merges(
11020 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11021 sdata->_prime_splits(&new_children);
11022 // distribute remaining split children to other shards below!
11023 break;
11024 }
11025 dout(20) << __func__ << " ignored create on " << qi << dendl;
11026 }
11027 } else {
11028 dout(20) << __func__ << " " << token
11029 << " no pg, peering, !create, discarding " << qi << dendl;
11030 }
11031 } else {
11032 dout(20) << __func__ << " " << token
11033 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11034 << ", discarding " << qi
11035 << dendl;
11036 }
11037 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11038 dout(20) << __func__ << " " << token
11039 << " no pg, should exist e" << osdmap->get_epoch()
11040 << ", will wait on " << qi << dendl;
11041 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 11042 } else {
11fdf7f2
TL
11043 dout(20) << __func__ << " " << token
11044 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11045 << ", dropping " << qi << dendl;
7c673cae 11046 // share map with client?
9f95a23c
TL
11047 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11048 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11049 sdata->shard_osdmap,
11050 (*_op)->sent_epoch);
7c673cae 11051 }
11fdf7f2 11052 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 11053 if (pushes_to_free > 0) {
11fdf7f2 11054 sdata->shard_lock.unlock();
7c673cae 11055 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 11056 handle_oncommits(oncommits);
7c673cae
FG
11057 return;
11058 }
11059 }
11fdf7f2
TL
11060 sdata->shard_lock.unlock();
11061 handle_oncommits(oncommits);
7c673cae
FG
11062 return;
11063 }
11fdf7f2
TL
11064 if (qi.is_peering()) {
11065 OSDMapRef osdmap = sdata->shard_osdmap;
11066 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11067 _add_slot_waiter(token, slot, std::move(qi));
11068 sdata->shard_lock.unlock();
11069 pg->unlock();
11070 handle_oncommits(oncommits);
11071 return;
11072 }
11073 }
11074 sdata->shard_lock.unlock();
7c673cae 11075
11fdf7f2
TL
11076 if (!new_children.empty()) {
11077 for (auto shard : osd->shards) {
11078 shard->prime_splits(osdmap, &new_children);
11079 }
11080 ceph_assert(new_children.empty());
11081 }
7c673cae
FG
11082
11083 // osd_opwq_process marks the point at which an operation has been dequeued
11084 // and will begin to be handled by a worker thread.
11085 {
11086#ifdef WITH_LTTNG
11087 osd_reqid_t reqid;
9f95a23c 11088 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11089 reqid = (*_op)->get_reqid();
11090 }
11091#endif
11092 tracepoint(osd, opwq_process_start, reqid.name._type,
11093 reqid.name._num, reqid.tid, reqid.inc);
11094 }
11095
11096 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11097 Formatter *f = Formatter::create("json");
11098 f->open_object_section("q");
11099 dump(f);
11100 f->close_section();
11101 f->flush(*_dout);
11102 delete f;
11103 *_dout << dendl;
11104
11fdf7f2 11105 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
11106
11107 {
11108#ifdef WITH_LTTNG
11109 osd_reqid_t reqid;
9f95a23c 11110 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11111 reqid = (*_op)->get_reqid();
11112 }
11113#endif
11114 tracepoint(osd, opwq_process_finish, reqid.name._type,
11115 reqid.name._num, reqid.tid, reqid.inc);
11116 }
11117
11fdf7f2 11118 handle_oncommits(oncommits);
7c673cae
FG
11119}
11120
9f95a23c 11121void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
1d09f67e
TL
11122 if (unlikely(m_fast_shutdown) ) {
11123 // stop enqueing when we are in the middle of a fast shutdown
11124 return;
11125 }
11126
7c673cae 11127 uint32_t shard_index =
11fdf7f2 11128 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 11129
11fdf7f2 11130 OSDShard* sdata = osd->shards[shard_index];
7c673cae 11131 assert (NULL != sdata);
20effc67
TL
11132 if (sdata->get_scheduler_type() == "mClockScheduler") {
11133 item.maybe_set_is_qos_item();
11134 }
11135
11136 dout(20) << __func__ << " " << item << dendl;
7c673cae 11137
9f95a23c
TL
11138 bool empty = true;
11139 {
11140 std::lock_guard l{sdata->shard_lock};
11141 empty = sdata->scheduler->empty();
11142 sdata->scheduler->enqueue(std::move(item));
11143 }
7c673cae 11144
f67539c2 11145 {
9f95a23c 11146 std::lock_guard l{sdata->sdata_wait_lock};
f67539c2
TL
11147 if (empty) {
11148 sdata->sdata_cond.notify_all();
11149 } else if (sdata->waiting_threads) {
11150 sdata->sdata_cond.notify_one();
11151 }
9f95a23c 11152 }
7c673cae
FG
11153}
11154
9f95a23c 11155void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
7c673cae 11156{
1d09f67e
TL
11157 if (unlikely(m_fast_shutdown) ) {
11158 // stop enqueing when we are in the middle of a fast shutdown
11159 return;
11160 }
11161
11fdf7f2
TL
11162 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11163 auto& sdata = osd->shards[shard_index];
11164 ceph_assert(sdata);
11165 sdata->shard_lock.lock();
11166 auto p = sdata->pg_slots.find(item.get_ordering_token());
11167 if (p != sdata->pg_slots.end() &&
11168 !p->second->to_process.empty()) {
7c673cae 11169 // we may be racing with _process, which has dequeued a new item
9f95a23c 11170 // from scheduler, put it on to_process, and is now busy taking the
7c673cae
FG
11171 // pg lock. ensure this old requeued item is ordered before any
11172 // such newer item in to_process.
11fdf7f2
TL
11173 p->second->to_process.push_front(std::move(item));
11174 item = std::move(p->second->to_process.back());
11175 p->second->to_process.pop_back();
11176 dout(20) << __func__
11177 << " " << p->second->to_process.front()
11178 << " shuffled w/ " << item << dendl;
7c673cae 11179 } else {
11fdf7f2 11180 dout(20) << __func__ << " " << item << dendl;
7c673cae 11181 }
9f95a23c 11182 sdata->scheduler->enqueue_front(std::move(item));
11fdf7f2
TL
11183 sdata->shard_lock.unlock();
11184 std::lock_guard l{sdata->sdata_wait_lock};
11185 sdata->sdata_cond.notify_one();
7c673cae
FG
11186}
11187
1d09f67e
TL
11188void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11189{
11190 uint32_t shard_index = 0;
11191 m_fast_shutdown = true;
11192
11193 for (; shard_index < osd->num_shards; shard_index++) {
11194 auto& sdata = osd->shards[shard_index];
11195 ceph_assert(sdata);
11196 sdata->shard_lock.lock();
11197 int work_count = 0;
11198 while(! sdata->scheduler->empty() ) {
11199 auto work_item = sdata->scheduler->dequeue();
11200 work_count++;
11201 }
11202 sdata->shard_lock.unlock();
11203 }
11204}
11205
f67539c2 11206namespace ceph::osd_cmds {
7c673cae 11207
11fdf7f2
TL
11208int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11209 std::ostream& os)
7c673cae
FG
11210{
11211 if (!ceph_using_tcmalloc()) {
11212 os << "could not issue heap profiler command -- not using tcmalloc!";
11213 return -EOPNOTSUPP;
11214 }
f67539c2 11215
7c673cae 11216 string cmd;
9f95a23c 11217 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
7c673cae
FG
11218 os << "unable to get value for command \"" << cmd << "\"";
11219 return -EINVAL;
11fdf7f2 11220 }
f67539c2 11221
7c673cae
FG
11222 std::vector<std::string> cmd_vec;
11223 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
11224
11225 string val;
9f95a23c 11226 if (cmd_getval(cmdmap, "value", val)) {
11fdf7f2
TL
11227 cmd_vec.push_back(val);
11228 }
f67539c2 11229
7c673cae 11230 ceph_heap_profiler_handle_command(cmd_vec, os);
f67539c2 11231
7c673cae
FG
11232 return 0;
11233}
f67539c2
TL
11234
11235} // namespace ceph::osd_cmds