]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
eafe8130 26#include <time.h>
eafe8130 27#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
28
29#ifdef HAVE_SYS_PARAM_H
30#include <sys/param.h>
31#endif
32
33#ifdef HAVE_SYS_MOUNT_H
34#include <sys/mount.h>
35#endif
36
37#include "osd/PG.h"
20effc67
TL
38#include "osd/scrubber/scrub_machine.h"
39#include "osd/scrubber/pg_scrubber.h"
7c673cae
FG
40
41#include "include/types.h"
42#include "include/compat.h"
11fdf7f2 43#include "include/random.h"
20effc67 44#include "include/scope_guard.h"
7c673cae
FG
45
46#include "OSD.h"
47#include "OSDMap.h"
48#include "Watch.h"
49#include "osdc/Objecter.h"
50
51#include "common/errno.h"
52#include "common/ceph_argparse.h"
9f95a23c 53#include "common/ceph_releases.h"
224ce89b 54#include "common/ceph_time.h"
7c673cae 55#include "common/version.h"
f67539c2 56#include "common/async/blocked_completion.h"
b5b8bbf5 57#include "common/pick_address.h"
11fdf7f2
TL
58#include "common/blkdev.h"
59#include "common/numa.h"
7c673cae
FG
60
61#include "os/ObjectStore.h"
62#ifdef HAVE_LIBFUSE
63#include "os/FuseStore.h"
64#endif
65
66#include "PrimaryLogPG.h"
67
7c673cae
FG
68#include "msg/Messenger.h"
69#include "msg/Message.h"
70
71#include "mon/MonClient.h"
72
73#include "messages/MLog.h"
74
75#include "messages/MGenericMessage.h"
7c673cae
FG
76#include "messages/MOSDPing.h"
77#include "messages/MOSDFailure.h"
78#include "messages/MOSDMarkMeDown.h"
9f95a23c 79#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
80#include "messages/MOSDFull.h"
81#include "messages/MOSDOp.h"
82#include "messages/MOSDOpReply.h"
83#include "messages/MOSDBackoff.h"
84#include "messages/MOSDBeacon.h"
85#include "messages/MOSDRepOp.h"
86#include "messages/MOSDRepOpReply.h"
87#include "messages/MOSDBoot.h"
88#include "messages/MOSDPGTemp.h"
11fdf7f2 89#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
90
91#include "messages/MOSDMap.h"
92#include "messages/MMonGetOSDMap.h"
93#include "messages/MOSDPGNotify.h"
9f95a23c 94#include "messages/MOSDPGNotify2.h"
9f95a23c 95#include "messages/MOSDPGQuery2.h"
7c673cae
FG
96#include "messages/MOSDPGLog.h"
97#include "messages/MOSDPGRemove.h"
98#include "messages/MOSDPGInfo.h"
9f95a23c 99#include "messages/MOSDPGInfo2.h"
11fdf7f2 100#include "messages/MOSDPGCreate2.h"
7c673cae
FG
101#include "messages/MBackfillReserve.h"
102#include "messages/MRecoveryReserve.h"
c07f9fc5 103#include "messages/MOSDForceRecovery.h"
7c673cae
FG
104#include "messages/MOSDECSubOpWrite.h"
105#include "messages/MOSDECSubOpWriteReply.h"
106#include "messages/MOSDECSubOpRead.h"
107#include "messages/MOSDECSubOpReadReply.h"
108#include "messages/MOSDPGCreated.h"
109#include "messages/MOSDPGUpdateLogMissing.h"
110#include "messages/MOSDPGUpdateLogMissingReply.h"
111
11fdf7f2
TL
112#include "messages/MOSDPeeringOp.h"
113
7c673cae
FG
114#include "messages/MOSDAlive.h"
115
11fdf7f2 116#include "messages/MOSDScrub2.h"
7c673cae 117
7c673cae
FG
118#include "messages/MCommand.h"
119#include "messages/MCommandReply.h"
120
121#include "messages/MPGStats.h"
7c673cae 122
9f95a23c
TL
123#include "messages/MMonGetPurgedSnaps.h"
124#include "messages/MMonGetPurgedSnapsReply.h"
125
7c673cae
FG
126#include "common/perf_counters.h"
127#include "common/Timer.h"
128#include "common/LogClient.h"
129#include "common/AsyncReserver.h"
130#include "common/HeartbeatMap.h"
131#include "common/admin_socket.h"
132#include "common/ceph_context.h"
133
134#include "global/signal_handler.h"
135#include "global/pidfile.h"
136
137#include "include/color.h"
138#include "perfglue/cpu_profiler.h"
139#include "perfglue/heap_profiler.h"
140
f67539c2 141#include "osd/ClassHandler.h"
7c673cae
FG
142#include "osd/OpRequest.h"
143
144#include "auth/AuthAuthorizeHandler.h"
145#include "auth/RotatingKeyRing.h"
7c673cae
FG
146
147#include "objclass/objclass.h"
148
149#include "common/cmdparse.h"
150#include "include/str_list.h"
151#include "include/util.h"
152
11fdf7f2 153#include "include/ceph_assert.h"
7c673cae
FG
154#include "common/config.h"
155#include "common/EventTrace.h"
156
11fdf7f2
TL
157#include "json_spirit/json_spirit_reader.h"
158#include "json_spirit/json_spirit_writer.h"
159
7c673cae
FG
160#ifdef WITH_LTTNG
161#define TRACEPOINT_DEFINE
162#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
163#include "tracing/osd.h"
164#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
165#undef TRACEPOINT_DEFINE
166#else
167#define tracepoint(...)
168#endif
20effc67
TL
169
170#include "osd_tracer.h"
171
7c673cae
FG
172
173#define dout_context cct
174#define dout_subsys ceph_subsys_osd
175#undef dout_prefix
176#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
177
f67539c2
TL
178using std::deque;
179using std::list;
180using std::lock_guard;
181using std::make_pair;
182using std::make_tuple;
183using std::make_unique;
184using std::map;
185using std::ostream;
186using std::ostringstream;
187using std::pair;
188using std::set;
189using std::string;
190using std::stringstream;
191using std::to_string;
192using std::unique_ptr;
193using std::vector;
194
195using ceph::bufferlist;
196using ceph::bufferptr;
197using ceph::decode;
198using ceph::encode;
199using ceph::fixed_u_to_string;
200using ceph::Formatter;
201using ceph::heartbeat_handle_d;
202using ceph::make_mutex;
203
9f95a23c
TL
204using namespace ceph::osd::scheduler;
205using TOPNSPC::common::cmd_getval;
20effc67 206using TOPNSPC::common::cmd_getval_or;
224ce89b 207
7c673cae
FG
208static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
209 return *_dout << "osd." << whoami << " " << epoch << " ";
210}
211
20effc67 212
7c673cae
FG
213//Initial features in new superblock.
214//Features here are also automatically upgraded
215CompatSet OSD::get_osd_initial_compat_set() {
216 CompatSet::FeatureSet ceph_osd_feature_compat;
217 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
218 CompatSet::FeatureSet ceph_osd_feature_incompat;
219 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
220 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
221 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
222 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
223 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
224 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
225 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
226 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
227 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
228 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
229 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
230 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
231 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
232 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 233 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
9f95a23c 234 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
7c673cae
FG
235 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
236 ceph_osd_feature_incompat);
237}
238
239//Features are added here that this OSD supports.
240CompatSet OSD::get_osd_compat_set() {
241 CompatSet compat = get_osd_initial_compat_set();
242 //Any features here can be set in code, but not in initial superblock
243 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
244 return compat;
245}
246
f67539c2 247OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
7c673cae
FG
248 osd(osd),
249 cct(osd->cct),
20effc67 250 whoami(osd->whoami), store(osd->store.get()),
7c673cae
FG
251 log_client(osd->log_client), clog(osd->clog),
252 pg_recovery_stats(osd->pg_recovery_stats),
253 cluster_messenger(osd->cluster_messenger),
254 client_messenger(osd->client_messenger),
255 logger(osd->logger),
256 recoverystate_perf(osd->recoverystate_perf),
257 monc(osd->monc),
11fdf7f2
TL
258 osd_max_object_size(cct->_conf, "osd_max_object_size"),
259 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
260 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
261 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
20effc67 262 m_scrub_queue{cct, *this},
7c673cae
FG
263 agent_valid_iterator(false),
264 agent_ops(0),
265 flush_mode_high_count(0),
266 agent_active(true),
267 agent_thread(this),
268 agent_stop_flag(false),
7c673cae
FG
269 agent_timer(osd->client_messenger->cct, agent_timer_lock),
270 last_recalibrate(ceph_clock_now()),
271 promote_max_objects(0),
272 promote_max_bytes(0),
f67539c2 273 poolctx(poolctx),
9f95a23c
TL
274 objecter(make_unique<Objecter>(osd->client_messenger->cct,
275 osd->objecter_messenger,
f67539c2 276 osd->monc, poolctx)),
11fdf7f2 277 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
278 watch_timer(osd->client_messenger->cct, watch_lock),
279 next_notif_id(0),
7c673cae 280 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2 281 sleep_timer(cct, sleep_lock, false),
7c673cae 282 reserver_finisher(cct),
3efd9988 283 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 284 cct->_conf->osd_min_recovery_priority),
3efd9988 285 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 286 cct->_conf->osd_min_recovery_priority),
3efd9988 287 snap_reserver(cct, &reserver_finisher,
7c673cae 288 cct->_conf->osd_max_trimming_pgs),
7c673cae
FG
289 recovery_ops_active(0),
290 recovery_ops_reserved(0),
291 recovery_paused(false),
7c673cae
FG
292 map_cache(cct, cct->_conf->osd_map_cache_size),
293 map_bl_cache(cct->_conf->osd_map_cache_size),
294 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae 295 cur_state(NONE),
11fdf7f2 296 cur_ratio(0), physical_ratio(0),
9f95a23c 297 boot_epoch(0), up_epoch(0), bind_epoch(0)
7c673cae
FG
298{
299 objecter->init();
11fdf7f2
TL
300
301 for (int i = 0; i < m_objecter_finishers; i++) {
302 ostringstream str;
303 str << "objecter-finisher-" << i;
9f95a23c
TL
304 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
305 objecter_finishers.push_back(std::move(fin));
11fdf7f2 306 }
7c673cae
FG
307}
308
31f18b77 309#ifdef PG_DEBUG_REFS
f67539c2 310void OSDService::add_pgid(spg_t pgid, PG *pg) {
11fdf7f2 311 std::lock_guard l(pgid_lock);
31f18b77
FG
312 if (!pgid_tracker.count(pgid)) {
313 live_pgs[pgid] = pg;
314 }
315 pgid_tracker[pgid]++;
316}
317void OSDService::remove_pgid(spg_t pgid, PG *pg)
318{
11fdf7f2
TL
319 std::lock_guard l(pgid_lock);
320 ceph_assert(pgid_tracker.count(pgid));
321 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
322 pgid_tracker[pgid]--;
323 if (pgid_tracker[pgid] == 0) {
324 pgid_tracker.erase(pgid);
325 live_pgs.erase(pgid);
326 }
327}
328void OSDService::dump_live_pgids()
329{
11fdf7f2 330 std::lock_guard l(pgid_lock);
31f18b77
FG
331 derr << "live pgids:" << dendl;
332 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
333 i != pgid_tracker.cend();
334 ++i) {
335 derr << "\t" << *i << dendl;
336 live_pgs[i->first]->dump_live_ids();
337 }
338}
339#endif
340
341
1e59de90 342ceph::signedspan OSDService::get_mnow() const
9f95a23c
TL
343{
344 return ceph::mono_clock::now() - osd->startup_time;
345}
7c673cae 346
11fdf7f2
TL
347void OSDService::identify_splits_and_merges(
348 OSDMapRef old_map,
349 OSDMapRef new_map,
350 spg_t pgid,
351 set<pair<spg_t,epoch_t>> *split_children,
352 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 353{
1e59de90
TL
354 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
355 << " to e" << new_map->get_epoch() << dendl;
11fdf7f2 356 if (!old_map->have_pg_pool(pgid.pool())) {
1e59de90
TL
357 dout(20) << __func__ << " " << pgid << " pool " << pgid.pool()
358 << " does not exist in old map" << dendl;
7c673cae 359 return;
7c673cae 360 }
7c673cae 361 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
362 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
363 if (p == osd->pg_num_history.pg_nums.end()) {
1e59de90
TL
364 dout(20) << __func__ << " " << pgid << " pool " << pgid.pool()
365 << " has no history" << dendl;
11fdf7f2
TL
366 return;
367 }
368 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
369 << " to e" << new_map->get_epoch()
370 << " pg_nums " << p->second << dendl;
371 deque<spg_t> queue;
372 queue.push_back(pgid);
eafe8130 373 set<spg_t> did;
11fdf7f2
TL
374 while (!queue.empty()) {
375 auto cur = queue.front();
376 queue.pop_front();
eafe8130 377 did.insert(cur);
11fdf7f2
TL
378 unsigned pgnum = old_pgnum;
379 for (auto q = p->second.lower_bound(old_map->get_epoch());
380 q != p->second.end() &&
381 q->first <= new_map->get_epoch();
382 ++q) {
383 if (pgnum < q->second) {
384 // split?
385 if (cur.ps() < pgnum) {
386 set<spg_t> children;
387 if (cur.is_split(pgnum, q->second, &children)) {
388 dout(20) << __func__ << " " << cur << " e" << q->first
389 << " pg_num " << pgnum << " -> " << q->second
390 << " children " << children << dendl;
391 for (auto i : children) {
392 split_children->insert(make_pair(i, q->first));
eafe8130
TL
393 if (!did.count(i))
394 queue.push_back(i);
11fdf7f2
TL
395 }
396 }
397 } else if (cur.ps() < q->second) {
398 dout(20) << __func__ << " " << cur << " e" << q->first
399 << " pg_num " << pgnum << " -> " << q->second
400 << " is a child" << dendl;
401 // normally we'd capture this from the parent, but it's
402 // possible the parent doesn't exist yet (it will be
403 // fabricated to allow an intervening merge). note this PG
404 // as a split child here to be sure we catch it.
405 split_children->insert(make_pair(cur, q->first));
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is post-split, skipping" << dendl;
410 }
411 } else if (merge_pgs) {
412 // merge?
413 if (cur.ps() >= q->second) {
414 if (cur.ps() < pgnum) {
415 spg_t parent;
416 if (cur.is_merge_source(pgnum, q->second, &parent)) {
417 set<spg_t> children;
418 parent.is_split(q->second, pgnum, &children);
419 dout(20) << __func__ << " " << cur << " e" << q->first
420 << " pg_num " << pgnum << " -> " << q->second
421 << " is merge source, target " << parent
422 << ", source(s) " << children << dendl;
423 merge_pgs->insert(make_pair(parent, q->first));
eafe8130
TL
424 if (!did.count(parent)) {
425 // queue (and re-scan) parent in case it might not exist yet
426 // and there are some future splits pending on it
427 queue.push_back(parent);
428 }
11fdf7f2
TL
429 for (auto c : children) {
430 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
431 if (!did.count(c))
432 queue.push_back(c);
11fdf7f2
TL
433 }
434 }
435 } else {
436 dout(20) << __func__ << " " << cur << " e" << q->first
437 << " pg_num " << pgnum << " -> " << q->second
438 << " is beyond old pgnum, skipping" << dendl;
439 }
440 } else {
441 set<spg_t> children;
442 if (cur.is_split(q->second, pgnum, &children)) {
443 dout(20) << __func__ << " " << cur << " e" << q->first
444 << " pg_num " << pgnum << " -> " << q->second
445 << " is merge target, source " << children << dendl;
446 for (auto c : children) {
447 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
448 if (!did.count(c))
449 queue.push_back(c);
11fdf7f2
TL
450 }
451 merge_pgs->insert(make_pair(cur, q->first));
452 }
7c673cae
FG
453 }
454 }
11fdf7f2 455 pgnum = q->second;
7c673cae
FG
456 }
457 }
458}
459
7c673cae
FG
460void OSDService::need_heartbeat_peer_update()
461{
462 osd->need_heartbeat_peer_update();
463}
464
9f95a23c
TL
465HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
466{
467 std::lock_guard l(hb_stamp_lock);
468 if (peer >= hb_stamps.size()) {
469 hb_stamps.resize(peer + 1);
470 }
471 if (!hb_stamps[peer]) {
472 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
473 }
474 return hb_stamps[peer];
475}
476
477void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
478{
479 osd->enqueue_peering_evt(
480 spgid,
481 PGPeeringEventRef(
482 std::make_shared<PGPeeringEvent>(
483 epoch, epoch,
484 RenewLease())));
485}
486
7c673cae
FG
487void OSDService::start_shutdown()
488{
489 {
11fdf7f2 490 std::lock_guard l(agent_timer_lock);
7c673cae
FG
491 agent_timer.shutdown();
492 }
31f18b77
FG
493
494 {
11fdf7f2
TL
495 std::lock_guard l(sleep_lock);
496 sleep_timer.shutdown();
31f18b77 497 }
81eedcae
TL
498
499 {
500 std::lock_guard l(recovery_request_lock);
501 recovery_request_timer.shutdown();
502 }
7c673cae
FG
503}
504
31f18b77 505void OSDService::shutdown_reserver()
7c673cae
FG
506{
507 reserver_finisher.wait_for_empty();
508 reserver_finisher.stop();
31f18b77
FG
509}
510
511void OSDService::shutdown()
512{
9f95a23c
TL
513 mono_timer.suspend();
514
7c673cae 515 {
11fdf7f2 516 std::lock_guard l(watch_lock);
7c673cae
FG
517 watch_timer.shutdown();
518 }
519
520 objecter->shutdown();
9f95a23c 521 for (auto& f : objecter_finishers) {
11fdf7f2
TL
522 f->wait_for_empty();
523 f->stop();
7c673cae
FG
524 }
525
11fdf7f2 526 publish_map(OSDMapRef());
7c673cae
FG
527 next_osdmap = OSDMapRef();
528}
529
530void OSDService::init()
531{
532 reserver_finisher.start();
9f95a23c 533 for (auto& f : objecter_finishers) {
11fdf7f2
TL
534 f->start();
535 }
7c673cae
FG
536 objecter->set_client_incarnation(0);
537
538 // deprioritize objecter in daemonperf output
539 objecter->get_logger()->set_prio_adjust(-3);
540
541 watch_timer.init();
542 agent_timer.init();
9f95a23c 543 mono_timer.resume();
7c673cae
FG
544
545 agent_thread.create("osd_srv_agent");
546
547 if (cct->_conf->osd_recovery_delay_start)
548 defer_recovery(cct->_conf->osd_recovery_delay_start);
549}
550
551void OSDService::final_init()
552{
553 objecter->start(osdmap.get());
554}
555
556void OSDService::activate_map()
557{
558 // wake/unwake the tiering agent
9f95a23c 559 std::lock_guard l{agent_lock};
7c673cae
FG
560 agent_active =
561 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
562 osd->is_active();
9f95a23c 563 agent_cond.notify_all();
7c673cae
FG
564}
565
1e59de90
TL
566OSDMapRef OSDService::get_nextmap_reserved() {
567 std::lock_guard l(pre_publish_lock);
568
569 epoch_t e = next_osdmap->get_epoch();
570
571 std::map<epoch_t, unsigned>::iterator i =
572 map_reservations.insert(std::make_pair(e, 0)).first;
573 i->second++;
574 dout(20) << __func__ << " map_reservations: " << map_reservations << dendl;
575 return next_osdmap;
576}
577
578/// releases reservation on map
579void OSDService::release_map(OSDMapRef osdmap) {
580 std::lock_guard l(pre_publish_lock);
581 dout(20) << __func__ << " epoch: " << osdmap->get_epoch() << dendl;
582 std::map<epoch_t, unsigned>::iterator i =
583 map_reservations.find(osdmap->get_epoch());
584 ceph_assert(i != map_reservations.end());
585 ceph_assert(i->second > 0);
586 if (--(i->second) == 0) {
587 map_reservations.erase(i);
588 }
589 if (pre_publish_waiter) {
590 dout(20) << __func__ << " notify all." << dendl;
591 pre_publish_cond.notify_all();
592 }
593}
594
595/// blocks until there are no reserved maps prior to next_osdmap
596void OSDService::await_reserved_maps() {
597 std::unique_lock l{pre_publish_lock};
598 dout(20) << __func__ << " epoch:" << next_osdmap->get_epoch() << dendl;
599
600 ceph_assert(next_osdmap);
601 pre_publish_waiter++;
602 pre_publish_cond.wait(l, [this] {
603 auto i = map_reservations.cbegin();
604 return (i == map_reservations.cend() ||
605 i->first >= next_osdmap->get_epoch());
606 });
607 pre_publish_waiter--;
608 dout(20) << __func__ << " done " << pre_publish_waiter << dendl;
609}
610
181888fb
FG
611void OSDService::request_osdmap_update(epoch_t e)
612{
613 osd->osdmap_subscribe(e, false);
614}
615
9f95a23c 616
7c673cae
FG
617class AgentTimeoutCB : public Context {
618 PGRef pg;
619public:
620 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
621 void finish(int) override {
622 pg->agent_choose_mode_restart();
623 }
624};
625
626void OSDService::agent_entry()
627{
628 dout(10) << __func__ << " start" << dendl;
9f95a23c 629 std::unique_lock agent_locker{agent_lock};
7c673cae
FG
630
631 while (!agent_stop_flag) {
632 if (agent_queue.empty()) {
633 dout(20) << __func__ << " empty queue" << dendl;
9f95a23c 634 agent_cond.wait(agent_locker);
7c673cae
FG
635 continue;
636 }
637 uint64_t level = agent_queue.rbegin()->first;
638 set<PGRef>& top = agent_queue.rbegin()->second;
639 dout(10) << __func__
640 << " tiers " << agent_queue.size()
641 << ", top is " << level
642 << " with pgs " << top.size()
643 << ", ops " << agent_ops << "/"
644 << cct->_conf->osd_agent_max_ops
645 << (agent_active ? " active" : " NOT ACTIVE")
646 << dendl;
647 dout(20) << __func__ << " oids " << agent_oids << dendl;
648 int max = cct->_conf->osd_agent_max_ops - agent_ops;
649 int agent_flush_quota = max;
650 if (!flush_mode_high_count)
651 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
652 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
9f95a23c 653 agent_cond.wait(agent_locker);
7c673cae
FG
654 continue;
655 }
656
657 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
658 agent_queue_pos = top.begin();
659 agent_valid_iterator = true;
660 }
661 PGRef pg = *agent_queue_pos;
662 dout(10) << "high_count " << flush_mode_high_count
663 << " agent_ops " << agent_ops
664 << " flush_quota " << agent_flush_quota << dendl;
9f95a23c 665 agent_locker.unlock();
7c673cae 666 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 667 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
668 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
669 << " seconds" << dendl;
670
f67539c2 671 logger->inc(l_osd_tier_delay);
7c673cae 672 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
9f95a23c 673 std::lock_guard timer_locker{agent_timer_lock};
7c673cae
FG
674 Context *cb = new AgentTimeoutCB(pg);
675 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
7c673cae 676 }
9f95a23c 677 agent_locker.lock();
7c673cae 678 }
7c673cae
FG
679 dout(10) << __func__ << " finish" << dendl;
680}
681
682void OSDService::agent_stop()
683{
684 {
11fdf7f2 685 std::lock_guard l(agent_lock);
7c673cae
FG
686
687 // By this time all ops should be cancelled
11fdf7f2 688 ceph_assert(agent_ops == 0);
7c673cae
FG
689 // By this time all PGs are shutdown and dequeued
690 if (!agent_queue.empty()) {
691 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
692 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
693 ceph_abort_msg("agent queue not empty");
7c673cae
FG
694 }
695
696 agent_stop_flag = true;
9f95a23c 697 agent_cond.notify_all();
7c673cae
FG
698 }
699 agent_thread.join();
700}
701
702// -------------------------------------
703
704void OSDService::promote_throttle_recalibrate()
705{
706 utime_t now = ceph_clock_now();
707 double dur = now - last_recalibrate;
708 last_recalibrate = now;
709 unsigned prob = promote_probability_millis;
710
711 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
712 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
713
714 unsigned min_prob = 1;
715
716 uint64_t attempts, obj, bytes;
717 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
718 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 719 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 720 << target_obj_sec << " obj/sec or "
1adf2230 721 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
722 << dendl;
723
724 // calculate what the probability *should* be, given the targets
725 unsigned new_prob;
726 if (attempts && dur > 0) {
727 uint64_t avg_size = 1;
728 if (obj)
11fdf7f2 729 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
730 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
731 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
732 / (double)attempts;
733 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
734 << avg_size << dendl;
735 if (target_obj_sec && target_bytes_sec)
11fdf7f2 736 new_prob = std::min(po, pb);
7c673cae
FG
737 else if (target_obj_sec)
738 new_prob = po;
739 else if (target_bytes_sec)
740 new_prob = pb;
741 else
742 new_prob = 1000;
743 } else {
744 new_prob = 1000;
745 }
746 dout(20) << __func__ << " new_prob " << new_prob << dendl;
747
748 // correct for persistent skew between target rate and actual rate, adjust
749 double ratio = 1.0;
750 unsigned actual = 0;
751 if (attempts && obj) {
752 actual = obj * 1000 / attempts;
753 ratio = (double)actual / (double)prob;
754 new_prob = (double)new_prob / ratio;
755 }
11fdf7f2
TL
756 new_prob = std::max(new_prob, min_prob);
757 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
758
759 // adjust
760 prob = (prob + new_prob) / 2;
11fdf7f2
TL
761 prob = std::max(prob, min_prob);
762 prob = std::min(prob, 1000u);
7c673cae
FG
763 dout(10) << __func__ << " actual " << actual
764 << ", actual/prob ratio " << ratio
765 << ", adjusted new_prob " << new_prob
766 << ", prob " << promote_probability_millis << " -> " << prob
767 << dendl;
768 promote_probability_millis = prob;
769
770 // set hard limits for this interval to mitigate stampedes
91327a77
AA
771 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
772 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
773}
774
775// -------------------------------------
776
777float OSDService::get_failsafe_full_ratio()
778{
779 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
780 if (full_ratio > 1.0) full_ratio /= 100.0;
781 return full_ratio;
782}
783
11fdf7f2 784OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 785{
7c673cae
FG
786 // The OSDMap ratios take precendence. So if the failsafe is .95 and
787 // the admin sets the cluster full to .96, the failsafe moves up to .96
788 // too. (Not that having failsafe == full is ideal, but it's better than
789 // dropping writes before the clusters appears full.)
790 OSDMapRef osdmap = get_osdmap();
791 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 792 return NONE;
7c673cae
FG
793 }
794 float nearfull_ratio = osdmap->get_nearfull_ratio();
795 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
796 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
797 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
798
9f95a23c 799 if (osdmap->require_osd_release < ceph_release_t::luminous) {
7c673cae
FG
800 // use the failsafe for nearfull and full; the mon isn't using the
801 // flags anyway because we're mid-upgrade.
802 full_ratio = failsafe_ratio;
803 backfillfull_ratio = failsafe_ratio;
804 nearfull_ratio = failsafe_ratio;
805 } else if (full_ratio <= 0 ||
806 backfillfull_ratio <= 0 ||
807 nearfull_ratio <= 0) {
808 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
809 // use failsafe flag. ick. the monitor did something wrong or the user
810 // did something stupid.
811 full_ratio = failsafe_ratio;
812 backfillfull_ratio = failsafe_ratio;
813 nearfull_ratio = failsafe_ratio;
814 }
815
7c673cae 816 if (injectfull_state > NONE && injectfull) {
7c673cae 817 inject = "(Injected)";
11fdf7f2
TL
818 return injectfull_state;
819 } else if (pratio > failsafe_ratio) {
820 return FAILSAFE;
7c673cae 821 } else if (ratio > full_ratio) {
11fdf7f2 822 return FULL;
7c673cae 823 } else if (ratio > backfillfull_ratio) {
11fdf7f2 824 return BACKFILLFULL;
92f5a8d4 825 } else if (pratio > nearfull_ratio) {
11fdf7f2 826 return NEARFULL;
7c673cae 827 }
11fdf7f2
TL
828 return NONE;
829}
830
831void OSDService::check_full_status(float ratio, float pratio)
832{
833 std::lock_guard l(full_status_lock);
834
835 cur_ratio = ratio;
836 physical_ratio = pratio;
837
838 string inject;
839 s_names new_state;
840 new_state = recalc_full_state(ratio, pratio, inject);
841
7c673cae 842 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 843 << ", physical ratio " << pratio
7c673cae
FG
844 << ", new state " << get_full_state_name(new_state)
845 << " " << inject
846 << dendl;
847
848 // warn
849 if (cur_state != new_state) {
850 dout(10) << __func__ << " " << get_full_state_name(cur_state)
851 << " -> " << get_full_state_name(new_state) << dendl;
852 if (new_state == FAILSAFE) {
c07f9fc5 853 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
854 << (int)roundf(ratio * 100) << "% full";
855 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
856 clog->error() << "full status failsafe disengaged, no longer dropping "
857 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
858 }
859 cur_state = new_state;
860 }
861}
862
863bool OSDService::need_fullness_update()
864{
865 OSDMapRef osdmap = get_osdmap();
866 s_names cur = NONE;
867 if (osdmap->exists(whoami)) {
868 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
869 cur = FULL;
870 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
871 cur = BACKFILLFULL;
872 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
873 cur = NEARFULL;
874 }
875 }
876 s_names want = NONE;
877 if (is_full())
878 want = FULL;
879 else if (is_backfillfull())
880 want = BACKFILLFULL;
881 else if (is_nearfull())
882 want = NEARFULL;
883 return want != cur;
884}
885
11fdf7f2 886bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 887{
7c673cae
FG
888 if (injectfull && injectfull_state >= type) {
889 // injectfull is either a count of the number of times to return failsafe full
890 // or if -1 then always return full
891 if (injectfull > 0)
892 --injectfull;
11fdf7f2
TL
893 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
894 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
895 << dendl;
7c673cae
FG
896 return true;
897 }
11fdf7f2
TL
898 return false;
899}
900
901bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
902{
903 std::lock_guard l(full_status_lock);
904
905 if (_check_inject_full(dpp, type))
906 return true;
907
908 if (cur_state >= type)
909 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
910 << " physical " << physical_ratio << dendl;
7c673cae 911
7c673cae
FG
912 return cur_state >= type;
913}
914
11fdf7f2
TL
915bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
916{
917 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
918 {
919 std::lock_guard l(full_status_lock);
920 if (_check_inject_full(dpp, type)) {
921 return true;
922 }
923 }
924
925 float pratio;
926 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
927
928 string notused;
929 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
930
931 if (tentative_state >= type)
932 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
933
934 return tentative_state >= type;
935}
936
937bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
938{
939 return _check_full(dpp, FAILSAFE);
940}
941
942bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 943{
11fdf7f2 944 return _check_full(dpp, FULL);
7c673cae
FG
945}
946
11fdf7f2 947bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 948{
11fdf7f2 949 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
950}
951
11fdf7f2 952bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 953{
11fdf7f2 954 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
955}
956
11fdf7f2 957bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 958{
11fdf7f2 959 return _check_full(dpp, NEARFULL);
7c673cae
FG
960}
961
962bool OSDService::is_failsafe_full() const
963{
11fdf7f2 964 std::lock_guard l(full_status_lock);
7c673cae
FG
965 return cur_state == FAILSAFE;
966}
967
968bool OSDService::is_full() const
969{
11fdf7f2 970 std::lock_guard l(full_status_lock);
7c673cae
FG
971 return cur_state >= FULL;
972}
973
974bool OSDService::is_backfillfull() const
975{
11fdf7f2 976 std::lock_guard l(full_status_lock);
7c673cae
FG
977 return cur_state >= BACKFILLFULL;
978}
979
980bool OSDService::is_nearfull() const
981{
11fdf7f2 982 std::lock_guard l(full_status_lock);
7c673cae
FG
983 return cur_state >= NEARFULL;
984}
985
986void OSDService::set_injectfull(s_names type, int64_t count)
987{
11fdf7f2 988 std::lock_guard l(full_status_lock);
7c673cae
FG
989 injectfull_state = type;
990 injectfull = count;
991}
992
11fdf7f2
TL
993void OSDService::set_statfs(const struct store_statfs_t &stbuf,
994 osd_alert_list_t& alerts)
7c673cae 995{
224ce89b 996 uint64_t bytes = stbuf.total;
224ce89b 997 uint64_t avail = stbuf.available;
11fdf7f2
TL
998 uint64_t used = stbuf.get_used_raw();
999
1000 // For testing fake statfs values so it doesn't matter if all
1001 // OSDs are using the same partition.
1002 if (cct->_conf->fake_statfs_for_testing) {
1003 uint64_t total_num_bytes = 0;
1004 vector<PGRef> pgs;
1005 osd->_get_pgs(&pgs);
1006 for (auto p : pgs) {
1007 total_num_bytes += p->get_stats_num_bytes();
1008 }
1009 bytes = cct->_conf->fake_statfs_for_testing;
1010 if (total_num_bytes < bytes)
1011 avail = bytes - total_num_bytes;
1012 else
1013 avail = 0;
1014 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
1015 << " adjust available " << avail
1016 << dendl;
1017 used = bytes - avail;
1018 }
7c673cae 1019
f67539c2
TL
1020 logger->set(l_osd_stat_bytes, bytes);
1021 logger->set(l_osd_stat_bytes_used, used);
1022 logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 1023
11fdf7f2
TL
1024 std::lock_guard l(stat_lock);
1025 osd_stat.statfs = stbuf;
1026 osd_stat.os_alerts.clear();
1027 osd_stat.os_alerts[whoami].swap(alerts);
1028 if (cct->_conf->fake_statfs_for_testing) {
1029 osd_stat.statfs.total = bytes;
1030 osd_stat.statfs.available = avail;
1031 // For testing don't want used to go negative, so clear reserved
1032 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
1033 }
1034}
7c673cae 1035
11fdf7f2
TL
1036osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
1037 int num_pgs)
224ce89b 1038{
eafe8130
TL
1039 utime_t now = ceph_clock_now();
1040 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
11fdf7f2
TL
1041 std::lock_guard l(stat_lock);
1042 osd_stat.hb_peers.swap(hb_peers);
1043 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
1044 osd_stat.num_pgs = num_pgs;
eafe8130
TL
1045 // Clean entries that aren't updated
1046 // This is called often enough that we can just remove 1 at a time
1047 for (auto i: osd_stat.hb_pingtime) {
1048 if (i.second.last_update == 0)
1049 continue;
1050 if (stale_time && now.sec() - i.second.last_update > stale_time) {
1051 dout(20) << __func__ << " time out heartbeat for osd " << i.first
1052 << " last_update " << i.second.last_update << dendl;
1053 osd_stat.hb_pingtime.erase(i.first);
1054 break;
1055 }
1056 }
11fdf7f2
TL
1057 return osd_stat;
1058}
1059
1060void OSDService::inc_osd_stat_repaired()
1061{
1062 std::lock_guard l(stat_lock);
1063 osd_stat.num_shards_repaired++;
1064 return;
1065}
1066
1067float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1068 uint64_t adjust_used)
1069{
1070 *pratio =
b3b6e05e 1071 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
11fdf7f2
TL
1072
1073 if (adjust_used) {
1074 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1075 if (new_stat.statfs.available > adjust_used)
1076 new_stat.statfs.available -= adjust_used;
1077 else
1078 new_stat.statfs.available = 0;
1079 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
1080 }
1081
11fdf7f2
TL
1082 // Check all pgs and adjust kb_used to include all pending backfill data
1083 int backfill_adjusted = 0;
1084 vector<PGRef> pgs;
1085 osd->_get_pgs(&pgs);
1086 for (auto p : pgs) {
1087 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1088 }
1089 if (backfill_adjusted) {
1090 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1091 }
b3b6e05e 1092 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
7c673cae
FG
1093}
1094
7c673cae
FG
1095void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1096{
1e59de90
TL
1097 dout(20) << __func__ << " " << m->get_type_name() << " to osd." << peer
1098 << " from_epoch " << from_epoch << dendl;
7c673cae
FG
1099 OSDMapRef next_map = get_nextmap_reserved();
1100 // service map is always newer/newest
11fdf7f2 1101 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1102
1103 if (next_map->is_down(peer) ||
1104 next_map->get_info(peer).up_from > from_epoch) {
1105 m->put();
1106 release_map(next_map);
1107 return;
1108 }
9f95a23c
TL
1109 ConnectionRef peer_con;
1110 if (peer == whoami) {
1111 peer_con = osd->cluster_messenger->get_loopback_connection();
1112 } else {
1113 peer_con = osd->cluster_messenger->connect_to_osd(
1114 next_map->get_cluster_addrs(peer), false, true);
1115 }
1116 maybe_share_map(peer_con.get(), next_map);
7c673cae
FG
1117 peer_con->send_message(m);
1118 release_map(next_map);
1119}
1120
9f95a23c
TL
1121void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1122{
1e59de90 1123 dout(20) << __func__ << " from_epoch " << from_epoch << dendl;
9f95a23c
TL
1124 OSDMapRef next_map = get_nextmap_reserved();
1125 // service map is always newer/newest
1126 ceph_assert(from_epoch <= next_map->get_epoch());
1127
1128 for (auto& iter : messages) {
1129 if (next_map->is_down(iter.first) ||
1130 next_map->get_info(iter.first).up_from > from_epoch) {
1131 iter.second->put();
1132 continue;
1133 }
1134 ConnectionRef peer_con;
1135 if (iter.first == whoami) {
1136 peer_con = osd->cluster_messenger->get_loopback_connection();
1137 } else {
1138 peer_con = osd->cluster_messenger->connect_to_osd(
1139 next_map->get_cluster_addrs(iter.first), false, true);
1140 }
1141 maybe_share_map(peer_con.get(), next_map);
1142 peer_con->send_message(iter.second);
1143 }
1144 release_map(next_map);
1145}
7c673cae
FG
1146ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1147{
1e59de90
TL
1148 dout(20) << __func__ << " to osd." << peer
1149 << " from_epoch " << from_epoch << dendl;
7c673cae
FG
1150 OSDMapRef next_map = get_nextmap_reserved();
1151 // service map is always newer/newest
11fdf7f2 1152 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1153
1154 if (next_map->is_down(peer) ||
1155 next_map->get_info(peer).up_from > from_epoch) {
1156 release_map(next_map);
1157 return NULL;
1158 }
9f95a23c
TL
1159 ConnectionRef con;
1160 if (peer == whoami) {
1161 con = osd->cluster_messenger->get_loopback_connection();
1162 } else {
1163 con = osd->cluster_messenger->connect_to_osd(
1164 next_map->get_cluster_addrs(peer), false, true);
1165 }
7c673cae
FG
1166 release_map(next_map);
1167 return con;
1168}
1169
1170pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1171{
1e59de90
TL
1172 dout(20) << __func__ << " to osd." << peer
1173 << " from_epoch " << from_epoch << dendl;
7c673cae
FG
1174 OSDMapRef next_map = get_nextmap_reserved();
1175 // service map is always newer/newest
11fdf7f2 1176 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1177
1178 pair<ConnectionRef,ConnectionRef> ret;
1179 if (next_map->is_down(peer) ||
1180 next_map->get_info(peer).up_from > from_epoch) {
1181 release_map(next_map);
1182 return ret;
1183 }
11fdf7f2
TL
1184 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1185 next_map->get_hb_back_addrs(peer));
1186 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1187 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1188 release_map(next_map);
1189 return ret;
1190}
1191
11fdf7f2
TL
1192entity_name_t OSDService::get_cluster_msgr_name() const
1193{
1194 return cluster_messenger->get_myname();
1195}
7c673cae 1196
94b18763
FG
1197void OSDService::queue_want_pg_temp(pg_t pgid,
1198 const vector<int>& want,
1199 bool forced)
7c673cae 1200{
11fdf7f2 1201 std::lock_guard l(pg_temp_lock);
94b18763 1202 auto p = pg_temp_pending.find(pgid);
7c673cae 1203 if (p == pg_temp_pending.end() ||
94b18763
FG
1204 p->second.acting != want ||
1205 forced) {
11fdf7f2 1206 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1207 }
1208}
1209
1210void OSDService::remove_want_pg_temp(pg_t pgid)
1211{
11fdf7f2 1212 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1213 pg_temp_wanted.erase(pgid);
1214 pg_temp_pending.erase(pgid);
1215}
1216
1217void OSDService::_sent_pg_temp()
1218{
11fdf7f2
TL
1219#ifdef HAVE_STDLIB_MAP_SPLICING
1220 pg_temp_pending.merge(pg_temp_wanted);
1221#else
94b18763
FG
1222 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1223 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1224#endif
7c673cae
FG
1225 pg_temp_wanted.clear();
1226}
1227
1228void OSDService::requeue_pg_temp()
1229{
11fdf7f2 1230 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1231 // wanted overrides pending. note that remove_want_pg_temp
1232 // clears the item out of both.
1233 unsigned old_wanted = pg_temp_wanted.size();
1234 unsigned old_pending = pg_temp_pending.size();
1235 _sent_pg_temp();
1236 pg_temp_wanted.swap(pg_temp_pending);
1237 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1238 << pg_temp_wanted.size() << dendl;
1239}
1240
94b18763
FG
1241std::ostream& operator<<(std::ostream& out,
1242 const OSDService::pg_temp_t& pg_temp)
1243{
1244 out << pg_temp.acting;
1245 if (pg_temp.forced) {
1246 out << " (forced)";
1247 }
1248 return out;
1249}
1250
7c673cae
FG
1251void OSDService::send_pg_temp()
1252{
11fdf7f2 1253 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1254 if (pg_temp_wanted.empty())
1255 return;
1256 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1257 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1258 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1259 auto& m = ms[pg_temp.forced];
94b18763
FG
1260 if (!m) {
1261 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1262 m->forced = pg_temp.forced;
94b18763 1263 }
11fdf7f2 1264 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1265 }
1266 for (auto m : ms) {
1267 if (m) {
1268 monc->send_mon_message(m);
1269 }
1270 }
7c673cae
FG
1271 _sent_pg_temp();
1272}
1273
1274void OSDService::send_pg_created(pg_t pgid)
1275{
11fdf7f2 1276 std::lock_guard l(pg_created_lock);
7c673cae 1277 dout(20) << __func__ << dendl;
11fdf7f2 1278 auto o = get_osdmap();
9f95a23c 1279 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2 1280 pg_created.insert(pgid);
c07f9fc5
FG
1281 monc->send_mon_message(new MOSDPGCreated(pgid));
1282 }
7c673cae
FG
1283}
1284
11fdf7f2
TL
1285void OSDService::send_pg_created()
1286{
1287 std::lock_guard l(pg_created_lock);
1288 dout(20) << __func__ << dendl;
1289 auto o = get_osdmap();
9f95a23c 1290 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2
TL
1291 for (auto pgid : pg_created) {
1292 monc->send_mon_message(new MOSDPGCreated(pgid));
1293 }
1294 }
1295}
1296
1297void OSDService::prune_pg_created()
1298{
1299 std::lock_guard l(pg_created_lock);
1300 dout(20) << __func__ << dendl;
1301 auto o = get_osdmap();
1302 auto i = pg_created.begin();
1303 while (i != pg_created.end()) {
1304 auto p = o->get_pg_pool(i->pool());
1305 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1306 dout(20) << __func__ << " pruning " << *i << dendl;
1307 i = pg_created.erase(i);
1308 } else {
1309 dout(20) << __func__ << " keeping " << *i << dendl;
1310 ++i;
1311 }
1312 }
1313}
1314
1315
7c673cae
FG
1316// --------------------------------------
1317// dispatch
1318
7c673cae
FG
1319void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1320 epoch_t *_bind_epoch) const
1321{
11fdf7f2 1322 std::lock_guard l(epoch_lock);
7c673cae
FG
1323 if (_boot_epoch)
1324 *_boot_epoch = boot_epoch;
1325 if (_up_epoch)
1326 *_up_epoch = up_epoch;
1327 if (_bind_epoch)
1328 *_bind_epoch = bind_epoch;
1329}
1330
1331void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1332 const epoch_t *_bind_epoch)
1333{
11fdf7f2 1334 std::lock_guard l(epoch_lock);
7c673cae 1335 if (_boot_epoch) {
11fdf7f2 1336 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1337 boot_epoch = *_boot_epoch;
1338 }
1339 if (_up_epoch) {
11fdf7f2 1340 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1341 up_epoch = *_up_epoch;
1342 }
1343 if (_bind_epoch) {
11fdf7f2 1344 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1345 bind_epoch = *_bind_epoch;
1346 }
1347}
1348
1349bool OSDService::prepare_to_stop()
1350{
9f95a23c 1351 std::unique_lock l(is_stopping_lock);
7c673cae
FG
1352 if (get_state() != NOT_STOPPING)
1353 return false;
1354
1355 OSDMapRef osdmap = get_osdmap();
1356 if (osdmap && osdmap->is_up(whoami)) {
1d09f67e 1357 dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
7c673cae 1358 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1359 monc->send_mon_message(
1360 new MOSDMarkMeDown(
1361 monc->get_fsid(),
1362 whoami,
1363 osdmap->get_addrs(whoami),
1364 osdmap->get_epoch(),
1d09f67e
TL
1365 true, // request ack
1366 true // mark as down and dead
11fdf7f2 1367 ));
9f95a23c
TL
1368 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1369 is_stopping_cond.wait_for(l, timeout,
1370 [this] { return get_state() == STOPPING; });
7c673cae 1371 }
1d09f67e 1372
7c673cae
FG
1373 dout(0) << __func__ << " starting shutdown" << dendl;
1374 set_state(STOPPING);
1375 return true;
1376}
1377
1378void OSDService::got_stop_ack()
1379{
9f95a23c 1380 std::scoped_lock l(is_stopping_lock);
7c673cae
FG
1381 if (get_state() == PREPARING_TO_STOP) {
1382 dout(0) << __func__ << " starting shutdown" << dendl;
1383 set_state(STOPPING);
9f95a23c 1384 is_stopping_cond.notify_all();
7c673cae
FG
1385 } else {
1386 dout(10) << __func__ << " ignoring msg" << dendl;
1387 }
1388}
1389
1390MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1391 OSDSuperblock& sblock)
1392{
28e407b8
AA
1393 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1394 osdmap->get_encoding_features());
1e59de90 1395 m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound;
7c673cae
FG
1396 m->newest_map = sblock.newest_map;
1397
11fdf7f2
TL
1398 int max = cct->_conf->osd_map_message_max;
1399 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1400
1e59de90 1401 if (since < m->cluster_osdmap_trim_lower_bound) {
11fdf7f2
TL
1402 // we don't have the next map the target wants, so start with a
1403 // full map.
1404 bufferlist bl;
1e59de90
TL
1405 dout(10) << __func__ << " cluster osdmap lower bound "
1406 << sblock.cluster_osdmap_trim_lower_bound
1407 << " > since " << since << ", starting with full map"
1408 << dendl;
1409 since = m->cluster_osdmap_trim_lower_bound;
11fdf7f2
TL
1410 if (!get_map_bl(since, bl)) {
1411 derr << __func__ << " missing full map " << since << dendl;
1412 goto panic;
1413 }
1414 max--;
1415 max_bytes -= bl.length();
f67539c2 1416 m->maps[since] = std::move(bl);
11fdf7f2
TL
1417 }
1418 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1419 bufferlist bl;
11fdf7f2 1420 if (get_inc_map_bl(e, bl)) {
aee94f69 1421 m->incremental_maps[e] = bl;
11fdf7f2 1422 } else {
e306af50 1423 dout(10) << __func__ << " missing incremental map " << e << dendl;
11fdf7f2
TL
1424 if (!get_map_bl(e, bl)) {
1425 derr << __func__ << " also missing full map " << e << dendl;
1426 goto panic;
1427 }
aee94f69 1428 m->maps[e] = bl;
11fdf7f2
TL
1429 }
1430 max--;
1431 max_bytes -= bl.length();
1432 if (max <= 0 || max_bytes <= 0) {
7c673cae 1433 break;
11fdf7f2
TL
1434 }
1435 }
1436 return m;
1437
1438 panic:
1439 if (!m->maps.empty() ||
1440 !m->incremental_maps.empty()) {
1441 // send what we have so far
1442 return m;
1443 }
1444 // send something
1445 bufferlist bl;
1446 if (get_inc_map_bl(m->newest_map, bl)) {
f67539c2 1447 m->incremental_maps[m->newest_map] = std::move(bl);
11fdf7f2
TL
1448 } else {
1449 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1450 if (!get_map_bl(m->newest_map, bl)) {
1451 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1452 << dendl;
11fdf7f2 1453 ceph_abort();
7c673cae 1454 }
f67539c2 1455 m->maps[m->newest_map] = std::move(bl);
7c673cae
FG
1456 }
1457 return m;
1458}
1459
1460void OSDService::send_map(MOSDMap *m, Connection *con)
1461{
1462 con->send_message(m);
1463}
1464
1465void OSDService::send_incremental_map(epoch_t since, Connection *con,
9f95a23c 1466 const OSDMapRef& osdmap)
7c673cae
FG
1467{
1468 epoch_t to = osdmap->get_epoch();
1469 dout(10) << "send_incremental_map " << since << " -> " << to
1470 << " to " << con << " " << con->get_peer_addr() << dendl;
1471
1472 MOSDMap *m = NULL;
1473 while (!m) {
1474 OSDSuperblock sblock(get_superblock());
1475 if (since < sblock.oldest_map) {
1476 // just send latest full map
28e407b8
AA
1477 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1478 osdmap->get_encoding_features());
1e59de90 1479 m->cluster_osdmap_trim_lower_bound = sblock.cluster_osdmap_trim_lower_bound;
7c673cae
FG
1480 m->newest_map = sblock.newest_map;
1481 get_map_bl(to, m->maps[to]);
1482 send_map(m, con);
1483 return;
1484 }
1485
1486 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1487 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1488 << ", only sending most recent" << dendl;
1489 since = to - cct->_conf->osd_map_share_max_epochs;
1490 }
1491
7c673cae
FG
1492 m = build_incremental_map_msg(since, to, sblock);
1493 }
1494 send_map(m, con);
1495}
1496
1497bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1498{
1499 bool found = map_bl_cache.lookup(e, &bl);
31f18b77 1500 if (found) {
f67539c2 1501 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1502 return true;
31f18b77 1503 }
f67539c2 1504 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1505 found = store->read(meta_ch,
31f18b77
FG
1506 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1507 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1508 if (found) {
7c673cae 1509 _add_map_bl(e, bl);
31f18b77 1510 }
7c673cae
FG
1511 return found;
1512}
1513
1514bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1515{
11fdf7f2 1516 std::lock_guard l(map_cache_lock);
7c673cae 1517 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77 1518 if (found) {
f67539c2 1519 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1520 return true;
31f18b77 1521 }
f67539c2 1522 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1523 found = store->read(meta_ch,
31f18b77
FG
1524 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1525 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1526 if (found) {
7c673cae 1527 _add_map_inc_bl(e, bl);
31f18b77 1528 }
7c673cae
FG
1529 return found;
1530}
1531
1532void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1533{
1534 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1535 // cache a contiguous buffer
1536 if (bl.get_num_buffers() > 1) {
1537 bl.rebuild();
1538 }
1539 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1540 map_bl_cache.add(e, bl);
1541}
1542
1543void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1544{
1545 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1546 // cache a contiguous buffer
1547 if (bl.get_num_buffers() > 1) {
1548 bl.rebuild();
1549 }
1550 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1551 map_bl_inc_cache.add(e, bl);
1552}
1553
7c673cae
FG
1554OSDMapRef OSDService::_add_map(OSDMap *o)
1555{
1556 epoch_t e = o->get_epoch();
1557
1558 if (cct->_conf->osd_map_dedup) {
1559 // Dedup against an existing map at a nearby epoch
1560 OSDMapRef for_dedup = map_cache.lower_bound(e);
1561 if (for_dedup) {
1562 OSDMap::dedup(for_dedup.get(), o);
1563 }
1564 }
1565 bool existed;
1566 OSDMapRef l = map_cache.add(e, o, &existed);
1567 if (existed) {
1568 delete o;
1569 }
1570 return l;
1571}
1572
1573OSDMapRef OSDService::try_get_map(epoch_t epoch)
1574{
11fdf7f2 1575 std::lock_guard l(map_cache_lock);
7c673cae
FG
1576 OSDMapRef retval = map_cache.lookup(epoch);
1577 if (retval) {
1578 dout(30) << "get_map " << epoch << " -cached" << dendl;
f67539c2 1579 logger->inc(l_osd_map_cache_hit);
7c673cae
FG
1580 return retval;
1581 }
f67539c2 1582 {
7c673cae
FG
1583 logger->inc(l_osd_map_cache_miss);
1584 epoch_t lb = map_cache.cached_key_lower_bound();
1585 if (epoch < lb) {
1586 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1587 logger->inc(l_osd_map_cache_miss_low);
1588 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1589 }
1590 }
1591
1592 OSDMap *map = new OSDMap;
1593 if (epoch > 0) {
1594 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1595 bufferlist bl;
1596 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1597 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1598 delete map;
1599 return OSDMapRef();
1600 }
1601 map->decode(bl);
1602 } else {
1603 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1604 }
1605 return _add_map(map);
1606}
1607
1608// ops
1609
1610
1611void OSDService::reply_op_error(OpRequestRef op, int err)
1612{
9f95a23c 1613 reply_op_error(op, err, eversion_t(), 0, {});
7c673cae
FG
1614}
1615
1616void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
9f95a23c
TL
1617 version_t uv,
1618 vector<pg_log_op_return_item_t> op_returns)
7c673cae 1619{
9f95a23c 1620 auto m = op->get_req<MOSDOp>();
11fdf7f2 1621 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1622 int flags;
1623 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1624
9f95a23c
TL
1625 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1626 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
7c673cae 1627 reply->set_reply_versions(v, uv);
9f95a23c 1628 reply->set_op_returns(op_returns);
7c673cae
FG
1629 m->get_connection()->send_message(reply);
1630}
1631
1632void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1633{
31f18b77
FG
1634 if (!cct->_conf->osd_debug_misdirected_ops) {
1635 return;
1636 }
1637
9f95a23c 1638 auto m = op->get_req<MOSDOp>();
11fdf7f2 1639 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1640
11fdf7f2 1641 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1642
1643 if (pg->is_ec_pg()) {
1644 /**
1645 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1646 * can get this result:
1647 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1648 * [CRUSH_ITEM_NONE, 2, 3]/3
1649 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1650 * [3, 2, 3]/3
1651 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1652 * -- misdirected op
1653 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1654 * it and fulfils it
1655 *
1656 * We can't compute the op target based on the sending map epoch due to
1657 * splitting. The simplest thing is to detect such cases here and drop
1658 * them without an error (the client will resend anyway).
1659 */
11fdf7f2 1660 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1661 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1662 if (!opmap) {
1663 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1664 << m->get_map_epoch() << ", dropping" << dendl;
1665 return;
1666 }
1667 pg_t _pgid = m->get_raw_pg();
1668 spg_t pgid;
1669 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1670 _pgid = opmap->raw_pg_to_pg(_pgid);
1671 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1672 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1673 dout(7) << __func__ << ": " << *pg << " primary changed since "
1674 << m->get_map_epoch() << ", dropping" << dendl;
1675 return;
1676 }
1677 }
1678
1679 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1680 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1681 << " pg " << m->get_raw_pg()
1682 << " to osd." << whoami
11fdf7f2 1683 << " not " << pg->get_acting()
7c673cae 1684 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1685}
1686
9f95a23c 1687void OSDService::enqueue_back(OpSchedulerItem&& qi)
7c673cae 1688{
11fdf7f2 1689 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1690}
1691
9f95a23c 1692void OSDService::enqueue_front(OpSchedulerItem&& qi)
7c673cae 1693{
11fdf7f2 1694 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1695}
1696
11fdf7f2
TL
1697void OSDService::queue_recovery_context(
1698 PG *pg,
1e59de90
TL
1699 GenContext<ThreadPool::TPHandle&> *c,
1700 uint64_t cost,
1701 int priority)
7c673cae 1702{
11fdf7f2 1703 epoch_t e = get_osdmap_epoch();
1e59de90
TL
1704
1705 uint64_t cost_for_queue = [this, cost] {
1706 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
1707 return cost;
1708 } else {
1709 /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
1710 * require very large costs for several messages in order to do any
1711 * meaningful amount of throttling. This branch should be removed after
1712 * Reef.
1713 */
1714 return cct->_conf->osd_recovery_cost;
1715 }
1716 }();
1717
11fdf7f2 1718 enqueue_back(
9f95a23c
TL
1719 OpSchedulerItem(
1720 unique_ptr<OpSchedulerItem::OpQueueable>(
1e59de90
TL
1721 new PGRecoveryContext(pg->get_pgid(), c, e, priority)),
1722 cost_for_queue,
11fdf7f2
TL
1723 cct->_conf->osd_recovery_priority,
1724 ceph_clock_now(),
1725 0,
1726 e));
7c673cae
FG
1727}
1728
1729void OSDService::queue_for_snap_trim(PG *pg)
1730{
1731 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2 1732 enqueue_back(
9f95a23c
TL
1733 OpSchedulerItem(
1734 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1735 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1736 cct->_conf->osd_snap_trim_cost,
1737 cct->_conf->osd_snap_trim_priority,
1738 ceph_clock_now(),
1739 0,
1740 pg->get_osdmap_epoch()));
1741}
1742
f67539c2
TL
1743template <class MSG_TYPE>
1744void OSDService::queue_scrub_event_msg(PG* pg,
1745 Scrub::scrub_prio_t with_priority,
20effc67
TL
1746 unsigned int qu_priority,
1747 Scrub::act_token_t act_token)
11fdf7f2 1748{
11fdf7f2 1749 const auto epoch = pg->get_osdmap_epoch();
20effc67
TL
1750 auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
1751 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
1752 << ". Epoch: " << epoch << " token: " << act_token << dendl;
f67539c2 1753 enqueue_back(OpSchedulerItem(
1e59de90 1754 unique_ptr<OpSchedulerItem::OpQueueable>(msg), get_scrub_cost(),
f67539c2
TL
1755 pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1756}
1757
1758template <class MSG_TYPE>
20effc67
TL
1759void OSDService::queue_scrub_event_msg(PG* pg,
1760 Scrub::scrub_prio_t with_priority)
f67539c2
TL
1761{
1762 const auto epoch = pg->get_osdmap_epoch();
1763 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1764 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
f67539c2 1765 enqueue_back(OpSchedulerItem(
1e59de90 1766 unique_ptr<OpSchedulerItem::OpQueueable>(msg), get_scrub_cost(),
f67539c2
TL
1767 pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1768}
1769
1e59de90
TL
1770int64_t OSDService::get_scrub_cost()
1771{
1772
1773 int64_t cost_for_queue = cct->_conf->osd_scrub_cost;
1774 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
1775 cost_for_queue = cct->_conf->osd_scrub_event_cost *
1776 cct->_conf->osd_shallow_scrub_chunk_max;
1777 }
1778 return cost_for_queue;
1779}
1780
f67539c2
TL
1781void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1782{
1783 queue_scrub_event_msg<PGScrub>(pg, with_priority);
1784}
1785
1786void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1787{
1788 queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1789}
1790
1791void OSDService::queue_for_rep_scrub(PG* pg,
1792 Scrub::scrub_prio_t with_priority,
20effc67
TL
1793 unsigned int qu_priority,
1794 Scrub::act_token_t act_token)
f67539c2 1795{
20effc67 1796 queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
f67539c2
TL
1797}
1798
1799void OSDService::queue_for_rep_scrub_resched(PG* pg,
1800 Scrub::scrub_prio_t with_priority,
20effc67
TL
1801 unsigned int qu_priority,
1802 Scrub::act_token_t act_token)
f67539c2
TL
1803{
1804 // Resulting scrub event: 'SchedReplica'
20effc67
TL
1805 queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
1806 act_token);
f67539c2
TL
1807}
1808
1809void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1810{
1811 // Resulting scrub event: 'RemotesReserved'
1812 queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1813}
1814
1815void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1816{
1817 // Resulting scrub event: 'ReservationFailure'
1818 queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1819}
1820
1821void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1822{
1823 // Resulting scrub event: 'InternalSchedScrub'
1824 queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1825}
1826
1827void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1828{
1829 // Resulting scrub event: 'ActivePushesUpd'
1830 queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1831}
1832
20effc67
TL
1833void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
1834{
1835 // Resulting scrub event: 'SelectedChunkFree'
1836 queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
1837}
1838
1839void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
1840{
1841 // Resulting scrub event: 'ChunkIsBusy'
1842 queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
1843}
1844
f67539c2
TL
1845void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1846{
1847 queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1848}
1849
1850void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1851{
1852 // Resulting scrub event: 'Unblocked'
1853 queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1854}
1855
1856void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1857{
1858 // Resulting scrub event: 'DigestUpdate'
1859 queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1860}
1861
20effc67
TL
1862void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
1863{
1864 // Resulting scrub event: 'IntLocalMapDone'
1865 queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
1866}
1867
f67539c2
TL
1868void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1869{
1870 // Resulting scrub event: 'GotReplicas'
1871 queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1872}
1873
1874void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1875{
1876 // Resulting scrub event: 'ReplicaPushesUpd'
1877 queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
11fdf7f2
TL
1878}
1879
20effc67
TL
1880void OSDService::queue_scrub_is_finished(PG *pg)
1881{
1882 // Resulting scrub event: 'ScrubFinished'
1883 queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
1884}
1885
1886void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
1887{
1888 // Resulting scrub event: 'NextChunk'
1889 queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
1890}
1891
11fdf7f2
TL
1892void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1893{
1894 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1895 enqueue_back(
9f95a23c
TL
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1898 new PGDelete(pgid, e)),
1899 cct->_conf->osd_pg_delete_cost,
1900 cct->_conf->osd_pg_delete_priority,
1901 ceph_clock_now(),
1902 0,
1903 e));
1904}
1905
1906bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1907{
1908 return osd->try_finish_pg_delete(pg, old_pg_num);
1909}
1910
1911// ---
1912
1913void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1914{
1915 std::lock_guard l(merge_lock);
1916 dout(10) << __func__ << " " << pg->pg_id << dendl;
1917 ready_to_merge_source[pg->pg_id.pgid] = version;
1918 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1919 _send_ready_to_merge();
1920}
1921
1922void OSDService::set_ready_to_merge_target(PG *pg,
1923 eversion_t version,
1924 epoch_t last_epoch_started,
1925 epoch_t last_epoch_clean)
1926{
1927 std::lock_guard l(merge_lock);
1928 dout(10) << __func__ << " " << pg->pg_id << dendl;
1929 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1930 make_tuple(version,
1931 last_epoch_started,
1932 last_epoch_clean)));
1933 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1934 _send_ready_to_merge();
1935}
1936
1937void OSDService::set_not_ready_to_merge_source(pg_t source)
1938{
1939 std::lock_guard l(merge_lock);
1940 dout(10) << __func__ << " " << source << dendl;
1941 not_ready_to_merge_source.insert(source);
1942 assert(ready_to_merge_source.count(source) == 0);
1943 _send_ready_to_merge();
1944}
1945
1946void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1947{
1948 std::lock_guard l(merge_lock);
1949 dout(10) << __func__ << " " << target << " source " << source << dendl;
1950 not_ready_to_merge_target[target] = source;
1951 assert(ready_to_merge_target.count(target) == 0);
1952 _send_ready_to_merge();
1953}
1954
1955void OSDService::send_ready_to_merge()
1956{
1957 std::lock_guard l(merge_lock);
1958 _send_ready_to_merge();
1959}
1960
1961void OSDService::_send_ready_to_merge()
1962{
1963 dout(20) << __func__
1964 << " ready_to_merge_source " << ready_to_merge_source
1965 << " not_ready_to_merge_source " << not_ready_to_merge_source
1966 << " ready_to_merge_target " << ready_to_merge_target
1967 << " not_ready_to_merge_target " << not_ready_to_merge_target
1968 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1969 << dendl;
1970 for (auto src : not_ready_to_merge_source) {
1971 if (sent_ready_to_merge_source.count(src) == 0) {
1972 monc->send_mon_message(new MOSDPGReadyToMerge(
1973 src,
1974 {}, {}, 0, 0,
1975 false,
1976 osdmap->get_epoch()));
1977 sent_ready_to_merge_source.insert(src);
1978 }
1979 }
1980 for (auto p : not_ready_to_merge_target) {
1981 if (sent_ready_to_merge_source.count(p.second) == 0) {
1982 monc->send_mon_message(new MOSDPGReadyToMerge(
1983 p.second,
1984 {}, {}, 0, 0,
1985 false,
1986 osdmap->get_epoch()));
1987 sent_ready_to_merge_source.insert(p.second);
1988 }
1989 }
1990 for (auto src : ready_to_merge_source) {
1991 if (not_ready_to_merge_source.count(src.first) ||
1992 not_ready_to_merge_target.count(src.first.get_parent())) {
1993 continue;
1994 }
1995 auto p = ready_to_merge_target.find(src.first.get_parent());
1996 if (p != ready_to_merge_target.end() &&
1997 sent_ready_to_merge_source.count(src.first) == 0) {
1998 monc->send_mon_message(new MOSDPGReadyToMerge(
1999 src.first, // source pgid
2000 src.second, // src version
2001 std::get<0>(p->second), // target version
2002 std::get<1>(p->second), // PG's last_epoch_started
2003 std::get<2>(p->second), // PG's last_epoch_clean
2004 true,
2005 osdmap->get_epoch()));
2006 sent_ready_to_merge_source.insert(src.first);
2007 }
2008 }
2009}
2010
2011void OSDService::clear_ready_to_merge(PG *pg)
2012{
2013 std::lock_guard l(merge_lock);
2014 dout(10) << __func__ << " " << pg->pg_id << dendl;
2015 ready_to_merge_source.erase(pg->pg_id.pgid);
2016 ready_to_merge_target.erase(pg->pg_id.pgid);
2017 not_ready_to_merge_source.erase(pg->pg_id.pgid);
2018 not_ready_to_merge_target.erase(pg->pg_id.pgid);
2019 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
2020}
2021
2022void OSDService::clear_sent_ready_to_merge()
2023{
2024 std::lock_guard l(merge_lock);
2025 sent_ready_to_merge_source.clear();
2026}
2027
9f95a23c 2028void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
11fdf7f2
TL
2029{
2030 std::lock_guard l(merge_lock);
2031 auto i = sent_ready_to_merge_source.begin();
2032 while (i != sent_ready_to_merge_source.end()) {
2033 if (!osdmap->pg_exists(*i)) {
2034 dout(10) << __func__ << " " << *i << dendl;
2035 i = sent_ready_to_merge_source.erase(i);
2036 } else {
1e59de90 2037 dout(20) << __func__ << " exist " << *i << dendl;
11fdf7f2
TL
2038 ++i;
2039 }
2040 }
7c673cae
FG
2041}
2042
11fdf7f2
TL
2043// ---
2044
2045void OSDService::_queue_for_recovery(
1e59de90 2046 pg_awaiting_throttle_t p,
11fdf7f2
TL
2047 uint64_t reserved_pushes)
2048{
9f95a23c 2049 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1e59de90
TL
2050
2051 uint64_t cost_for_queue = [this, &reserved_pushes, &p] {
2052 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
2053 return p.cost_per_object * reserved_pushes;
2054 } else {
2055 /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
2056 * require very large costs for several messages in order to do any
2057 * meaningful amount of throttling. This branch should be removed after
2058 * Reef.
2059 */
2060 return cct->_conf->osd_recovery_cost;
2061 }
2062 }();
2063
11fdf7f2 2064 enqueue_back(
9f95a23c
TL
2065 OpSchedulerItem(
2066 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2 2067 new PGRecovery(
1e59de90
TL
2068 p.pg->get_pgid(),
2069 p.epoch_queued,
2070 reserved_pushes,
2071 p.priority)),
2072 cost_for_queue,
11fdf7f2
TL
2073 cct->_conf->osd_recovery_priority,
2074 ceph_clock_now(),
2075 0,
1e59de90 2076 p.epoch_queued));
11fdf7f2 2077}
7c673cae
FG
2078
2079// ====================================================================
2080// OSD
2081
2082#undef dout_prefix
2083#define dout_prefix *_dout
2084
2085// Commands shared between OSD's console and admin console:
f67539c2 2086namespace ceph::osd_cmds {
7c673cae 2087
2a845540
TL
2088int heap(CephContext& cct,
2089 const cmdmap_t& cmdmap,
2090 std::ostream& outos,
2091 std::ostream& erros);
f67539c2
TL
2092
2093} // namespace ceph::osd_cmds
7c673cae 2094
20effc67
TL
2095int OSD::mkfs(CephContext *cct,
2096 std::unique_ptr<ObjectStore> store,
2097 uuid_d fsid,
2098 int whoami,
2099 string osdspec_affinity)
7c673cae
FG
2100{
2101 int ret;
2102
7c673cae
FG
2103 OSDSuperblock sb;
2104 bufferlist sbbl;
7c673cae
FG
2105 // if we are fed a uuid for this osd, use it.
2106 store->set_fsid(cct->_conf->osd_uuid);
2107
2108 ret = store->mkfs();
2109 if (ret) {
224ce89b
WB
2110 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2111 << cpp_strerror(ret) << dendl;
20effc67 2112 return ret;
7c673cae
FG
2113 }
2114
31f18b77 2115 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
2116
2117 ret = store->mount();
2118 if (ret) {
224ce89b
WB
2119 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2120 << cpp_strerror(ret) << dendl;
20effc67 2121 return ret;
7c673cae
FG
2122 }
2123
20effc67
TL
2124 auto umount_store = make_scope_guard([&] {
2125 store->umount();
2126 });
2127
2128 ObjectStore::CollectionHandle ch =
2129 store->open_collection(coll_t::meta());
11fdf7f2
TL
2130 if (ch) {
2131 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2132 if (ret < 0) {
2133 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
20effc67 2134 return ret;
11fdf7f2 2135 }
7c673cae
FG
2136 /* if we already have superblock, check content of superblock */
2137 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
2138 auto p = sbbl.cbegin();
2139 decode(sb, p);
7c673cae
FG
2140 if (whoami != sb.whoami) {
2141 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2142 << dendl;
20effc67 2143 return -EINVAL;
7c673cae
FG
2144 }
2145 if (fsid != sb.cluster_fsid) {
2146 derr << "provided cluster fsid " << fsid
2147 << " != superblock's " << sb.cluster_fsid << dendl;
20effc67 2148 return -EINVAL;
7c673cae
FG
2149 }
2150 } else {
2151 // create superblock
2152 sb.cluster_fsid = fsid;
2153 sb.osd_fsid = store->get_fsid();
2154 sb.whoami = whoami;
2155 sb.compat_features = get_osd_initial_compat_set();
2156
2157 bufferlist bl;
11fdf7f2 2158 encode(sb, bl);
7c673cae 2159
11fdf7f2
TL
2160 ObjectStore::CollectionHandle ch = store->create_new_collection(
2161 coll_t::meta());
7c673cae
FG
2162 ObjectStore::Transaction t;
2163 t.create_collection(coll_t::meta(), 0);
2164 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 2165 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
2166 if (ret) {
2167 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 2168 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
20effc67 2169 return ret;
7c673cae 2170 }
a4b75251 2171 ch->flush();
7c673cae
FG
2172 }
2173
20effc67 2174 ret = write_meta(cct, store.get(), sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
7c673cae 2175 if (ret) {
224ce89b
WB
2176 derr << "OSD::mkfs: failed to write fsid file: error "
2177 << cpp_strerror(ret) << dendl;
11fdf7f2 2178 }
7c673cae
FG
2179 return ret;
2180}
2181
e306af50 2182int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
7c673cae
FG
2183{
2184 char val[80];
2185 int r;
2186
2187 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2188 r = store->write_meta("magic", val);
2189 if (r < 0)
2190 return r;
2191
2192 snprintf(val, sizeof(val), "%d", whoami);
2193 r = store->write_meta("whoami", val);
2194 if (r < 0)
2195 return r;
2196
2197 cluster_fsid.print(val);
2198 r = store->write_meta("ceph_fsid", val);
2199 if (r < 0)
2200 return r;
2201
11fdf7f2 2202 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2203 if (key.size()) {
2204 r = store->write_meta("osd_key", key);
2205 if (r < 0)
2206 return r;
b32b8144 2207 } else {
11fdf7f2 2208 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2209 if (!keyfile.empty()) {
2210 bufferlist keybl;
2211 string err;
11fdf7f2 2212 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2213 if (r < 0) {
2214 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2215 << err << ": " << cpp_strerror(r) << dendl;
2216 return r;
2217 }
2218 r = store->write_meta("osd_key", keybl.to_str());
2219 if (r < 0)
2220 return r;
2221 }
3efd9988 2222 }
e306af50
TL
2223 if (!osdspec_affinity.empty()) {
2224 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2225 if (r < 0)
2226 return r;
2227 }
3efd9988 2228
39ae355f
TL
2229 r = store->write_meta("ceph_version_when_created", pretty_version_to_str());
2230 if (r < 0)
2231 return r;
2232
2233 ostringstream created_at;
2234 utime_t now = ceph_clock_now();
2235 now.gmtime(created_at);
2236 r = store->write_meta("created_at", created_at.str());
2237 if (r < 0)
2238 return r;
2239
7c673cae
FG
2240 r = store->write_meta("ready", "ready");
2241 if (r < 0)
2242 return r;
2243
2244 return 0;
2245}
2246
11fdf7f2
TL
2247int OSD::peek_meta(ObjectStore *store,
2248 std::string *magic,
2249 uuid_d *cluster_fsid,
2250 uuid_d *osd_fsid,
2251 int *whoami,
9f95a23c 2252 ceph_release_t *require_osd_release)
7c673cae
FG
2253{
2254 string val;
2255
2256 int r = store->read_meta("magic", &val);
2257 if (r < 0)
2258 return r;
11fdf7f2 2259 *magic = val;
7c673cae
FG
2260
2261 r = store->read_meta("whoami", &val);
2262 if (r < 0)
2263 return r;
11fdf7f2 2264 *whoami = atoi(val.c_str());
7c673cae
FG
2265
2266 r = store->read_meta("ceph_fsid", &val);
2267 if (r < 0)
2268 return r;
11fdf7f2 2269 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2270 if (!r)
2271 return -EINVAL;
2272
2273 r = store->read_meta("fsid", &val);
2274 if (r < 0) {
11fdf7f2 2275 *osd_fsid = uuid_d();
7c673cae 2276 } else {
11fdf7f2 2277 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2278 if (!r)
2279 return -EINVAL;
2280 }
2281
11fdf7f2
TL
2282 r = store->read_meta("require_osd_release", &val);
2283 if (r >= 0) {
9f95a23c 2284 *require_osd_release = ceph_release_from_name(val);
11fdf7f2
TL
2285 }
2286
7c673cae
FG
2287 return 0;
2288}
2289
2290
2291#undef dout_prefix
2292#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2293
2294// cons/des
2295
20effc67
TL
2296OSD::OSD(CephContext *cct_,
2297 std::unique_ptr<ObjectStore> store_,
7c673cae
FG
2298 int id,
2299 Messenger *internal_messenger,
2300 Messenger *external_messenger,
2301 Messenger *hb_client_front,
2302 Messenger *hb_client_back,
2303 Messenger *hb_front_serverm,
2304 Messenger *hb_back_serverm,
2305 Messenger *osdc_messenger,
2306 MonClient *mc,
f67539c2
TL
2307 const std::string &dev, const std::string &jdev,
2308 ceph::async::io_context_pool& poolctx) :
7c673cae 2309 Dispatcher(cct_),
7c673cae 2310 tick_timer(cct, osd_lock),
7c673cae 2311 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2312 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2313 cluster_messenger(internal_messenger),
2314 client_messenger(external_messenger),
2315 objecter_messenger(osdc_messenger),
2316 monc(mc),
9f95a23c 2317 mgrc(cct_, client_messenger, &mc->monmap),
f67539c2
TL
2318 logger(create_logger()),
2319 recoverystate_perf(create_recoverystate_perf()),
20effc67 2320 store(std::move(store_)),
7c673cae
FG
2321 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2322 clog(log_client.create_channel()),
2323 whoami(id),
2324 dev_path(dev), journal_path(jdev),
31f18b77 2325 store_is_rotational(store->is_rotational()),
7c673cae
FG
2326 trace_endpoint("0.0.0.0", 0, "osd"),
2327 asok_hook(NULL),
11fdf7f2
TL
2328 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2329 "osd_pg_epoch_max_lag_factor")),
7c673cae 2330 osd_compat(get_osd_compat_set()),
7c673cae 2331 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2332 get_num_op_threads()),
7c673cae
FG
2333 heartbeat_stop(false),
2334 heartbeat_need_update(true),
2335 hb_front_client_messenger(hb_client_front),
2336 hb_back_client_messenger(hb_client_back),
2337 hb_front_server_messenger(hb_front_serverm),
2338 hb_back_server_messenger(hb_back_serverm),
2339 daily_loadavg(0.0),
2340 heartbeat_thread(this),
2341 heartbeat_dispatcher(this),
2342 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2343 cct->_conf->osd_num_op_tracker_shard),
2344 test_ops_hook(NULL),
7c673cae 2345 op_shardedwq(
7c673cae 2346 this,
f67539c2
TL
2347 ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2348 ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
7c673cae 2349 &osd_op_tp),
7c673cae 2350 last_pg_create_epoch(0),
11fdf7f2 2351 boot_finisher(cct),
7c673cae
FG
2352 up_thru_wanted(0),
2353 requested_full_first(0),
2354 requested_full_last(0),
f67539c2 2355 service(this, poolctx)
7c673cae 2356{
11fdf7f2
TL
2357
2358 if (!gss_ktfile_client.empty()) {
f67539c2
TL
2359 // Assert we can export environment variable
2360 /*
11fdf7f2
TL
2361 The default client keytab is used, if it is present and readable,
2362 to automatically obtain initial credentials for GSSAPI client
2363 applications. The principal name of the first entry in the client
2364 keytab is used by default when obtaining initial credentials.
2365 1. The KRB5_CLIENT_KTNAME environment variable.
2366 2. The default_client_keytab_name profile variable in [libdefaults].
2367 3. The hardcoded default, DEFCKTNAME.
2368 */
f67539c2 2369 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
11fdf7f2
TL
2370 gss_ktfile_client.c_str(), 1));
2371 ceph_assert(set_result == 0);
2372 }
2373
7c673cae
FG
2374 monc->set_messenger(client_messenger);
2375 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2376 cct->_conf->osd_op_log_threshold);
2377 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2378 cct->_conf->osd_op_history_duration);
2379 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2380 cct->_conf->osd_op_history_slow_op_threshold);
9f95a23c 2381 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
7c673cae
FG
2382#ifdef WITH_BLKIN
2383 std::stringstream ss;
2384 ss << "osd." << whoami;
2385 trace_endpoint.copy_name(ss.str());
2386#endif
11fdf7f2
TL
2387
2388 // initialize shards
2389 num_shards = get_num_op_shards();
2390 for (uint32_t i = 0; i < num_shards; i++) {
2391 OSDShard *one_shard = new OSDShard(
2392 i,
2393 cct,
9f95a23c 2394 this);
11fdf7f2
TL
2395 shards.push_back(one_shard);
2396 }
7c673cae
FG
2397}
2398
2399OSD::~OSD()
2400{
11fdf7f2
TL
2401 while (!shards.empty()) {
2402 delete shards.back();
2403 shards.pop_back();
2404 }
7c673cae
FG
2405 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2406 cct->get_perfcounters_collection()->remove(logger);
2407 delete recoverystate_perf;
2408 delete logger;
7c673cae
FG
2409}
2410
91327a77
AA
2411double OSD::get_tick_interval() const
2412{
2413 // vary +/- 5% to avoid scrub scheduling livelocks
2414 constexpr auto delta = 0.05;
91327a77 2415 return (OSD_TICK_INTERVAL *
11fdf7f2 2416 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2417}
2418
7c673cae
FG
2419void OSD::handle_signal(int signum)
2420{
11fdf7f2 2421 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2422 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2423 shutdown();
2424}
2425
2426int OSD::pre_init()
2427{
11fdf7f2 2428 std::lock_guard lock(osd_lock);
7c673cae
FG
2429 if (is_stopping())
2430 return 0;
2431
2432 if (store->test_mount_in_use()) {
2433 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2434 << "currently in use. (Is ceph-osd already running?)" << dendl;
2435 return -EBUSY;
2436 }
2437
11fdf7f2
TL
2438 cct->_conf.add_observer(this);
2439 return 0;
2440}
2441
2442int OSD::set_numa_affinity()
2443{
2444 // storage numa node
2445 int store_node = -1;
2446 store->get_numa_node(&store_node, nullptr, nullptr);
2447 if (store_node >= 0) {
2448 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2449 }
2450
2451 // check network numa node(s)
2452 int front_node = -1, back_node = -1;
2453 string front_iface = pick_iface(
2454 cct,
2455 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2456 string back_iface = pick_iface(
2457 cct,
2458 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2459 int r = get_iface_numa_node(front_iface, &front_node);
92f5a8d4 2460 if (r >= 0 && front_node >= 0) {
11fdf7f2 2461 dout(1) << __func__ << " public network " << front_iface << " numa node "
92f5a8d4 2462 << front_node << dendl;
11fdf7f2 2463 r = get_iface_numa_node(back_iface, &back_node);
92f5a8d4 2464 if (r >= 0 && back_node >= 0) {
11fdf7f2
TL
2465 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2466 << back_node << dendl;
2467 if (front_node == back_node &&
2468 front_node == store_node) {
2469 dout(1) << " objectstore and network numa nodes all match" << dendl;
2470 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2471 numa_node = front_node;
2472 }
92f5a8d4
TL
2473 } else if (front_node != back_node) {
2474 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2475 << dendl;
11fdf7f2
TL
2476 } else {
2477 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2478 << dendl;
2479 }
92f5a8d4
TL
2480 } else if (back_node == -2) {
2481 dout(1) << __func__ << " cluster network " << back_iface
2482 << " ports numa nodes do not match" << dendl;
2483 } else {
2484 derr << __func__ << " unable to identify cluster interface '" << back_iface
2485 << "' numa node: " << cpp_strerror(r) << dendl;
11fdf7f2 2486 }
92f5a8d4
TL
2487 } else if (front_node == -2) {
2488 dout(1) << __func__ << " public network " << front_iface
2489 << " ports numa nodes do not match" << dendl;
11fdf7f2
TL
2490 } else {
2491 derr << __func__ << " unable to identify public interface '" << front_iface
2492 << "' numa node: " << cpp_strerror(r) << dendl;
2493 }
2494 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2495 // this takes precedence over the automagic logic above
2496 numa_node = node;
2497 }
2498 if (numa_node >= 0) {
2499 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2500 if (r < 0) {
2501 dout(1) << __func__ << " unable to determine numa node " << numa_node
2502 << " CPUs" << dendl;
2503 numa_node = -1;
2504 } else {
2505 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2506 << " cpus "
2507 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2508 << dendl;
92f5a8d4 2509 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
11fdf7f2
TL
2510 if (r < 0) {
2511 r = -errno;
2512 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2513 << dendl;
2514 numa_node = -1;
2515 }
2516 }
2517 } else {
2518 dout(1) << __func__ << " not setting numa affinity" << dendl;
2519 }
7c673cae
FG
2520 return 0;
2521}
2522
2523// asok
2524
2525class OSDSocketHook : public AdminSocketHook {
2526 OSD *osd;
2527public:
2528 explicit OSDSocketHook(OSD *o) : osd(o) {}
9f95a23c 2529 int call(std::string_view prefix, const cmdmap_t& cmdmap,
39ae355f 2530 const bufferlist& inbl,
9f95a23c
TL
2531 Formatter *f,
2532 std::ostream& ss,
2533 bufferlist& out) override {
2534 ceph_abort("should use async hook");
2535 }
2536 void call_async(
2537 std::string_view prefix,
2538 const cmdmap_t& cmdmap,
2539 Formatter *f,
2540 const bufferlist& inbl,
2541 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
11fdf7f2 2542 try {
9f95a23c
TL
2543 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2544 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2545 bufferlist empty;
2546 on_finish(-EINVAL, e.what(), empty);
11fdf7f2 2547 }
7c673cae
FG
2548 }
2549};
2550
11fdf7f2
TL
2551std::set<int64_t> OSD::get_mapped_pools()
2552{
2553 std::set<int64_t> pools;
2554 std::vector<spg_t> pgids;
2555 _get_pgids(&pgids);
2556 for (const auto &pgid : pgids) {
2557 pools.insert(pgid.pool());
2558 }
2559 return pools;
2560}
2561
20effc67
TL
2562OSD::PGRefOrError OSD::locate_asok_target(const cmdmap_t& cmdmap,
2563 stringstream& ss,
2564 bool only_primary)
2565{
2566 string pgidstr;
2567 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2568 ss << "no pgid specified";
2569 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2570 }
2571
2572 pg_t pgid;
2573 if (!pgid.parse(pgidstr.c_str())) {
2574 ss << "couldn't parse pgid '" << pgidstr << "'";
2575 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2576 }
2577
2578 spg_t pcand;
2579 PGRef pg;
2580 if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) {
2581 if (pg->is_primary() || !only_primary) {
2582 return OSD::PGRefOrError{pg, 0};
2583 }
2584
2585 ss << "not primary for pgid " << pgid;
2586 pg->unlock();
2587 return OSD::PGRefOrError{std::nullopt, -EAGAIN};
2588 } else {
2589 ss << "i don't have pgid " << pgid;
2590 return OSD::PGRefOrError{std::nullopt, -ENOENT};
2591 }
2592}
2593
2594// note that the cmdmap is explicitly copied into asok_route_to_pg()
2595int OSD::asok_route_to_pg(
2596 bool only_primary,
2597 std::string_view prefix,
2598 cmdmap_t cmdmap,
2599 Formatter* f,
2600 stringstream& ss,
2601 const bufferlist& inbl,
2602 bufferlist& outbl,
2603 std::function<void(int, const std::string&, bufferlist&)> on_finish)
2604{
2605 auto [target_pg, ret] = locate_asok_target(cmdmap, ss, only_primary);
2606
2607 if (!target_pg.has_value()) {
2608 // 'ss' and 'ret' already contain the error information
2609 on_finish(ret, ss.str(), outbl);
2610 return ret;
2611 }
2612
2613 // the PG was locked by locate_asok_target()
2614 try {
2615 (*target_pg)->do_command(prefix, cmdmap, inbl, on_finish);
2616 (*target_pg)->unlock();
2617 return 0; // the pg handler calls on_finish directly
2618 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2619 (*target_pg)->unlock();
2620 ss << e.what();
2621 on_finish(ret, ss.str(), outbl);
2622 return -EINVAL;
2623 }
2624}
2625
9f95a23c
TL
2626void OSD::asok_command(
2627 std::string_view prefix, const cmdmap_t& cmdmap,
2628 Formatter *f,
2629 const bufferlist& inbl,
2630 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 2631{
9f95a23c
TL
2632 int ret = 0;
2633 stringstream ss; // stderr error message stream
2634 bufferlist outbl; // if empty at end, we'll dump formatter as output
2635
2636 // --- PG commands are routed here to PG::do_command ---
2637 if (prefix == "pg" ||
2638 prefix == "query" ||
1e59de90 2639 prefix == "log" ||
9f95a23c
TL
2640 prefix == "mark_unfound_lost" ||
2641 prefix == "list_unfound" ||
2642 prefix == "scrub" ||
2643 prefix == "deep_scrub"
2644 ) {
2645 string pgidstr;
2646 pg_t pgid;
2647 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2648 ss << "no pgid specified";
2649 ret = -EINVAL;
2650 goto out;
2651 }
2652 if (!pgid.parse(pgidstr.c_str())) {
2653 ss << "couldn't parse pgid '" << pgidstr << "'";
2654 ret = -EINVAL;
2655 goto out;
2656 }
2657 spg_t pcand;
2658 PGRef pg;
2659 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2660 (pg = _lookup_lock_pg(pcand))) {
2661 if (pg->is_primary()) {
2662 cmdmap_t new_cmdmap = cmdmap;
2663 try {
2664 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2665 pg->unlock();
2666 return; // the pg handler calls on_finish directly
2667 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2668 pg->unlock();
2669 ss << e.what();
2670 ret = -EINVAL;
2671 goto out;
2672 }
2673 } else {
2674 ss << "not primary for pgid " << pgid;
2675 // do not reply; they will get newer maps and realize they
2676 // need to resend.
2677 pg->unlock();
2678 ret = -EAGAIN;
2679 goto out;
2680 }
2681 } else {
2682 ss << "i don't have pgid " << pgid;
2683 ret = -ENOENT;
2684 }
2685 }
2686
20effc67
TL
2687 // --- PG commands that will be answered even if !primary ---
2688
2689 else if (prefix == "scrubdebug") {
2690 asok_route_to_pg(false, prefix, cmdmap, f, ss, inbl, outbl, on_finish);
2691 return;
2692 }
2693
9f95a23c
TL
2694 // --- OSD commands follow ---
2695
2696 else if (prefix == "status") {
2697 lock_guard l(osd_lock);
7c673cae
FG
2698 f->open_object_section("status");
2699 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2700 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2701 f->dump_unsigned("whoami", superblock.whoami);
2702 f->dump_string("state", get_state_name(get_state()));
2703 f->dump_unsigned("oldest_map", superblock.oldest_map);
1e59de90
TL
2704 f->dump_unsigned("cluster_osdmap_trim_lower_bound",
2705 superblock.cluster_osdmap_trim_lower_bound);
7c673cae 2706 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2707 f->dump_unsigned("num_pgs", num_pgs);
7c673cae 2708 f->close_section();
9f95a23c 2709 } else if (prefix == "flush_journal") {
7c673cae 2710 store->flush_journal();
9f95a23c
TL
2711 } else if (prefix == "dump_ops_in_flight" ||
2712 prefix == "ops" ||
2713 prefix == "dump_blocked_ops" ||
1e59de90 2714 prefix == "dump_blocked_ops_count" ||
9f95a23c
TL
2715 prefix == "dump_historic_ops" ||
2716 prefix == "dump_historic_ops_by_duration" ||
2717 prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2718
2719 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2720even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2721will start to track new ops received afterwards.";
2722
2723 set<string> filters;
2724 vector<string> filter_str;
9f95a23c 2725 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
c07f9fc5
FG
2726 copy(filter_str.begin(), filter_str.end(),
2727 inserter(filters, filters.end()));
2728 }
2729
9f95a23c
TL
2730 if (prefix == "dump_ops_in_flight" ||
2731 prefix == "ops") {
c07f9fc5
FG
2732 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2733 ss << error_str;
9f95a23c
TL
2734 ret = -EINVAL;
2735 goto out;
c07f9fc5
FG
2736 }
2737 }
9f95a23c 2738 if (prefix == "dump_blocked_ops") {
c07f9fc5
FG
2739 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2740 ss << error_str;
9f95a23c
TL
2741 ret = -EINVAL;
2742 goto out;
c07f9fc5
FG
2743 }
2744 }
1e59de90
TL
2745 if (prefix == "dump_blocked_ops_count") {
2746 if (!op_tracker.dump_ops_in_flight(f, true, filters, true)) {
2747 ss << error_str;
2748 ret = -EINVAL;
2749 goto out;
2750 }
2751 }
9f95a23c 2752 if (prefix == "dump_historic_ops") {
c07f9fc5
FG
2753 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2754 ss << error_str;
9f95a23c
TL
2755 ret = -EINVAL;
2756 goto out;
c07f9fc5
FG
2757 }
2758 }
9f95a23c 2759 if (prefix == "dump_historic_ops_by_duration") {
c07f9fc5
FG
2760 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2761 ss << error_str;
9f95a23c
TL
2762 ret = -EINVAL;
2763 goto out;
c07f9fc5
FG
2764 }
2765 }
9f95a23c 2766 if (prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2767 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2768 ss << error_str;
9f95a23c
TL
2769 ret = -EINVAL;
2770 goto out;
c07f9fc5 2771 }
7c673cae 2772 }
9f95a23c 2773 } else if (prefix == "dump_op_pq_state") {
7c673cae
FG
2774 f->open_object_section("pq");
2775 op_shardedwq.dump(f);
2776 f->close_section();
f67539c2 2777 } else if (prefix == "dump_blocklist") {
7c673cae 2778 list<pair<entity_addr_t,utime_t> > bl;
33c7a0ef 2779 list<pair<entity_addr_t,utime_t> > rbl;
7c673cae 2780 OSDMapRef curmap = service.get_osdmap();
33c7a0ef 2781 curmap->get_blocklist(&bl, &rbl);
7c673cae 2782
f67539c2 2783 f->open_array_section("blocklist");
7c673cae
FG
2784 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2785 it != bl.end(); ++it) {
224ce89b 2786 f->open_object_section("entry");
7c673cae
FG
2787 f->open_object_section("entity_addr_t");
2788 it->first.dump(f);
2789 f->close_section(); //entity_addr_t
2790 it->second.localtime(f->dump_stream("expire_time"));
2791 f->close_section(); //entry
2792 }
f67539c2 2793 f->close_section(); //blocklist
33c7a0ef
TL
2794 f->open_array_section("range_blocklist");
2795 for (list<pair<entity_addr_t,utime_t> >::iterator it = rbl.begin();
2796 it != rbl.end(); ++it) {
2797 f->open_object_section("entry");
2798 f->open_object_section("entity_addr_t");
2799 it->first.dump(f);
2800 f->close_section(); //entity_addr_t
2801 it->second.localtime(f->dump_stream("expire_time"));
2802 f->close_section(); //entry
2803 }
2804 f->close_section(); //blocklist
9f95a23c 2805 } else if (prefix == "dump_watchers") {
7c673cae
FG
2806 list<obj_watch_item_t> watchers;
2807 // scan pg's
11fdf7f2
TL
2808 vector<PGRef> pgs;
2809 _get_pgs(&pgs);
2810 for (auto& pg : pgs) {
2811 list<obj_watch_item_t> pg_watchers;
2812 pg->get_watchers(&pg_watchers);
2813 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2814 }
2815
2816 f->open_array_section("watchers");
2817 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2818 it != watchers.end(); ++it) {
2819
224ce89b 2820 f->open_object_section("watch");
7c673cae
FG
2821
2822 f->dump_string("namespace", it->obj.nspace);
2823 f->dump_string("object", it->obj.oid.name);
2824
2825 f->open_object_section("entity_name");
2826 it->wi.name.dump(f);
2827 f->close_section(); //entity_name_t
2828
224ce89b
WB
2829 f->dump_unsigned("cookie", it->wi.cookie);
2830 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2831
2832 f->open_object_section("entity_addr_t");
2833 it->wi.addr.dump(f);
2834 f->close_section(); //entity_addr_t
2835
2836 f->close_section(); //watch
2837 }
2838
2839 f->close_section(); //watchers
9f95a23c 2840 } else if (prefix == "dump_recovery_reservations") {
7c673cae
FG
2841 f->open_object_section("reservations");
2842 f->open_object_section("local_reservations");
2843 service.local_reserver.dump(f);
2844 f->close_section();
2845 f->open_object_section("remote_reservations");
2846 service.remote_reserver.dump(f);
2847 f->close_section();
2848 f->close_section();
9f95a23c 2849 } else if (prefix == "dump_scrub_reservations") {
eafe8130 2850 f->open_object_section("scrub_reservations");
20effc67 2851 service.get_scrub_services().dump_scrub_reservations(f);
eafe8130 2852 f->close_section();
9f95a23c 2853 } else if (prefix == "get_latest_osdmap") {
7c673cae 2854 get_latest_osdmap();
9f95a23c 2855 } else if (prefix == "set_heap_property") {
7c673cae
FG
2856 string property;
2857 int64_t value = 0;
2858 string error;
2859 bool success = false;
9f95a23c 2860 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2861 error = "unable to get property";
2862 success = false;
9f95a23c 2863 } else if (!cmd_getval(cmdmap, "value", value)) {
7c673cae
FG
2864 error = "unable to get value";
2865 success = false;
2866 } else if (value < 0) {
2867 error = "negative value not allowed";
2868 success = false;
2869 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2870 error = "invalid property";
2871 success = false;
2872 } else {
2873 success = true;
2874 }
2875 f->open_object_section("result");
2876 f->dump_string("error", error);
2877 f->dump_bool("success", success);
2878 f->close_section();
9f95a23c 2879 } else if (prefix == "get_heap_property") {
7c673cae
FG
2880 string property;
2881 size_t value = 0;
2882 string error;
2883 bool success = false;
9f95a23c 2884 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2885 error = "unable to get property";
2886 success = false;
2887 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2888 error = "invalid property";
2889 success = false;
2890 } else {
2891 success = true;
2892 }
2893 f->open_object_section("result");
2894 f->dump_string("error", error);
2895 f->dump_bool("success", success);
2896 f->dump_int("value", value);
2897 f->close_section();
9f95a23c 2898 } else if (prefix == "dump_objectstore_kv_stats") {
7c673cae 2899 store->get_db_statistics(f);
9f95a23c 2900 } else if (prefix == "dump_scrubs") {
20effc67 2901 service.get_scrub_services().dump_scrubs(f);
9f95a23c 2902 } else if (prefix == "calc_objectstore_db_histogram") {
7c673cae 2903 store->generate_db_histogram(f);
9f95a23c 2904 } else if (prefix == "flush_store_cache") {
11fdf7f2 2905 store->flush_cache(&ss);
39ae355f
TL
2906 } else if (prefix == "rotate-stored-key") {
2907 store->write_meta("osd_key", inbl.to_str());
9f95a23c 2908 } else if (prefix == "dump_pgstate_history") {
7c673cae 2909 f->open_object_section("pgstate_history");
9f95a23c 2910 f->open_array_section("pgs");
11fdf7f2
TL
2911 vector<PGRef> pgs;
2912 _get_pgs(&pgs);
2913 for (auto& pg : pgs) {
9f95a23c 2914 f->open_object_section("pg");
11fdf7f2 2915 f->dump_stream("pg") << pg->pg_id;
9f95a23c 2916 f->dump_string("currently", pg->get_current_state());
11fdf7f2 2917 pg->dump_pgstate_history(f);
9f95a23c 2918 f->close_section();
7c673cae
FG
2919 }
2920 f->close_section();
9f95a23c
TL
2921 f->close_section();
2922 } else if (prefix == "compact") {
224ce89b
WB
2923 dout(1) << "triggering manual compaction" << dendl;
2924 auto start = ceph::coarse_mono_clock::now();
2925 store->compact();
2926 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2927 double duration = std::chrono::duration<double>(end-start).count();
f67539c2 2928 dout(1) << "finished manual compaction in "
11fdf7f2 2929 << duration
224ce89b
WB
2930 << " seconds" << dendl;
2931 f->open_object_section("compact_result");
11fdf7f2
TL
2932 f->dump_float("elapsed_time", duration);
2933 f->close_section();
9f95a23c 2934 } else if (prefix == "get_mapped_pools") {
11fdf7f2
TL
2935 f->open_array_section("mapped_pools");
2936 set<int64_t> poollist = get_mapped_pools();
2937 for (auto pool : poollist) {
2938 f->dump_int("pool_id", pool);
2939 }
2940 f->close_section();
9f95a23c 2941 } else if (prefix == "smart") {
11fdf7f2 2942 string devid;
9f95a23c
TL
2943 cmd_getval(cmdmap, "devid", devid);
2944 ostringstream out;
2945 probe_smart(devid, out);
2946 outbl.append(out.str());
2947 } else if (prefix == "list_devices") {
11fdf7f2
TL
2948 set<string> devnames;
2949 store->get_devices(&devnames);
9f95a23c 2950 f->open_array_section("list_devices");
11fdf7f2
TL
2951 for (auto dev : devnames) {
2952 if (dev.find("dm-") == 0) {
2953 continue;
2954 }
9f95a23c
TL
2955 string err;
2956 f->open_object_section("device");
11fdf7f2 2957 f->dump_string("device", "/dev/" + dev);
9f95a23c
TL
2958 f->dump_string("device_id", get_device_id(dev, &err));
2959 f->close_section();
11fdf7f2 2960 }
224ce89b 2961 f->close_section();
9f95a23c
TL
2962 } else if (prefix == "send_beacon") {
2963 lock_guard l(osd_lock);
11fdf7f2
TL
2964 if (is_active()) {
2965 send_beacon(ceph::coarse_mono_clock::now());
2966 }
9f95a23c
TL
2967 }
2968
2969 else if (prefix == "cluster_log") {
2970 vector<string> msg;
2971 cmd_getval(cmdmap, "message", msg);
2972 if (msg.empty()) {
2973 ret = -EINVAL;
2974 ss << "ignoring empty log message";
2975 goto out;
2976 }
2977 string message = msg.front();
2978 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2979 message += " " + *a;
2980 string lvl;
2981 cmd_getval(cmdmap, "level", lvl);
2982 clog_type level = string_to_clog_type(lvl);
2983 if (level < 0) {
2984 ret = -EINVAL;
2985 ss << "unknown level '" << lvl << "'";
2986 goto out;
2987 }
2988 clog->do_log(level, message);
2989 }
2990
2991 else if (prefix == "bench") {
9f95a23c 2992 // default count 1G, size 4MB
20effc67
TL
2993 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 1LL << 30);
2994 int64_t bsize = cmd_getval_or<int64_t>(cmdmap, "size", 4LL << 20);
2995 int64_t osize = cmd_getval_or<int64_t>(cmdmap, "object_size", 0);
2996 int64_t onum = cmd_getval_or<int64_t>(cmdmap, "object_num", 0);
a4b75251 2997 double elapsed = 0.0;
9f95a23c 2998
a4b75251
TL
2999 ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
3000 if (ret != 0) {
9f95a23c 3001 goto out;
9f95a23c
TL
3002 }
3003
9f95a23c
TL
3004 double rate = count / elapsed;
3005 double iops = rate / bsize;
3006 f->open_object_section("osd_bench_results");
3007 f->dump_int("bytes_written", count);
3008 f->dump_int("blocksize", bsize);
3009 f->dump_float("elapsed_sec", elapsed);
3010 f->dump_float("bytes_per_sec", rate);
3011 f->dump_float("iops", iops);
3012 f->close_section();
3013 }
3014
3015 else if (prefix == "flush_pg_stats") {
3016 mgrc.send_pgstats();
3017 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
3018 }
3019
3020 else if (prefix == "heap") {
2a845540
TL
3021 std::stringstream outss;
3022 ret = ceph::osd_cmds::heap(*cct, cmdmap, outss, ss);
3023 outbl.append(outss);
9f95a23c
TL
3024 }
3025
3026 else if (prefix == "debug dump_missing") {
3027 f->open_array_section("pgs");
3028 vector<PGRef> pgs;
3029 _get_pgs(&pgs);
3030 for (auto& pg : pgs) {
3031 string s = stringify(pg->pg_id);
3032 f->open_array_section(s.c_str());
3033 pg->lock();
3034 pg->dump_missing(f);
3035 pg->unlock();
3036 f->close_section();
3037 }
3038 f->close_section();
3039 }
3040
3041 else if (prefix == "debug kick_recovery_wq") {
3042 int64_t delay;
3043 cmd_getval(cmdmap, "delay", delay);
3044 ostringstream oss;
3045 oss << delay;
3046 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
3047 if (ret != 0) {
3048 ss << "kick_recovery_wq: error setting "
3049 << "osd_recovery_delay_start to '" << delay << "': error "
3050 << ret;
3051 goto out;
3052 }
3053 cct->_conf.apply_changes(nullptr);
3054 ss << "kicking recovery queue. set osd_recovery_delay_start "
3055 << "to " << cct->_conf->osd_recovery_delay_start;
3056 }
3057
3058 else if (prefix == "cpu_profiler") {
3059 ostringstream ds;
3060 string arg;
3061 cmd_getval(cmdmap, "arg", arg);
3062 vector<string> argvec;
3063 get_str_vec(arg, argvec);
3064 cpu_profiler_handle_command(argvec, ds);
3065 outbl.append(ds.str());
3066 }
3067
3068 else if (prefix == "dump_pg_recovery_stats") {
3069 lock_guard l(osd_lock);
3070 pg_recovery_stats.dump_formatted(f);
3071 }
3072
3073 else if (prefix == "reset_pg_recovery_stats") {
3074 lock_guard l(osd_lock);
3075 pg_recovery_stats.reset();
3076 }
3077
3078 else if (prefix == "perf histogram dump") {
3079 std::string logger;
3080 std::string counter;
3081 cmd_getval(cmdmap, "logger", logger);
3082 cmd_getval(cmdmap, "counter", counter);
3083 cct->get_perfcounters_collection()->dump_formatted_histograms(
3084 f, false, logger, counter);
3085 }
3086
3087 else if (prefix == "cache drop") {
3088 lock_guard l(osd_lock);
3089 dout(20) << "clearing all caches" << dendl;
3090 // Clear the objectstore's cache - onode and buffer for Bluestore,
3091 // system's pagecache for Filestore
3092 ret = store->flush_cache(&ss);
3093 if (ret < 0) {
3094 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
3095 goto out;
3096 }
3097 // Clear the objectcontext cache (per PG)
3098 vector<PGRef> pgs;
3099 _get_pgs(&pgs);
3100 for (auto& pg: pgs) {
3101 pg->clear_cache();
3102 }
3103 }
3104
3105 else if (prefix == "cache status") {
3106 lock_guard l(osd_lock);
3107 int obj_ctx_count = 0;
3108 vector<PGRef> pgs;
3109 _get_pgs(&pgs);
3110 for (auto& pg: pgs) {
3111 obj_ctx_count += pg->get_cache_obj_count();
3112 }
3113 f->open_object_section("cache_status");
3114 f->dump_int("object_ctx", obj_ctx_count);
3115 store->dump_cache_stats(f);
3116 f->close_section();
3117 }
3118
3119 else if (prefix == "scrub_purged_snaps") {
3120 lock_guard l(osd_lock);
3121 scrub_purged_snaps();
3122 }
3123
3124 else if (prefix == "dump_osd_network") {
3125 lock_guard l(osd_lock);
3126 int64_t value = 0;
3127 if (!(cmd_getval(cmdmap, "value", value))) {
3128 // Convert milliseconds to microseconds
3129 value = static_cast<double>(g_conf().get_val<double>(
3130 "mon_warn_on_slow_ping_time")) * 1000;
3131 if (value == 0) {
3132 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
3133 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
3134 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
3135 }
3136 } else {
3137 // Convert user input to microseconds
3138 value *= 1000;
3139 }
3140 if (value < 0) value = 0;
3141
3142 struct osd_ping_time_t {
3143 uint32_t pingtime;
3144 int to;
3145 bool back;
3146 std::array<uint32_t,3> times;
3147 std::array<uint32_t,3> min;
3148 std::array<uint32_t,3> max;
3149 uint32_t last;
3150 uint32_t last_update;
3151
3152 bool operator<(const osd_ping_time_t& rhs) const {
3153 if (pingtime < rhs.pingtime)
3154 return true;
3155 if (pingtime > rhs.pingtime)
3156 return false;
3157 if (to < rhs.to)
3158 return true;
3159 if (to > rhs.to)
3160 return false;
3161 return back;
3162 }
3163 };
3164
3165 set<osd_ping_time_t> sorted;
3166 // Get pingtimes under lock and not on the stack
eafe8130
TL
3167 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3168 service.get_hb_pingtime(pingtimes);
3169 for (auto j : *pingtimes) {
3170 if (j.second.last_update == 0)
3171 continue;
3172 osd_ping_time_t item;
3173 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3174 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3175 if (item.pingtime >= value) {
3176 item.to = j.first;
3177 item.times[0] = j.second.back_pingtime[0];
3178 item.times[1] = j.second.back_pingtime[1];
3179 item.times[2] = j.second.back_pingtime[2];
3180 item.min[0] = j.second.back_min[0];
3181 item.min[1] = j.second.back_min[1];
3182 item.min[2] = j.second.back_min[2];
3183 item.max[0] = j.second.back_max[0];
3184 item.max[1] = j.second.back_max[1];
3185 item.max[2] = j.second.back_max[2];
3186 item.last = j.second.back_last;
3187 item.back = true;
3188 item.last_update = j.second.last_update;
3189 sorted.emplace(item);
3190 }
3191 if (j.second.front_last == 0)
3192 continue;
3193 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3194 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3195 if (item.pingtime >= value) {
3196 item.to = j.first;
3197 item.times[0] = j.second.front_pingtime[0];
3198 item.times[1] = j.second.front_pingtime[1];
3199 item.times[2] = j.second.front_pingtime[2];
3200 item.min[0] = j.second.front_min[0];
3201 item.min[1] = j.second.front_min[1];
3202 item.min[2] = j.second.front_min[2];
3203 item.max[0] = j.second.front_max[0];
3204 item.max[1] = j.second.front_max[1];
3205 item.max[2] = j.second.front_max[2];
3206 item.last = j.second.front_last;
3207 item.last_update = j.second.last_update;
3208 item.back = false;
3209 sorted.emplace(item);
3210 }
3211 }
3212 delete pingtimes;
3213 //
3214 // Network ping times (1min 5min 15min)
3215 f->open_object_section("network_ping_times");
3216 f->dump_int("threshold", value / 1000);
3217 f->open_array_section("entries");
3218 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3219 ceph_assert(sitem.pingtime >= value);
3220 f->open_object_section("entry");
3221
3222 const time_t lu(sitem.last_update);
3223 char buffer[26];
3224 string lustr(ctime_r(&lu, buffer));
3225 lustr.pop_back(); // Remove trailing \n
3226 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3227 f->dump_string("last update", lustr);
3228 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3229 f->dump_int("from osd", whoami);
3230 f->dump_int("to osd", sitem.to);
3231 f->dump_string("interface", (sitem.back ? "back" : "front"));
3232 f->open_object_section("average");
3233 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3234 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3235 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3236 f->close_section(); // average
3237 f->open_object_section("min");
3238 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3239 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3240 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3241 f->close_section(); // min
3242 f->open_object_section("max");
3243 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3244 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3245 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3246 f->close_section(); // max
3247 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3248 f->close_section(); // entry
3249 }
3250 f->close_section(); // entries
3251 f->close_section(); // network_ping_times
20effc67
TL
3252 } else if (prefix == "dump_pool_statfs") {
3253 lock_guard l(osd_lock);
3254
3255 int64_t p = 0;
3256 if (!(cmd_getval(cmdmap, "poolid", p))) {
3257 ss << "Error dumping pool statfs: no poolid provided";
3258 ret = -EINVAL;
3259 goto out;
3260 }
3261
3262 store_statfs_t st;
3263 bool per_pool_omap_stats = false;
3264
3265 ret = store->pool_statfs(p, &st, &per_pool_omap_stats);
3266 if (ret < 0) {
3267 ss << "Error dumping pool statfs: " << cpp_strerror(ret);
3268 goto out;
3269 } else {
3270 ss << "dumping pool statfs...";
3271 f->open_object_section("pool_statfs");
3272 f->dump_int("poolid", p);
3273 st.dump(f);
3274 f->close_section();
3275 }
7c673cae 3276 } else {
11fdf7f2 3277 ceph_abort_msg("broken asok registration");
7c673cae 3278 }
9f95a23c
TL
3279
3280 out:
3281 on_finish(ret, ss.str(), outbl);
7c673cae
FG
3282}
3283
a4b75251
TL
3284int OSD::run_osd_bench_test(
3285 int64_t count,
3286 int64_t bsize,
3287 int64_t osize,
3288 int64_t onum,
3289 double *elapsed,
3290 ostream &ss)
3291{
3292 int ret = 0;
39ae355f 3293 srand(time(NULL) % (unsigned long) -1);
a4b75251
TL
3294 uint32_t duration = cct->_conf->osd_bench_duration;
3295
3296 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3297 // let us limit the block size because the next checks rely on it
3298 // having a sane value. If we allow any block size to be set things
3299 // can still go sideways.
3300 ss << "block 'size' values are capped at "
3301 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3302 << " a higher value, please adjust 'osd_bench_max_block_size'";
3303 ret = -EINVAL;
3304 return ret;
3305 } else if (bsize < (int64_t) (1 << 20)) {
3306 // entering the realm of small block sizes.
3307 // limit the count to a sane value, assuming a configurable amount of
3308 // IOPS and duration, so that the OSD doesn't get hung up on this,
3309 // preventing timeouts from going off
3310 int64_t max_count =
3311 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3312 if (count > max_count) {
3313 ss << "'count' values greater than " << max_count
3314 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3315 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3316 << " for " << duration << " seconds,"
3317 << " can cause ill effects on osd. "
3318 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3319 << " value if you wish to use a higher 'count'.";
3320 ret = -EINVAL;
3321 return ret;
3322 }
3323 } else {
3324 // 1MB block sizes are big enough so that we get more stuff done.
3325 // However, to avoid the osd from getting hung on this and having
3326 // timers being triggered, we are going to limit the count assuming
3327 // a configurable throughput and duration.
3328 // NOTE: max_count is the total amount of bytes that we believe we
3329 // will be able to write during 'duration' for the given
3330 // throughput. The block size hardly impacts this unless it's
3331 // way too big. Given we already check how big the block size
3332 // is, it's safe to assume everything will check out.
3333 int64_t max_count =
3334 cct->_conf->osd_bench_large_size_max_throughput * duration;
3335 if (count > max_count) {
3336 ss << "'count' values greater than " << max_count
3337 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3338 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3339 << " for " << duration << " seconds,"
3340 << " can cause ill effects on osd. "
3341 << " Please adjust 'osd_bench_large_size_max_throughput'"
3342 << " with a higher value if you wish to use a higher 'count'.";
3343 ret = -EINVAL;
3344 return ret;
3345 }
3346 }
3347
3348 if (osize && bsize > osize) {
3349 bsize = osize;
3350 }
3351
3352 dout(1) << " bench count " << count
3353 << " bsize " << byte_u_t(bsize) << dendl;
3354
3355 ObjectStore::Transaction cleanupt;
3356
3357 if (osize && onum) {
3358 bufferlist bl;
3359 bufferptr bp(osize);
20effc67 3360 memset(bp.c_str(), 'a', bp.length());
a4b75251
TL
3361 bl.push_back(std::move(bp));
3362 bl.rebuild_page_aligned();
3363 for (int i=0; i<onum; ++i) {
3364 char nm[30];
3365 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3366 object_t oid(nm);
3367 hobject_t soid(sobject_t(oid, 0));
3368 ObjectStore::Transaction t;
3369 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3370 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3371 cleanupt.remove(coll_t(), ghobject_t(soid));
3372 }
3373 }
3374
a4b75251
TL
3375 {
3376 C_SaferCond waiter;
3377 if (!service.meta_ch->flush_commit(&waiter)) {
3378 waiter.wait();
3379 }
3380 }
3381
39ae355f 3382 bufferlist bl;
a4b75251
TL
3383 utime_t start = ceph_clock_now();
3384 for (int64_t pos = 0; pos < count; pos += bsize) {
1e59de90 3385 char nm[34];
a4b75251 3386 unsigned offset = 0;
39ae355f
TL
3387 bufferptr bp(bsize);
3388 memset(bp.c_str(), rand() & 0xff, bp.length());
3389 bl.push_back(std::move(bp));
3390 bl.rebuild_page_aligned();
a4b75251
TL
3391 if (onum && osize) {
3392 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3393 offset = rand() % (osize / bsize) * bsize;
3394 } else {
3395 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3396 }
3397 object_t oid(nm);
3398 hobject_t soid(sobject_t(oid, 0));
3399 ObjectStore::Transaction t;
3400 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3401 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3402 if (!onum || !osize) {
3403 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3404 }
39ae355f 3405 bl.clear();
a4b75251
TL
3406 }
3407
3408 {
3409 C_SaferCond waiter;
3410 if (!service.meta_ch->flush_commit(&waiter)) {
3411 waiter.wait();
3412 }
3413 }
3414 utime_t end = ceph_clock_now();
3415 *elapsed = end - start;
3416
3417 // clean up
3418 store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3419 {
3420 C_SaferCond waiter;
3421 if (!service.meta_ch->flush_commit(&waiter)) {
3422 waiter.wait();
3423 }
3424 }
3425
3426 return ret;
3427}
3428
7c673cae
FG
3429class TestOpsSocketHook : public AdminSocketHook {
3430 OSDService *service;
3431 ObjectStore *store;
3432public:
3433 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
9f95a23c 3434 int call(std::string_view command, const cmdmap_t& cmdmap,
39ae355f 3435 const bufferlist&,
9f95a23c
TL
3436 Formatter *f,
3437 std::ostream& errss,
3438 bufferlist& out) override {
3439 int r = 0;
3440 stringstream outss;
11fdf7f2 3441 try {
9f95a23c
TL
3442 test_ops(service, store, command, cmdmap, outss);
3443 out.append(outss);
3444 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3445 errss << e.what();
3446 r = -EINVAL;
11fdf7f2 3447 }
9f95a23c 3448 return r;
7c673cae
FG
3449 }
3450 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 3451 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
3452
3453};
3454
3455class OSD::C_Tick : public Context {
3456 OSD *osd;
3457 public:
3458 explicit C_Tick(OSD *o) : osd(o) {}
3459 void finish(int r) override {
3460 osd->tick();
3461 }
3462};
3463
3464class OSD::C_Tick_WithoutOSDLock : public Context {
3465 OSD *osd;
3466 public:
3467 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3468 void finish(int r) override {
3469 osd->tick_without_osd_lock();
3470 }
3471};
3472
3473int OSD::enable_disable_fuse(bool stop)
3474{
3475#ifdef HAVE_LIBFUSE
3476 int r;
3477 string mntpath = cct->_conf->osd_data + "/fuse";
3478 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3479 dout(1) << __func__ << " disabling" << dendl;
3480 fuse_store->stop();
3481 delete fuse_store;
3482 fuse_store = NULL;
3483 r = ::rmdir(mntpath.c_str());
7c673cae 3484 if (r < 0) {
c07f9fc5
FG
3485 r = -errno;
3486 derr << __func__ << " failed to rmdir " << mntpath << ": "
3487 << cpp_strerror(r) << dendl;
7c673cae
FG
3488 return r;
3489 }
3490 return 0;
3491 }
3492 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3493 dout(1) << __func__ << " enabling" << dendl;
3494 r = ::mkdir(mntpath.c_str(), 0700);
3495 if (r < 0)
3496 r = -errno;
3497 if (r < 0 && r != -EEXIST) {
3498 derr << __func__ << " unable to create " << mntpath << ": "
3499 << cpp_strerror(r) << dendl;
3500 return r;
3501 }
20effc67 3502 fuse_store = new FuseStore(store.get(), mntpath);
7c673cae
FG
3503 r = fuse_store->start();
3504 if (r < 0) {
3505 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3506 delete fuse_store;
3507 fuse_store = NULL;
3508 return r;
3509 }
3510 }
3511#endif // HAVE_LIBFUSE
3512 return 0;
3513}
3514
9f95a23c
TL
3515size_t OSD::get_num_cache_shards()
3516{
3517 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3518}
3519
31f18b77
FG
3520int OSD::get_num_op_shards()
3521{
3522 if (cct->_conf->osd_op_num_shards)
3523 return cct->_conf->osd_op_num_shards;
3524 if (store_is_rotational)
3525 return cct->_conf->osd_op_num_shards_hdd;
3526 else
3527 return cct->_conf->osd_op_num_shards_ssd;
3528}
3529
3530int OSD::get_num_op_threads()
3531{
3532 if (cct->_conf->osd_op_num_threads_per_shard)
3533 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3534 if (store_is_rotational)
3535 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3536 else
3537 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3538}
3539
c07f9fc5
FG
3540float OSD::get_osd_recovery_sleep()
3541{
3542 if (cct->_conf->osd_recovery_sleep)
3543 return cct->_conf->osd_recovery_sleep;
d2e6a577 3544 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 3545 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 3546 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 3547 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
3548 else
3549 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
3550}
3551
11fdf7f2
TL
3552float OSD::get_osd_delete_sleep()
3553{
3554 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3555 if (osd_delete_sleep > 0)
3556 return osd_delete_sleep;
3557 if (!store_is_rotational && !journal_is_rotational)
3558 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3559 if (store_is_rotational && !journal_is_rotational)
3560 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3561 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3562}
3563
9f95a23c
TL
3564int OSD::get_recovery_max_active()
3565{
3566 if (cct->_conf->osd_recovery_max_active)
3567 return cct->_conf->osd_recovery_max_active;
3568 if (store_is_rotational)
3569 return cct->_conf->osd_recovery_max_active_hdd;
3570 else
3571 return cct->_conf->osd_recovery_max_active_ssd;
3572}
3573
494da23a
TL
3574float OSD::get_osd_snap_trim_sleep()
3575{
3576 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3577 if (osd_snap_trim_sleep > 0)
3578 return osd_snap_trim_sleep;
3579 if (!store_is_rotational && !journal_is_rotational)
3580 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3581 if (store_is_rotational && !journal_is_rotational)
3582 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3583 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3584}
3585
7c673cae
FG
3586int OSD::init()
3587{
9f95a23c 3588 OSDMapRef osdmap;
7c673cae 3589 CompatSet initial, diff;
11fdf7f2 3590 std::lock_guard lock(osd_lock);
7c673cae
FG
3591 if (is_stopping())
3592 return 0;
20effc67 3593 tracing::osd::tracer.init("osd");
7c673cae
FG
3594 tick_timer.init();
3595 tick_timer_without_osd_lock.init();
3596 service.recovery_request_timer.init();
11fdf7f2
TL
3597 service.sleep_timer.init();
3598
3599 boot_finisher.start();
3600
3601 {
3602 string val;
3603 store->read_meta("require_osd_release", &val);
9f95a23c 3604 last_require_osd_release = ceph_release_from_name(val);
11fdf7f2 3605 }
7c673cae
FG
3606
3607 // mount.
31f18b77
FG
3608 dout(2) << "init " << dev_path
3609 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3610 << dendl;
d2e6a577 3611 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 3612 ceph_assert(store); // call pre_init() first!
7c673cae 3613
9f95a23c 3614 store->set_cache_shards(get_num_cache_shards());
7c673cae 3615
20effc67
TL
3616 int rotating_auth_attempts = 0;
3617 auto rotating_auth_timeout =
3618 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3619
7c673cae
FG
3620 int r = store->mount();
3621 if (r < 0) {
3622 derr << "OSD:init: unable to mount object store" << dendl;
3623 return r;
3624 }
d2e6a577
FG
3625 journal_is_rotational = store->is_journal_rotational();
3626 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3627 << dendl;
7c673cae
FG
3628
3629 enable_disable_fuse(false);
3630
3631 dout(2) << "boot" << dendl;
3632
11fdf7f2 3633 service.meta_ch = store->open_collection(coll_t::meta());
20effc67
TL
3634 if (!service.meta_ch) {
3635 derr << "OSD:init: unable to open meta collection"
3636 << dendl;
3637 r = -ENOENT;
3638 goto out;
3639 }
7c673cae
FG
3640 // initialize the daily loadavg with current 15min loadavg
3641 double loadavgs[3];
3642 if (getloadavg(loadavgs, 3) == 3) {
3643 daily_loadavg = loadavgs[2];
3644 } else {
3645 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3646 daily_loadavg = 1.0;
3647 }
3648
7c673cae
FG
3649 // sanity check long object name handling
3650 {
3651 hobject_t l;
3652 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3653 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3654 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3655 r = store->validate_hobject_key(l);
3656 if (r < 0) {
3657 derr << "backend (" << store->get_type() << ") is unable to support max "
3658 << "object name[space] len" << dendl;
3659 derr << " osd max object name len = "
3660 << cct->_conf->osd_max_object_name_len << dendl;
3661 derr << " osd max object namespace len = "
3662 << cct->_conf->osd_max_object_namespace_len << dendl;
3663 derr << cpp_strerror(r) << dendl;
3664 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3665 goto out;
3666 }
3667 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3668 << dendl;
3669 } else {
3670 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3671 }
3672 }
3673
3674 // read superblock
3675 r = read_superblock();
3676 if (r < 0) {
3677 derr << "OSD::init() : unable to read osd superblock" << dendl;
3678 r = -EINVAL;
3679 goto out;
3680 }
3681
3682 if (osd_compat.compare(superblock.compat_features) < 0) {
3683 derr << "The disk uses features unsupported by the executable." << dendl;
3684 derr << " ondisk features " << superblock.compat_features << dendl;
3685 derr << " daemon features " << osd_compat << dendl;
3686
3687 if (osd_compat.writeable(superblock.compat_features)) {
3688 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3689 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3690 r = -EOPNOTSUPP;
3691 goto out;
3692 }
3693 else {
3694 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3695 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3696 r = -EOPNOTSUPP;
3697 goto out;
3698 }
3699 }
3700
3701 assert_warn(whoami == superblock.whoami);
3702 if (whoami != superblock.whoami) {
3703 derr << "OSD::init: superblock says osd"
3704 << superblock.whoami << " but I am osd." << whoami << dendl;
3705 r = -EINVAL;
3706 goto out;
3707 }
3708
9f95a23c
TL
3709 startup_time = ceph::mono_clock::now();
3710
11fdf7f2 3711 // load up "current" osdmap
9f95a23c
TL
3712 assert_warn(!get_osdmap());
3713 if (get_osdmap()) {
11fdf7f2
TL
3714 derr << "OSD::init: unable to read current osdmap" << dendl;
3715 r = -EINVAL;
3716 goto out;
3717 }
3718 osdmap = get_map(superblock.current_epoch);
9f95a23c 3719 set_osdmap(osdmap);
11fdf7f2
TL
3720
3721 // make sure we don't have legacy pgs deleting
3722 {
3723 vector<coll_t> ls;
3724 int r = store->list_collections(ls);
3725 ceph_assert(r >= 0);
3726 for (auto c : ls) {
3727 spg_t pgid;
3728 if (c.is_pg(&pgid) &&
3729 !osdmap->have_pg_pool(pgid.pool())) {
3730 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3731 if (!store->exists(service.meta_ch, oid)) {
3732 derr << __func__ << " missing pg_pool_t for deleted pool "
3733 << pgid.pool() << " for pg " << pgid
3734 << "; please downgrade to luminous and allow "
3735 << "pg deletion to complete before upgrading" << dendl;
3736 ceph_abort();
3737 }
3738 }
3739 }
3740 }
3741
7c673cae
FG
3742 initial = get_osd_initial_compat_set();
3743 diff = superblock.compat_features.unsupported(initial);
3744 if (superblock.compat_features.merge(initial)) {
9f95a23c
TL
3745 // Are we adding SNAPMAPPER2?
3746 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3747 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3748 << dendl;
3749 auto ch = service.meta_ch;
3750 auto hoid = make_snapmapper_oid();
3751 unsigned max = cct->_conf->osd_target_transaction_size;
20effc67 3752 r = SnapMapper::convert_legacy(cct, store.get(), ch, hoid, max);
9f95a23c
TL
3753 if (r < 0)
3754 goto out;
3755 }
7c673cae
FG
3756 // We need to persist the new compat_set before we
3757 // do anything else
3758 dout(5) << "Upgrading superblock adding: " << diff << dendl;
1e59de90
TL
3759
3760 if (!superblock.cluster_osdmap_trim_lower_bound) {
3761 superblock.cluster_osdmap_trim_lower_bound = superblock.oldest_map;
3762 }
3763
7c673cae
FG
3764 ObjectStore::Transaction t;
3765 write_superblock(t);
11fdf7f2 3766 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3767 if (r < 0)
3768 goto out;
3769 }
3770
3771 // make sure snap mapper object exists
11fdf7f2 3772 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3773 dout(10) << "init creating/touching snapmapper object" << dendl;
3774 ObjectStore::Transaction t;
3775 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3776 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3777 if (r < 0)
3778 goto out;
3779 }
9f95a23c
TL
3780 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3781 dout(10) << "init creating/touching purged_snaps object" << dendl;
3782 ObjectStore::Transaction t;
3783 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3784 r = store->queue_transaction(service.meta_ch, std::move(t));
3785 if (r < 0)
3786 goto out;
3787 }
7c673cae
FG
3788
3789 if (cct->_conf->osd_open_classes_on_start) {
9f95a23c 3790 int r = ClassHandler::get_instance().open_all_classes();
7c673cae
FG
3791 if (r)
3792 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3793 }
3794
11fdf7f2 3795 check_osdmap_features();
7c673cae 3796
7c673cae
FG
3797 {
3798 epoch_t bind_epoch = osdmap->get_epoch();
3799 service.set_epochs(NULL, NULL, &bind_epoch);
3800 }
3801
3802 clear_temp_objects();
3803
d2e6a577 3804 // initialize osdmap references in sharded wq
11fdf7f2
TL
3805 for (auto& shard : shards) {
3806 std::lock_guard l(shard->osdmap_lock);
3807 shard->shard_osdmap = osdmap;
3808 }
d2e6a577 3809
7c673cae
FG
3810 // load up pgs (as they previously existed)
3811 load_pgs();
3812
3813 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
7c673cae 3814
f67539c2
TL
3815 if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3816 dout(2) << "compacting object store's omap" << dendl;
3817 store->compact();
3818 }
7c673cae 3819
11fdf7f2
TL
3820 // prime osd stats
3821 {
3822 struct store_statfs_t stbuf;
3823 osd_alert_list_t alerts;
3824 int r = store->statfs(&stbuf, &alerts);
3825 ceph_assert(r == 0);
3826 service.set_statfs(stbuf, alerts);
3827 }
3828
f67539c2 3829 // client_messenger's auth_client will be set up by monc->init() later.
11fdf7f2
TL
3830 for (auto m : { cluster_messenger,
3831 objecter_messenger,
3832 hb_front_client_messenger,
3833 hb_back_client_messenger,
3834 hb_front_server_messenger,
3835 hb_back_server_messenger } ) {
3836 m->set_auth_client(monc);
3837 }
3838 for (auto m : { client_messenger,
3839 cluster_messenger,
3840 hb_front_server_messenger,
3841 hb_back_server_messenger }) {
3842 m->set_auth_server(monc);
3843 }
3844 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3845
3846 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3847 | CEPH_ENTITY_TYPE_MGR);
3848 r = monc->init();
3849 if (r < 0)
3850 goto out;
3851
f67539c2 3852 mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
11fdf7f2 3853 mgrc.set_perf_metric_query_cb(
9f95a23c
TL
3854 [this](const ConfigPayload &config_payload) {
3855 set_perf_queries(config_payload);
11fdf7f2 3856 },
9f95a23c
TL
3857 [this] {
3858 return get_perf_reports();
11fdf7f2 3859 });
7c673cae 3860 mgrc.init();
7c673cae
FG
3861
3862 // tell monc about log_client so it will know about mon session resets
3863 monc->set_log_client(&log_client);
3864 update_log_config();
3865
11fdf7f2
TL
3866 // i'm ready!
3867 client_messenger->add_dispatcher_tail(&mgrc);
3868 client_messenger->add_dispatcher_tail(this);
3869 cluster_messenger->add_dispatcher_head(this);
3870
3871 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3872 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3873 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3874 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3875
9f95a23c 3876 objecter_messenger->add_dispatcher_head(service.objecter.get());
11fdf7f2 3877
28e407b8
AA
3878 service.init();
3879 service.publish_map(osdmap);
3880 service.publish_superblock(superblock);
28e407b8 3881
11fdf7f2
TL
3882 for (auto& shard : shards) {
3883 // put PGs in a temporary set because we may modify pg_slots
3884 // unordered_map below.
3885 set<PGRef> pgs;
3886 for (auto& i : shard->pg_slots) {
3887 PGRef pg = i.second->pg;
3888 if (!pg) {
3889 continue;
3890 }
3891 pgs.insert(pg);
3892 }
3893 for (auto pg : pgs) {
9f95a23c 3894 std::scoped_lock l{*pg};
11fdf7f2
TL
3895 set<pair<spg_t,epoch_t>> new_children;
3896 set<pair<spg_t,epoch_t>> merge_pgs;
3897 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3898 &new_children, &merge_pgs);
3899 if (!new_children.empty()) {
3900 for (auto shard : shards) {
3901 shard->prime_splits(osdmap, &new_children);
3902 }
3903 assert(new_children.empty());
3904 }
3905 if (!merge_pgs.empty()) {
3906 for (auto shard : shards) {
3907 shard->prime_merges(osdmap, &merge_pgs);
3908 }
3909 assert(merge_pgs.empty());
3910 }
11fdf7f2
TL
3911 }
3912 }
3913
7c673cae 3914 osd_op_tp.start();
7c673cae 3915
7c673cae
FG
3916 // start the heartbeat
3917 heartbeat_thread.create("osd_srv_heartbt");
3918
3919 // tick
91327a77
AA
3920 tick_timer.add_event_after(get_tick_interval(),
3921 new C_Tick(this));
7c673cae 3922 {
11fdf7f2 3923 std::lock_guard l(tick_timer_lock);
91327a77
AA
3924 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3925 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3926 }
3927
9f95a23c 3928 osd_lock.unlock();
7c673cae
FG
3929
3930 r = monc->authenticate();
3931 if (r < 0) {
c07f9fc5
FG
3932 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3933 << dendl;
11fdf7f2 3934 exit(1);
7c673cae
FG
3935 }
3936
11fdf7f2 3937 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3938 derr << "unable to obtain rotating service keys; retrying" << dendl;
3939 ++rotating_auth_attempts;
11fdf7f2 3940 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
39ae355f
TL
3941 derr << __func__ << " wait_auth_rotating timed out"
3942 <<" -- maybe I have a clock skew against the monitors?" << dendl;
11fdf7f2 3943 exit(1);
7c673cae
FG
3944 }
3945 }
3946
3947 r = update_crush_device_class();
3948 if (r < 0) {
d2e6a577
FG
3949 derr << __func__ << " unable to update_crush_device_class: "
3950 << cpp_strerror(r) << dendl;
11fdf7f2 3951 exit(1);
7c673cae
FG
3952 }
3953
3954 r = update_crush_location();
3955 if (r < 0) {
d2e6a577 3956 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3957 << cpp_strerror(r) << dendl;
11fdf7f2 3958 exit(1);
7c673cae
FG
3959 }
3960
9f95a23c 3961 osd_lock.lock();
7c673cae
FG
3962 if (is_stopping())
3963 return 0;
3964
3965 // start objecter *after* we have authenticated, so that we don't ignore
3966 // the OSDMaps it requests.
3967 service.final_init();
3968
3969 check_config();
3970
3971 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3972 consume_map();
7c673cae
FG
3973
3974 dout(0) << "done with init, starting boot process" << dendl;
3975
3976 // subscribe to any pg creations
3977 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3978
3979 // MgrClient needs this (it doesn't have MonClient reference itself)
3980 monc->sub_want("mgrmap", 0, 0);
3981
3982 // we don't need to ask for an osdmap here; objecter will
3983 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3984
3985 monc->renew_subs();
3986
3987 start_boot();
3988
a4b75251 3989 // Override a few options if mclock scheduler is enabled.
39ae355f 3990 maybe_override_sleep_options_for_qos();
1e59de90 3991 maybe_override_cost_for_qos();
a4b75251 3992 maybe_override_options_for_qos();
39ae355f 3993 maybe_override_max_osd_capacity_for_qos();
a4b75251 3994
7c673cae 3995 return 0;
7c673cae
FG
3996
3997out:
3998 enable_disable_fuse(true);
3999 store->umount();
20effc67 4000 store.reset();
7c673cae
FG
4001 return r;
4002}
4003
4004void OSD::final_init()
4005{
4006 AdminSocket *admin_socket = cct->get_admin_socket();
4007 asok_hook = new OSDSocketHook(this);
9f95a23c 4008 int r = admin_socket->register_command("status", asok_hook,
7c673cae 4009 "high-level status of OSD");
11fdf7f2 4010 ceph_assert(r == 0);
9f95a23c 4011 r = admin_socket->register_command("flush_journal",
7c673cae
FG
4012 asok_hook,
4013 "flush the journal to permanent store");
11fdf7f2 4014 ceph_assert(r == 0);
9f95a23c 4015 r = admin_socket->register_command("dump_ops_in_flight " \
c07f9fc5
FG
4016 "name=filterstr,type=CephString,n=N,req=false",
4017 asok_hook,
7c673cae 4018 "show the ops currently in flight");
11fdf7f2 4019 ceph_assert(r == 0);
9f95a23c 4020 r = admin_socket->register_command("ops " \
c07f9fc5
FG
4021 "name=filterstr,type=CephString,n=N,req=false",
4022 asok_hook,
7c673cae 4023 "show the ops currently in flight");
11fdf7f2 4024 ceph_assert(r == 0);
9f95a23c 4025 r = admin_socket->register_command("dump_blocked_ops " \
c07f9fc5
FG
4026 "name=filterstr,type=CephString,n=N,req=false",
4027 asok_hook,
7c673cae 4028 "show the blocked ops currently in flight");
11fdf7f2 4029 ceph_assert(r == 0);
1e59de90
TL
4030 r = admin_socket->register_command("dump_blocked_ops_count " \
4031 "name=filterstr,type=CephString,n=N,req=false",
4032 asok_hook,
4033 "show the count of blocked ops currently in flight");
4034 ceph_assert(r == 0);
9f95a23c 4035 r = admin_socket->register_command("dump_historic_ops " \
c07f9fc5 4036 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
4037 asok_hook,
4038 "show recent ops");
11fdf7f2 4039 ceph_assert(r == 0);
9f95a23c 4040 r = admin_socket->register_command("dump_historic_slow_ops " \
c07f9fc5 4041 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
4042 asok_hook,
4043 "show slowest recent ops");
11fdf7f2 4044 ceph_assert(r == 0);
9f95a23c 4045 r = admin_socket->register_command("dump_historic_ops_by_duration " \
c07f9fc5 4046 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
4047 asok_hook,
4048 "show slowest recent ops, sorted by duration");
11fdf7f2 4049 ceph_assert(r == 0);
9f95a23c 4050 r = admin_socket->register_command("dump_op_pq_state",
7c673cae 4051 asok_hook,
20effc67 4052 "dump op queue state");
11fdf7f2 4053 ceph_assert(r == 0);
f67539c2 4054 r = admin_socket->register_command("dump_blocklist",
7c673cae 4055 asok_hook,
f67539c2 4056 "dump blocklisted clients and times");
11fdf7f2 4057 ceph_assert(r == 0);
9f95a23c 4058 r = admin_socket->register_command("dump_watchers",
7c673cae
FG
4059 asok_hook,
4060 "show clients which have active watches,"
4061 " and on which objects");
11fdf7f2 4062 ceph_assert(r == 0);
9f95a23c 4063 r = admin_socket->register_command("dump_recovery_reservations",
7c673cae
FG
4064 asok_hook,
4065 "show recovery reservations");
11fdf7f2 4066 ceph_assert(r == 0);
9f95a23c 4067 r = admin_socket->register_command("dump_scrub_reservations",
eafe8130 4068 asok_hook,
f6b5b4d7 4069 "show scrub reservations");
eafe8130 4070 ceph_assert(r == 0);
9f95a23c 4071 r = admin_socket->register_command("get_latest_osdmap",
7c673cae
FG
4072 asok_hook,
4073 "force osd to update the latest map from "
4074 "the mon");
11fdf7f2 4075 ceph_assert(r == 0);
7c673cae 4076
9f95a23c 4077 r = admin_socket->register_command("set_heap_property " \
7c673cae
FG
4078 "name=property,type=CephString " \
4079 "name=value,type=CephInt",
4080 asok_hook,
4081 "update malloc extension heap property");
11fdf7f2 4082 ceph_assert(r == 0);
7c673cae 4083
9f95a23c 4084 r = admin_socket->register_command("get_heap_property " \
7c673cae
FG
4085 "name=property,type=CephString",
4086 asok_hook,
4087 "get malloc extension heap property");
11fdf7f2 4088 ceph_assert(r == 0);
7c673cae
FG
4089
4090 r = admin_socket->register_command("dump_objectstore_kv_stats",
7c673cae
FG
4091 asok_hook,
4092 "print statistics of kvdb which used by bluestore");
11fdf7f2 4093 ceph_assert(r == 0);
7c673cae
FG
4094
4095 r = admin_socket->register_command("dump_scrubs",
7c673cae
FG
4096 asok_hook,
4097 "print scheduled scrubs");
11fdf7f2 4098 ceph_assert(r == 0);
7c673cae
FG
4099
4100 r = admin_socket->register_command("calc_objectstore_db_histogram",
7c673cae
FG
4101 asok_hook,
4102 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 4103 ceph_assert(r == 0);
7c673cae
FG
4104
4105 r = admin_socket->register_command("flush_store_cache",
7c673cae
FG
4106 asok_hook,
4107 "Flush bluestore internal cache");
11fdf7f2 4108 ceph_assert(r == 0);
39ae355f
TL
4109 r = admin_socket->register_command("rotate-stored-key",
4110 asok_hook,
4111 "Update the stored osd_key");
4112 ceph_assert(r == 0);
9f95a23c 4113 r = admin_socket->register_command("dump_pgstate_history",
7c673cae
FG
4114 asok_hook,
4115 "show recent state history");
11fdf7f2 4116 ceph_assert(r == 0);
7c673cae 4117
9f95a23c 4118 r = admin_socket->register_command("compact",
224ce89b
WB
4119 asok_hook,
4120 "Commpact object store's omap."
4121 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
4122 ceph_assert(r == 0);
4123
9f95a23c 4124 r = admin_socket->register_command("get_mapped_pools",
11fdf7f2
TL
4125 asok_hook,
4126 "dump pools whose PG(s) are mapped to this OSD.");
4127
4128 ceph_assert(r == 0);
4129
9f95a23c 4130 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
11fdf7f2
TL
4131 asok_hook,
4132 "probe OSD devices for SMART data.");
4133
4134 ceph_assert(r == 0);
4135
9f95a23c 4136 r = admin_socket->register_command("list_devices",
11fdf7f2
TL
4137 asok_hook,
4138 "list OSD devices.");
9f95a23c 4139 r = admin_socket->register_command("send_beacon",
11fdf7f2
TL
4140 asok_hook,
4141 "send OSD beacon to mon immediately");
224ce89b 4142
9f95a23c
TL
4143 r = admin_socket->register_command(
4144 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
4145 "Dump osd heartbeat network ping times");
eafe8130
TL
4146 ceph_assert(r == 0);
4147
20effc67
TL
4148 r = admin_socket->register_command(
4149 "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook,
4150 "Dump store's statistics for the given pool");
4151 ceph_assert(r == 0);
4152
4153 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store.get());
7c673cae
FG
4154 // Note: pools are CephString instead of CephPoolname because
4155 // these commands traditionally support both pool names and numbers
4156 r = admin_socket->register_command(
7c673cae
FG
4157 "setomapval " \
4158 "name=pool,type=CephString " \
4159 "name=objname,type=CephObjectname " \
4160 "name=key,type=CephString "\
4161 "name=val,type=CephString",
4162 test_ops_hook,
4163 "set omap key");
11fdf7f2 4164 ceph_assert(r == 0);
7c673cae 4165 r = admin_socket->register_command(
7c673cae
FG
4166 "rmomapkey " \
4167 "name=pool,type=CephString " \
4168 "name=objname,type=CephObjectname " \
4169 "name=key,type=CephString",
4170 test_ops_hook,
4171 "remove omap key");
11fdf7f2 4172 ceph_assert(r == 0);
7c673cae 4173 r = admin_socket->register_command(
7c673cae
FG
4174 "setomapheader " \
4175 "name=pool,type=CephString " \
4176 "name=objname,type=CephObjectname " \
4177 "name=header,type=CephString",
4178 test_ops_hook,
4179 "set omap header");
11fdf7f2 4180 ceph_assert(r == 0);
7c673cae
FG
4181
4182 r = admin_socket->register_command(
7c673cae
FG
4183 "getomap " \
4184 "name=pool,type=CephString " \
4185 "name=objname,type=CephObjectname",
4186 test_ops_hook,
4187 "output entire object map");
11fdf7f2 4188 ceph_assert(r == 0);
7c673cae
FG
4189
4190 r = admin_socket->register_command(
7c673cae
FG
4191 "truncobj " \
4192 "name=pool,type=CephString " \
4193 "name=objname,type=CephObjectname " \
4194 "name=len,type=CephInt",
4195 test_ops_hook,
4196 "truncate object to length");
11fdf7f2 4197 ceph_assert(r == 0);
7c673cae
FG
4198
4199 r = admin_socket->register_command(
7c673cae
FG
4200 "injectdataerr " \
4201 "name=pool,type=CephString " \
4202 "name=objname,type=CephObjectname " \
4203 "name=shardid,type=CephInt,req=false,range=0|255",
4204 test_ops_hook,
4205 "inject data error to an object");
11fdf7f2 4206 ceph_assert(r == 0);
7c673cae
FG
4207
4208 r = admin_socket->register_command(
7c673cae
FG
4209 "injectmdataerr " \
4210 "name=pool,type=CephString " \
4211 "name=objname,type=CephObjectname " \
4212 "name=shardid,type=CephInt,req=false,range=0|255",
4213 test_ops_hook,
4214 "inject metadata error to an object");
11fdf7f2 4215 ceph_assert(r == 0);
7c673cae 4216 r = admin_socket->register_command(
7c673cae
FG
4217 "set_recovery_delay " \
4218 "name=utime,type=CephInt,req=false",
4219 test_ops_hook,
4220 "Delay osd recovery by specified seconds");
11fdf7f2 4221 ceph_assert(r == 0);
7c673cae 4222 r = admin_socket->register_command(
7c673cae
FG
4223 "injectfull " \
4224 "name=type,type=CephString,req=false " \
4225 "name=count,type=CephInt,req=false ",
4226 test_ops_hook,
4227 "Inject a full disk (optional count times)");
11fdf7f2 4228 ceph_assert(r == 0);
9f95a23c
TL
4229 r = admin_socket->register_command(
4230 "bench " \
4231 "name=count,type=CephInt,req=false " \
4232 "name=size,type=CephInt,req=false " \
4233 "name=object_size,type=CephInt,req=false " \
4234 "name=object_num,type=CephInt,req=false ",
4235 asok_hook,
4236 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4237 "(default count=1G default size=4MB). Results in log.");
4238 ceph_assert(r == 0);
4239 r = admin_socket->register_command(
4240 "cluster_log " \
4241 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4242 "name=message,type=CephString,n=N",
4243 asok_hook,
4244 "log a message to the cluster log");
4245 ceph_assert(r == 0);
4246 r = admin_socket->register_command(
4247 "flush_pg_stats",
4248 asok_hook,
4249 "flush pg stats");
4250 ceph_assert(r == 0);
4251 r = admin_socket->register_command(
4252 "heap " \
4253 "name=heapcmd,type=CephChoices,strings=" \
4254 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4255 "name=value,type=CephString,req=false",
4256 asok_hook,
4257 "show heap usage info (available only if compiled with tcmalloc)");
4258 ceph_assert(r == 0);
4259 r = admin_socket->register_command(
4260 "debug dump_missing " \
4261 "name=filename,type=CephFilepath",
4262 asok_hook,
4263 "dump missing objects to a named file");
4264 ceph_assert(r == 0);
4265 r = admin_socket->register_command(
4266 "debug kick_recovery_wq " \
4267 "name=delay,type=CephInt,range=0",
4268 asok_hook,
4269 "set osd_recovery_delay_start to <val>");
4270 ceph_assert(r == 0);
4271 r = admin_socket->register_command(
4272 "cpu_profiler " \
4273 "name=arg,type=CephChoices,strings=status|flush",
4274 asok_hook,
4275 "run cpu profiling on daemon");
4276 ceph_assert(r == 0);
4277 r = admin_socket->register_command(
4278 "dump_pg_recovery_stats",
4279 asok_hook,
4280 "dump pg recovery statistics");
4281 ceph_assert(r == 0);
4282 r = admin_socket->register_command(
4283 "reset_pg_recovery_stats",
4284 asok_hook,
4285 "reset pg recovery statistics");
4286 ceph_assert(r == 0);
4287 r = admin_socket->register_command(
4288 "cache drop",
4289 asok_hook,
4290 "Drop all OSD caches");
4291 ceph_assert(r == 0);
4292 r = admin_socket->register_command(
4293 "cache status",
4294 asok_hook,
4295 "Get OSD caches statistics");
4296 ceph_assert(r == 0);
4297 r = admin_socket->register_command(
4298 "scrub_purged_snaps",
4299 asok_hook,
4300 "Scrub purged_snaps vs snapmapper index");
4301 ceph_assert(r == 0);
20effc67
TL
4302 r = admin_socket->register_command(
4303 "scrubdebug " \
4304 "name=pgid,type=CephPgid " \
4305 "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4306 "name=value,type=CephString,req=false",
4307 asok_hook,
4308 "debug the scrubber");
4309 ceph_assert(r == 0);
7c673cae 4310
9f95a23c
TL
4311 // -- pg commands --
4312 // old form: ceph pg <pgid> command ...
4313 r = admin_socket->register_command(
4314 "pg " \
4315 "name=pgid,type=CephPgid " \
4316 "name=cmd,type=CephChoices,strings=query",
4317 asok_hook,
4318 "");
4319 ceph_assert(r == 0);
1e59de90
TL
4320 r = admin_socket->register_command(
4321 "pg " \
4322 "name=pgid,type=CephPgid " \
4323 "name=cmd,type=CephChoices,strings=log",
4324 asok_hook,
4325 "");
4326 ceph_assert(r == 0);
9f95a23c
TL
4327 r = admin_socket->register_command(
4328 "pg " \
4329 "name=pgid,type=CephPgid " \
4330 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4331 "name=mulcmd,type=CephChoices,strings=revert|delete",
4332 asok_hook,
4333 "");
4334 ceph_assert(r == 0);
4335 r = admin_socket->register_command(
4336 "pg " \
4337 "name=pgid,type=CephPgid " \
4338 "name=cmd,type=CephChoices,strings=list_unfound " \
4339 "name=offset,type=CephString,req=false",
4340 asok_hook,
4341 "");
4342 ceph_assert(r == 0);
4343 r = admin_socket->register_command(
4344 "pg " \
4345 "name=pgid,type=CephPgid " \
4346 "name=cmd,type=CephChoices,strings=scrub " \
4347 "name=time,type=CephInt,req=false",
4348 asok_hook,
4349 "");
4350 ceph_assert(r == 0);
4351 r = admin_socket->register_command(
4352 "pg " \
4353 "name=pgid,type=CephPgid " \
4354 "name=cmd,type=CephChoices,strings=deep_scrub " \
4355 "name=time,type=CephInt,req=false",
4356 asok_hook,
4357 "");
4358 ceph_assert(r == 0);
4359 // new form: tell <pgid> <cmd> for both cli and rest
4360 r = admin_socket->register_command(
4361 "query",
4362 asok_hook,
4363 "show details of a specific pg");
4364 ceph_assert(r == 0);
1e59de90
TL
4365 r = admin_socket->register_command(
4366 "log",
4367 asok_hook,
4368 "dump pg_log of a specific pg");
4369 ceph_assert(r == 0);
9f95a23c
TL
4370 r = admin_socket->register_command(
4371 "mark_unfound_lost " \
4372 "name=pgid,type=CephPgid,req=false " \
4373 "name=mulcmd,type=CephChoices,strings=revert|delete",
4374 asok_hook,
4375 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4376 ceph_assert(r == 0);
4377 r = admin_socket->register_command(
4378 "list_unfound " \
4379 "name=pgid,type=CephPgid,req=false " \
4380 "name=offset,type=CephString,req=false",
4381 asok_hook,
4382 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4383 ceph_assert(r == 0);
4384 r = admin_socket->register_command(
4385 "scrub " \
4386 "name=pgid,type=CephPgid,req=false " \
4387 "name=time,type=CephInt,req=false",
4388 asok_hook,
4389 "Trigger a scheduled scrub ");
4390 ceph_assert(r == 0);
4391 r = admin_socket->register_command(
4392 "deep_scrub " \
4393 "name=pgid,type=CephPgid,req=false " \
4394 "name=time,type=CephInt,req=false",
4395 asok_hook,
4396 "Trigger a scheduled deep scrub ");
4397 ceph_assert(r == 0);
4398}
7c673cae 4399
f67539c2 4400PerfCounters* OSD::create_logger()
9f95a23c 4401{
f67539c2 4402 PerfCounters* logger = build_osd_logger(cct);
7c673cae 4403 cct->get_perfcounters_collection()->add(logger);
f67539c2 4404 return logger;
7c673cae
FG
4405}
4406
f67539c2 4407PerfCounters* OSD::create_recoverystate_perf()
7c673cae 4408{
f67539c2 4409 PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
7c673cae 4410 cct->get_perfcounters_collection()->add(recoverystate_perf);
f67539c2 4411 return recoverystate_perf;
7c673cae
FG
4412}
4413
4414int OSD::shutdown()
4415{
1d09f67e
TL
4416 // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4417 //cct->_conf->osd_fast_shutdown = true;
4418
4419 dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4420 << cct->_conf->osd_fast_shutdown
4421 << ", null-fm = " << store->has_null_manager() << dendl;
4422
4423 utime_t start_time_func = ceph_clock_now();
4424
92f5a8d4
TL
4425 if (cct->_conf->osd_fast_shutdown) {
4426 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
f67539c2
TL
4427 if (cct->_conf->osd_fast_shutdown_notify_mon)
4428 service.prepare_to_stop();
92f5a8d4 4429
1d09f67e
TL
4430 // There is no state we need to keep wehn running in NULL-FM moode
4431 if (!store->has_null_manager()) {
4432 cct->_log->flush();
4433 _exit(0);
4434 }
4435 } else if (!service.prepare_to_stop()) {
7c673cae 4436 return 0; // already shutting down
1d09f67e
TL
4437 }
4438
9f95a23c 4439 osd_lock.lock();
7c673cae 4440 if (is_stopping()) {
9f95a23c 4441 osd_lock.unlock();
7c673cae
FG
4442 return 0;
4443 }
7c673cae 4444
1d09f67e
TL
4445 if (!cct->_conf->osd_fast_shutdown) {
4446 dout(0) << "shutdown" << dendl;
4447 }
4448
4449 // don't accept new task for this OSD
7c673cae
FG
4450 set_state(STATE_STOPPING);
4451
1d09f67e
TL
4452 // Disabled debugging during fast-shutdown
4453 if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
11fdf7f2
TL
4454 cct->_conf.set_val("debug_osd", "100");
4455 cct->_conf.set_val("debug_journal", "100");
4456 cct->_conf.set_val("debug_filestore", "100");
4457 cct->_conf.set_val("debug_bluestore", "100");
4458 cct->_conf.set_val("debug_ms", "100");
4459 cct->_conf.apply_changes(nullptr);
3efd9988 4460 }
7c673cae 4461
39ae355f
TL
4462 // stop MgrClient earlier as it's more like an internal consumer of OSD
4463 //
4464 // should occur before unmounting the database in fast-shutdown to avoid
4465 // a race condition (see https://tracker.ceph.com/issues/56101)
4466 mgrc.shutdown();
4467
1d09f67e
TL
4468 if (cct->_conf->osd_fast_shutdown) {
4469 // first, stop new task from being taken from op_shardedwq
4470 // and clear all pending tasks
4471 op_shardedwq.stop_for_fast_shutdown();
4472
4473 utime_t start_time_timer = ceph_clock_now();
4474 tick_timer.shutdown();
4475 {
4476 std::lock_guard l(tick_timer_lock);
4477 tick_timer_without_osd_lock.shutdown();
4478 }
4479
4480 osd_lock.unlock();
4481 utime_t start_time_osd_drain = ceph_clock_now();
4482
4483 // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4484 osd_op_tp.drain();
4485 osd_op_tp.stop();
4486
4487 utime_t start_time_umount = ceph_clock_now();
4488 store->prepare_for_fast_shutdown();
4489 std::lock_guard lock(osd_lock);
4490 // TBD: assert in allocator that nothing is being add
4491 store->umount();
4492
4493 utime_t end_time = ceph_clock_now();
4494 if (cct->_conf->osd_fast_shutdown_timeout) {
4495 ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4496 }
4497 dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl;
4498 dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl;
4499 dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl;
4500 dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl;
4501 cct->_log->flush();
4502
4503 // now it is safe to exit
4504 _exit(0);
4505 }
4506
7c673cae
FG
4507 service.start_shutdown();
4508
4509 // stop sending work to pgs. this just prevents any new work in _process
4510 // from racing with on_shutdown and potentially entering the pg after.
4511 op_shardedwq.drain();
4512
4513 // Shutdown PGs
4514 {
11fdf7f2
TL
4515 vector<PGRef> pgs;
4516 _get_pgs(&pgs);
4517 for (auto pg : pgs) {
4518 pg->shutdown();
7c673cae
FG
4519 }
4520 }
7c673cae
FG
4521
4522 // drain op queue again (in case PGs requeued something)
4523 op_shardedwq.drain();
7c673cae 4524
7c673cae 4525 // unregister commands
11fdf7f2 4526 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
4527 delete asok_hook;
4528 asok_hook = NULL;
4529
11fdf7f2 4530 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
4531 delete test_ops_hook;
4532 test_ops_hook = NULL;
4533
9f95a23c 4534 osd_lock.unlock();
7c673cae 4535
9f95a23c
TL
4536 {
4537 std::lock_guard l{heartbeat_lock};
4538 heartbeat_stop = true;
4539 heartbeat_cond.notify_all();
4540 heartbeat_peers.clear();
4541 }
7c673cae
FG
4542 heartbeat_thread.join();
4543
9f95a23c
TL
4544 hb_back_server_messenger->mark_down_all();
4545 hb_front_server_messenger->mark_down_all();
4546 hb_front_client_messenger->mark_down_all();
4547 hb_back_client_messenger->mark_down_all();
4548
7c673cae
FG
4549 osd_op_tp.drain();
4550 osd_op_tp.stop();
4551 dout(10) << "op sharded tp stopped" << dendl;
4552
7c673cae
FG
4553 dout(10) << "stopping agent" << dendl;
4554 service.agent_stop();
4555
11fdf7f2
TL
4556 boot_finisher.wait_for_empty();
4557
9f95a23c 4558 osd_lock.lock();
7c673cae 4559
11fdf7f2 4560 boot_finisher.stop();
494da23a 4561 reset_heartbeat_peers(true);
7c673cae
FG
4562
4563 tick_timer.shutdown();
4564
4565 {
11fdf7f2 4566 std::lock_guard l(tick_timer_lock);
7c673cae
FG
4567 tick_timer_without_osd_lock.shutdown();
4568 }
4569
4570 // note unmount epoch
9f95a23c 4571 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
7c673cae 4572 superblock.mounted = service.get_boot_epoch();
9f95a23c 4573 superblock.clean_thru = get_osdmap_epoch();
7c673cae
FG
4574 ObjectStore::Transaction t;
4575 write_superblock(t);
11fdf7f2 4576 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4577 if (r) {
4578 derr << "OSD::shutdown: error writing superblock: "
4579 << cpp_strerror(r) << dendl;
4580 }
4581
4582
31f18b77
FG
4583 service.shutdown_reserver();
4584
7c673cae
FG
4585 // Remove PGs
4586#ifdef PG_DEBUG_REFS
4587 service.dump_live_pgids();
4588#endif
11fdf7f2
TL
4589 while (true) {
4590 vector<PGRef> pgs;
4591 _get_pgs(&pgs, true);
4592 if (pgs.empty()) {
4593 break;
4594 }
4595 for (auto& pg : pgs) {
4596 if (pg->is_deleted()) {
4597 continue;
4598 }
4599 dout(20) << " kicking pg " << pg << dendl;
4600 pg->lock();
4601 if (pg->get_num_ref() != 1) {
4602 derr << "pgid " << pg->get_pgid() << " has ref count of "
4603 << pg->get_num_ref() << dendl;
7c673cae 4604#ifdef PG_DEBUG_REFS
11fdf7f2 4605 pg->dump_live_ids();
7c673cae 4606#endif
31f18b77
FG
4607 if (cct->_conf->osd_shutdown_pgref_assert) {
4608 ceph_abort();
4609 }
7c673cae 4610 }
11fdf7f2
TL
4611 pg->ch.reset();
4612 pg->unlock();
7c673cae 4613 }
7c673cae
FG
4614 }
4615#ifdef PG_DEBUG_REFS
4616 service.dump_live_pgids();
4617#endif
f64942e4 4618
9f95a23c 4619 osd_lock.unlock();
11fdf7f2 4620 cct->_conf.remove_observer(this);
9f95a23c 4621 osd_lock.lock();
7c673cae 4622
11fdf7f2
TL
4623 service.meta_ch.reset();
4624
7c673cae
FG
4625 dout(10) << "syncing store" << dendl;
4626 enable_disable_fuse(true);
4627
4628 if (cct->_conf->osd_journal_flush_on_shutdown) {
4629 dout(10) << "flushing journal" << dendl;
4630 store->flush_journal();
4631 }
4632
7c673cae 4633 monc->shutdown();
9f95a23c
TL
4634 osd_lock.unlock();
4635 {
4636 std::unique_lock l{map_lock};
4637 set_osdmap(OSDMapRef());
4638 }
11fdf7f2
TL
4639 for (auto s : shards) {
4640 std::lock_guard l(s->osdmap_lock);
4641 s->shard_osdmap = OSDMapRef();
4642 }
7c673cae 4643 service.shutdown();
11fdf7f2
TL
4644
4645 std::lock_guard lock(osd_lock);
4646 store->umount();
20effc67 4647 store.reset();
11fdf7f2
TL
4648 dout(10) << "Store synced" << dendl;
4649
7c673cae
FG
4650 op_tracker.on_shutdown();
4651
9f95a23c 4652 ClassHandler::get_instance().shutdown();
7c673cae
FG
4653 client_messenger->shutdown();
4654 cluster_messenger->shutdown();
4655 hb_front_client_messenger->shutdown();
4656 hb_back_client_messenger->shutdown();
4657 objecter_messenger->shutdown();
4658 hb_front_server_messenger->shutdown();
4659 hb_back_server_messenger->shutdown();
4660
1d09f67e
TL
4661 utime_t duration = ceph_clock_now() - start_time_func;
4662 dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
4663
20effc67 4664
7c673cae
FG
4665 return r;
4666}
4667
4668int OSD::mon_cmd_maybe_osd_create(string &cmd)
4669{
4670 bool created = false;
4671 while (true) {
4672 dout(10) << __func__ << " cmd: " << cmd << dendl;
4673 vector<string> vcmd{cmd};
4674 bufferlist inbl;
4675 C_SaferCond w;
4676 string outs;
4677 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4678 int r = w.wait();
4679 if (r < 0) {
4680 if (r == -ENOENT && !created) {
4681 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4682 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4683 vector<string> vnewcmd{newcmd};
4684 bufferlist inbl;
4685 C_SaferCond w;
4686 string outs;
4687 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4688 int r = w.wait();
4689 if (r < 0) {
4690 derr << __func__ << " fail: osd does not exist and created failed: "
4691 << cpp_strerror(r) << dendl;
4692 return r;
4693 }
4694 created = true;
4695 continue;
4696 }
4697 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4698 return r;
4699 }
4700 break;
4701 }
4702
4703 return 0;
4704}
4705
4706int OSD::update_crush_location()
4707{
4708 if (!cct->_conf->osd_crush_update_on_start) {
4709 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4710 return 0;
4711 }
4712
4713 char weight[32];
4714 if (cct->_conf->osd_crush_initial_weight >= 0) {
4715 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4716 } else {
4717 struct store_statfs_t st;
11fdf7f2
TL
4718 osd_alert_list_t alerts;
4719 int r = store->statfs(&st, &alerts);
7c673cae
FG
4720 if (r < 0) {
4721 derr << "statfs: " << cpp_strerror(r) << dendl;
4722 return r;
4723 }
4724 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4725 std::max(.00001,
4726 double(st.total) /
4727 double(1ull << 40 /* TB */)));
7c673cae
FG
4728 }
4729
9f95a23c 4730 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
7c673cae
FG
4731
4732 string cmd =
4733 string("{\"prefix\": \"osd crush create-or-move\", ") +
9f95a23c
TL
4734 string("\"id\": ") + stringify(whoami) + ", " +
4735 string("\"weight\":") + weight + ", " +
4736 string("\"args\": [") + stringify(cct->crush_location) + "]}";
7c673cae
FG
4737 return mon_cmd_maybe_osd_create(cmd);
4738}
4739
4740int OSD::update_crush_device_class()
4741{
224ce89b
WB
4742 if (!cct->_conf->osd_class_update_on_start) {
4743 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4744 return 0;
4745 }
4746
7c673cae
FG
4747 string device_class;
4748 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4749 if (r < 0 || device_class.empty()) {
4750 device_class = store->get_default_device_class();
4751 }
4752
4753 if (device_class.empty()) {
d2e6a577 4754 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4755 return 0;
224ce89b 4756 }
7c673cae
FG
4757
4758 string cmd =
4759 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4760 string("\"class\": \"") + device_class + string("\", ") +
4761 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4762
224ce89b 4763 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4764 if (r == -EBUSY) {
4765 // good, already bound to a device-class
4766 return 0;
4767 } else {
4768 return r;
4769 }
7c673cae
FG
4770}
4771
4772void OSD::write_superblock(ObjectStore::Transaction& t)
4773{
4774 dout(10) << "write_superblock " << superblock << dendl;
4775
4776 //hack: at minimum it's using the baseline feature set
4777 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4778 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4779
4780 bufferlist bl;
11fdf7f2 4781 encode(superblock, bl);
7c673cae
FG
4782 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4783}
4784
4785int OSD::read_superblock()
4786{
4787 bufferlist bl;
11fdf7f2 4788 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4789 if (r < 0)
4790 return r;
4791
11fdf7f2
TL
4792 auto p = bl.cbegin();
4793 decode(superblock, p);
7c673cae
FG
4794
4795 dout(10) << "read_superblock " << superblock << dendl;
4796
4797 return 0;
4798}
4799
4800void OSD::clear_temp_objects()
4801{
4802 dout(10) << __func__ << dendl;
4803 vector<coll_t> ls;
4804 store->list_collections(ls);
4805 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4806 spg_t pgid;
4807 if (!p->is_pg(&pgid))
4808 continue;
4809
4810 // list temp objects
4811 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4812
4813 vector<ghobject_t> temps;
4814 ghobject_t next;
4815 while (1) {
4816 vector<ghobject_t> objects;
11fdf7f2
TL
4817 auto ch = store->open_collection(*p);
4818 ceph_assert(ch);
4819 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4820 store->get_ideal_list_max(),
4821 &objects, &next);
4822 if (objects.empty())
4823 break;
4824 vector<ghobject_t>::iterator q;
4825 for (q = objects.begin(); q != objects.end(); ++q) {
4826 // Hammer set pool for temps to -1, so check for clean-up
4827 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4828 temps.push_back(*q);
4829 } else {
4830 break;
4831 }
4832 }
4833 // If we saw a non-temp object and hit the break above we can
4834 // break out of the while loop too.
4835 if (q != objects.end())
4836 break;
4837 }
4838 if (!temps.empty()) {
4839 ObjectStore::Transaction t;
4840 int removed = 0;
4841 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4842 dout(20) << " removing " << *p << " object " << *q << dendl;
4843 t.remove(*p, *q);
4844 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4845 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4846 t = ObjectStore::Transaction();
4847 removed = 0;
4848 }
4849 }
4850 if (removed) {
11fdf7f2 4851 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4852 }
4853 }
4854 }
4855}
4856
4857void OSD::recursive_remove_collection(CephContext* cct,
4858 ObjectStore *store, spg_t pgid,
4859 coll_t tmp)
4860{
4861 OSDriver driver(
4862 store,
4863 coll_t(),
4864 make_snapmapper_oid());
4865
11fdf7f2 4866 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4867 ObjectStore::Transaction t;
4868 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4869
11fdf7f2
TL
4870 ghobject_t next;
4871 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4872 vector<ghobject_t> objects;
11fdf7f2
TL
4873 objects.reserve(max);
4874 while (true) {
4875 objects.clear();
4876 store->collection_list(ch, next, ghobject_t::get_max(),
4877 max, &objects, &next);
4878 generic_dout(10) << __func__ << " " << objects << dendl;
4879 if (objects.empty())
4880 break;
4881 for (auto& p: objects) {
4882 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4883 int r = mapper.remove_oid(p.hobj, &_t);
4884 if (r != 0 && r != -ENOENT)
4885 ceph_abort();
4886 t.remove(tmp, p);
7c673cae 4887 }
11fdf7f2
TL
4888 int r = store->queue_transaction(ch, std::move(t));
4889 ceph_assert(r == 0);
4890 t = ObjectStore::Transaction();
7c673cae
FG
4891 }
4892 t.remove_collection(tmp);
11fdf7f2
TL
4893 int r = store->queue_transaction(ch, std::move(t));
4894 ceph_assert(r == 0);
7c673cae
FG
4895
4896 C_SaferCond waiter;
11fdf7f2 4897 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4898 waiter.wait();
4899 }
4900}
4901
4902
4903// ======================================================
4904// PG's
4905
7c673cae
FG
4906PG* OSD::_make_pg(
4907 OSDMapRef createmap,
4908 spg_t pgid)
4909{
11fdf7f2
TL
4910 dout(10) << __func__ << " " << pgid << dendl;
4911 pg_pool_t pi;
4912 map<string,string> ec_profile;
4913 string name;
4914 if (createmap->have_pg_pool(pgid.pool())) {
4915 pi = *createmap->get_pg_pool(pgid.pool());
4916 name = createmap->get_pool_name(pgid.pool());
4917 if (pi.is_erasure()) {
4918 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4919 }
4920 } else {
4921 // pool was deleted; grab final pg_pool_t off disk.
4922 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4923 bufferlist bl;
4924 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4925 if (r < 0) {
4926 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4927 << dendl;
4928 return nullptr;
4929 }
4930 ceph_assert(r >= 0);
4931 auto p = bl.cbegin();
4932 decode(pi, p);
4933 decode(name, p);
4934 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4935 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4936 << " tombstone" << dendl;
4937 return nullptr;
4938 }
4939 decode(ec_profile, p);
4940 }
f67539c2 4941 PGPool pool(createmap, pgid.pool(), pi, name);
7c673cae 4942 PG *pg;
11fdf7f2
TL
4943 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4944 pi.type == pg_pool_t::TYPE_ERASURE)
4945 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4946 else
4947 ceph_abort();
7c673cae
FG
4948 return pg;
4949}
4950
11fdf7f2 4951void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4952{
11fdf7f2
TL
4953 v->clear();
4954 v->reserve(get_num_pgs());
4955 for (auto& s : shards) {
4956 std::lock_guard l(s->shard_lock);
4957 for (auto& j : s->pg_slots) {
4958 if (j.second->pg &&
4959 !j.second->pg->is_deleted()) {
4960 v->push_back(j.second->pg);
4961 if (clear_too) {
4962 s->_detach_pg(j.second.get());
4963 }
4964 }
7c673cae 4965 }
7c673cae 4966 }
7c673cae
FG
4967}
4968
11fdf7f2 4969void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4970{
11fdf7f2
TL
4971 v->clear();
4972 v->reserve(get_num_pgs());
4973 for (auto& s : shards) {
4974 std::lock_guard l(s->shard_lock);
4975 for (auto& j : s->pg_slots) {
4976 if (j.second->pg &&
4977 !j.second->pg->is_deleted()) {
4978 v->push_back(j.first);
4979 }
7c673cae
FG
4980 }
4981 }
7c673cae
FG
4982}
4983
11fdf7f2 4984void OSD::register_pg(PGRef pg)
7c673cae 4985{
11fdf7f2
TL
4986 spg_t pgid = pg->get_pgid();
4987 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4988 auto sdata = shards[shard_index];
4989 std::lock_guard l(sdata->shard_lock);
4990 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4991 ceph_assert(r.second);
4992 auto *slot = r.first->second.get();
4993 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4994 sdata->_attach_pg(slot, pg.get());
4995}
7c673cae 4996
11fdf7f2
TL
4997bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4998{
4999 auto sdata = pg->osd_shard;
5000 ceph_assert(sdata);
5001 {
5002 std::lock_guard l(sdata->shard_lock);
5003 auto p = sdata->pg_slots.find(pg->pg_id);
5004 if (p == sdata->pg_slots.end() ||
5005 !p->second->pg) {
5006 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
5007 return false;
5008 }
5009 if (p->second->waiting_for_merge_epoch) {
5010 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
5011 return false;
5012 }
5013 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
5014 sdata->_detach_pg(p->second.get());
5015 }
7c673cae 5016
11fdf7f2
TL
5017 for (auto shard : shards) {
5018 shard->unprime_split_children(pg->pg_id, old_pg_num);
5019 }
7c673cae 5020
11fdf7f2
TL
5021 // update pg count now since we might not get an osdmap any time soon.
5022 if (pg->is_primary())
5023 service.logger->dec(l_osd_pg_primary);
9f95a23c
TL
5024 else if (pg->is_nonprimary())
5025 service.logger->dec(l_osd_pg_replica); // misnomver
11fdf7f2
TL
5026 else
5027 service.logger->dec(l_osd_pg_stray);
7c673cae 5028
11fdf7f2 5029 return true;
7c673cae
FG
5030}
5031
11fdf7f2 5032PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 5033{
11fdf7f2
TL
5034 uint32_t shard_index = pgid.hash_to_shard(num_shards);
5035 auto sdata = shards[shard_index];
5036 std::lock_guard l(sdata->shard_lock);
5037 auto p = sdata->pg_slots.find(pgid);
5038 if (p == sdata->pg_slots.end()) {
7c673cae 5039 return nullptr;
11fdf7f2
TL
5040 }
5041 return p->second->pg;
7c673cae
FG
5042}
5043
11fdf7f2 5044PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 5045{
11fdf7f2
TL
5046 PGRef pg = _lookup_pg(pgid);
5047 if (!pg) {
5048 return nullptr;
5049 }
5050 pg->lock();
5051 if (!pg->is_deleted()) {
5052 return pg;
5053 }
5054 pg->unlock();
5055 return nullptr;
31f18b77
FG
5056}
5057
11fdf7f2 5058PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 5059{
11fdf7f2 5060 return _lookup_lock_pg(pgid);
7c673cae
FG
5061}
5062
5063void OSD::load_pgs()
5064{
9f95a23c 5065 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5066 dout(0) << "load_pgs" << dendl;
11fdf7f2 5067
7c673cae 5068 {
11fdf7f2
TL
5069 auto pghist = make_pg_num_history_oid();
5070 bufferlist bl;
5071 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
5072 if (r >= 0 && bl.length() > 0) {
5073 auto p = bl.cbegin();
5074 decode(pg_num_history, p);
5075 }
5076 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
5077 }
5078
5079 vector<coll_t> ls;
5080 int r = store->list_collections(ls);
5081 if (r < 0) {
5082 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
5083 }
5084
11fdf7f2 5085 int num = 0;
7c673cae
FG
5086 for (vector<coll_t>::iterator it = ls.begin();
5087 it != ls.end();
5088 ++it) {
5089 spg_t pgid;
5090 if (it->is_temp(&pgid) ||
20effc67 5091 (it->is_pg(&pgid) && PG::_has_removal_flag(store.get(), pgid))) {
11fdf7f2
TL
5092 dout(10) << "load_pgs " << *it
5093 << " removing, legacy or flagged for removal pg" << dendl;
20effc67 5094 recursive_remove_collection(cct, store.get(), pgid, *it);
7c673cae
FG
5095 continue;
5096 }
5097
5098 if (!it->is_pg(&pgid)) {
5099 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
5100 continue;
5101 }
5102
7c673cae 5103 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 5104 epoch_t map_epoch = 0;
20effc67 5105 int r = PG::peek_map_epoch(store.get(), pgid, &map_epoch);
7c673cae
FG
5106 if (r < 0) {
5107 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
5108 << dendl;
5109 continue;
5110 }
5111
11fdf7f2 5112 PGRef pg;
7c673cae
FG
5113 if (map_epoch > 0) {
5114 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
5115 if (!pgosdmap) {
9f95a23c 5116 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
7c673cae
FG
5117 derr << __func__ << ": could not find map for epoch " << map_epoch
5118 << " on pg " << pgid << ", but the pool is not present in the "
5119 << "current map, so this is probably a result of bug 10617. "
5120 << "Skipping the pg for now, you can use ceph-objectstore-tool "
5121 << "to clean it up later." << dendl;
5122 continue;
5123 } else {
5124 derr << __func__ << ": have pgid " << pgid << " at epoch "
5125 << map_epoch << ", but missing map. Crashing."
5126 << dendl;
11fdf7f2 5127 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
5128 }
5129 }
11fdf7f2 5130 pg = _make_pg(pgosdmap, pgid);
7c673cae 5131 } else {
9f95a23c 5132 pg = _make_pg(get_osdmap(), pgid);
7c673cae 5133 }
11fdf7f2 5134 if (!pg) {
20effc67 5135 recursive_remove_collection(cct, store.get(), pgid, *it);
11fdf7f2
TL
5136 continue;
5137 }
5138
5139 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 5140
11fdf7f2 5141 pg->lock();
7c673cae
FG
5142 pg->ch = store->open_collection(pg->coll);
5143
5144 // read pg state, log
20effc67 5145 pg->read_state(store.get());
7c673cae 5146
94b18763
FG
5147 if (pg->dne()) {
5148 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
5149 pg->ch = nullptr;
94b18763 5150 pg->unlock();
20effc67 5151 recursive_remove_collection(cct, store.get(), pgid, *it);
94b18763
FG
5152 continue;
5153 }
11fdf7f2
TL
5154 {
5155 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5156 assert(NULL != shards[shard_index]);
5157 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5158 }
7c673cae 5159
11fdf7f2 5160 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 5161 pg->unlock();
7c673cae 5162
11fdf7f2
TL
5163 register_pg(pg);
5164 ++num;
7c673cae 5165 }
11fdf7f2 5166 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
5167}
5168
5169
11fdf7f2
TL
5170PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
5171 const PGCreateInfo *info)
5172{
5173 spg_t pgid = info->pgid;
7c673cae 5174
11fdf7f2
TL
5175 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
5176 dout(10) << __func__ << " hit max pg, dropping" << dendl;
5177 return nullptr;
5178 }
3efd9988 5179
11fdf7f2 5180 OSDMapRef startmap = get_map(info->epoch);
7c673cae 5181
11fdf7f2
TL
5182 if (info->by_mon) {
5183 int64_t pool_id = pgid.pgid.pool();
5184 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
5185 if (!pool) {
5186 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
5187 return nullptr;
5188 }
9f95a23c 5189 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
5190 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
5191 // this ensures we do not process old creating messages after the
5192 // pool's initial pgs have been created (and pg are subsequently
5193 // allowed to split or merge).
5194 dout(20) << __func__ << " dropping " << pgid
5195 << "create, pool does not have CREATING flag set" << dendl;
5196 return nullptr;
7c673cae
FG
5197 }
5198 }
7c673cae 5199
11fdf7f2
TL
5200 int up_primary, acting_primary;
5201 vector<int> up, acting;
5202 startmap->pg_to_up_acting_osds(
5203 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 5204
11fdf7f2
TL
5205 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
5206 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
5207 store->get_type() != "bluestore") {
5208 clog->warn() << "pg " << pgid
5209 << " is at risk of silent data corruption: "
5210 << "the pool allows ec overwrites but is not stored in "
5211 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 5212 }
20effc67 5213 PeeringCtx rctx;
9f95a23c
TL
5214 create_pg_collection(
5215 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
5216 init_pg_ondisk(rctx.transaction, pgid, pp);
7c673cae 5217
9f95a23c 5218 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae 5219
11fdf7f2
TL
5220 PGRef pg = _make_pg(startmap, pgid);
5221 pg->ch = store->create_new_collection(pg->coll);
7c673cae 5222
11fdf7f2
TL
5223 {
5224 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5225 assert(NULL != shards[shard_index]);
5226 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 5227 }
7c673cae 5228
11fdf7f2 5229 pg->lock(true);
7c673cae 5230
11fdf7f2
TL
5231 // we are holding the shard lock
5232 ceph_assert(!pg->is_deleted());
5233
5234 pg->init(
5235 role,
5236 up,
5237 up_primary,
5238 acting,
5239 acting_primary,
5240 info->history,
5241 info->past_intervals,
11fdf7f2 5242 rctx.transaction);
7c673cae 5243
92f5a8d4
TL
5244 pg->init_collection_pool_opts();
5245
11fdf7f2 5246 if (pg->is_primary()) {
9f95a23c 5247 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
5248 pg->set_dynamic_perf_stats_queries(m_perf_queries);
5249 }
7c673cae 5250
9f95a23c
TL
5251 pg->handle_initialize(rctx);
5252 pg->handle_activate_map(rctx);
7c673cae 5253
11fdf7f2 5254 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 5255
11fdf7f2
TL
5256 dout(10) << __func__ << " new pg " << *pg << dendl;
5257 return pg;
7c673cae
FG
5258}
5259
11fdf7f2
TL
5260bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
5261 spg_t pgid,
5262 bool is_mon_create)
3efd9988
FG
5263{
5264 const auto max_pgs_per_osd =
11fdf7f2
TL
5265 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5266 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 5267
11fdf7f2 5268 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
5269 return false;
5270 }
11fdf7f2
TL
5271
5272 std::lock_guard l(pending_creates_lock);
3efd9988
FG
5273 if (is_mon_create) {
5274 pending_creates_from_mon++;
5275 } else {
9f95a23c
TL
5276 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
5277 pending_creates_from_osd.emplace(pgid, is_primary);
3efd9988 5278 }
1adf2230 5279 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 5280 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
5281 return true;
5282}
5283
5284// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5285// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5286// to up set if pg_temp is empty. so an empty pg_temp won't work.
5287static vector<int32_t> twiddle(const vector<int>& acting) {
5288 if (acting.size() > 1) {
5289 return {acting[0]};
5290 } else {
5291 vector<int32_t> twiddled(acting.begin(), acting.end());
5292 twiddled.push_back(-1);
5293 return twiddled;
5294 }
5295}
5296
5297void OSD::resume_creating_pg()
5298{
5299 bool do_sub_pg_creates = false;
b32b8144 5300 bool have_pending_creates = false;
3efd9988
FG
5301 {
5302 const auto max_pgs_per_osd =
11fdf7f2
TL
5303 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5304 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5305 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
5306 // this could happen if admin decreases this setting before a PG is removed
5307 return;
5308 }
11fdf7f2
TL
5309 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5310 std::lock_guard l(pending_creates_lock);
3efd9988 5311 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
5312 dout(20) << __func__ << " pending_creates_from_mon "
5313 << pending_creates_from_mon << dendl;
3efd9988
FG
5314 do_sub_pg_creates = true;
5315 if (pending_creates_from_mon >= spare_pgs) {
5316 spare_pgs = pending_creates_from_mon = 0;
5317 } else {
5318 spare_pgs -= pending_creates_from_mon;
5319 pending_creates_from_mon = 0;
5320 }
5321 }
5322 auto pg = pending_creates_from_osd.cbegin();
5323 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 5324 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 5325 vector<int> acting;
9f95a23c
TL
5326 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5327 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
3efd9988 5328 pg = pending_creates_from_osd.erase(pg);
94b18763 5329 do_sub_pg_creates = true;
3efd9988
FG
5330 spare_pgs--;
5331 }
b32b8144
FG
5332 have_pending_creates = (pending_creates_from_mon > 0 ||
5333 !pending_creates_from_osd.empty());
3efd9988 5334 }
b32b8144
FG
5335
5336 bool do_renew_subs = false;
3efd9988
FG
5337 if (do_sub_pg_creates) {
5338 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5339 dout(4) << __func__ << ": resolicit pg creates from mon since "
5340 << last_pg_create_epoch << dendl;
b32b8144 5341 do_renew_subs = true;
3efd9988
FG
5342 }
5343 }
9f95a23c 5344 version_t start = get_osdmap_epoch() + 1;
b32b8144
FG
5345 if (have_pending_creates) {
5346 // don't miss any new osdmap deleting PGs
5347 if (monc->sub_want("osdmap", start, 0)) {
5348 dout(4) << __func__ << ": resolicit osdmap from mon since "
5349 << start << dendl;
5350 do_renew_subs = true;
5351 }
94b18763 5352 } else if (do_sub_pg_creates) {
b32b8144
FG
5353 // no need to subscribe the osdmap continuously anymore
5354 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5355 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 5356 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
5357 << start << dendl;
5358 do_renew_subs = true;
5359 }
5360 }
5361
5362 if (do_renew_subs) {
5363 monc->renew_subs();
5364 }
5365
94b18763 5366 service.send_pg_temp();
3efd9988 5367}
7c673cae 5368
7c673cae
FG
5369void OSD::_add_heartbeat_peer(int p)
5370{
5371 if (p == whoami)
5372 return;
5373 HeartbeatInfo *hi;
5374
5375 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5376 if (i == heartbeat_peers.end()) {
9f95a23c 5377 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
7c673cae
FG
5378 if (!cons.first)
5379 return;
9f95a23c
TL
5380 assert(cons.second);
5381
7c673cae
FG
5382 hi = &heartbeat_peers[p];
5383 hi->peer = p;
9f95a23c
TL
5384
5385 auto stamps = service.get_hb_stamps(p);
5386
5387 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5388 sb->peer = p;
5389 sb->stamps = stamps;
eafe8130 5390 hi->hb_interval_start = ceph_clock_now();
7c673cae 5391 hi->con_back = cons.first.get();
9f95a23c
TL
5392 hi->con_back->set_priv(sb);
5393
5394 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5395 sf->peer = p;
5396 sf->stamps = stamps;
5397 hi->con_front = cons.second.get();
5398 hi->con_front->set_priv(sf);
5399
5400 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5401 << " " << hi->con_back->get_peer_addr()
5402 << " " << hi->con_front->get_peer_addr()
5403 << dendl;
7c673cae
FG
5404 } else {
5405 hi = &i->second;
5406 }
9f95a23c 5407 hi->epoch = get_osdmap_epoch();
7c673cae
FG
5408}
5409
5410void OSD::_remove_heartbeat_peer(int n)
5411{
5412 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 5413 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
5414 dout(20) << " removing heartbeat peer osd." << n
5415 << " " << q->second.con_back->get_peer_addr()
5416 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5417 << dendl;
9f95a23c 5418 q->second.clear_mark_down();
7c673cae
FG
5419 heartbeat_peers.erase(q);
5420}
5421
5422void OSD::need_heartbeat_peer_update()
5423{
5424 if (is_stopping())
5425 return;
5426 dout(20) << "need_heartbeat_peer_update" << dendl;
5427 heartbeat_set_peers_need_update();
5428}
5429
5430void OSD::maybe_update_heartbeat_peers()
5431{
9f95a23c 5432 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5433
11fdf7f2 5434 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
5435 utime_t now = ceph_clock_now();
5436 if (last_heartbeat_resample == utime_t()) {
5437 last_heartbeat_resample = now;
5438 heartbeat_set_peers_need_update();
5439 } else if (!heartbeat_peers_need_update()) {
5440 utime_t dur = now - last_heartbeat_resample;
5441 if (dur > cct->_conf->osd_heartbeat_grace) {
5442 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5443 heartbeat_set_peers_need_update();
5444 last_heartbeat_resample = now;
494da23a
TL
5445 // automatically clean up any stale heartbeat peers
5446 // if we are unhealthy, then clean all
5447 reset_heartbeat_peers(is_waiting_for_healthy());
7c673cae
FG
5448 }
5449 }
5450 }
5451
5452 if (!heartbeat_peers_need_update())
5453 return;
5454 heartbeat_clear_peers_need_update();
5455
11fdf7f2 5456 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5457
5458 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5459
5460
5461 // build heartbeat from set
5462 if (is_active()) {
11fdf7f2
TL
5463 vector<PGRef> pgs;
5464 _get_pgs(&pgs);
5465 for (auto& pg : pgs) {
5466 pg->with_heartbeat_peers([&](int peer) {
9f95a23c 5467 if (get_osdmap()->is_up(peer)) {
11fdf7f2
TL
5468 _add_heartbeat_peer(peer);
5469 }
5470 });
7c673cae
FG
5471 }
5472 }
5473
5474 // include next and previous up osds to ensure we have a fully-connected set
5475 set<int> want, extras;
9f95a23c 5476 const int next = get_osdmap()->get_next_up_osd_after(whoami);
7c673cae
FG
5477 if (next >= 0)
5478 want.insert(next);
9f95a23c 5479 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
7c673cae
FG
5480 if (prev >= 0 && prev != next)
5481 want.insert(prev);
5482
11fdf7f2
TL
5483 // make sure we have at least **min_down** osds coming from different
5484 // subtree level (e.g., hosts) for fast failure detection.
5485 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5486 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
9f95a23c
TL
5487 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5488 get_osdmap()->get_random_up_osds_by_subtree(
5489 whoami, subtree, limit, want, &want);
11fdf7f2 5490
7c673cae
FG
5491 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5492 dout(10) << " adding neighbor peer osd." << *p << dendl;
5493 extras.insert(*p);
5494 _add_heartbeat_peer(*p);
5495 }
5496
5497 // remove down peers; enumerate extras
5498 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5499 while (p != heartbeat_peers.end()) {
9f95a23c 5500 if (!get_osdmap()->is_up(p->first)) {
7c673cae
FG
5501 int o = p->first;
5502 ++p;
5503 _remove_heartbeat_peer(o);
5504 continue;
5505 }
9f95a23c 5506 if (p->second.epoch < get_osdmap_epoch()) {
7c673cae
FG
5507 extras.insert(p->first);
5508 }
5509 ++p;
5510 }
5511
5512 // too few?
11fdf7f2 5513 for (int n = next; n >= 0; ) {
7c673cae
FG
5514 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5515 break;
5516 if (!extras.count(n) && !want.count(n) && n != whoami) {
5517 dout(10) << " adding random peer osd." << n << dendl;
5518 extras.insert(n);
5519 _add_heartbeat_peer(n);
5520 }
9f95a23c 5521 n = get_osdmap()->get_next_up_osd_after(n);
11fdf7f2 5522 if (n == next)
7c673cae
FG
5523 break; // came full circle; stop
5524 }
5525
5526 // too many?
5527 for (set<int>::iterator p = extras.begin();
5528 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5529 ++p) {
5530 if (want.count(*p))
5531 continue;
5532 _remove_heartbeat_peer(*p);
5533 }
5534
5535 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
9f95a23c
TL
5536
5537 // clean up stale failure pending
5538 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5539 if (heartbeat_peers.count(it->first) == 0) {
5540 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5541 failure_pending.erase(it++);
5542 } else {
5543 it++;
5544 }
5545 }
7c673cae
FG
5546}
5547
494da23a 5548void OSD::reset_heartbeat_peers(bool all)
7c673cae 5549{
9f95a23c 5550 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5551 dout(10) << "reset_heartbeat_peers" << dendl;
494da23a
TL
5552 utime_t stale = ceph_clock_now();
5553 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
11fdf7f2 5554 std::lock_guard l(heartbeat_lock);
494da23a 5555 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
b3b6e05e 5556 auto& [peer, hi] = *it;
494da23a 5557 if (all || hi.is_stale(stale)) {
9f95a23c 5558 hi.clear_mark_down();
494da23a 5559 // stop sending failure_report to mon too
b3b6e05e
TL
5560 failure_queue.erase(peer);
5561 failure_pending.erase(peer);
5562 it = heartbeat_peers.erase(it);
494da23a 5563 } else {
b3b6e05e 5564 ++it;
7c673cae 5565 }
7c673cae 5566 }
7c673cae
FG
5567}
5568
5569void OSD::handle_osd_ping(MOSDPing *m)
5570{
5571 if (superblock.cluster_fsid != m->fsid) {
5572 dout(20) << "handle_osd_ping from " << m->get_source_inst()
9f95a23c
TL
5573 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5574 << dendl;
7c673cae
FG
5575 m->put();
5576 return;
5577 }
5578
5579 int from = m->get_source().num();
5580
9f95a23c 5581 heartbeat_lock.lock();
7c673cae 5582 if (is_stopping()) {
9f95a23c 5583 heartbeat_lock.unlock();
7c673cae
FG
5584 m->put();
5585 return;
5586 }
5587
9f95a23c
TL
5588 utime_t now = ceph_clock_now();
5589 auto mnow = service.get_mnow();
5590 ConnectionRef con(m->get_connection());
7c673cae 5591 OSDMapRef curmap = service.get_osdmap();
c07f9fc5 5592 if (!curmap) {
9f95a23c 5593 heartbeat_lock.unlock();
c07f9fc5
FG
5594 m->put();
5595 return;
5596 }
7c673cae 5597
9f95a23c
TL
5598 auto sref = con->get_priv();
5599 Session *s = static_cast<Session*>(sref.get());
5600 if (!s) {
5601 heartbeat_lock.unlock();
5602 m->put();
5603 return;
5604 }
5605 if (!s->stamps) {
5606 s->peer = from;
5607 s->stamps = service.get_hb_stamps(from);
5608 }
5609
7c673cae
FG
5610 switch (m->op) {
5611
5612 case MOSDPing::PING:
5613 {
5614 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5615 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5616 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5617 if (heartbeat_drop->second == 0) {
5618 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5619 } else {
5620 --heartbeat_drop->second;
5621 dout(5) << "Dropping heartbeat from " << from
5622 << ", " << heartbeat_drop->second
5623 << " remaining to drop" << dendl;
5624 break;
5625 }
5626 } else if (cct->_conf->osd_debug_drop_ping_probability >
5627 ((((double)(rand()%100))/100.0))) {
5628 heartbeat_drop =
5629 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5630 cct->_conf->osd_debug_drop_ping_duration)).first;
5631 dout(5) << "Dropping heartbeat from " << from
5632 << ", " << heartbeat_drop->second
5633 << " remaining to drop" << dendl;
5634 break;
5635 }
5636 }
5637
9f95a23c
TL
5638 ceph::signedspan sender_delta_ub{};
5639 s->stamps->got_ping(
5640 m->up_from,
5641 mnow,
5642 m->mono_send_stamp,
5643 m->delta_ub,
5644 &sender_delta_ub);
5645 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5646
7c673cae 5647 if (!cct->get_heartbeat_map()->is_healthy()) {
9f95a23c
TL
5648 dout(10) << "internal heartbeat not healthy, dropping ping request"
5649 << dendl;
7c673cae
FG
5650 break;
5651 }
5652
5653 Message *r = new MOSDPing(monc->get_fsid(),
5654 curmap->get_epoch(),
9f95a23c
TL
5655 MOSDPing::PING_REPLY,
5656 m->ping_stamp,
5657 m->mono_ping_stamp,
5658 mnow,
5659 service.get_up_epoch(),
5660 cct->_conf->osd_heartbeat_min_size,
5661 sender_delta_ub);
5662 con->send_message(r);
7c673cae
FG
5663
5664 if (curmap->is_up(from)) {
7c673cae 5665 if (is_active()) {
9f95a23c
TL
5666 ConnectionRef cluster_con = service.get_con_osd_cluster(
5667 from, curmap->get_epoch());
5668 if (cluster_con) {
5669 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5670 }
5671 }
5672 } else if (!curmap->exists(from) ||
5673 curmap->get_down_at(from) > m->map_epoch) {
5674 // tell them they have died
5675 Message *r = new MOSDPing(monc->get_fsid(),
5676 curmap->get_epoch(),
5677 MOSDPing::YOU_DIED,
9f95a23c
TL
5678 m->ping_stamp,
5679 m->mono_ping_stamp,
5680 mnow,
5681 service.get_up_epoch(),
31f18b77 5682 cct->_conf->osd_heartbeat_min_size);
9f95a23c 5683 con->send_message(r);
7c673cae
FG
5684 }
5685 }
5686 break;
5687
5688 case MOSDPing::PING_REPLY:
5689 {
5690 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5691 if (i != heartbeat_peers.end()) {
9f95a23c 5692 auto acked = i->second.ping_history.find(m->ping_stamp);
11fdf7f2 5693 if (acked != i->second.ping_history.end()) {
11fdf7f2 5694 int &unacknowledged = acked->second.second;
9f95a23c 5695 if (con == i->second.con_back) {
11fdf7f2
TL
5696 dout(25) << "handle_osd_ping got reply from osd." << from
5697 << " first_tx " << i->second.first_tx
5698 << " last_tx " << i->second.last_tx
9f95a23c
TL
5699 << " last_rx_back " << i->second.last_rx_back
5700 << " -> " << now
11fdf7f2
TL
5701 << " last_rx_front " << i->second.last_rx_front
5702 << dendl;
5703 i->second.last_rx_back = now;
5704 ceph_assert(unacknowledged > 0);
5705 --unacknowledged;
5706 // if there is no front con, set both stamps.
5707 if (i->second.con_front == NULL) {
5708 i->second.last_rx_front = now;
5709 ceph_assert(unacknowledged > 0);
5710 --unacknowledged;
5711 }
9f95a23c 5712 } else if (con == i->second.con_front) {
11fdf7f2
TL
5713 dout(25) << "handle_osd_ping got reply from osd." << from
5714 << " first_tx " << i->second.first_tx
5715 << " last_tx " << i->second.last_tx
5716 << " last_rx_back " << i->second.last_rx_back
9f95a23c
TL
5717 << " last_rx_front " << i->second.last_rx_front
5718 << " -> " << now
11fdf7f2
TL
5719 << dendl;
5720 i->second.last_rx_front = now;
5721 ceph_assert(unacknowledged > 0);
5722 --unacknowledged;
5723 }
7c673cae 5724
11fdf7f2
TL
5725 if (unacknowledged == 0) {
5726 // succeeded in getting all replies
5727 dout(25) << "handle_osd_ping got all replies from osd." << from
9f95a23c 5728 << " , erase pending ping(sent at " << m->ping_stamp << ")"
11fdf7f2
TL
5729 << " and older pending ping(s)"
5730 << dendl;
eafe8130
TL
5731
5732#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5733 ++i->second.hb_average_count;
9f95a23c 5734 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
eafe8130
TL
5735 i->second.hb_total_back += back_pingtime;
5736 if (back_pingtime < i->second.hb_min_back)
5737 i->second.hb_min_back = back_pingtime;
5738 if (back_pingtime > i->second.hb_max_back)
5739 i->second.hb_max_back = back_pingtime;
9f95a23c 5740 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
eafe8130
TL
5741 i->second.hb_total_front += front_pingtime;
5742 if (front_pingtime < i->second.hb_min_front)
5743 i->second.hb_min_front = front_pingtime;
5744 if (front_pingtime > i->second.hb_max_front)
5745 i->second.hb_max_front = front_pingtime;
5746
5747 ceph_assert(i->second.hb_interval_start != utime_t());
5748 if (i->second.hb_interval_start == utime_t())
5749 i->second.hb_interval_start = now;
5750 int64_t hb_avg_time_period = 60;
5751 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5752 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5753 }
5754 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5755 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5756 uint32_t back_min = i->second.hb_min_back;
5757 uint32_t back_max = i->second.hb_max_back;
5758 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5759 uint32_t front_min = i->second.hb_min_front;
5760 uint32_t front_max = i->second.hb_max_front;
5761
5762 // Reset for new interval
5763 i->second.hb_average_count = 0;
5764 i->second.hb_interval_start = now;
5765 i->second.hb_total_back = i->second.hb_max_back = 0;
5766 i->second.hb_min_back = UINT_MAX;
5767 i->second.hb_total_front = i->second.hb_max_front = 0;
5768 i->second.hb_min_front = UINT_MAX;
5769
5770 // Record per osd interace ping times
5771 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5772 if (i->second.hb_back_pingtime.size() == 0) {
5773 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5774 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5775 i->second.hb_back_pingtime.push_back(back_avg);
5776 i->second.hb_back_min.push_back(back_min);
5777 i->second.hb_back_max.push_back(back_max);
5778 i->second.hb_front_pingtime.push_back(front_avg);
5779 i->second.hb_front_min.push_back(front_min);
5780 i->second.hb_front_max.push_back(front_max);
5781 ++i->second.hb_index;
5782 }
5783 } else {
5784 int index = i->second.hb_index & (hb_vector_size - 1);
5785 i->second.hb_back_pingtime[index] = back_avg;
5786 i->second.hb_back_min[index] = back_min;
5787 i->second.hb_back_max[index] = back_max;
5788 i->second.hb_front_pingtime[index] = front_avg;
5789 i->second.hb_front_min[index] = front_min;
5790 i->second.hb_front_max[index] = front_max;
5791 ++i->second.hb_index;
5792 }
5793
5794 {
5795 std::lock_guard l(service.stat_lock);
5796 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5797 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5798
5799 uint32_t total = 0;
5800 uint32_t min = UINT_MAX;
5801 uint32_t max = 0;
5802 uint32_t count = 0;
5803 uint32_t which = 0;
5804 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5805 for (int32_t k = size - 1 ; k >= 0; --k) {
5806 ++count;
5807 int index = (i->second.hb_index + k) % size;
5808 total += i->second.hb_back_pingtime[index];
5809 if (i->second.hb_back_min[index] < min)
5810 min = i->second.hb_back_min[index];
5811 if (i->second.hb_back_max[index] > max)
5812 max = i->second.hb_back_max[index];
5813 if (count == 1 || count == 5 || count == 15) {
5814 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5815 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5816 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5817 which++;
5818 if (count == 15)
5819 break;
5820 }
5821 }
5822
5823 if (i->second.con_front != NULL) {
5824 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5825
5826 total = 0;
5827 min = UINT_MAX;
5828 max = 0;
5829 count = 0;
5830 which = 0;
5831 for (int32_t k = size - 1 ; k >= 0; --k) {
5832 ++count;
5833 int index = (i->second.hb_index + k) % size;
5834 total += i->second.hb_front_pingtime[index];
5835 if (i->second.hb_front_min[index] < min)
5836 min = i->second.hb_front_min[index];
5837 if (i->second.hb_front_max[index] > max)
5838 max = i->second.hb_front_max[index];
5839 if (count == 1 || count == 5 || count == 15) {
5840 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5841 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5842 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5843 which++;
5844 if (count == 15)
5845 break;
5846 }
5847 }
5848 }
5849 }
5850 } else {
5851 std::lock_guard l(service.stat_lock);
5852 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5853 if (i->second.con_front != NULL)
5854 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5855 }
11fdf7f2 5856 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5857 }
5858
11fdf7f2
TL
5859 if (i->second.is_healthy(now)) {
5860 // Cancel false reports
5861 auto failure_queue_entry = failure_queue.find(from);
5862 if (failure_queue_entry != failure_queue.end()) {
5863 dout(10) << "handle_osd_ping canceling queued "
5864 << "failure report for osd." << from << dendl;
5865 failure_queue.erase(failure_queue_entry);
5866 }
5867
5868 auto failure_pending_entry = failure_pending.find(from);
5869 if (failure_pending_entry != failure_pending.end()) {
5870 dout(10) << "handle_osd_ping canceling in-flight "
5871 << "failure report for osd." << from << dendl;
5872 send_still_alive(curmap->get_epoch(),
5873 from,
5874 failure_pending_entry->second.second);
5875 failure_pending.erase(failure_pending_entry);
5876 }
7c673cae 5877 }
11fdf7f2
TL
5878 } else {
5879 // old replies, deprecated by newly sent pings.
9f95a23c 5880 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
11fdf7f2
TL
5881 << ") is found, treat as covered by newly sent pings "
5882 << "and ignore"
5883 << dendl;
7c673cae
FG
5884 }
5885 }
5886
5887 if (m->map_epoch &&
5888 curmap->is_up(from)) {
7c673cae 5889 if (is_active()) {
9f95a23c
TL
5890 ConnectionRef cluster_con = service.get_con_osd_cluster(
5891 from, curmap->get_epoch());
5892 if (cluster_con) {
5893 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5894 }
5895 }
5896 }
9f95a23c
TL
5897
5898 s->stamps->got_ping_reply(
5899 mnow,
5900 m->mono_send_stamp,
5901 m->delta_ub);
5902 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
7c673cae
FG
5903 }
5904 break;
5905
5906 case MOSDPing::YOU_DIED:
5907 dout(10) << "handle_osd_ping " << m->get_source_inst()
5908 << " says i am down in " << m->map_epoch << dendl;
5909 osdmap_subscribe(curmap->get_epoch()+1, false);
5910 break;
5911 }
5912
9f95a23c 5913 heartbeat_lock.unlock();
7c673cae
FG
5914 m->put();
5915}
5916
5917void OSD::heartbeat_entry()
5918{
9f95a23c 5919 std::unique_lock l(heartbeat_lock);
7c673cae
FG
5920 if (is_stopping())
5921 return;
5922 while (!heartbeat_stop) {
5923 heartbeat();
5924
eafe8130
TL
5925 double wait;
5926 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5927 wait = (float)cct->_conf->osd_heartbeat_interval;
5928 } else {
5929 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5930 }
9f95a23c 5931 auto w = ceph::make_timespan(wait);
7c673cae 5932 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
9f95a23c 5933 heartbeat_cond.wait_for(l, w);
7c673cae
FG
5934 if (is_stopping())
5935 return;
5936 dout(30) << "heartbeat_entry woke up" << dendl;
5937 }
5938}
5939
5940void OSD::heartbeat_check()
5941{
9f95a23c 5942 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
7c673cae
FG
5943 utime_t now = ceph_clock_now();
5944
11fdf7f2 5945 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5946 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5947 p != heartbeat_peers.end();
5948 ++p) {
5949
5950 if (p->second.first_tx == utime_t()) {
5951 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5952 << " yet, skipping" << dendl;
7c673cae
FG
5953 continue;
5954 }
5955
5956 dout(25) << "heartbeat_check osd." << p->first
5957 << " first_tx " << p->second.first_tx
5958 << " last_tx " << p->second.last_tx
5959 << " last_rx_back " << p->second.last_rx_back
5960 << " last_rx_front " << p->second.last_rx_front
5961 << dendl;
11fdf7f2
TL
5962 if (p->second.is_unhealthy(now)) {
5963 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5964 if (p->second.last_rx_back == utime_t() ||
5965 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5966 derr << "heartbeat_check: no reply from "
5967 << p->second.con_front->get_peer_addr().get_sockaddr()
5968 << " osd." << p->first
5969 << " ever on either front or back, first ping sent "
5970 << p->second.first_tx
5971 << " (oldest deadline " << oldest_deadline << ")"
5972 << dendl;
7c673cae 5973 // fail
11fdf7f2 5974 failure_queue[p->first] = p->second.first_tx;
7c673cae 5975 } else {
11fdf7f2
TL
5976 derr << "heartbeat_check: no reply from "
5977 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5978 << " osd." << p->first << " since back " << p->second.last_rx_back
5979 << " front " << p->second.last_rx_front
11fdf7f2
TL
5980 << " (oldest deadline " << oldest_deadline << ")"
5981 << dendl;
7c673cae 5982 // fail
11fdf7f2 5983 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5984 }
5985 }
5986 }
5987}
5988
5989void OSD::heartbeat()
5990{
9f95a23c 5991 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
7c673cae
FG
5992 dout(30) << "heartbeat" << dendl;
5993
20effc67
TL
5994 auto load_for_logger = service.get_scrub_services().update_load_average();
5995 if (load_for_logger) {
5996 logger->set(l_osd_loadavg, load_for_logger.value());
7c673cae 5997 }
7c673cae
FG
5998 dout(30) << "heartbeat checking stats" << dendl;
5999
11fdf7f2 6000 // refresh peer list and osd stats
7c673cae
FG
6001 vector<int> hb_peers;
6002 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6003 p != heartbeat_peers.end();
6004 ++p)
6005 hb_peers.push_back(p->first);
7c673cae 6006
11fdf7f2
TL
6007 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
6008 dout(5) << __func__ << " " << new_stat << dendl;
6009 ceph_assert(new_stat.statfs.total);
6010
6011 float pratio;
6012 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
6013
6014 service.check_full_status(ratio, pratio);
7c673cae
FG
6015
6016 utime_t now = ceph_clock_now();
9f95a23c 6017 auto mnow = service.get_mnow();
11fdf7f2
TL
6018 utime_t deadline = now;
6019 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
6020
6021 // send heartbeats
6022 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
6023 i != heartbeat_peers.end();
6024 ++i) {
6025 int peer = i->first;
f67539c2
TL
6026 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
6027 if (!s) {
6028 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
6029 continue;
6030 }
9f95a23c
TL
6031 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
6032
7c673cae
FG
6033 i->second.last_tx = now;
6034 if (i->second.first_tx == utime_t())
6035 i->second.first_tx = now;
11fdf7f2
TL
6036 i->second.ping_history[now] = make_pair(deadline,
6037 HeartbeatInfo::HEARTBEAT_MAX_CONN);
eafe8130
TL
6038 if (i->second.hb_interval_start == utime_t())
6039 i->second.hb_interval_start = now;
9f95a23c 6040
9f95a23c
TL
6041 std::optional<ceph::signedspan> delta_ub;
6042 s->stamps->sent_ping(&delta_ub);
6043
6044 i->second.con_back->send_message(
6045 new MOSDPing(monc->get_fsid(),
6046 service.get_osdmap_epoch(),
6047 MOSDPing::PING,
6048 now,
6049 mnow,
6050 mnow,
6051 service.get_up_epoch(),
6052 cct->_conf->osd_heartbeat_min_size,
6053 delta_ub));
7c673cae
FG
6054
6055 if (i->second.con_front)
9f95a23c
TL
6056 i->second.con_front->send_message(
6057 new MOSDPing(monc->get_fsid(),
6058 service.get_osdmap_epoch(),
6059 MOSDPing::PING,
6060 now,
6061 mnow,
6062 mnow,
6063 service.get_up_epoch(),
6064 cct->_conf->osd_heartbeat_min_size,
6065 delta_ub));
7c673cae
FG
6066 }
6067
6068 logger->set(l_osd_hb_to, heartbeat_peers.size());
6069
6070 // hmm.. am i all alone?
6071 dout(30) << "heartbeat lonely?" << dendl;
6072 if (heartbeat_peers.empty()) {
6073 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
6074 last_mon_heartbeat = now;
6075 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
9f95a23c 6076 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
6077 }
6078 }
6079
6080 dout(30) << "heartbeat done" << dendl;
6081}
6082
6083bool OSD::heartbeat_reset(Connection *con)
6084{
11fdf7f2
TL
6085 std::lock_guard l(heartbeat_lock);
6086 auto s = con->get_priv();
9f95a23c 6087 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
11fdf7f2 6088 con->set_priv(nullptr);
7c673cae 6089 if (s) {
7c673cae 6090 if (is_stopping()) {
7c673cae
FG
6091 return true;
6092 }
9f95a23c
TL
6093 auto session = static_cast<Session*>(s.get());
6094 auto p = heartbeat_peers.find(session->peer);
7c673cae
FG
6095 if (p != heartbeat_peers.end() &&
6096 (p->second.con_back == con ||
6097 p->second.con_front == con)) {
6098 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6099 << ", reopening" << dendl;
9f95a23c 6100 p->second.clear_mark_down(con);
7c673cae
FG
6101 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
6102 if (newcon.first) {
6103 p->second.con_back = newcon.first.get();
11fdf7f2 6104 p->second.con_back->set_priv(s);
7c673cae
FG
6105 if (newcon.second) {
6106 p->second.con_front = newcon.second.get();
11fdf7f2 6107 p->second.con_front->set_priv(s);
7c673cae 6108 }
11fdf7f2 6109 p->second.ping_history.clear();
7c673cae
FG
6110 } else {
6111 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6112 << ", raced with osdmap update, closing out peer" << dendl;
6113 heartbeat_peers.erase(p);
6114 }
6115 } else {
6116 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
6117 }
7c673cae
FG
6118 }
6119 return true;
6120}
6121
6122
6123
6124// =========================================
6125
6126void OSD::tick()
6127{
9f95a23c 6128 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6129 dout(10) << "tick" << dendl;
6130
9f95a23c
TL
6131 utime_t now = ceph_clock_now();
6132 // throw out any obsolete markdown log
6133 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6134 while (!osd_markdown_log.empty() &&
6135 osd_markdown_log.front() + grace < now)
6136 osd_markdown_log.pop_front();
6137
7c673cae
FG
6138 if (is_active() || is_waiting_for_healthy()) {
6139 maybe_update_heartbeat_peers();
6140 }
6141
6142 if (is_waiting_for_healthy()) {
6143 start_boot();
494da23a
TL
6144 }
6145
6146 if (is_waiting_for_healthy() || is_booting()) {
6147 std::lock_guard l(heartbeat_lock);
494da23a
TL
6148 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
6149 last_mon_heartbeat = now;
6150 dout(1) << __func__ << " checking mon for new map" << dendl;
9f95a23c 6151 osdmap_subscribe(get_osdmap_epoch() + 1, false);
11fdf7f2 6152 }
7c673cae
FG
6153 }
6154
9f95a23c
TL
6155 // scrub purged_snaps every deep scrub interval
6156 {
6157 const utime_t last = superblock.last_purged_snaps_scrub;
6158 utime_t next = last;
6159 next += cct->_conf->osd_scrub_min_interval;
6160 std::mt19937 rng;
6161 // use a seed that is stable for each scrub interval, but varies
6162 // by OSD to avoid any herds.
6163 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
20effc67 6164 double r = (rng() % 1024) / 1024.0;
9f95a23c
TL
6165 next +=
6166 cct->_conf->osd_scrub_min_interval *
6167 cct->_conf->osd_scrub_interval_randomize_ratio * r;
6168 if (next < ceph_clock_now()) {
6169 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6170 << " next " << next << " ... now" << dendl;
6171 scrub_purged_snaps();
6172 } else {
6173 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6174 << " next " << next << dendl;
6175 }
6176 }
6177
91327a77 6178 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
6179}
6180
6181void OSD::tick_without_osd_lock()
6182{
9f95a23c 6183 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
7c673cae
FG
6184 dout(10) << "tick_without_osd_lock" << dendl;
6185
f67539c2
TL
6186 logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
6187 logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
6188 logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
11fdf7f2
TL
6189
6190 // refresh osd stats
6191 struct store_statfs_t stbuf;
6192 osd_alert_list_t alerts;
6193 int r = store->statfs(&stbuf, &alerts);
6194 ceph_assert(r == 0);
6195 service.set_statfs(stbuf, alerts);
7c673cae
FG
6196
6197 // osd_lock is not being held, which means the OSD state
6198 // might change when doing the monitor report
6199 if (is_active() || is_waiting_for_healthy()) {
9f95a23c
TL
6200 {
6201 std::lock_guard l{heartbeat_lock};
6202 heartbeat_check();
6203 }
6204 map_lock.lock_shared();
11fdf7f2 6205 std::lock_guard l(mon_report_lock);
7c673cae
FG
6206
6207 // mon report?
7c673cae 6208 utime_t now = ceph_clock_now();
11fdf7f2
TL
6209 if (service.need_fullness_update() ||
6210 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 6211 last_mon_report = now;
7c673cae
FG
6212 send_full_update();
6213 send_failures();
7c673cae 6214 }
9f95a23c 6215 map_lock.unlock_shared();
11fdf7f2
TL
6216
6217 epoch_t max_waiting_epoch = 0;
6218 for (auto s : shards) {
6219 max_waiting_epoch = std::max(max_waiting_epoch,
6220 s->get_max_waiting_epoch());
6221 }
6222 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6223 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6224 << ", requesting new map" << dendl;
6225 osdmap_subscribe(superblock.newest_map + 1, false);
6226 }
7c673cae
FG
6227 }
6228
6229 if (is_active()) {
6230 if (!scrub_random_backoff()) {
6231 sched_scrub();
6232 }
6233 service.promote_throttle_recalibrate();
3efd9988 6234 resume_creating_pg();
224ce89b
WB
6235 bool need_send_beacon = false;
6236 const auto now = ceph::coarse_mono_clock::now();
6237 {
6238 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 6239 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b 6240 const auto elapsed = now - last_sent_beacon;
f67539c2 6241 if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
224ce89b
WB
6242 cct->_conf->osd_beacon_report_interval) {
6243 need_send_beacon = true;
6244 }
6245 }
6246 if (need_send_beacon) {
6247 send_beacon(now);
6248 }
7c673cae
FG
6249 }
6250
11fdf7f2 6251 mgrc.update_daemon_health(get_health_metrics());
7c673cae 6252 service.kick_recovery_queue();
91327a77
AA
6253 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6254 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
6255}
6256
7c673cae
FG
6257// Usage:
6258// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6259// rmomapkey <pool-id> [namespace/]<obj-name> <key>
6260// setomapheader <pool-id> [namespace/]<obj-name> <header>
6261// getomap <pool> [namespace/]<obj-name>
6262// truncobj <pool-id> [namespace/]<obj-name> <newlen>
6263// injectmdataerr [namespace/]<obj-name> [shardid]
6264// injectdataerr [namespace/]<obj-name> [shardid]
6265//
6266// set_recovery_delay [utime]
6267void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
6268 std::string_view command,
6269 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
6270{
6271 //Test support
6272 //Support changing the omap on a single osd by using the Admin Socket to
6273 //directly request the osd make a change.
6274 if (command == "setomapval" || command == "rmomapkey" ||
6275 command == "setomapheader" || command == "getomap" ||
6276 command == "truncobj" || command == "injectmdataerr" ||
6277 command == "injectdataerr"
6278 ) {
6279 pg_t rawpg;
6280 int64_t pool;
6281 OSDMapRef curmap = service->get_osdmap();
6282 int r = -1;
6283
6284 string poolstr;
6285
9f95a23c 6286 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
6287 pool = curmap->lookup_pg_pool_name(poolstr);
6288 //If we can't find it by name then maybe id specified
6289 if (pool < 0 && isdigit(poolstr[0]))
6290 pool = atoll(poolstr.c_str());
6291 if (pool < 0) {
b5b8bbf5 6292 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
6293 return;
6294 }
6295
6296 string objname, nspace;
9f95a23c 6297 cmd_getval(cmdmap, "objname", objname);
7c673cae
FG
6298 std::size_t found = objname.find_first_of('/');
6299 if (found != string::npos) {
6300 nspace = objname.substr(0, found);
6301 objname = objname.substr(found+1);
6302 }
6303 object_locator_t oloc(pool, nspace);
6304 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
6305
6306 if (r < 0) {
6307 ss << "Invalid namespace/objname";
6308 return;
6309 }
6310
20effc67 6311 int64_t shardid = cmd_getval_or<int64_t>(cmdmap, "shardid", shard_id_t::NO_SHARD);
7c673cae
FG
6312 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6313 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6314 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6315 if (curmap->pg_is_ec(rawpg)) {
6316 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6317 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6318 return;
6319 }
6320 }
6321
6322 ObjectStore::Transaction t;
6323
6324 if (command == "setomapval") {
6325 map<string, bufferlist> newattrs;
6326 bufferlist val;
6327 string key, valstr;
9f95a23c
TL
6328 cmd_getval(cmdmap, "key", key);
6329 cmd_getval(cmdmap, "val", valstr);
7c673cae
FG
6330
6331 val.append(valstr);
6332 newattrs[key] = val;
6333 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 6334 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6335 if (r < 0)
6336 ss << "error=" << r;
6337 else
6338 ss << "ok";
6339 } else if (command == "rmomapkey") {
6340 string key;
9f95a23c 6341 cmd_getval(cmdmap, "key", key);
7c673cae 6342
9f95a23c 6343 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
11fdf7f2 6344 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6345 if (r < 0)
6346 ss << "error=" << r;
6347 else
6348 ss << "ok";
6349 } else if (command == "setomapheader") {
6350 bufferlist newheader;
6351 string headerstr;
6352
9f95a23c 6353 cmd_getval(cmdmap, "header", headerstr);
7c673cae
FG
6354 newheader.append(headerstr);
6355 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 6356 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6357 if (r < 0)
6358 ss << "error=" << r;
6359 else
6360 ss << "ok";
6361 } else if (command == "getomap") {
6362 //Debug: Output entire omap
6363 bufferlist hdrbl;
6364 map<string, bufferlist> keyvals;
11fdf7f2
TL
6365 auto ch = store->open_collection(coll_t(pgid));
6366 if (!ch) {
6367 ss << "unable to open collection for " << pgid;
6368 r = -ENOENT;
6369 } else {
6370 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6371 if (r >= 0) {
7c673cae
FG
6372 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6373 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 6374 it != keyvals.end(); ++it)
7c673cae
FG
6375 ss << " key=" << (*it).first << " val="
6376 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 6377 } else {
7c673cae 6378 ss << "error=" << r;
11fdf7f2 6379 }
7c673cae
FG
6380 }
6381 } else if (command == "truncobj") {
6382 int64_t trunclen;
9f95a23c 6383 cmd_getval(cmdmap, "len", trunclen);
7c673cae 6384 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 6385 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6386 if (r < 0)
6387 ss << "error=" << r;
6388 else
6389 ss << "ok";
6390 } else if (command == "injectdataerr") {
6391 store->inject_data_error(gobj);
6392 ss << "ok";
6393 } else if (command == "injectmdataerr") {
6394 store->inject_mdata_error(gobj);
6395 ss << "ok";
6396 }
6397 return;
6398 }
6399 if (command == "set_recovery_delay") {
20effc67 6400 int64_t delay = cmd_getval_or<int64_t>(cmdmap, "utime", 0);
7c673cae
FG
6401 ostringstream oss;
6402 oss << delay;
11fdf7f2 6403 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
6404 oss.str().c_str());
6405 if (r != 0) {
6406 ss << "set_recovery_delay: error setting "
6407 << "osd_recovery_delay_start to '" << delay << "': error "
6408 << r;
6409 return;
6410 }
11fdf7f2 6411 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
6412 ss << "set_recovery_delay: set osd_recovery_delay_start "
6413 << "to " << service->cct->_conf->osd_recovery_delay_start;
6414 return;
6415 }
7c673cae 6416 if (command == "injectfull") {
20effc67
TL
6417 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", -1);
6418 string type = cmd_getval_or<string>(cmdmap, "type", "full");
7c673cae 6419 OSDService::s_names state;
20effc67 6420
7c673cae
FG
6421 if (type == "none" || count == 0) {
6422 type = "none";
6423 count = 0;
6424 }
6425 state = service->get_full_state(type);
6426 if (state == OSDService::s_names::INVALID) {
6427 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6428 return;
6429 }
6430 service->set_injectfull(state, count);
6431 return;
6432 }
6433 ss << "Internal error - command=" << command;
6434}
6435
7c673cae
FG
6436// =========================================
6437
6438void OSD::ms_handle_connect(Connection *con)
6439{
6440 dout(10) << __func__ << " con " << con << dendl;
6441 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 6442 std::lock_guard l(osd_lock);
7c673cae
FG
6443 if (is_stopping())
6444 return;
6445 dout(10) << __func__ << " on mon" << dendl;
6446
6447 if (is_preboot()) {
6448 start_boot();
6449 } else if (is_booting()) {
6450 _send_boot(); // resend boot message
6451 } else {
9f95a23c 6452 map_lock.lock_shared();
11fdf7f2 6453 std::lock_guard l2(mon_report_lock);
7c673cae
FG
6454
6455 utime_t now = ceph_clock_now();
6456 last_mon_report = now;
6457
6458 // resend everything, it's a new session
6459 send_full_update();
6460 send_alive();
6461 service.requeue_pg_temp();
11fdf7f2 6462 service.clear_sent_ready_to_merge();
7c673cae 6463 service.send_pg_temp();
11fdf7f2
TL
6464 service.send_ready_to_merge();
6465 service.send_pg_created();
7c673cae
FG
6466 requeue_failures();
6467 send_failures();
7c673cae 6468
9f95a23c 6469 map_lock.unlock_shared();
7c673cae
FG
6470 if (is_active()) {
6471 send_beacon(ceph::coarse_mono_clock::now());
6472 }
6473 }
6474
6475 // full map requests may happen while active or pre-boot
6476 if (requested_full_first) {
6477 rerequest_full_maps();
6478 }
6479 }
6480}
6481
6482void OSD::ms_handle_fast_connect(Connection *con)
6483{
6484 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6485 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6486 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6487 s = ceph::make_ref<Session>(cct, con);
6488 con->set_priv(s);
7c673cae
FG
6489 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6490 << " addr=" << s->con->get_peer_addr() << dendl;
6491 // we don't connect to clients
11fdf7f2 6492 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6493 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6494 }
7c673cae
FG
6495 }
6496}
6497
6498void OSD::ms_handle_fast_accept(Connection *con)
6499{
6500 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6501 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6502 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6503 s = ceph::make_ref<Session>(cct, con);
6504 con->set_priv(s);
7c673cae
FG
6505 dout(10) << "new session (incoming)" << s << " con=" << con
6506 << " addr=" << con->get_peer_addr()
6507 << " must have raced with connect" << dendl;
11fdf7f2 6508 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6509 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6510 }
7c673cae
FG
6511 }
6512}
6513
6514bool OSD::ms_handle_reset(Connection *con)
6515{
9f95a23c
TL
6516 auto session = ceph::ref_cast<Session>(con->get_priv());
6517 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
7c673cae
FG
6518 if (!session)
6519 return false;
6520 session->wstate.reset(con);
11fdf7f2
TL
6521 session->con->set_priv(nullptr);
6522 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
6523 // note that we break session->con *before* the session_handle_reset
6524 // cleanup below. this avoids a race between us and
6525 // PG::add_backoff, Session::check_backoff, etc.
9f95a23c 6526 session_handle_reset(session);
7c673cae
FG
6527 return true;
6528}
6529
6530bool OSD::ms_handle_refused(Connection *con)
6531{
6532 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6533 return false;
6534
9f95a23c
TL
6535 auto session = ceph::ref_cast<Session>(con->get_priv());
6536 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
7c673cae
FG
6537 if (!session)
6538 return false;
6539 int type = con->get_peer_type();
6540 // handle only OSD failures here
6541 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6542 OSDMapRef osdmap = get_osdmap();
6543 if (osdmap) {
6544 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6545 if (id >= 0 && osdmap->is_up(id)) {
6546 // I'm cheating mon heartbeat grace logic, because we know it's not going
6547 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
6548 monc->send_mon_message(
6549 new MOSDFailure(
6550 monc->get_fsid(),
6551 id,
6552 osdmap->get_addrs(id),
6553 cct->_conf->osd_heartbeat_grace + 1,
6554 osdmap->get_epoch(),
6555 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6556 ));
7c673cae
FG
6557 }
6558 }
6559 }
7c673cae
FG
6560 return true;
6561}
6562
f67539c2 6563struct CB_OSD_GetVersion {
7c673cae 6564 OSD *osd;
f67539c2
TL
6565 explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6566 void operator ()(boost::system::error_code ec, version_t newest,
6567 version_t oldest) {
6568 if (!ec)
7c673cae
FG
6569 osd->_got_mon_epochs(oldest, newest);
6570 }
6571};
6572
6573void OSD::start_boot()
6574{
6575 if (!_is_healthy()) {
6576 // if we are not healthy, do not mark ourselves up (yet)
6577 dout(1) << "not healthy; waiting to boot" << dendl;
6578 if (!is_waiting_for_healthy())
6579 start_waiting_for_healthy();
6580 // send pings sooner rather than later
6581 heartbeat_kick();
6582 return;
6583 }
6584 dout(1) << __func__ << dendl;
6585 set_state(STATE_PREBOOT);
6586 dout(10) << "start_boot - have maps " << superblock.oldest_map
6587 << ".." << superblock.newest_map << dendl;
f67539c2 6588 monc->get_version("osdmap", CB_OSD_GetVersion(this));
7c673cae
FG
6589}
6590
6591void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6592{
11fdf7f2 6593 std::lock_guard l(osd_lock);
7c673cae
FG
6594 if (is_preboot()) {
6595 _preboot(oldest, newest);
6596 }
6597}
6598
6599void OSD::_preboot(epoch_t oldest, epoch_t newest)
6600{
11fdf7f2 6601 ceph_assert(is_preboot());
7c673cae
FG
6602 dout(10) << __func__ << " _preboot mon has osdmaps "
6603 << oldest << ".." << newest << dendl;
6604
6605 // ensure our local fullness awareness is accurate
81eedcae
TL
6606 {
6607 std::lock_guard l(heartbeat_lock);
6608 heartbeat();
6609 }
7c673cae 6610
9f95a23c
TL
6611 const auto& monmap = monc->monmap;
6612 const auto osdmap = get_osdmap();
7c673cae 6613 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
6614 if (osdmap->get_epoch() == 0) {
6615 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 6616 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
6617 derr << "osdmap says I am destroyed" << dendl;
6618 // provide a small margin so we don't livelock seeing if we
6619 // un-destroyed ourselves.
6620 if (osdmap->get_epoch() > newest - 1) {
6621 exit(0);
6622 }
81eedcae 6623 } else if (osdmap->is_noup(whoami)) {
7c673cae
FG
6624 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6625 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6626 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6627 << dendl;
7c673cae
FG
6628 } else if (service.need_fullness_update()) {
6629 derr << "osdmap fullness state needs update" << dendl;
6630 send_full_update();
9f95a23c
TL
6631 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6632 superblock.purged_snaps_last < superblock.current_epoch) {
6633 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6634 << " < newest_map " << superblock.current_epoch << dendl;
6635 _get_purged_snaps();
7c673cae
FG
6636 } else if (osdmap->get_epoch() >= oldest - 1 &&
6637 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
6638
6639 // wait for pgs to fully catch up in a different thread, since
6640 // this thread might be required for splitting and merging PGs to
6641 // make progress.
6642 boot_finisher.queue(
9f95a23c 6643 new LambdaContext(
11fdf7f2 6644 [this](int r) {
9f95a23c 6645 std::unique_lock l(osd_lock);
11fdf7f2
TL
6646 if (is_preboot()) {
6647 dout(10) << __func__ << " waiting for peering work to drain"
6648 << dendl;
9f95a23c 6649 l.unlock();
11fdf7f2 6650 for (auto shard : shards) {
9f95a23c 6651 shard->wait_min_pg_epoch(get_osdmap_epoch());
11fdf7f2 6652 }
9f95a23c 6653 l.lock();
11fdf7f2
TL
6654 }
6655 if (is_preboot()) {
6656 _send_boot();
6657 }
6658 }));
6659 return;
7c673cae
FG
6660 }
6661
6662 // get all the latest maps
6663 if (osdmap->get_epoch() + 1 >= oldest)
6664 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6665 else
6666 osdmap_subscribe(oldest - 1, true);
6667}
6668
9f95a23c
TL
6669void OSD::_get_purged_snaps()
6670{
6671 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6672 // overlapping requests to the mon, which will be somewhat inefficient, but
6673 // it should be reliable.
6674 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6675 << ", newest_map " << superblock.current_epoch << dendl;
6676 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6677 superblock.purged_snaps_last + 1,
6678 superblock.current_epoch + 1);
6679 monc->send_mon_message(m);
6680}
6681
6682void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6683{
6684 dout(10) << __func__ << " " << *m << dendl;
6685 ObjectStore::Transaction t;
6686 if (!is_preboot() ||
6687 m->last < superblock.purged_snaps_last) {
6688 goto out;
1e59de90
TL
6689 } else {
6690 OSDriver osdriver{store.get(), service.meta_ch, make_purged_snaps_oid()};
6691 SnapMapper::record_purged_snaps(
6692 cct,
6693 osdriver,
6694 osdriver.get_transaction(&t),
6695 m->purged_snaps);
9f95a23c 6696 }
9f95a23c
TL
6697 superblock.purged_snaps_last = m->last;
6698 write_superblock(t);
6699 store->queue_transaction(
6700 service.meta_ch,
6701 std::move(t));
6702 service.publish_superblock(superblock);
6703 if (m->last < superblock.current_epoch) {
6704 _get_purged_snaps();
6705 } else {
6706 start_boot();
6707 }
6708out:
6709 m->put();
6710}
6711
7c673cae
FG
6712void OSD::send_full_update()
6713{
6714 if (!service.need_fullness_update())
6715 return;
6716 unsigned state = 0;
6717 if (service.is_full()) {
6718 state = CEPH_OSD_FULL;
6719 } else if (service.is_backfillfull()) {
6720 state = CEPH_OSD_BACKFILLFULL;
6721 } else if (service.is_nearfull()) {
6722 state = CEPH_OSD_NEARFULL;
6723 }
6724 set<string> s;
6725 OSDMap::calc_state_set(state, s);
6726 dout(10) << __func__ << " want state " << s << dendl;
9f95a23c 6727 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
7c673cae
FG
6728}
6729
6730void OSD::start_waiting_for_healthy()
6731{
6732 dout(1) << "start_waiting_for_healthy" << dendl;
6733 set_state(STATE_WAITING_FOR_HEALTHY);
6734 last_heartbeat_resample = utime_t();
181888fb
FG
6735
6736 // subscribe to osdmap updates, in case our peers really are known to be dead
9f95a23c 6737 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
6738}
6739
6740bool OSD::_is_healthy()
6741{
6742 if (!cct->get_heartbeat_map()->is_healthy()) {
6743 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6744 return false;
6745 }
6746
6747 if (is_waiting_for_healthy()) {
11fdf7f2 6748 utime_t now = ceph_clock_now();
9f95a23c
TL
6749 if (osd_markdown_log.empty()) {
6750 dout(5) << __func__ << " force returning true since last markdown"
6751 << " was " << cct->_conf->osd_max_markdown_period
6752 << "s ago" << dendl;
11fdf7f2
TL
6753 return true;
6754 }
6755 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6756 int num = 0, up = 0;
6757 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6758 p != heartbeat_peers.end();
6759 ++p) {
11fdf7f2 6760 if (p->second.is_healthy(now))
7c673cae
FG
6761 ++up;
6762 ++num;
6763 }
6764 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6765 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6766 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6767 return false;
6768 }
6769 }
6770
6771 return true;
6772}
6773
6774void OSD::_send_boot()
6775{
6776 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
6777 Connection *local_connection =
6778 cluster_messenger->get_loopback_connection().get();
6779 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6780 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6781 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6782 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6783
6784 dout(20) << " initial client_addrs " << client_addrs
6785 << ", cluster_addrs " << cluster_addrs
6786 << ", hb_back_addrs " << hb_back_addrs
6787 << ", hb_front_addrs " << hb_front_addrs
6788 << dendl;
6789 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6790 dout(10) << " assuming cluster_addrs match client_addrs "
6791 << client_addrs << dendl;
6792 cluster_addrs = cluster_messenger->get_myaddrs();
6793 }
6794 if (auto session = local_connection->get_priv(); !session) {
6795 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6796 }
6797
7c673cae 6798 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6799 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6800 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6801 << cluster_addrs << dendl;
6802 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6803 }
11fdf7f2
TL
6804 if (auto session = local_connection->get_priv(); !session) {
6805 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6806 }
6807
11fdf7f2
TL
6808 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6809 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6810 dout(10) << " assuming hb_front_addrs match client_addrs "
6811 << client_addrs << dendl;
6812 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6813 }
6814 if (auto session = local_connection->get_priv(); !session) {
6815 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6816 }
6817
6818 // we now know what our front and back addrs will be, and we are
6819 // about to tell the mon what our metadata (including numa bindings)
6820 // are, so now is a good time!
6821 set_numa_affinity();
6822
6823 MOSDBoot *mboot = new MOSDBoot(
6824 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6825 hb_back_addrs, hb_front_addrs, cluster_addrs,
6826 CEPH_FEATURES_ALL);
6827 dout(10) << " final client_addrs " << client_addrs
6828 << ", cluster_addrs " << cluster_addrs
6829 << ", hb_back_addrs " << hb_back_addrs
6830 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6831 << dendl;
6832 _collect_metadata(&mboot->metadata);
6833 monc->send_mon_message(mboot);
6834 set_state(STATE_BOOTING);
6835}
6836
6837void OSD::_collect_metadata(map<string,string> *pm)
6838{
6839 // config info
6840 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6841 if (store->get_type() == "filestore") {
6842 // not applicable for bluestore
6843 (*pm)["osd_journal"] = journal_path;
6844 }
11fdf7f2
TL
6845 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6846 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6847 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6848 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6849
6850 // backend
6851 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6852 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6853 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6854 (*pm)["default_device_class"] = store->get_default_device_class();
f6b5b4d7
TL
6855 string osdspec_affinity;
6856 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6857 if (r < 0 || osdspec_affinity.empty()) {
6858 osdspec_affinity = "";
6859 }
6860 (*pm)["osdspec_affinity"] = osdspec_affinity;
39ae355f
TL
6861 string ceph_version_when_created;
6862 r = store->read_meta("ceph_version_when_created", &ceph_version_when_created);
6863 if (r <0 || ceph_version_when_created.empty()) {
6864 ceph_version_when_created = "";
6865 }
6866 (*pm)["ceph_version_when_created"] = ceph_version_when_created;
6867 string created_at;
6868 r = store->read_meta("created_at", &created_at);
6869 if (r < 0 || created_at.empty()) {
6870 created_at = "";
6871 }
6872 (*pm)["created_at"] = created_at;
7c673cae
FG
6873 store->collect_metadata(pm);
6874
6875 collect_sys_info(pm, cct);
6876
11fdf7f2
TL
6877 (*pm)["front_iface"] = pick_iface(
6878 cct,
6879 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6880 (*pm)["back_iface"] = pick_iface(
6881 cct,
6882 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6883
6884 // network numa
6885 {
6886 int node = -1;
6887 set<int> nodes;
6888 set<string> unknown;
6889 for (auto nm : { "front_iface", "back_iface" }) {
6890 if (!(*pm)[nm].size()) {
6891 unknown.insert(nm);
6892 continue;
6893 }
6894 int n = -1;
6895 int r = get_iface_numa_node((*pm)[nm], &n);
6896 if (r < 0) {
6897 unknown.insert((*pm)[nm]);
6898 continue;
6899 }
6900 nodes.insert(n);
6901 if (node < 0) {
6902 node = n;
6903 }
6904 }
6905 if (unknown.size()) {
6906 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6907 }
6908 if (!nodes.empty()) {
6909 (*pm)["network_numa_nodes"] = stringify(nodes);
6910 }
6911 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6912 (*pm)["network_numa_node"] = stringify(node);
6913 }
6914 }
6915
6916 if (numa_node >= 0) {
6917 (*pm)["numa_node"] = stringify(numa_node);
6918 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6919 &numa_cpu_set);
6920 }
6921
6922 set<string> devnames;
6923 store->get_devices(&devnames);
9f95a23c
TL
6924 map<string,string> errs;
6925 get_device_metadata(devnames, pm, &errs);
6926 for (auto& i : errs) {
6927 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
11fdf7f2 6928 }
7c673cae
FG
6929 dout(10) << __func__ << " " << *pm << dendl;
6930}
6931
6932void OSD::queue_want_up_thru(epoch_t want)
6933{
9f95a23c
TL
6934 std::shared_lock map_locker{map_lock};
6935 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6936 std::lock_guard report_locker(mon_report_lock);
7c673cae
FG
6937 if (want > up_thru_wanted) {
6938 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6939 << ", currently " << cur
6940 << dendl;
6941 up_thru_wanted = want;
6942 send_alive();
6943 } else {
6944 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6945 << ", currently " << cur
6946 << dendl;
6947 }
7c673cae
FG
6948}
6949
6950void OSD::send_alive()
6951{
9f95a23c
TL
6952 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6953 const auto osdmap = get_osdmap();
7c673cae
FG
6954 if (!osdmap->exists(whoami))
6955 return;
6956 epoch_t up_thru = osdmap->get_up_thru(whoami);
6957 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6958 if (up_thru_wanted > up_thru) {
6959 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6960 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6961 }
6962}
6963
6964void OSD::request_full_map(epoch_t first, epoch_t last)
6965{
6966 dout(10) << __func__ << " " << first << ".." << last
6967 << ", previously requested "
6968 << requested_full_first << ".." << requested_full_last << dendl;
9f95a23c 6969 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
6970 ceph_assert(first > 0 && last > 0);
6971 ceph_assert(first <= last);
6972 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6973 if (requested_full_first == 0) {
6974 // first request
6975 requested_full_first = first;
6976 requested_full_last = last;
6977 } else if (last <= requested_full_last) {
6978 // dup
6979 return;
6980 } else {
6981 // additional request
6982 first = requested_full_last + 1;
6983 requested_full_last = last;
6984 }
6985 MMonGetOSDMap *req = new MMonGetOSDMap;
6986 req->request_full(first, last);
6987 monc->send_mon_message(req);
6988}
6989
6990void OSD::got_full_map(epoch_t e)
6991{
11fdf7f2 6992 ceph_assert(requested_full_first <= requested_full_last);
9f95a23c 6993 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6994 if (requested_full_first == 0) {
6995 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6996 return;
6997 }
6998 if (e < requested_full_first) {
6999 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
7000 << ".." << requested_full_last
7001 << ", ignoring" << dendl;
7002 return;
7003 }
7004 if (e >= requested_full_last) {
7005 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
7006 << ".." << requested_full_last << ", resetting" << dendl;
7007 requested_full_first = requested_full_last = 0;
7008 return;
7009 }
f67539c2 7010
7c673cae
FG
7011 requested_full_first = e + 1;
7012
7013 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
7014 << ".." << requested_full_last
7015 << ", still need more" << dendl;
7016}
7017
7018void OSD::requeue_failures()
7019{
11fdf7f2 7020 std::lock_guard l(heartbeat_lock);
7c673cae
FG
7021 unsigned old_queue = failure_queue.size();
7022 unsigned old_pending = failure_pending.size();
11fdf7f2 7023 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
7024 failure_queue[p->first] = p->second.first;
7025 failure_pending.erase(p++);
7026 }
7027 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
7028 << failure_queue.size() << dendl;
7029}
7030
7031void OSD::send_failures()
7032{
9f95a23c
TL
7033 ceph_assert(ceph_mutex_is_locked(map_lock));
7034 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
11fdf7f2 7035 std::lock_guard l(heartbeat_lock);
7c673cae 7036 utime_t now = ceph_clock_now();
9f95a23c 7037 const auto osdmap = get_osdmap();
7c673cae
FG
7038 while (!failure_queue.empty()) {
7039 int osd = failure_queue.begin()->first;
7c673cae
FG
7040 if (!failure_pending.count(osd)) {
7041 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
7042 monc->send_mon_message(
7043 new MOSDFailure(
7044 monc->get_fsid(),
7045 osd,
7046 osdmap->get_addrs(osd),
7047 failed_for,
7048 osdmap->get_epoch()));
7049 failure_pending[osd] = make_pair(failure_queue.begin()->second,
7050 osdmap->get_addrs(osd));
7c673cae
FG
7051 }
7052 failure_queue.erase(osd);
7053 }
7054}
7055
11fdf7f2 7056void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 7057{
11fdf7f2
TL
7058 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
7059 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
7060 monc->send_mon_message(m);
7061}
7062
11fdf7f2 7063void OSD::cancel_pending_failures()
7c673cae 7064{
11fdf7f2
TL
7065 std::lock_guard l(heartbeat_lock);
7066 auto it = failure_pending.begin();
7067 while (it != failure_pending.end()) {
7068 dout(10) << __func__ << " canceling in-flight failure report for osd."
7069 << it->first << dendl;
9f95a23c 7070 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
11fdf7f2 7071 failure_pending.erase(it++);
7c673cae 7072 }
7c673cae
FG
7073}
7074
7075void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
7076{
7077 const auto& monmap = monc->monmap;
7078 // send beacon to mon even if we are just connected, and the monmap is not
7079 // initialized yet by then.
7080 if (monmap.epoch > 0 &&
7081 monmap.get_required_features().contains_all(
7082 ceph::features::mon::FEATURE_LUMINOUS)) {
7083 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
7084 MOSDBeacon* beacon = nullptr;
7085 {
11fdf7f2 7086 std::lock_guard l{min_last_epoch_clean_lock};
9f95a23c
TL
7087 beacon = new MOSDBeacon(get_osdmap_epoch(),
7088 min_last_epoch_clean,
f67539c2
TL
7089 superblock.last_purged_snaps_scrub,
7090 cct->_conf->osd_beacon_report_interval);
494da23a 7091 beacon->pgs = min_last_epoch_clean_pgs;
224ce89b 7092 last_sent_beacon = now;
7c673cae
FG
7093 }
7094 monc->send_mon_message(beacon);
7095 } else {
7096 dout(20) << __func__ << " not sending" << dendl;
7097 }
7098}
7099
7c673cae
FG
7100void OSD::handle_command(MCommand *m)
7101{
7102 ConnectionRef con = m->get_connection();
9f95a23c 7103 auto session = ceph::ref_cast<Session>(con->get_priv());
7c673cae 7104 if (!session) {
9f95a23c 7105 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
7106 m->put();
7107 return;
7108 }
9f95a23c
TL
7109 if (!session->caps.allow_all()) {
7110 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
7111 m->put();
7112 return;
7113 }
9f95a23c 7114 cct->get_admin_socket()->queue_tell_command(m);
7c673cae
FG
7115 m->put();
7116}
7117
f64942e4
AA
7118namespace {
7119 class unlock_guard {
9f95a23c 7120 ceph::mutex& m;
f64942e4 7121 public:
9f95a23c 7122 explicit unlock_guard(ceph::mutex& mutex)
f64942e4
AA
7123 : m(mutex)
7124 {
11fdf7f2 7125 m.unlock();
f64942e4
AA
7126 }
7127 unlock_guard(unlock_guard&) = delete;
7128 ~unlock_guard() {
11fdf7f2 7129 m.lock();
f64942e4
AA
7130 }
7131 };
7132}
7133
9f95a23c 7134void OSD::scrub_purged_snaps()
7c673cae 7135{
9f95a23c
TL
7136 dout(10) << __func__ << dendl;
7137 ceph_assert(ceph_mutex_is_locked(osd_lock));
20effc67 7138 SnapMapper::Scrubber s(cct, store.get(), service.meta_ch,
9f95a23c
TL
7139 make_snapmapper_oid(),
7140 make_purged_snaps_oid());
7141 clog->debug() << "purged_snaps scrub starts";
7142 osd_lock.unlock();
7143 s.run();
7144 if (s.stray.size()) {
7145 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
7146 } else {
7147 clog->debug() << "purged_snaps scrub ok";
224ce89b 7148 }
9f95a23c
TL
7149 set<pair<spg_t,snapid_t>> queued;
7150 for (auto& [pool, snap, hash, shard] : s.stray) {
7151 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
7152 if (!pi) {
7153 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
7154 continue;
11fdf7f2 7155 }
9f95a23c
TL
7156 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
7157 spg_t spgid(pgid, shard);
7158 pair<spg_t,snapid_t> p(spgid, snap);
7159 if (queued.count(p)) {
7160 dout(20) << __func__ << " pg " << spgid << " snap " << snap
7161 << " already queued" << dendl;
7162 continue;
11fdf7f2 7163 }
9f95a23c
TL
7164 PGRef pg = lookup_lock_pg(spgid);
7165 if (!pg) {
7166 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
7167 continue;
11fdf7f2 7168 }
9f95a23c
TL
7169 queued.insert(p);
7170 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
7171 << snap << dendl;
7172 pg->queue_snap_retrim(snap);
7173 pg->unlock();
7c673cae 7174 }
9f95a23c
TL
7175 osd_lock.lock();
7176 if (is_stopping()) {
7177 return;
7178 }
7179 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
7180 ObjectStore::Transaction t;
7181 superblock.last_purged_snaps_scrub = ceph_clock_now();
7182 write_superblock(t);
7183 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7184 ceph_assert(tr == 0);
7185 if (is_active()) {
7186 send_beacon(ceph::coarse_mono_clock::now());
7187 }
7188 dout(10) << __func__ << " done" << dendl;
11fdf7f2
TL
7189}
7190
7191void OSD::probe_smart(const string& only_devid, ostream& ss)
7192{
7193 set<string> devnames;
7194 store->get_devices(&devnames);
7195 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7196 "osd_smart_report_timeout");
7197
7198 // == typedef std::map<std::string, mValue> mObject;
7199 json_spirit::mObject json_map;
7200
7201 for (auto dev : devnames) {
7202 // smartctl works only on physical devices; filter out any logical device
7203 if (dev.find("dm-") == 0) {
7204 continue;
7205 }
7206
7207 string err;
7208 string devid = get_device_id(dev, &err);
7209 if (devid.size() == 0) {
7210 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7211 << err << "), skipping" << dendl;
7212 continue;
7213 }
7214 if (only_devid.size() && devid != only_devid) {
7215 continue;
7216 }
7217
7218 json_spirit::mValue smart_json;
7219 if (block_device_get_metrics(dev, smart_timeout,
7220 &smart_json)) {
7221 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7222 continue;
7223 }
7224 json_map[devid] = smart_json;
7c673cae 7225 }
11fdf7f2 7226 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
7227}
7228
7229bool OSD::heartbeat_dispatch(Message *m)
7230{
7231 dout(30) << "heartbeat_dispatch " << m << dendl;
7232 switch (m->get_type()) {
7233
7234 case CEPH_MSG_PING:
7235 dout(10) << "ping from " << m->get_source_inst() << dendl;
7236 m->put();
7237 break;
7238
7239 case MSG_OSD_PING:
7240 handle_osd_ping(static_cast<MOSDPing*>(m));
7241 break;
7242
7243 default:
7244 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7245 m->put();
7246 }
7247
7248 return true;
7249}
7250
7251bool OSD::ms_dispatch(Message *m)
7252{
7253 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7254 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7255 service.got_stop_ack();
7256 m->put();
7257 return true;
7258 }
7259
7260 // lock!
7261
9f95a23c 7262 osd_lock.lock();
7c673cae 7263 if (is_stopping()) {
9f95a23c 7264 osd_lock.unlock();
7c673cae
FG
7265 m->put();
7266 return true;
7267 }
7268
7c673cae
FG
7269 _dispatch(m);
7270
9f95a23c 7271 osd_lock.unlock();
7c673cae
FG
7272
7273 return true;
7274}
7275
9f95a23c
TL
7276void OSDService::maybe_share_map(
7277 Connection *con,
7278 const OSDMapRef& osdmap,
7279 epoch_t peer_epoch_lb)
7c673cae 7280{
9f95a23c
TL
7281 // NOTE: we assume caller hold something that keeps the Connection itself
7282 // pinned (e.g., an OpRequest's MessageRef).
7283 auto session = ceph::ref_cast<Session>(con->get_priv());
7284 if (!session) {
7c673cae
FG
7285 return;
7286 }
7c673cae 7287
9f95a23c
TL
7288 // assume the peer has the newer of the op's sent_epoch and what
7289 // we think we sent them.
7c673cae 7290 session->sent_epoch_lock.lock();
9f95a23c
TL
7291 if (peer_epoch_lb > session->last_sent_epoch) {
7292 dout(10) << __func__ << " con " << con
7293 << " " << con->get_peer_addr()
7294 << " map epoch " << session->last_sent_epoch
7295 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7296 session->last_sent_epoch = peer_epoch_lb;
7297 }
7298 epoch_t last_sent_epoch = session->last_sent_epoch;
7c673cae
FG
7299 session->sent_epoch_lock.unlock();
7300
9f95a23c
TL
7301 if (osdmap->get_epoch() <= last_sent_epoch) {
7302 return;
7303 }
11fdf7f2 7304
9f95a23c
TL
7305 send_incremental_map(last_sent_epoch, con, osdmap);
7306 last_sent_epoch = osdmap->get_epoch();
7c673cae
FG
7307
7308 session->sent_epoch_lock.lock();
7309 if (session->last_sent_epoch < last_sent_epoch) {
9f95a23c
TL
7310 dout(10) << __func__ << " con " << con
7311 << " " << con->get_peer_addr()
7312 << " map epoch " << session->last_sent_epoch
7313 << " -> " << last_sent_epoch << " (shared)" << dendl;
7c673cae
FG
7314 session->last_sent_epoch = last_sent_epoch;
7315 }
7316 session->sent_epoch_lock.unlock();
7c673cae
FG
7317}
7318
9f95a23c 7319void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7c673cae 7320{
9f95a23c 7321 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7c673cae
FG
7322
7323 auto i = session->waiting_on_map.begin();
7324 while (i != session->waiting_on_map.end()) {
7325 OpRequestRef op = &(*i);
11fdf7f2 7326 ceph_assert(ms_can_fast_dispatch(op->get_req()));
9f95a23c 7327 auto m = op->get_req<MOSDFastDispatchOp>();
7c673cae
FG
7328 if (m->get_min_epoch() > osdmap->get_epoch()) {
7329 break;
7330 }
7331 session->waiting_on_map.erase(i++);
7332 op->put();
7333
7334 spg_t pgid;
7335 if (m->get_type() == CEPH_MSG_OSD_OP) {
7336 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7337 static_cast<const MOSDOp*>(m)->get_pg());
7338 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7339 continue;
7340 }
7341 } else {
7342 pgid = m->get_spg();
7343 }
11fdf7f2 7344 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
7345 }
7346
7347 if (session->waiting_on_map.empty()) {
7348 clear_session_waiting_on_map(session);
7349 } else {
7350 register_session_waiting_on_map(session);
7351 }
7352}
7353
7354void OSD::ms_fast_dispatch(Message *m)
7355{
11fdf7f2 7356 FUNCTRACE(cct);
7c673cae
FG
7357 if (service.is_stopping()) {
7358 m->put();
7359 return;
7360 }
11fdf7f2
TL
7361 // peering event?
7362 switch (m->get_type()) {
7363 case CEPH_MSG_PING:
7364 dout(10) << "ping from " << m->get_source() << dendl;
7365 m->put();
7366 return;
11fdf7f2
TL
7367 case MSG_OSD_FORCE_RECOVERY:
7368 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7369 return;
7370 case MSG_OSD_SCRUB2:
7371 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7372 return;
11fdf7f2
TL
7373 case MSG_OSD_PG_CREATE2:
7374 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
11fdf7f2
TL
7375 case MSG_OSD_PG_NOTIFY:
7376 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7377 case MSG_OSD_PG_INFO:
7378 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7379 case MSG_OSD_PG_REMOVE:
7380 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
11fdf7f2
TL
7381 // these are single-pg messages that handle themselves
7382 case MSG_OSD_PG_LOG:
7383 case MSG_OSD_PG_TRIM:
9f95a23c
TL
7384 case MSG_OSD_PG_NOTIFY2:
7385 case MSG_OSD_PG_QUERY2:
7386 case MSG_OSD_PG_INFO2:
11fdf7f2
TL
7387 case MSG_OSD_BACKFILL_RESERVE:
7388 case MSG_OSD_RECOVERY_RESERVE:
9f95a23c
TL
7389 case MSG_OSD_PG_LEASE:
7390 case MSG_OSD_PG_LEASE_ACK:
11fdf7f2
TL
7391 {
7392 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7393 if (require_osd_peer(pm)) {
7394 enqueue_peering_evt(
7395 pm->get_spg(),
7396 PGPeeringEventRef(pm->get_event()));
7397 }
7398 pm->put();
7399 return;
7400 }
7401 }
7402
7c673cae
FG
7403 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7404 {
7405#ifdef WITH_LTTNG
7406 osd_reqid_t reqid = op->get_reqid();
7407#endif
7408 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7409 reqid.name._num, reqid.tid, reqid.inc);
7410 }
1e59de90 7411 op->osd_parent_span = tracing::osd::tracer.start_trace("op-request-created");
20effc67 7412
7c673cae
FG
7413 if (m->trace)
7414 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7415
11fdf7f2 7416 // note sender epoch, min req's epoch
7c673cae
FG
7417 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7418 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7419 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7420
7421 service.maybe_inject_dispatch_delay();
7422
7423 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7424 m->get_type() != CEPH_MSG_OSD_OP) {
7425 // queue it directly
7426 enqueue_op(
7427 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7428 std::move(op),
7c673cae
FG
7429 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7430 } else {
7431 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7432 // message that didn't have an explicit spg_t); we need to map
7433 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7434 auto priv = m->get_connection()->get_priv();
7435 if (auto session = static_cast<Session*>(priv.get()); session) {
7436 std::lock_guard l{session->session_dispatch_lock};
7437 op->get();
7438 session->waiting_on_map.push_back(*op);
7439 OSDMapRef nextmap = service.get_nextmap_reserved();
7440 dispatch_session_waiting(session, nextmap);
7441 service.release_map(nextmap);
7c673cae
FG
7442 }
7443 }
f67539c2 7444 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7c673cae
FG
7445}
7446
aee94f69 7447int OSD::ms_handle_fast_authentication(Connection *con)
7c673cae 7448{
11fdf7f2 7449 int ret = 0;
9f95a23c 7450 auto s = ceph::ref_cast<Session>(con->get_priv());
11fdf7f2 7451 if (!s) {
9f95a23c
TL
7452 s = ceph::make_ref<Session>(cct, con);
7453 con->set_priv(s);
11fdf7f2
TL
7454 s->entity_name = con->get_peer_entity_name();
7455 dout(10) << __func__ << " new session " << s << " con " << s->con
7456 << " entity " << s->entity_name
7457 << " addr " << con->get_peer_addrs() << dendl;
7458 } else {
7459 dout(10) << __func__ << " existing session " << s << " con " << s->con
7460 << " entity " << s->entity_name
7461 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7462 }
7463
11fdf7f2 7464 AuthCapsInfo &caps_info = con->get_peer_caps_info();
9f95a23c 7465 if (caps_info.allow_all) {
11fdf7f2 7466 s->caps.set_allow_all();
9f95a23c 7467 } else if (caps_info.caps.length() > 0) {
11fdf7f2
TL
7468 bufferlist::const_iterator p = caps_info.caps.cbegin();
7469 string str;
7470 try {
7471 decode(str, p);
7472 }
f67539c2 7473 catch (ceph::buffer::error& e) {
11fdf7f2
TL
7474 dout(10) << __func__ << " session " << s << " " << s->entity_name
7475 << " failed to decode caps string" << dendl;
9f95a23c 7476 ret = -EACCES;
11fdf7f2
TL
7477 }
7478 if (!ret) {
7c673cae 7479 bool success = s->caps.parse(str);
11fdf7f2
TL
7480 if (success) {
7481 dout(10) << __func__ << " session " << s
7482 << " " << s->entity_name
7483 << " has caps " << s->caps << " '" << str << "'" << dendl;
7484 ret = 1;
7485 } else {
7486 dout(10) << __func__ << " session " << s << " " << s->entity_name
7487 << " failed to parse caps '" << str << "'" << dendl;
9f95a23c 7488 ret = -EACCES;
11fdf7f2 7489 }
7c673cae 7490 }
7c673cae 7491 }
11fdf7f2 7492 return ret;
7c673cae
FG
7493}
7494
7c673cae
FG
7495void OSD::_dispatch(Message *m)
7496{
9f95a23c 7497 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7498 dout(20) << "_dispatch " << m << " " << *m << dendl;
7499
7500 switch (m->get_type()) {
7c673cae
FG
7501 // -- don't need OSDMap --
7502
7503 // map and replication
7504 case CEPH_MSG_OSD_MAP:
7505 handle_osd_map(static_cast<MOSDMap*>(m));
7506 break;
9f95a23c
TL
7507 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7508 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7509 break;
7c673cae
FG
7510
7511 // osd
11fdf7f2
TL
7512 case MSG_COMMAND:
7513 handle_command(static_cast<MCommand*>(m));
7514 return;
11fdf7f2 7515 }
7c673cae
FG
7516}
7517
11fdf7f2
TL
7518void OSD::handle_fast_scrub(MOSDScrub2 *m)
7519{
7520 dout(10) << __func__ << " " << *m << dendl;
7521 if (!require_mon_or_mgr_peer(m)) {
7522 m->put();
7523 return;
7524 }
7525 if (m->fsid != monc->get_fsid()) {
7526 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7527 << dendl;
7528 m->put();
7529 return;
7530 }
7531 for (auto pgid : m->scrub_pgs) {
7532 enqueue_peering_evt(
7533 pgid,
7534 PGPeeringEventRef(
7535 std::make_shared<PGPeeringEvent>(
7536 m->epoch,
7537 m->epoch,
9f95a23c 7538 PeeringState::RequestScrub(m->deep, m->repair))));
11fdf7f2
TL
7539 }
7540 m->put();
7541}
7542
7c673cae
FG
7543bool OSD::scrub_random_backoff()
7544{
7545 bool coin_flip = (rand() / (double)RAND_MAX >=
7546 cct->_conf->osd_scrub_backoff_ratio);
7547 if (!coin_flip) {
1e59de90
TL
7548 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off (ratio: "
7549 << cct->_conf->osd_scrub_backoff_ratio << ")" << dendl;
7c673cae
FG
7550 return true;
7551 }
7552 return false;
7553}
7554
7c673cae 7555
20effc67 7556void OSD::sched_scrub()
f67539c2 7557{
20effc67 7558 auto& scrub_scheduler = service.get_scrub_services();
f67539c2 7559
1e59de90
TL
7560 if (auto blocked_pgs = scrub_scheduler.get_blocked_pgs_count();
7561 blocked_pgs > 0) {
7562 // some PGs managed by this OSD were blocked by a locked object during
7563 // scrub. This means we might not have the resources needed to scrub now.
7564 dout(10)
7565 << fmt::format(
7566 "{}: PGs are blocked while scrubbing due to locked objects ({} PGs)",
7567 __func__,
7568 blocked_pgs)
7569 << dendl;
7570 }
7571
20effc67
TL
7572 // fail fast if no resources are available
7573 if (!scrub_scheduler.can_inc_scrubs()) {
7574 dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7575 return;
f67539c2 7576 }
f67539c2 7577
20effc67
TL
7578 // if there is a PG that is just now trying to reserve scrub replica resources -
7579 // we should wait and not initiate a new scrub
7580 if (scrub_scheduler.is_reserving_now()) {
7581 dout(20) << __func__ << ": scrub resources reservation in progress" << dendl;
7582 return;
9f95a23c 7583 }
9f95a23c 7584
20effc67 7585 Scrub::ScrubPreconds env_conditions;
28e407b8 7586
20effc67
TL
7587 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7588 if (!cct->_conf->osd_repair_during_recovery) {
7589 dout(15) << __func__ << ": not scheduling scrubs due to active recovery"
7590 << dendl;
7591 return;
28e407b8 7592 }
20effc67
TL
7593 dout(10) << __func__
7594 << " will only schedule explicitly requested repair due to active recovery"
7595 << dendl;
7596 env_conditions.allow_requested_repair_only = true;
28e407b8
AA
7597 }
7598
20effc67
TL
7599 if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
7600 dout(20) << __func__ << " sched_scrub starts" << dendl;
7601 auto all_jobs = scrub_scheduler.list_registered_jobs();
7602 for (const auto& sj : all_jobs) {
7603 dout(20) << "sched_scrub scrub-queue jobs: " << *sj << dendl;
7c673cae
FG
7604 }
7605 }
20effc67
TL
7606
7607 auto was_started = scrub_scheduler.select_pg_and_scrub(env_conditions);
7608 dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started)
7609 << ")" << dendl;
7c673cae
FG
7610}
7611
20effc67
TL
7612Scrub::schedule_result_t OSDService::initiate_a_scrub(spg_t pgid,
7613 bool allow_requested_repair_only)
7c673cae 7614{
20effc67 7615 dout(20) << __func__ << " trying " << pgid << dendl;
7c673cae 7616
20effc67
TL
7617 // we have a candidate to scrub. We need some PG information to know if scrubbing is
7618 // allowed
7c673cae 7619
20effc67
TL
7620 PGRef pg = osd->lookup_lock_pg(pgid);
7621 if (!pg) {
7622 // the PG was dequeued in the short timespan between creating the candidates list
7623 // (collect_ripe_jobs()) and here
7624 dout(5) << __func__ << " pg " << pgid << " not found" << dendl;
7625 return Scrub::schedule_result_t::no_such_pg;
7c673cae
FG
7626 }
7627
20effc67
TL
7628 // This has already started, so go on to the next scrub job
7629 if (pg->is_scrub_queued_or_active()) {
7630 pg->unlock();
7631 dout(20) << __func__ << ": already in progress pgid " << pgid << dendl;
7632 return Scrub::schedule_result_t::already_started;
7c673cae 7633 }
20effc67 7634 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
1e59de90 7635 if (allow_requested_repair_only && !pg->get_planned_scrub().must_repair) {
20effc67
TL
7636 pg->unlock();
7637 dout(10) << __func__ << " skip " << pgid
7638 << " because repairing is not explicitly requested on it" << dendl;
7639 return Scrub::schedule_result_t::preconditions;
b5b8bbf5
FG
7640 }
7641
20effc67
TL
7642 auto scrub_attempt = pg->sched_scrub();
7643 pg->unlock();
7644 return scrub_attempt;
7c673cae
FG
7645}
7646
494da23a
TL
7647void OSD::resched_all_scrubs()
7648{
7649 dout(10) << __func__ << ": start" << dendl;
20effc67
TL
7650 auto all_jobs = service.get_scrub_services().list_registered_jobs();
7651 for (auto& e : all_jobs) {
7652
7653 auto& job = *e;
7654 dout(20) << __func__ << ": examine " << job.pgid << dendl;
7655
7656 PGRef pg = _lookup_lock_pg(job.pgid);
7657 if (!pg)
7658 continue;
7659
1e59de90 7660 if (!pg->get_planned_scrub().must_scrub && !pg->get_planned_scrub().need_auto) {
20effc67
TL
7661 dout(15) << __func__ << ": reschedule " << job.pgid << dendl;
7662 pg->reschedule_scrub();
7663 }
7664 pg->unlock();
494da23a
TL
7665 }
7666 dout(10) << __func__ << ": done" << dendl;
7667}
7668
11fdf7f2
TL
7669MPGStats* OSD::collect_pg_stats()
7670{
20effc67 7671 dout(15) << __func__ << dendl;
11fdf7f2
TL
7672 // This implementation unconditionally sends every is_primary PG's
7673 // stats every time we're called. This has equivalent cost to the
7674 // previous implementation's worst case where all PGs are busy and
7675 // their stats are always enqueued for sending.
9f95a23c 7676 std::shared_lock l{map_lock};
11fdf7f2 7677
11fdf7f2
TL
7678 osd_stat_t cur_stat = service.get_osd_stat();
7679 cur_stat.os_perf_stat = store->get_cur_stats();
7680
9f95a23c 7681 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
11fdf7f2
TL
7682 m->osd_stat = cur_stat;
7683
7684 std::lock_guard lec{min_last_epoch_clean_lock};
9f95a23c 7685 min_last_epoch_clean = get_osdmap_epoch();
11fdf7f2
TL
7686 min_last_epoch_clean_pgs.clear();
7687
1e59de90
TL
7688 auto now_is = ceph::coarse_real_clock::now();
7689
11fdf7f2
TL
7690 std::set<int64_t> pool_set;
7691 vector<PGRef> pgs;
7692 _get_pgs(&pgs);
7693 for (auto& pg : pgs) {
7694 auto pool = pg->pg_id.pgid.pool();
7695 pool_set.emplace((int64_t)pool);
7696 if (!pg->is_primary()) {
7697 continue;
7698 }
1e59de90 7699 pg->with_pg_stats(now_is, [&](const pg_stat_t& s, epoch_t lec) {
11fdf7f2 7700 m->pg_stat[pg->pg_id.pgid] = s;
f67539c2 7701 min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
11fdf7f2
TL
7702 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7703 });
7704 }
7705 store_statfs_t st;
1e59de90 7706 bool per_pool_stats = true;
9f95a23c 7707 bool per_pool_omap_stats = false;
11fdf7f2 7708 for (auto p : pool_set) {
9f95a23c 7709 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
11fdf7f2 7710 if (r == -ENOTSUP) {
1e59de90 7711 per_pool_stats = false;
11fdf7f2
TL
7712 break;
7713 } else {
7714 assert(r >= 0);
7715 m->pool_stat[p] = st;
7716 }
7717 }
7c673cae 7718
81eedcae
TL
7719 // indicate whether we are reporting per-pool stats
7720 m->osd_stat.num_osds = 1;
7721 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
9f95a23c 7722 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
81eedcae 7723
11fdf7f2
TL
7724 return m;
7725}
7c673cae 7726
11fdf7f2 7727vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 7728{
11fdf7f2
TL
7729 vector<DaemonHealthMetric> metrics;
7730 {
7731 utime_t oldest_secs;
7732 const utime_t now = ceph_clock_now();
7733 auto too_old = now;
7734 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7735 int slow = 0;
7736 TrackedOpRef oldest_op;
20effc67
TL
7737 OSDMapRef osdmap = get_osdmap();
7738 // map of slow op counts by slow op event type for an aggregated logging to
7739 // the cluster log.
7740 map<uint8_t, int> slow_op_types;
7741 // map of slow op counts by pool for reporting a pool name with highest
7742 // slow ops.
7743 map<uint64_t, int> slow_op_pools;
7744 bool log_aggregated_slow_op =
7745 cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
11fdf7f2
TL
7746 auto count_slow_ops = [&](TrackedOp& op) {
7747 if (op.get_initiated() < too_old) {
9f95a23c
TL
7748 stringstream ss;
7749 ss << "slow request " << op.get_desc()
7750 << " initiated "
7751 << op.get_initiated()
7752 << " currently "
7753 << op.state_string();
7754 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
20effc67
TL
7755 if (log_aggregated_slow_op) {
7756 if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
7757 uint8_t op_type = req->state_flag();
7758 auto m = req->get_req<MOSDFastDispatchOp>();
7759 uint64_t poolid = m->get_spg().pgid.m_pool;
7760 slow_op_types[op_type]++;
7761 if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
7762 slow_op_pools[poolid]++;
7763 }
7764 }
7765 } else {
7766 clog->warn() << ss.str();
7767 }
11fdf7f2
TL
7768 slow++;
7769 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7770 oldest_op = &op;
7771 }
7772 return true;
7773 } else {
7774 return false;
7775 }
7776 };
7777 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7778 if (slow) {
7779 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7780 << oldest_op->get_desc() << dendl;
20effc67
TL
7781 if (log_aggregated_slow_op &&
7782 slow_op_types.size() > 0) {
7783 stringstream ss;
7784 ss << slow << " slow requests (by type [ ";
7785 for (const auto& [op_type, count] : slow_op_types) {
7786 ss << "'" << OpRequest::get_state_string(op_type)
7787 << "' : " << count
7788 << " ";
7789 }
7790 auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
7791 [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
7792 return p1.second < p2.second;
7793 });
7794 if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
7795 string pool_name = osdmap->get_pool_name(slow_pool_it->first);
7796 ss << "] most affected pool [ '"
7797 << pool_name
7798 << "' : "
7799 << slow_pool_it->second
7800 << " ])";
7801 } else {
7802 ss << "])";
7803 }
7804 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7805 clog->warn() << ss.str();
7806 }
11fdf7f2
TL
7807 }
7808 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7809 } else {
7810 // no news is not good news.
7811 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7812 }
7813 }
7814 {
7815 std::lock_guard l(pending_creates_lock);
7816 auto n_primaries = pending_creates_from_mon;
7817 for (const auto& create : pending_creates_from_osd) {
7818 if (create.second) {
7819 n_primaries++;
7820 }
b32b8144 7821 }
11fdf7f2 7822 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 7823 }
b32b8144
FG
7824 return metrics;
7825}
7826
7c673cae
FG
7827// =====================================================
7828// MAP
7c673cae
FG
7829/** update_map
7830 * assimilate new OSDMap(s). scan pgs, etc.
7831 */
7832
7833void OSD::note_down_osd(int peer)
7834{
9f95a23c
TL
7835 ceph_assert(ceph_mutex_is_locked(osd_lock));
7836 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7c673cae 7837
9f95a23c 7838 std::lock_guard l{heartbeat_lock};
7c673cae
FG
7839 failure_queue.erase(peer);
7840 failure_pending.erase(peer);
7841 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7842 if (p != heartbeat_peers.end()) {
9f95a23c 7843 p->second.clear_mark_down();
7c673cae
FG
7844 heartbeat_peers.erase(p);
7845 }
7c673cae
FG
7846}
7847
7848void OSD::note_up_osd(int peer)
7849{
7c673cae
FG
7850 heartbeat_set_peers_need_update();
7851}
7852
7853struct C_OnMapCommit : public Context {
7854 OSD *osd;
7855 epoch_t first, last;
7856 MOSDMap *msg;
7857 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7858 : osd(o), first(f), last(l), msg(m) {}
7859 void finish(int r) override {
7860 osd->_committed_osd_maps(first, last, msg);
7861 msg->put();
7862 }
7863};
7864
7c673cae
FG
7865void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7866{
11fdf7f2 7867 std::lock_guard l(osdmap_subscribe_lock);
181888fb 7868 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7869 return;
7870
11fdf7f2 7871 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 7872
7c673cae
FG
7873 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7874 force_request) {
7875 monc->renew_subs();
7876 }
7877}
7878
7879void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7880{
7881 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7882 if (min <= superblock.oldest_map)
7883 return;
7884
7885 int num = 0;
7886 ObjectStore::Transaction t;
7887 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7888 dout(20) << " removing old osdmap epoch " << e << dendl;
7889 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7890 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7891 superblock.oldest_map = e + 1;
7892 num++;
7893 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7894 service.publish_superblock(superblock);
7895 write_superblock(t);
11fdf7f2
TL
7896 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7897 ceph_assert(tr == 0);
7c673cae
FG
7898 num = 0;
7899 if (!skip_maps) {
7900 // skip_maps leaves us with a range of old maps if we fail to remove all
7901 // of them before moving superblock.oldest_map forward to the first map
7902 // in the incoming MOSDMap msg. so we should continue removing them in
7903 // this case, even we could do huge series of delete transactions all at
7904 // once.
7905 break;
7906 }
7907 }
7908 }
7909 if (num > 0) {
7910 service.publish_superblock(superblock);
7911 write_superblock(t);
11fdf7f2
TL
7912 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7913 ceph_assert(tr == 0);
7c673cae
FG
7914 }
7915 // we should not remove the cached maps
11fdf7f2 7916 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
7917}
7918
7919void OSD::handle_osd_map(MOSDMap *m)
7920{
11fdf7f2
TL
7921 // wait for pgs to catch up
7922 {
7923 // we extend the map cache pins to accomodate pgs slow to consume maps
7924 // for some period, until we hit the max_lag_factor bound, at which point
7925 // we block here to stop injesting more maps than they are able to keep
7926 // up with.
7927 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7928 m_osd_pg_epoch_max_lag_factor;
7929 ceph_assert(max_lag > 0);
7930 epoch_t osd_min = 0;
7931 for (auto shard : shards) {
7932 epoch_t min = shard->get_min_pg_epoch();
7933 if (osd_min == 0 || min < osd_min) {
7934 osd_min = min;
7935 }
7936 }
9f95a23c 7937 epoch_t osdmap_epoch = get_osdmap_epoch();
11fdf7f2 7938 if (osd_min > 0 &&
9f95a23c
TL
7939 osdmap_epoch > max_lag &&
7940 osdmap_epoch - max_lag > osd_min) {
7941 epoch_t need = osdmap_epoch - max_lag;
11fdf7f2
TL
7942 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7943 << " max_lag " << max_lag << ")" << dendl;
7944 for (auto shard : shards) {
7945 epoch_t min = shard->get_min_pg_epoch();
7946 if (need > min) {
7947 dout(10) << __func__ << " waiting for pgs to consume " << need
7948 << " (shard " << shard->shard_id << " min " << min
7949 << ", map cache is " << cct->_conf->osd_map_cache_size
7950 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7951 << ")" << dendl;
7952 unlock_guard unlock{osd_lock};
7953 shard->wait_min_pg_epoch(need);
7954 }
7955 }
7956 }
7957 }
7958
9f95a23c 7959 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
7960 map<epoch_t,OSDMapRef> added_maps;
7961 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
7962 if (m->fsid != monc->get_fsid()) {
7963 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7964 << monc->get_fsid() << dendl;
7965 m->put();
7966 return;
7967 }
7968 if (is_initializing()) {
7969 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7970 m->put();
7971 return;
7972 }
7973
9f95a23c
TL
7974 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7975 if (session && !(session->entity_name.is_mon() ||
7c673cae
FG
7976 session->entity_name.is_osd())) {
7977 //not enough perms!
7978 dout(10) << "got osd map from Session " << session
7979 << " which we can't take maps from (not a mon or osd)" << dendl;
7980 m->put();
7c673cae
FG
7981 return;
7982 }
7c673cae
FG
7983
7984 // share with the objecter
7985 if (!is_preboot())
7986 service.objecter->handle_osd_map(m);
7987
7988 epoch_t first = m->get_first();
7989 epoch_t last = m->get_last();
7990 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7991 << superblock.newest_map
1e59de90
TL
7992 << ", src has [" << m->cluster_osdmap_trim_lower_bound
7993 << "," << m->newest_map << "]"
7c673cae
FG
7994 << dendl;
7995
7996 logger->inc(l_osd_map);
7997 logger->inc(l_osd_mape, last - first + 1);
7998 if (first <= superblock.newest_map)
7999 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
1e59de90
TL
8000
8001 if (superblock.cluster_osdmap_trim_lower_bound <
8002 m->cluster_osdmap_trim_lower_bound) {
8003 superblock.cluster_osdmap_trim_lower_bound =
8004 m->cluster_osdmap_trim_lower_bound;
8005 dout(10) << " superblock cluster_osdmap_trim_lower_bound new epoch is: "
8006 << superblock.cluster_osdmap_trim_lower_bound << dendl;
8007 ceph_assert(
8008 superblock.cluster_osdmap_trim_lower_bound >= superblock.oldest_map);
7c673cae
FG
8009 }
8010
8011 // make sure there is something new, here, before we bother flushing
8012 // the queues and such
8013 if (last <= superblock.newest_map) {
8014 dout(10) << " no new maps here, dropping" << dendl;
8015 m->put();
8016 return;
8017 }
8018
8019 // missing some?
8020 bool skip_maps = false;
8021 if (first > superblock.newest_map + 1) {
8022 dout(10) << "handle_osd_map message skips epochs "
8023 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
1e59de90 8024 if (m->cluster_osdmap_trim_lower_bound <= superblock.newest_map + 1) {
7c673cae
FG
8025 osdmap_subscribe(superblock.newest_map + 1, false);
8026 m->put();
8027 return;
8028 }
8029 // always try to get the full range of maps--as many as we can. this
8030 // 1- is good to have
8031 // 2- is at present the only way to ensure that we get a *full* map as
8032 // the first map!
1e59de90
TL
8033 if (m->cluster_osdmap_trim_lower_bound < first) {
8034 osdmap_subscribe(m->cluster_osdmap_trim_lower_bound - 1, true);
7c673cae
FG
8035 m->put();
8036 return;
8037 }
8038 skip_maps = true;
8039 }
8040
8041 ObjectStore::Transaction t;
8042 uint64_t txn_size = 0;
8043
9f95a23c
TL
8044 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8045
7c673cae 8046 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 8047 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
8048 for (epoch_t e = start; e <= last; e++) {
8049 if (txn_size >= t.get_num_bytes()) {
8050 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 8051 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
8052 }
8053 txn_size = t.get_num_bytes();
8054 map<epoch_t,bufferlist>::iterator p;
8055 p = m->maps.find(e);
8056 if (p != m->maps.end()) {
8057 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8058 OSDMap *o = new OSDMap;
8059 bufferlist& bl = p->second;
8060
8061 o->decode(bl);
8062
9f95a23c
TL
8063 purged_snaps[e] = o->get_new_purged_snaps();
8064
7c673cae
FG
8065 ghobject_t fulloid = get_osdmap_pobject_name(e);
8066 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
8067 added_maps[e] = add_map(o);
8068 added_maps_bl[e] = bl;
7c673cae
FG
8069 got_full_map(e);
8070 continue;
8071 }
8072
8073 p = m->incremental_maps.find(e);
8074 if (p != m->incremental_maps.end()) {
8075 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8076 bufferlist& bl = p->second;
8077 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8078 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
8079
8080 OSDMap *o = new OSDMap;
8081 if (e > 1) {
8082 bufferlist obl;
8083 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
8084 if (!got) {
8085 auto p = added_maps_bl.find(e - 1);
8086 ceph_assert(p != added_maps_bl.end());
8087 obl = p->second;
8088 }
7c673cae
FG
8089 o->decode(obl);
8090 }
8091
8092 OSDMap::Incremental inc;
11fdf7f2 8093 auto p = bl.cbegin();
7c673cae 8094 inc.decode(p);
494da23a 8095
7c673cae 8096 if (o->apply_incremental(inc) < 0) {
9f95a23c 8097 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 8098 ceph_abort_msg("bad fsid");
7c673cae
FG
8099 }
8100
8101 bufferlist fbl;
8102 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8103
8104 bool injected_failure = false;
8105 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8106 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8107 derr << __func__ << " injecting map crc failure" << dendl;
8108 injected_failure = true;
8109 }
8110
8111 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8112 dout(2) << "got incremental " << e
8113 << " but failed to encode full with correct crc; requesting"
8114 << dendl;
8115 clog->warn() << "failed to encode map e" << e << " with expected crc";
8116 dout(20) << "my encoded map was:\n";
8117 fbl.hexdump(*_dout);
8118 *_dout << dendl;
8119 delete o;
8120 request_full_map(e, last);
8121 last = e - 1;
f6b5b4d7
TL
8122
8123 // don't continue committing if we failed to enc the first inc map
8124 if (last < start) {
8125 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8126 m->put();
8127 return;
8128 }
7c673cae
FG
8129 break;
8130 }
8131 got_full_map(e);
9f95a23c 8132 purged_snaps[e] = o->get_new_purged_snaps();
7c673cae
FG
8133
8134 ghobject_t fulloid = get_osdmap_pobject_name(e);
8135 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
8136 added_maps[e] = add_map(o);
8137 added_maps_bl[e] = fbl;
7c673cae
FG
8138 continue;
8139 }
8140
11fdf7f2 8141 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
8142 }
8143
8144 // even if this map isn't from a mon, we may have satisfied our subscription
8145 monc->sub_got("osdmap", last);
8146
8147 if (!m->maps.empty() && requested_full_first) {
8148 dout(10) << __func__ << " still missing full maps " << requested_full_first
8149 << ".." << requested_full_last << dendl;
8150 rerequest_full_maps();
8151 }
8152
7c673cae
FG
8153 if (superblock.oldest_map) {
8154 // make sure we at least keep pace with incoming maps
1e59de90
TL
8155 trim_maps(m->cluster_osdmap_trim_lower_bound,
8156 last - first + 1, skip_maps);
11fdf7f2 8157 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
8158 }
8159
8160 if (!superblock.oldest_map || skip_maps)
8161 superblock.oldest_map = first;
8162 superblock.newest_map = last;
8163 superblock.current_epoch = last;
8164
8165 // note in the superblock that we were clean thru the prior epoch
8166 epoch_t boot_epoch = service.get_boot_epoch();
8167 if (boot_epoch && boot_epoch >= superblock.mounted) {
8168 superblock.mounted = boot_epoch;
8169 superblock.clean_thru = last;
8170 }
8171
11fdf7f2
TL
8172 // check for pg_num changes and deleted pools
8173 OSDMapRef lastmap;
8174 for (auto& i : added_maps) {
8175 if (!lastmap) {
8176 if (!(lastmap = service.try_get_map(i.first - 1))) {
8177 dout(10) << __func__ << " can't get previous map " << i.first - 1
8178 << " probably first start of this osd" << dendl;
8179 continue;
8180 }
8181 }
8182 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8183 for (auto& j : lastmap->get_pools()) {
8184 if (!i.second->have_pg_pool(j.first)) {
8185 pg_num_history.log_pool_delete(i.first, j.first);
8186 dout(10) << __func__ << " recording final pg_pool_t for pool "
8187 << j.first << dendl;
8188 // this information is needed by _make_pg() if have to restart before
8189 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8190 ghobject_t obj = make_final_pool_info_oid(j.first);
8191 bufferlist bl;
8192 encode(j.second, bl, CEPH_FEATURES_ALL);
8193 string name = lastmap->get_pool_name(j.first);
8194 encode(name, bl);
8195 map<string,string> profile;
8196 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8197 profile = lastmap->get_erasure_code_profile(
8198 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8199 }
8200 encode(profile, bl);
8201 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
11fdf7f2
TL
8202 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8203 new_pg_num != j.second.get_pg_num()) {
8204 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8205 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8206 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8207 }
8208 }
8209 for (auto& j : i.second->get_pools()) {
8210 if (!lastmap->have_pg_pool(j.first)) {
8211 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8212 << j.second.get_pg_num() << dendl;
8213 pg_num_history.log_pg_num_change(i.first, j.first,
8214 j.second.get_pg_num());
8215 }
8216 }
8217 lastmap = i.second;
8218 }
8219 pg_num_history.epoch = last;
8220 {
8221 bufferlist bl;
8222 ::encode(pg_num_history, bl);
8223 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8224 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8225 }
8226
9f95a23c
TL
8227 // record new purged_snaps
8228 if (superblock.purged_snaps_last == start - 1) {
1e59de90
TL
8229 OSDriver osdriver{store.get(), service.meta_ch, make_purged_snaps_oid()};
8230 SnapMapper::record_purged_snaps(
8231 cct,
8232 osdriver,
8233 osdriver.get_transaction(&t),
8234 purged_snaps);
9f95a23c
TL
8235 superblock.purged_snaps_last = last;
8236 } else {
8237 dout(10) << __func__ << " superblock purged_snaps_last is "
8238 << superblock.purged_snaps_last
8239 << ", not recording new purged_snaps" << dendl;
8240 }
8241
7c673cae
FG
8242 // superblock and commit
8243 write_superblock(t);
11fdf7f2 8244 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8245 store->queue_transaction(
11fdf7f2
TL
8246 service.meta_ch,
8247 std::move(t));
7c673cae
FG
8248 service.publish_superblock(superblock);
8249}
8250
8251void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8252{
8253 dout(10) << __func__ << " " << first << ".." << last << dendl;
8254 if (is_stopping()) {
8255 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8256 return;
8257 }
11fdf7f2 8258 std::lock_guard l(osd_lock);
31f18b77
FG
8259 if (is_stopping()) {
8260 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8261 return;
8262 }
9f95a23c 8263 map_lock.lock();
7c673cae 8264
f6b5b4d7
TL
8265 ceph_assert(first <= last);
8266
7c673cae
FG
8267 bool do_shutdown = false;
8268 bool do_restart = false;
8269 bool network_error = false;
f6b5b4d7 8270 OSDMapRef osdmap = get_osdmap();
7c673cae
FG
8271
8272 // advance through the new maps
8273 for (epoch_t cur = first; cur <= last; cur++) {
8274 dout(10) << " advance to epoch " << cur
8275 << " (<= last " << last
8276 << " <= newest_map " << superblock.newest_map
8277 << ")" << dendl;
8278
8279 OSDMapRef newmap = get_map(cur);
11fdf7f2 8280 ceph_assert(newmap); // we just cached it above!
7c673cae 8281
f67539c2 8282 // start blocklisting messages sent to peers that go down.
7c673cae
FG
8283 service.pre_publish_map(newmap);
8284
8285 // kill connections to newly down osds
8286 bool waited_for_reservations = false;
8287 set<int> old;
9f95a23c 8288 osdmap = get_osdmap();
7c673cae
FG
8289 osdmap->get_all_osds(old);
8290 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8291 if (*p != whoami &&
8292 osdmap->is_up(*p) && // in old map
8293 newmap->is_down(*p)) { // but not the new one
8294 if (!waited_for_reservations) {
8295 service.await_reserved_maps();
8296 waited_for_reservations = true;
8297 }
8298 note_down_osd(*p);
8299 } else if (*p != whoami &&
8300 osdmap->is_down(*p) &&
8301 newmap->is_up(*p)) {
8302 note_up_osd(*p);
8303 }
8304 }
8305
81eedcae 8306 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
7c673cae
FG
8307 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8308 << dendl;
8309 if (is_booting()) {
8310 // this captures the case where we sent the boot message while
8311 // NOUP was being set on the mon and our boot request was
8312 // dropped, and then later it is cleared. it imperfectly
8313 // handles the case where our original boot message was not
8314 // dropped and we restart even though we might have booted, but
8315 // that is harmless (boot will just take slightly longer).
8316 do_restart = true;
8317 }
8318 }
8319
9f95a23c
TL
8320 osdmap = std::move(newmap);
8321 set_osdmap(osdmap);
7c673cae
FG
8322 epoch_t up_epoch;
8323 epoch_t boot_epoch;
8324 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8325 if (!up_epoch &&
8326 osdmap->is_up(whoami) &&
11fdf7f2 8327 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8328 up_epoch = osdmap->get_epoch();
8329 dout(10) << "up_epoch is " << up_epoch << dendl;
8330 if (!boot_epoch) {
8331 boot_epoch = osdmap->get_epoch();
8332 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8333 }
8334 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8335 }
8336 }
8337
7c673cae
FG
8338 epoch_t _bind_epoch = service.get_bind_epoch();
8339 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8340 osdmap->get_addrs(whoami).legacy_equals(
8341 client_messenger->get_myaddrs()) &&
7c673cae
FG
8342 _bind_epoch < osdmap->get_up_from(whoami)) {
8343
8344 if (is_booting()) {
8345 dout(1) << "state: booting -> active" << dendl;
8346 set_state(STATE_ACTIVE);
11fdf7f2 8347 do_restart = false;
7c673cae
FG
8348
8349 // set incarnation so that osd_reqid_t's we generate for our
8350 // objecter requests are unique across restarts.
8351 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8352 cancel_pending_failures();
7c673cae
FG
8353 }
8354 }
8355
8356 if (osdmap->get_epoch() > 0 &&
8357 is_active()) {
8358 if (!osdmap->exists(whoami)) {
9f95a23c 8359 derr << "map says i do not exist. shutting down." << dendl;
7c673cae
FG
8360 do_shutdown = true; // don't call shutdown() while we have
8361 // everything paused
9f95a23c
TL
8362 } else if (osdmap->is_stop(whoami)) {
8363 derr << "map says i am stopped by admin. shutting down." << dendl;
8364 do_shutdown = true;
7c673cae 8365 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8366 !osdmap->get_addrs(whoami).legacy_equals(
8367 client_messenger->get_myaddrs()) ||
8368 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8369 cluster_messenger->get_myaddrs()) ||
8370 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8371 hb_back_server_messenger->get_myaddrs()) ||
8372 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8373 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8374 if (!osdmap->is_up(whoami)) {
8375 if (service.is_preparing_to_stop() || service.is_stopping()) {
8376 service.got_stop_ack();
8377 } else {
c07f9fc5
FG
8378 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8379 "but it is still running";
8380 clog->debug() << "map e" << osdmap->get_epoch()
8381 << " wrongly marked me down at e"
8382 << osdmap->get_down_at(whoami);
7c673cae 8383 }
9f95a23c
TL
8384 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8385 // note that this is best-effort...
8386 monc->send_mon_message(
8387 new MOSDMarkMeDead(
8388 monc->get_fsid(),
8389 whoami,
8390 osdmap->get_epoch()));
8391 }
11fdf7f2
TL
8392 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8393 client_messenger->get_myaddrs())) {
7c673cae 8394 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8395 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8396 << " != my " << client_messenger->get_myaddrs() << ")";
8397 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8398 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8399 clog->error() << "map e" << osdmap->get_epoch()
8400 << " had wrong cluster addr ("
11fdf7f2
TL
8401 << osdmap->get_cluster_addrs(whoami)
8402 << " != my " << cluster_messenger->get_myaddrs() << ")";
8403 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8404 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8405 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8406 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8407 << osdmap->get_hb_back_addrs(whoami)
8408 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8409 << ")";
11fdf7f2
TL
8410 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8411 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8412 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8413 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8414 << osdmap->get_hb_front_addrs(whoami)
8415 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8416 << ")";
8417 }
8418
8419 if (!service.is_stopping()) {
8420 epoch_t up_epoch = 0;
8421 epoch_t bind_epoch = osdmap->get_epoch();
8422 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8423 do_restart = true;
8424
8425 //add markdown log
8426 utime_t now = ceph_clock_now();
8427 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8428 osd_markdown_log.push_back(now);
7c673cae 8429 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
9f95a23c
TL
8430 derr << __func__ << " marked down "
8431 << osd_markdown_log.size()
8432 << " > osd_max_markdown_count "
8433 << cct->_conf->osd_max_markdown_count
8434 << " in last " << grace << " seconds, shutting down"
8435 << dendl;
7c673cae
FG
8436 do_restart = false;
8437 do_shutdown = true;
8438 }
8439
8440 start_waiting_for_healthy();
8441
8442 set<int> avoid_ports;
8443#if defined(__FreeBSD__)
8444 // prevent FreeBSD from grabbing the client_messenger port during
f67539c2 8445 // rebinding. In which case a cluster_meesneger will connect also
7c673cae 8446 // to the same port
11fdf7f2 8447 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8448#endif
11fdf7f2 8449 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8450
8451 int r = cluster_messenger->rebind(avoid_ports);
8452 if (r != 0) {
8453 do_shutdown = true; // FIXME: do_restart?
8454 network_error = true;
9f95a23c
TL
8455 derr << __func__ << " marked down:"
8456 << " rebind cluster_messenger failed" << dendl;
7c673cae
FG
8457 }
8458
9f95a23c
TL
8459 hb_back_server_messenger->mark_down_all();
8460 hb_front_server_messenger->mark_down_all();
7c673cae
FG
8461 hb_front_client_messenger->mark_down_all();
8462 hb_back_client_messenger->mark_down_all();
8463
494da23a 8464 reset_heartbeat_peers(true);
7c673cae
FG
8465 }
8466 }
20effc67
TL
8467 } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
8468 derr << "map says i am stopped by admin. shutting down." << dendl;
8469 do_shutdown = true;
7c673cae
FG
8470 }
8471
9f95a23c 8472 map_lock.unlock();
7c673cae 8473
11fdf7f2 8474 check_osdmap_features();
7c673cae
FG
8475
8476 // yay!
8477 consume_map();
8478
8479 if (is_active() || is_waiting_for_healthy())
8480 maybe_update_heartbeat_peers();
8481
11fdf7f2 8482 if (is_active()) {
7c673cae
FG
8483 activate_map();
8484 }
8485
31f18b77 8486 if (do_shutdown) {
7c673cae 8487 if (network_error) {
11fdf7f2 8488 cancel_pending_failures();
7c673cae
FG
8489 }
8490 // trigger shutdown in a different thread
8491 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8492 queue_async_signal(SIGINT);
8493 }
31f18b77
FG
8494 else if (m->newest_map && m->newest_map > last) {
8495 dout(10) << " msg say newest map is " << m->newest_map
8496 << ", requesting more" << dendl;
8497 osdmap_subscribe(osdmap->get_epoch()+1, false);
8498 }
7c673cae
FG
8499 else if (is_preboot()) {
8500 if (m->get_source().is_mon())
1e59de90 8501 _preboot(m->cluster_osdmap_trim_lower_bound, m->newest_map);
7c673cae
FG
8502 else
8503 start_boot();
8504 }
8505 else if (do_restart)
8506 start_boot();
8507
8508}
8509
11fdf7f2 8510void OSD::check_osdmap_features()
7c673cae
FG
8511{
8512 // adjust required feature bits?
8513
8514 // we have to be a bit careful here, because we are accessing the
8515 // Policy structures without taking any lock. in particular, only
8516 // modify integer values that can safely be read by a racing CPU.
8517 // since we are only accessing existing Policy structures a their
8518 // current memory location, and setting or clearing bits in integer
8519 // fields, and we are the only writer, this is not a problem.
8520
9f95a23c 8521 const auto osdmap = get_osdmap();
7c673cae
FG
8522 {
8523 Messenger::Policy p = client_messenger->get_default_policy();
8524 uint64_t mask;
8525 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8526 if ((p.features_required & mask) != features) {
8527 dout(0) << "crush map has features " << features
8528 << ", adjusting msgr requires for clients" << dendl;
8529 p.features_required = (p.features_required & ~mask) | features;
8530 client_messenger->set_default_policy(p);
8531 }
8532 }
8533 {
8534 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8535 uint64_t mask;
8536 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8537 if ((p.features_required & mask) != features) {
8538 dout(0) << "crush map has features " << features
8539 << " was " << p.features_required
8540 << ", adjusting msgr requires for mons" << dendl;
8541 p.features_required = (p.features_required & ~mask) | features;
8542 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8543 }
8544 }
8545 {
8546 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8547 uint64_t mask;
8548 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8549
8550 if ((p.features_required & mask) != features) {
8551 dout(0) << "crush map has features " << features
8552 << ", adjusting msgr requires for osds" << dendl;
8553 p.features_required = (p.features_required & ~mask) | features;
8554 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8555 }
8556
11fdf7f2 8557 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8558 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8559 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8560 ObjectStore::Transaction t;
8561 write_superblock(t);
11fdf7f2
TL
8562 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8563 ceph_assert(err == 0);
7c673cae
FG
8564 }
8565 }
11fdf7f2 8566
9f95a23c
TL
8567 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8568 hb_front_server_messenger->set_require_authorizer(false);
8569 hb_back_server_messenger->set_require_authorizer(false);
8570 } else {
8571 hb_front_server_messenger->set_require_authorizer(true);
8572 hb_back_server_messenger->set_require_authorizer(true);
11fdf7f2
TL
8573 }
8574
8575 if (osdmap->require_osd_release != last_require_osd_release) {
8576 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8577 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8578 store->write_meta("require_osd_release",
8579 stringify((int)osdmap->require_osd_release));
8580 last_require_osd_release = osdmap->require_osd_release;
8581 }
7c673cae
FG
8582}
8583
11fdf7f2
TL
8584struct C_FinishSplits : public Context {
8585 OSD *osd;
8586 set<PGRef> pgs;
8587 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8588 : osd(osd), pgs(in) {}
8589 void finish(int r) override {
8590 osd->_finish_splits(pgs);
8591 }
8592};
8593
8594void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8595{
11fdf7f2
TL
8596 dout(10) << __func__ << " " << pgs << dendl;
8597 if (is_stopping())
8598 return;
11fdf7f2
TL
8599 for (set<PGRef>::iterator i = pgs.begin();
8600 i != pgs.end();
8601 ++i) {
8602 PG *pg = i->get();
7c673cae 8603
20effc67 8604 PeeringCtx rctx;
11fdf7f2
TL
8605 pg->lock();
8606 dout(10) << __func__ << " " << *pg << dendl;
8607 epoch_t e = pg->get_osdmap_epoch();
9f95a23c 8608 pg->handle_initialize(rctx);
11fdf7f2 8609 pg->queue_null(e, e);
9f95a23c 8610 dispatch_context(rctx, pg, service.get_osdmap());
11fdf7f2 8611 pg->unlock();
7c673cae 8612
11fdf7f2
TL
8613 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8614 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae 8615 }
11fdf7f2
TL
8616};
8617
8618bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8619 unsigned need)
8620{
8621 std::lock_guard l(merge_lock);
8622 auto& p = merge_waiters[nextmap->get_epoch()][target];
8623 p[src->pg_id] = src;
8624 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8625 << " for " << target << ", have " << p.size() << "/" << need
8626 << dendl;
8627 return p.size() == need;
8628}
8629
8630bool OSD::advance_pg(
8631 epoch_t osd_epoch,
8632 PG *pg,
8633 ThreadPool::TPHandle &handle,
9f95a23c 8634 PeeringCtx &rctx)
11fdf7f2
TL
8635{
8636 if (osd_epoch <= pg->get_osdmap_epoch()) {
8637 return true;
8638 }
8639 ceph_assert(pg->is_locked());
8640 OSDMapRef lastmap = pg->get_osdmap();
11fdf7f2
TL
8641 set<PGRef> new_pgs; // any split children
8642 bool ret = true;
8643
8644 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8645 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8646 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8647 next_epoch <= osd_epoch;
7c673cae
FG
8648 ++next_epoch) {
8649 OSDMapRef nextmap = service.try_get_map(next_epoch);
8650 if (!nextmap) {
8651 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8652 continue;
8653 }
8654
11fdf7f2
TL
8655 unsigned new_pg_num =
8656 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8657 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8658 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8659 // check for merge
8660 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8661 spg_t parent;
8662 if (pg->pg_id.is_merge_source(
8663 old_pg_num,
8664 new_pg_num,
8665 &parent)) {
8666 // we are merge source
8667 PGRef spg = pg; // carry a ref
8668 dout(1) << __func__ << " " << pg->pg_id
8669 << " is merge source, target is " << parent
8670 << dendl;
8671 pg->write_if_dirty(rctx);
9f95a23c
TL
8672 if (!new_pgs.empty()) {
8673 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8674 new_pgs));
8675 new_pgs.clear();
8676 }
8677 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2 8678 pg->ch->flush();
eafe8130
TL
8679 // release backoffs explicitly, since the on_shutdown path
8680 // aggressively tears down backoff state.
8681 if (pg->is_primary()) {
8682 pg->release_pg_backoffs();
8683 }
11fdf7f2
TL
8684 pg->on_shutdown();
8685 OSDShard *sdata = pg->osd_shard;
8686 {
8687 std::lock_guard l(sdata->shard_lock);
8688 if (pg->pg_slot) {
8689 sdata->_detach_pg(pg->pg_slot);
8690 // update pg count now since we might not get an osdmap
8691 // any time soon.
8692 if (pg->is_primary())
8693 logger->dec(l_osd_pg_primary);
9f95a23c
TL
8694 else if (pg->is_nonprimary())
8695 logger->dec(l_osd_pg_replica); // misnomer
11fdf7f2
TL
8696 else
8697 logger->dec(l_osd_pg_stray);
8698 }
8699 }
8700 pg->unlock();
8701
8702 set<spg_t> children;
8703 parent.is_split(new_pg_num, old_pg_num, &children);
8704 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8705 enqueue_peering_evt(
8706 parent,
8707 PGPeeringEventRef(
8708 std::make_shared<PGPeeringEvent>(
8709 nextmap->get_epoch(),
8710 nextmap->get_epoch(),
8711 NullEvt())));
8712 }
8713 ret = false;
8714 goto out;
8715 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8716 // we are merge target
8717 set<spg_t> children;
8718 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8719 dout(20) << __func__ << " " << pg->pg_id
8720 << " is merge target, sources are " << children
8721 << dendl;
8722 map<spg_t,PGRef> sources;
8723 {
8724 std::lock_guard l(merge_lock);
8725 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8726 unsigned need = children.size();
8727 dout(20) << __func__ << " have " << s.size() << "/"
8728 << need << dendl;
8729 if (s.size() == need) {
8730 sources.swap(s);
8731 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8732 if (merge_waiters[nextmap->get_epoch()].empty()) {
8733 merge_waiters.erase(nextmap->get_epoch());
8734 }
8735 }
8736 }
8737 if (!sources.empty()) {
8738 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8739 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8740 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8741 pg->merge_from(
8742 sources, rctx, split_bits,
8743 nextmap->get_pg_pool(
8744 pg->pg_id.pool())->last_pg_merge_meta);
8745 pg->pg_slot->waiting_for_merge_epoch = 0;
8746 } else {
8747 dout(20) << __func__ << " not ready to merge yet" << dendl;
8748 pg->write_if_dirty(rctx);
9f95a23c
TL
8749 if (!new_pgs.empty()) {
8750 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8751 new_pgs));
8752 new_pgs.clear();
8753 }
8754 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2
TL
8755 pg->unlock();
8756 // kick source(s) to get them ready
8757 for (auto& i : children) {
8758 dout(20) << __func__ << " kicking source " << i << dendl;
8759 enqueue_peering_evt(
8760 i,
8761 PGPeeringEventRef(
8762 std::make_shared<PGPeeringEvent>(
8763 nextmap->get_epoch(),
8764 nextmap->get_epoch(),
8765 NullEvt())));
8766 }
8767 ret = false;
8768 goto out;
8769 }
8770 }
8771 }
8772 }
8773
7c673cae
FG
8774 vector<int> newup, newacting;
8775 int up_primary, acting_primary;
8776 nextmap->pg_to_up_acting_osds(
11fdf7f2 8777 pg->pg_id.pgid,
7c673cae
FG
8778 &newup, &up_primary,
8779 &newacting, &acting_primary);
8780 pg->handle_advance_map(
8781 nextmap, lastmap, newup, up_primary,
8782 newacting, acting_primary, rctx);
8783
494da23a
TL
8784 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8785 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8786 if (oldpool != lastmap->get_pools().end()
8787 && newpool != nextmap->get_pools().end()) {
8788 dout(20) << __func__
8789 << " new pool opts " << newpool->second.opts
8790 << " old pool opts " << oldpool->second.opts
8791 << dendl;
8792
8793 double old_min_interval = 0, new_min_interval = 0;
8794 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8795 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8796
8797 double old_max_interval = 0, new_max_interval = 0;
8798 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8799 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8800
8801 // Assume if an interval is change from set to unset or vice versa the actual config
8802 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8803 // unnecessarily.
8804 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8805 pg->on_info_history_change();
8806 }
8807 }
8808
11fdf7f2
TL
8809 if (new_pg_num && old_pg_num != new_pg_num) {
8810 // check for split
8811 set<spg_t> children;
8812 if (pg->pg_id.is_split(
8813 old_pg_num,
8814 new_pg_num,
8815 &children)) {
8816 split_pgs(
8817 pg, children, &new_pgs, lastmap, nextmap,
8818 rctx);
8819 }
7c673cae
FG
8820 }
8821
8822 lastmap = nextmap;
11fdf7f2 8823 old_pg_num = new_pg_num;
7c673cae
FG
8824 handle.reset_tp_timeout();
8825 }
7c673cae 8826 pg->handle_activate_map(rctx);
11fdf7f2
TL
8827
8828 ret = true;
8829 out:
8830 if (!new_pgs.empty()) {
9f95a23c 8831 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 8832 }
11fdf7f2 8833 return ret;
7c673cae
FG
8834}
8835
8836void OSD::consume_map()
8837{
9f95a23c
TL
8838 ceph_assert(ceph_mutex_is_locked(osd_lock));
8839 auto osdmap = get_osdmap();
1e59de90 8840 dout(20) << __func__ << " version " << osdmap->get_epoch() << dendl;
7c673cae 8841
3efd9988
FG
8842 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8843 * speak the older sorting version any more. Be careful not to force
8844 * a shutdown if we are merely processing old maps, though.
8845 */
8846 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8847 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8848 ceph_abort();
8849 }
11fdf7f2
TL
8850 service.pre_publish_map(osdmap);
8851 service.await_reserved_maps();
8852 service.publish_map(osdmap);
1e59de90 8853 dout(20) << "consume_map " << osdmap->get_epoch() << " -- publish done" << dendl;
11fdf7f2
TL
8854 // prime splits and merges
8855 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8856 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8857 for (auto& shard : shards) {
8858 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8859 }
8860 if (!newly_split.empty()) {
8861 for (auto& shard : shards) {
8862 shard->prime_splits(osdmap, &newly_split);
8863 }
8864 ceph_assert(newly_split.empty());
8865 }
7c673cae 8866
11fdf7f2
TL
8867 // prune sent_ready_to_merge
8868 service.prune_sent_ready_to_merge(osdmap);
7c673cae 8869
11fdf7f2
TL
8870 // FIXME, maybe: We could race against an incoming peering message
8871 // that instantiates a merge PG after identify_merges() below and
8872 // never set up its peer to complete the merge. An OSD restart
8873 // would clear it up. This is a hard race to resolve,
8874 // extraordinarily rare (we only merge PGs that are stable and
8875 // clean, so it'd have to be an imported PG to an OSD with a
8876 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8877 // replace all of this with a seastar-based code soon anyway.
8878 if (!merge_pgs.empty()) {
8879 // mark the pgs we already have, or create new and empty merge
8880 // participants for those we are missing. do this all under the
8881 // shard lock so we don't have to worry about racing pg creates
8882 // via _process.
8883 for (auto& shard : shards) {
8884 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 8885 }
11fdf7f2
TL
8886 ceph_assert(merge_pgs.empty());
8887 }
8888
8889 service.prune_pg_created();
8890
8891 unsigned pushes_to_free = 0;
8892 for (auto& shard : shards) {
8893 shard->consume_map(osdmap, &pushes_to_free);
8894 }
8895
8896 vector<spg_t> pgids;
8897 _get_pgids(&pgids);
8898
8899 // count (FIXME, probably during seastar rewrite)
8900 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8901 vector<PGRef> pgs;
8902 _get_pgs(&pgs);
8903 for (auto& pg : pgs) {
8904 // FIXME (probably during seastar rewrite): this is lockless and
8905 // racy, but we don't want to take pg lock here.
8906 if (pg->is_primary())
8907 num_pg_primary++;
9f95a23c
TL
8908 else if (pg->is_nonprimary())
8909 num_pg_replica++; // misnomer
11fdf7f2
TL
8910 else
8911 num_pg_stray++;
8912 }
3efd9988 8913
11fdf7f2
TL
8914 {
8915 // FIXME (as part of seastar rewrite): move to OSDShard
8916 std::lock_guard l(pending_creates_lock);
8917 for (auto pg = pending_creates_from_osd.begin();
8918 pg != pending_creates_from_osd.end();) {
9f95a23c 8919 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
11fdf7f2
TL
8920 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8921 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
8922 pg = pending_creates_from_osd.erase(pg);
8923 } else {
8924 ++pg;
8925 }
8926 }
7c673cae
FG
8927 }
8928
7c673cae
FG
8929 service.maybe_inject_dispatch_delay();
8930
8931 dispatch_sessions_waiting_on_map();
8932
8933 service.maybe_inject_dispatch_delay();
8934
11fdf7f2 8935 service.release_reserved_pushes(pushes_to_free);
7c673cae 8936
11fdf7f2
TL
8937 // queue null events to push maps down to individual PGs
8938 for (auto pgid : pgids) {
8939 enqueue_peering_evt(
8940 pgid,
8941 PGPeeringEventRef(
8942 std::make_shared<PGPeeringEvent>(
8943 osdmap->get_epoch(),
8944 osdmap->get_epoch(),
8945 NullEvt())));
7c673cae 8946 }
11fdf7f2 8947 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
8948 logger->set(l_osd_pg_primary, num_pg_primary);
8949 logger->set(l_osd_pg_replica, num_pg_replica);
8950 logger->set(l_osd_pg_stray, num_pg_stray);
8951}
8952
8953void OSD::activate_map()
8954{
9f95a23c
TL
8955 ceph_assert(ceph_mutex_is_locked(osd_lock));
8956 auto osdmap = get_osdmap();
7c673cae
FG
8957
8958 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8959
7c673cae
FG
8960 // norecover?
8961 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8962 if (!service.recovery_is_paused()) {
8963 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8964 service.pause_recovery();
8965 }
8966 } else {
8967 if (service.recovery_is_paused()) {
8968 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8969 service.unpause_recovery();
8970 }
8971 }
8972
8973 service.activate_map();
7c673cae
FG
8974}
8975
8976bool OSD::require_mon_peer(const Message *m)
8977{
8978 if (!m->get_connection()->peer_is_mon()) {
8979 dout(0) << "require_mon_peer received from non-mon "
8980 << m->get_connection()->get_peer_addr()
8981 << " " << *m << dendl;
8982 return false;
8983 }
8984 return true;
8985}
8986
8987bool OSD::require_mon_or_mgr_peer(const Message *m)
8988{
8989 if (!m->get_connection()->peer_is_mon() &&
8990 !m->get_connection()->peer_is_mgr()) {
8991 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8992 << m->get_connection()->get_peer_addr()
8993 << " " << *m << dendl;
8994 return false;
8995 }
8996 return true;
8997}
8998
8999bool OSD::require_osd_peer(const Message *m)
9000{
9001 if (!m->get_connection()->peer_is_osd()) {
9002 dout(0) << "require_osd_peer received from non-osd "
9003 << m->get_connection()->get_peer_addr()
9004 << " " << *m << dendl;
9005 return false;
9006 }
9007 return true;
9008}
9009
7c673cae
FG
9010// ----------------------------------------
9011// pg creation
9012
9013void OSD::split_pgs(
9014 PG *parent,
31f18b77 9015 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
9016 OSDMapRef curmap,
9017 OSDMapRef nextmap,
9f95a23c 9018 PeeringCtx &rctx)
7c673cae 9019{
11fdf7f2
TL
9020 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9021 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 9022
11fdf7f2
TL
9023 vector<object_stat_sum_t> updated_stats;
9024 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
9025
9026 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9027 for (set<spg_t>::const_iterator i = childpgids.begin();
9028 i != childpgids.end();
9029 ++i, ++stat_iter) {
11fdf7f2
TL
9030 ceph_assert(stat_iter != updated_stats.end());
9031 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
9032 PG* child = _make_pg(nextmap, *i);
9033 child->lock(true);
9034 out_pgs->insert(child);
11fdf7f2 9035 child->ch = store->create_new_collection(child->coll);
7c673cae 9036
11fdf7f2
TL
9037 {
9038 uint32_t shard_index = i->hash_to_shard(shards.size());
9039 assert(NULL != shards[shard_index]);
9040 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9041 }
7c673cae 9042
11fdf7f2
TL
9043 unsigned split_bits = i->get_split_bits(pg_num);
9044 dout(10) << " pg_num is " << pg_num
9045 << ", m_seed " << i->ps()
9046 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
9047 parent->split_colls(
9048 *i,
9049 split_bits,
9050 i->ps(),
1e59de90 9051 &child->get_pgpool().info,
9f95a23c 9052 rctx.transaction);
7c673cae
FG
9053 parent->split_into(
9054 i->pgid,
9055 child,
9056 split_bits);
7c673cae 9057
92f5a8d4
TL
9058 child->init_collection_pool_opts();
9059
9f95a23c 9060 child->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
9061 child->unlock();
9062 }
11fdf7f2 9063 ceph_assert(stat_iter != updated_stats.end());
9f95a23c 9064 parent->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
9065}
9066
7c673cae
FG
9067// ----------------------------------------
9068// peering and recovery
9069
9f95a23c 9070void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
7c673cae
FG
9071 ThreadPool::TPHandle *handle)
9072{
11fdf7f2
TL
9073 if (!service.get_osdmap()->is_up(whoami)) {
9074 dout(20) << __func__ << " not up in osdmap" << dendl;
9075 } else if (!is_active()) {
9076 dout(20) << __func__ << " not active" << dendl;
9077 } else {
9f95a23c
TL
9078 for (auto& [osd, ls] : ctx.message_map) {
9079 if (!curmap->is_up(osd)) {
9080 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9081 continue;
9082 }
9083 ConnectionRef con = service.get_con_osd_cluster(
9084 osd, curmap->get_epoch());
9085 if (!con) {
9086 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9087 << dendl;
9088 continue;
9089 }
9090 service.maybe_share_map(con.get(), curmap);
9091 for (auto m : ls) {
9092 con->send_message2(m);
9093 }
9094 ls.clear();
9095 }
7c673cae 9096 }
9f95a23c 9097 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
7c673cae 9098 int tr = store->queue_transaction(
11fdf7f2 9099 pg->ch,
9f95a23c 9100 std::move(ctx.transaction), TrackedOpRef(),
7c673cae 9101 handle);
11fdf7f2 9102 ceph_assert(tr == 0);
7c673cae 9103 }
7c673cae
FG
9104}
9105
11fdf7f2 9106void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9107{
11fdf7f2
TL
9108 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9109 if (!require_mon_peer(m)) {
9110 m->put();
7c673cae 9111 return;
7c673cae 9112 }
11fdf7f2
TL
9113 for (auto& p : m->pgs) {
9114 spg_t pgid = p.first;
9115 epoch_t created = p.second.first;
9116 utime_t created_stamp = p.second.second;
9f95a23c
TL
9117 auto q = m->pg_extra.find(pgid);
9118 if (q == m->pg_extra.end()) {
1e59de90
TL
9119 clog->error() << __func__ << " " << pgid << " e" << created
9120 << "@" << created_stamp << " with no history or past_intervals"
9121 << ", this should be impossible after octopus. Ignoring.";
9f95a23c
TL
9122 } else {
9123 dout(20) << __func__ << " " << pgid << " e" << created
9124 << "@" << created_stamp
9125 << " history " << q->second.first
9126 << " pi " << q->second.second << dendl;
9127 if (!q->second.second.empty() &&
9128 m->epoch < q->second.second.get_bounds().second) {
9129 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9130 << " and unmatched past_intervals " << q->second.second
9131 << " (history " << q->second.first << ")";
9132 } else {
9133 enqueue_peering_evt(
9134 pgid,
9135 PGPeeringEventRef(
9136 std::make_shared<PGPeeringEvent>(
9137 m->epoch,
9138 m->epoch,
9139 NullEvt(),
9140 true,
9141 new PGCreateInfo(
9142 pgid,
9143 m->epoch,
9144 q->second.first,
9145 q->second.second,
9146 true)
9147 )));
9148 }
9149 }
11fdf7f2 9150 }
7c673cae 9151
11fdf7f2
TL
9152 {
9153 std::lock_guard l(pending_creates_lock);
9154 if (pending_creates_from_mon == 0) {
9155 last_pg_create_epoch = m->epoch;
9156 }
7c673cae
FG
9157 }
9158
11fdf7f2 9159 m->put();
7c673cae
FG
9160}
9161
11fdf7f2 9162void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9163{
11fdf7f2
TL
9164 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9165 if (!require_osd_peer(m)) {
9166 m->put();
7c673cae
FG
9167 return;
9168 }
11fdf7f2
TL
9169 int from = m->get_source().num();
9170 for (auto& p : m->get_pg_list()) {
9f95a23c 9171 spg_t pgid(p.info.pgid.pgid, p.to);
11fdf7f2
TL
9172 enqueue_peering_evt(
9173 pgid,
9174 PGPeeringEventRef(
9175 std::make_shared<PGPeeringEvent>(
9f95a23c
TL
9176 p.epoch_sent,
9177 p.query_epoch,
11fdf7f2 9178 MNotifyRec(
9f95a23c
TL
9179 pgid, pg_shard_t(from, p.from),
9180 p,
9181 m->get_connection()->get_features()),
11fdf7f2
TL
9182 true,
9183 new PGCreateInfo(
9184 pgid,
9f95a23c
TL
9185 p.query_epoch,
9186 p.info.history,
9187 p.past_intervals,
11fdf7f2
TL
9188 false)
9189 )));
7c673cae 9190 }
11fdf7f2 9191 m->put();
7c673cae
FG
9192}
9193
11fdf7f2 9194void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9195{
11fdf7f2
TL
9196 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9197 if (!require_osd_peer(m)) {
9198 m->put();
7c673cae
FG
9199 return;
9200 }
11fdf7f2
TL
9201 int from = m->get_source().num();
9202 for (auto& p : m->pg_list) {
9203 enqueue_peering_evt(
9f95a23c 9204 spg_t(p.info.pgid.pgid, p.to),
11fdf7f2 9205 PGPeeringEventRef(
20effc67
TL
9206 std::make_shared<PGPeeringEvent>(
9207 p.epoch_sent, p.query_epoch,
9208 MInfoRec(
9209 pg_shard_t(from, p.from),
9210 p.info,
9211 p.epoch_sent)))
11fdf7f2 9212 );
7c673cae 9213 }
11fdf7f2 9214 m->put();
7c673cae
FG
9215}
9216
11fdf7f2 9217void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9218{
11fdf7f2
TL
9219 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9220 if (!require_osd_peer(m)) {
9221 m->put();
7c673cae
FG
9222 return;
9223 }
11fdf7f2
TL
9224 for (auto& pgid : m->pg_list) {
9225 enqueue_peering_evt(
9226 pgid,
9227 PGPeeringEventRef(
9228 std::make_shared<PGPeeringEvent>(
9229 m->get_epoch(), m->get_epoch(),
9f95a23c 9230 PeeringState::DeleteStart())));
7c673cae 9231 }
11fdf7f2 9232 m->put();
7c673cae
FG
9233}
9234
11fdf7f2 9235void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9236{
11fdf7f2
TL
9237 dout(10) << __func__ << " " << *m << dendl;
9238 if (!require_mon_or_mgr_peer(m)) {
9239 m->put();
9240 return;
9241 }
9242 epoch_t epoch = get_osdmap_epoch();
9243 for (auto pgid : m->forced_pgs) {
9244 if (m->options & OFR_BACKFILL) {
9245 if (m->options & OFR_CANCEL) {
9246 enqueue_peering_evt(
9247 pgid,
9248 PGPeeringEventRef(
9249 std::make_shared<PGPeeringEvent>(
9250 epoch, epoch,
9f95a23c 9251 PeeringState::UnsetForceBackfill())));
11fdf7f2
TL
9252 } else {
9253 enqueue_peering_evt(
9254 pgid,
9255 PGPeeringEventRef(
9256 std::make_shared<PGPeeringEvent>(
9257 epoch, epoch,
9f95a23c 9258 PeeringState::SetForceBackfill())));
11fdf7f2
TL
9259 }
9260 } else if (m->options & OFR_RECOVERY) {
9261 if (m->options & OFR_CANCEL) {
9262 enqueue_peering_evt(
9263 pgid,
9264 PGPeeringEventRef(
9265 std::make_shared<PGPeeringEvent>(
9266 epoch, epoch,
9f95a23c 9267 PeeringState::UnsetForceRecovery())));
11fdf7f2
TL
9268 } else {
9269 enqueue_peering_evt(
9270 pgid,
9271 PGPeeringEventRef(
9272 std::make_shared<PGPeeringEvent>(
9273 epoch, epoch,
9f95a23c 9274 PeeringState::SetForceRecovery())));
c07f9fc5
FG
9275 }
9276 }
9277 }
11fdf7f2 9278 m->put();
c07f9fc5 9279}
7c673cae 9280
11fdf7f2 9281void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9282{
11fdf7f2
TL
9283 spg_t pgid = q.pgid;
9284 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9285
11fdf7f2
TL
9286 OSDMapRef osdmap = get_osdmap();
9287 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9288 return;
9289
11fdf7f2
TL
9290 dout(10) << " pg " << pgid << " dne" << dendl;
9291 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9292 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9293 if (con) {
9294 Message *m;
9295 if (q.query.type == pg_query_t::LOG ||
9296 q.query.type == pg_query_t::FULLLOG) {
9297 m = new MOSDPGLog(
9298 q.query.from, q.query.to,
9299 osdmap->get_epoch(), empty,
9300 q.query.epoch_sent);
7c673cae 9301 } else {
20effc67
TL
9302 pg_notify_t notify{q.query.from, q.query.to,
9303 q.query.epoch_sent,
9304 osdmap->get_epoch(),
9305 empty,
9306 PastIntervals()};
9307 m = new MOSDPGNotify2(spg_t{pgid.pgid, q.query.from},
9308 std::move(notify));
7c673cae 9309 }
9f95a23c 9310 service.maybe_share_map(con.get(), osdmap);
11fdf7f2 9311 con->send_message(m);
7c673cae
FG
9312 }
9313}
9314
9f95a23c
TL
9315void OSDService::queue_check_readable(spg_t spgid,
9316 epoch_t lpr,
9317 ceph::signedspan delay)
9318{
9319 if (delay == ceph::signedspan::zero()) {
9320 osd->enqueue_peering_evt(
9321 spgid,
9322 PGPeeringEventRef(
9323 std::make_shared<PGPeeringEvent>(
9324 lpr, lpr,
9325 PeeringState::CheckReadable())));
9326 } else {
9327 mono_timer.add_event(
9328 delay,
9329 [this, spgid, lpr]() {
9330 queue_check_readable(spgid, lpr);
9331 });
9332 }
9333}
9334
7c673cae 9335
7c673cae
FG
9336// =========================================================
9337// RECOVERY
9338
9339void OSDService::_maybe_queue_recovery() {
9f95a23c 9340 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
7c673cae
FG
9341 uint64_t available_pushes;
9342 while (!awaiting_throttle.empty() &&
9343 _recover_now(&available_pushes)) {
11fdf7f2 9344 uint64_t to_start = std::min(
7c673cae
FG
9345 available_pushes,
9346 cct->_conf->osd_recovery_max_single_start);
9347 _queue_for_recovery(awaiting_throttle.front(), to_start);
9348 awaiting_throttle.pop_front();
11fdf7f2
TL
9349 dout(10) << __func__ << " starting " << to_start
9350 << ", recovery_ops_reserved " << recovery_ops_reserved
9351 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9352 recovery_ops_reserved += to_start;
9353 }
9354}
9355
9356bool OSDService::_recover_now(uint64_t *available_pushes)
9357{
9358 if (available_pushes)
9359 *available_pushes = 0;
9360
9361 if (ceph_clock_now() < defer_recovery_until) {
9362 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9363 return false;
9364 }
9365
9366 if (recovery_paused) {
9367 dout(15) << __func__ << " paused" << dendl;
9368 return false;
9369 }
9370
9f95a23c 9371 uint64_t max = osd->get_recovery_max_active();
7c673cae
FG
9372 if (max <= recovery_ops_active + recovery_ops_reserved) {
9373 dout(15) << __func__ << " active " << recovery_ops_active
9374 << " + reserved " << recovery_ops_reserved
9375 << " >= max " << max << dendl;
9376 return false;
9377 }
9378
9379 if (available_pushes)
9380 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9381
9382 return true;
9383}
9384
9f95a23c
TL
9385unsigned OSDService::get_target_pg_log_entries() const
9386{
9387 auto num_pgs = osd->get_num_pgs();
9388 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9389 if (num_pgs > 0 && target > 0) {
9390 // target an even spread of our budgeted log entries across all
9391 // PGs. note that while we only get to control the entry count
9392 // for primary PGs, we'll normally be responsible for a mix of
9393 // primary and replica PGs (for the same pool(s) even), so this
9394 // will work out.
9395 return std::max<unsigned>(
9396 std::min<unsigned>(target / num_pgs,
9397 cct->_conf->osd_max_pg_log_entries),
9398 cct->_conf->osd_min_pg_log_entries);
9399 } else {
9400 // fall back to a per-pg value.
9401 return cct->_conf->osd_min_pg_log_entries;
9402 }
9403}
9404
7c673cae 9405void OSD::do_recovery(
1e59de90 9406 PG *pg, epoch_t queued, uint64_t reserved_pushes, int priority,
7c673cae
FG
9407 ThreadPool::TPHandle &handle)
9408{
9409 uint64_t started = 0;
31f18b77
FG
9410
9411 /*
9412 * When the value of osd_recovery_sleep is set greater than zero, recovery
9413 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9414 * recovery event's schedule time. This is done by adding a
9415 * recovery_requeue_callback event, which re-queues the recovery op using
9416 * queue_recovery_after_sleep.
9417 */
c07f9fc5 9418 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9419 {
11fdf7f2 9420 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9421 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9422 PGRef pgref(pg);
1e59de90
TL
9423 auto recovery_requeue_callback = new LambdaContext(
9424 [this, pgref, queued, reserved_pushes, priority](int r) {
b32b8144
FG
9425 dout(20) << "do_recovery wake up at "
9426 << ceph_clock_now()
9427 << ", re-queuing recovery" << dendl;
11fdf7f2 9428 std::lock_guard l(service.sleep_lock);
b32b8144 9429 service.recovery_needs_sleep = false;
1e59de90 9430 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes, priority);
b32b8144
FG
9431 });
9432
9433 // This is true for the first recovery op and when the previous recovery op
9434 // has been scheduled in the past. The next recovery op is scheduled after
9435 // completing the sleep from now.
f67539c2 9436
9f95a23c
TL
9437 if (auto now = ceph::real_clock::now();
9438 service.recovery_schedule_time < now) {
9439 service.recovery_schedule_time = now;
b32b8144 9440 }
9f95a23c 9441 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
11fdf7f2 9442 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9f95a23c 9443 recovery_requeue_callback);
b32b8144
FG
9444 dout(20) << "Recovery event scheduled at "
9445 << service.recovery_schedule_time << dendl;
9446 return;
9447 }
7c673cae
FG
9448 }
9449
9450 {
b32b8144 9451 {
11fdf7f2 9452 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9453 service.recovery_needs_sleep = true;
9454 }
9455
7c673cae
FG
9456 if (pg->pg_has_reset_since(queued)) {
9457 goto out;
9458 }
9459
7c673cae
FG
9460 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9461#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 9462 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
9463#endif
9464
11fdf7f2 9465 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
f67539c2 9466 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
7c673cae
FG
9467 << " on " << *pg << dendl;
9468
11fdf7f2 9469 if (do_unfound) {
20effc67 9470 PeeringCtx rctx;
11fdf7f2 9471 rctx.handle = &handle;
9f95a23c 9472 pg->find_unfound(queued, rctx);
11fdf7f2 9473 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 9474 }
7c673cae
FG
9475 }
9476
9477 out:
11fdf7f2 9478 ceph_assert(started <= reserved_pushes);
7c673cae
FG
9479 service.release_reserved_pushes(reserved_pushes);
9480}
9481
9482void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9483{
11fdf7f2 9484 std::lock_guard l(recovery_lock);
7c673cae
FG
9485 dout(10) << "start_recovery_op " << *pg << " " << soid
9486 << " (" << recovery_ops_active << "/"
9f95a23c 9487 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9488 << dendl;
9489 recovery_ops_active++;
9490
9491#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9492 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9493 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9494 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
9495#endif
9496}
9497
9498void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9499{
11fdf7f2 9500 std::lock_guard l(recovery_lock);
7c673cae
FG
9501 dout(10) << "finish_recovery_op " << *pg << " " << soid
9502 << " dequeue=" << dequeue
9f95a23c
TL
9503 << " (" << recovery_ops_active << "/"
9504 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9505 << dendl;
9506
9507 // adjust count
11fdf7f2 9508 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
9509 recovery_ops_active--;
9510
9511#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9512 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9513 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9514 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
9515#endif
9516
9517 _maybe_queue_recovery();
9518}
9519
9520bool OSDService::is_recovery_active()
9521{
eafe8130
TL
9522 if (cct->_conf->osd_debug_pretend_recovery_active) {
9523 return true;
9524 }
b5b8bbf5 9525 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9526}
9527
11fdf7f2
TL
9528void OSDService::release_reserved_pushes(uint64_t pushes)
9529{
9530 std::lock_guard l(recovery_lock);
9531 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9532 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9533 << dendl;
9534 ceph_assert(recovery_ops_reserved >= pushes);
9535 recovery_ops_reserved -= pushes;
9536 _maybe_queue_recovery();
9537}
9538
7c673cae
FG
9539// =========================================================
9540// OPS
9541
9542bool OSD::op_is_discardable(const MOSDOp *op)
9543{
9544 // drop client request if they are not connected and can't get the
9545 // reply anyway.
9546 if (!op->get_connection()->is_connected()) {
9547 return true;
9548 }
9549 return false;
9550}
9551
11fdf7f2 9552void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 9553{
11fdf7f2
TL
9554 const utime_t stamp = op->get_req()->get_recv_stamp();
9555 const utime_t latency = ceph_clock_now() - stamp;
9556 const unsigned priority = op->get_req()->get_priority();
9557 const int cost = op->get_req()->get_cost();
9558 const uint64_t owner = op->get_req()->get_source().num();
f67539c2 9559 const int type = op->get_req()->get_type();
11fdf7f2 9560
1e59de90 9561 dout(15) << "enqueue_op " << *op->get_req() << " prio " << priority
f67539c2 9562 << " type " << type
11fdf7f2 9563 << " cost " << cost
7c673cae
FG
9564 << " latency " << latency
9565 << " epoch " << epoch
9566 << " " << *(op->get_req()) << dendl;
9567 op->osd_trace.event("enqueue op");
11fdf7f2
TL
9568 op->osd_trace.keyval("priority", priority);
9569 op->osd_trace.keyval("cost", cost);
20effc67
TL
9570
9571 auto enqueue_span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
9572 enqueue_span->AddEvent(__func__, {
9573 {"priority", priority},
9574 {"cost", cost},
9575 {"epoch", epoch},
9576 {"owner", owner},
9577 {"type", type}
9578 });
9579
7c673cae 9580 op->mark_queued_for_pg();
224ce89b 9581 logger->tinc(l_osd_op_before_queue_op_lat, latency);
1e59de90 9582 if (PGRecoveryMsg::is_recovery_msg(op)) {
f67539c2
TL
9583 op_shardedwq.queue(
9584 OpSchedulerItem(
9585 unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9586 cost, priority, stamp, owner, epoch));
9587 } else {
9588 op_shardedwq.queue(
9589 OpSchedulerItem(
9590 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9591 cost, priority, stamp, owner, epoch));
9592 }
7c673cae
FG
9593}
9594
11fdf7f2
TL
9595void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9596{
9597 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9598 op_shardedwq.queue(
9f95a23c
TL
9599 OpSchedulerItem(
9600 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
11fdf7f2
TL
9601 10,
9602 cct->_conf->osd_peering_op_priority,
9603 utime_t(),
9604 0,
9605 evt->get_epoch_sent()));
9606}
7c673cae
FG
9607
9608/*
9609 * NOTE: dequeue called in worker thread, with pg lock
9610 */
9611void OSD::dequeue_op(
9612 PGRef pg, OpRequestRef op,
9613 ThreadPool::TPHandle &handle)
9614{
9f95a23c
TL
9615 const Message *m = op->get_req();
9616
11fdf7f2 9617 FUNCTRACE(cct);
9f95a23c 9618 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
7c673cae
FG
9619
9620 utime_t now = ceph_clock_now();
9621 op->set_dequeued_time(now);
9f95a23c
TL
9622
9623 utime_t latency = now - m->get_recv_stamp();
1e59de90
TL
9624 dout(10) << "dequeue_op " << *op->get_req()
9625 << " prio " << m->get_priority()
9f95a23c 9626 << " cost " << m->get_cost()
7c673cae 9627 << " latency " << latency
9f95a23c 9628 << " " << *m
7c673cae
FG
9629 << " pg " << *pg << dendl;
9630
224ce89b
WB
9631 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9632
9f95a23c
TL
9633 service.maybe_share_map(m->get_connection().get(),
9634 pg->get_osdmap(),
9635 op->sent_epoch);
7c673cae 9636
11fdf7f2 9637 if (pg->is_deleting())
7c673cae
FG
9638 return;
9639
9640 op->mark_reached_pg();
9641 op->osd_trace.event("dequeue_op");
9642
9643 pg->do_request(op, handle);
9644
9645 // finish
1e59de90 9646 dout(10) << "dequeue_op " << *op->get_req() << " finish" << dendl;
9f95a23c 9647 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
7c673cae
FG
9648}
9649
9650
11fdf7f2
TL
9651void OSD::dequeue_peering_evt(
9652 OSDShard *sdata,
9653 PG *pg,
9654 PGPeeringEventRef evt,
9655 ThreadPool::TPHandle& handle)
7c673cae 9656{
11fdf7f2 9657 auto curmap = sdata->get_osdmap();
9f95a23c
TL
9658 bool need_up_thru = false;
9659 epoch_t same_interval_since = 0;
11fdf7f2
TL
9660 if (!pg) {
9661 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9662 handle_pg_query_nopg(*q);
7c673cae 9663 } else {
11fdf7f2
TL
9664 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9665 ceph_abort();
9666 }
20effc67
TL
9667 } else if (PeeringCtx rctx;
9668 advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9f95a23c 9669 pg->do_peering_event(evt, rctx);
11fdf7f2 9670 if (pg->is_deleted()) {
11fdf7f2
TL
9671 pg->unlock();
9672 return;
7c673cae 9673 }
9f95a23c 9674 dispatch_context(rctx, pg, curmap, &handle);
11fdf7f2
TL
9675 need_up_thru = pg->get_need_up_thru();
9676 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
9677 pg->unlock();
9678 }
11fdf7f2
TL
9679
9680 if (need_up_thru) {
7c673cae 9681 queue_want_up_thru(same_interval_since);
11fdf7f2 9682 }
7c673cae
FG
9683
9684 service.send_pg_temp();
9685}
9686
11fdf7f2
TL
9687void OSD::dequeue_delete(
9688 OSDShard *sdata,
9689 PG *pg,
9690 epoch_t e,
9691 ThreadPool::TPHandle& handle)
9692{
9693 dequeue_peering_evt(
9694 sdata,
9695 pg,
9696 PGPeeringEventRef(
9697 std::make_shared<PGPeeringEvent>(
9698 e, e,
9f95a23c 9699 PeeringState::DeleteSome())),
11fdf7f2
TL
9700 handle);
9701}
9702
9703
9704
7c673cae
FG
9705// --------------------------------
9706
9707const char** OSD::get_tracked_conf_keys() const
9708{
9709 static const char* KEYS[] = {
9710 "osd_max_backfills",
9711 "osd_min_recovery_priority",
224ce89b
WB
9712 "osd_max_trimming_pgs",
9713 "osd_op_complaint_time",
9714 "osd_op_log_threshold",
9715 "osd_op_history_size",
9716 "osd_op_history_duration",
9717 "osd_op_history_slow_op_size",
9718 "osd_op_history_slow_op_threshold",
7c673cae
FG
9719 "osd_enable_op_tracker",
9720 "osd_map_cache_size",
11fdf7f2 9721 "osd_pg_epoch_max_lag_factor",
7c673cae 9722 "osd_pg_epoch_persisted_max_stale",
f67539c2
TL
9723 "osd_recovery_sleep",
9724 "osd_recovery_sleep_hdd",
9725 "osd_recovery_sleep_ssd",
9726 "osd_recovery_sleep_hybrid",
b3b6e05e
TL
9727 "osd_delete_sleep",
9728 "osd_delete_sleep_hdd",
9729 "osd_delete_sleep_ssd",
9730 "osd_delete_sleep_hybrid",
9731 "osd_snap_trim_sleep",
9732 "osd_snap_trim_sleep_hdd",
9733 "osd_snap_trim_sleep_ssd",
20effc67 9734 "osd_snap_trim_sleep_hybrid",
b3b6e05e 9735 "osd_scrub_sleep",
f67539c2
TL
9736 "osd_recovery_max_active",
9737 "osd_recovery_max_active_hdd",
9738 "osd_recovery_max_active_ssd",
7c673cae
FG
9739 // clog & admin clog
9740 "clog_to_monitors",
9741 "clog_to_syslog",
9742 "clog_to_syslog_facility",
9743 "clog_to_syslog_level",
9744 "osd_objectstore_fuse",
9745 "clog_to_graylog",
9746 "clog_to_graylog_host",
9747 "clog_to_graylog_port",
9748 "host",
9749 "fsid",
9750 "osd_recovery_delay_start",
9751 "osd_client_message_size_cap",
9752 "osd_client_message_cap",
31f18b77
FG
9753 "osd_heartbeat_min_size",
9754 "osd_heartbeat_interval",
9f95a23c 9755 "osd_object_clean_region_max_num_intervals",
494da23a
TL
9756 "osd_scrub_min_interval",
9757 "osd_scrub_max_interval",
7c673cae
FG
9758 NULL
9759 };
9760 return KEYS;
9761}
9762
11fdf7f2 9763void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
9764 const std::set <std::string> &changed)
9765{
9f95a23c 9766 std::lock_guard l{osd_lock};
f67539c2
TL
9767
9768 if (changed.count("osd_max_backfills") ||
39ae355f
TL
9769 changed.count("osd_recovery_max_active") ||
9770 changed.count("osd_recovery_max_active_hdd") ||
9771 changed.count("osd_recovery_max_active_ssd")) {
9772 if (!maybe_override_options_for_qos(&changed) &&
9773 changed.count("osd_max_backfills")) {
9774 // Scheduler is not "mclock". Fallback to earlier behavior
9775 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9776 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9777 }
9778 }
9779 if (changed.count("osd_delete_sleep") ||
f67539c2
TL
9780 changed.count("osd_delete_sleep_hdd") ||
9781 changed.count("osd_delete_sleep_ssd") ||
9782 changed.count("osd_delete_sleep_hybrid") ||
9783 changed.count("osd_snap_trim_sleep") ||
9784 changed.count("osd_snap_trim_sleep_hdd") ||
9785 changed.count("osd_snap_trim_sleep_ssd") ||
9786 changed.count("osd_snap_trim_sleep_hybrid") ||
9787 changed.count("osd_scrub_sleep") ||
9788 changed.count("osd_recovery_sleep") ||
9789 changed.count("osd_recovery_sleep_hdd") ||
9790 changed.count("osd_recovery_sleep_ssd") ||
39ae355f
TL
9791 changed.count("osd_recovery_sleep_hybrid")) {
9792 maybe_override_sleep_options_for_qos();
7c673cae 9793 }
1e59de90
TL
9794 if (changed.count("osd_pg_delete_cost")) {
9795 maybe_override_cost_for_qos();
9796 }
7c673cae
FG
9797 if (changed.count("osd_min_recovery_priority")) {
9798 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9799 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9800 }
9801 if (changed.count("osd_max_trimming_pgs")) {
9802 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9803 }
9804 if (changed.count("osd_op_complaint_time") ||
9805 changed.count("osd_op_log_threshold")) {
9806 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9807 cct->_conf->osd_op_log_threshold);
9808 }
9809 if (changed.count("osd_op_history_size") ||
9810 changed.count("osd_op_history_duration")) {
9811 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9812 cct->_conf->osd_op_history_duration);
9813 }
9814 if (changed.count("osd_op_history_slow_op_size") ||
9815 changed.count("osd_op_history_slow_op_threshold")) {
9816 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9817 cct->_conf->osd_op_history_slow_op_threshold);
9818 }
9819 if (changed.count("osd_enable_op_tracker")) {
9820 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9821 }
7c673cae
FG
9822 if (changed.count("osd_map_cache_size")) {
9823 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9824 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9825 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9826 }
9827 if (changed.count("clog_to_monitors") ||
9828 changed.count("clog_to_syslog") ||
9829 changed.count("clog_to_syslog_level") ||
9830 changed.count("clog_to_syslog_facility") ||
9831 changed.count("clog_to_graylog") ||
9832 changed.count("clog_to_graylog_host") ||
9833 changed.count("clog_to_graylog_port") ||
9834 changed.count("host") ||
9835 changed.count("fsid")) {
9836 update_log_config();
9837 }
11fdf7f2
TL
9838 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9839 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9840 "osd_pg_epoch_max_lag_factor");
9841 }
7c673cae
FG
9842
9843#ifdef HAVE_LIBFUSE
9844 if (changed.count("osd_objectstore_fuse")) {
9845 if (store) {
9846 enable_disable_fuse(false);
9847 }
9848 }
9849#endif
9850
9851 if (changed.count("osd_recovery_delay_start")) {
9852 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9853 service.kick_recovery_queue();
9854 }
9855
9856 if (changed.count("osd_client_message_cap")) {
9857 uint64_t newval = cct->_conf->osd_client_message_cap;
9858 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
20effc67 9859 if (pol.throttler_messages) {
7c673cae
FG
9860 pol.throttler_messages->reset_max(newval);
9861 }
9862 }
9863 if (changed.count("osd_client_message_size_cap")) {
9864 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9865 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
20effc67 9866 if (pol.throttler_bytes) {
7c673cae
FG
9867 pol.throttler_bytes->reset_max(newval);
9868 }
9869 }
9f95a23c
TL
9870 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9871 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9872 }
7c673cae 9873
494da23a
TL
9874 if (changed.count("osd_scrub_min_interval") ||
9875 changed.count("osd_scrub_max_interval")) {
9876 resched_all_scrubs();
9877 dout(0) << __func__ << ": scrub interval change" << dendl;
9878 }
7c673cae 9879 check_config();
f67539c2
TL
9880 if (changed.count("osd_asio_thread_count")) {
9881 service.poolctx.stop();
9882 service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
9883 }
7c673cae
FG
9884}
9885
a4b75251
TL
9886void OSD::maybe_override_max_osd_capacity_for_qos()
9887{
9888 // If the scheduler enabled is mclock, override the default
9889 // osd capacity with the value obtained from running the
9890 // osd bench test. This is later used to setup mclock.
9891 if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
20effc67
TL
9892 (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false) &&
9893 (!unsupported_objstore_for_qos())) {
a4b75251
TL
9894 std::string max_capacity_iops_config;
9895 bool force_run_benchmark =
9896 cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
9897
9898 if (store_is_rotational) {
9899 max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
9900 } else {
9901 max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
9902 }
9903
39ae355f
TL
9904 double default_iops = 0.0;
9905 double cur_iops = 0.0;
a4b75251 9906 if (!force_run_benchmark) {
a4b75251 9907 // Get the current osd iops capacity
39ae355f 9908 cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
a4b75251
TL
9909
9910 // Get the default max iops capacity
9911 auto val = cct->_conf.get_val_default(max_capacity_iops_config);
9912 if (!val.has_value()) {
9913 derr << __func__ << " Unable to determine default value of "
9914 << max_capacity_iops_config << dendl;
9915 // Cannot determine default iops. Force a run of the OSD benchmark.
9916 force_run_benchmark = true;
9917 } else {
9918 // Default iops
9919 default_iops = std::stod(val.value());
9920 }
9921
9922 // Determine if we really need to run the osd benchmark
9923 if (!force_run_benchmark && (default_iops != cur_iops)) {
9924 dout(1) << __func__ << std::fixed << std::setprecision(2)
9925 << " default_iops: " << default_iops
9926 << " cur_iops: " << cur_iops
9927 << ". Skip OSD benchmark test." << dendl;
9928 return;
9929 }
9930 }
9931
9932 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
9933 int64_t count = 12288000; // Count of bytes to write
9934 int64_t bsize = 4096; // Block size
9935 int64_t osize = 4194304; // Object size
9936 int64_t onum = 100; // Count of objects to write
9937 double elapsed = 0.0; // Time taken to complete the test
9938 double iops = 0.0;
9939 stringstream ss;
9940 int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
9941 if (ret != 0) {
9942 derr << __func__
9943 << " osd bench err: " << ret
9944 << " osd bench errstr: " << ss.str()
9945 << dendl;
9946 return;
9947 }
9948
9949 double rate = count / elapsed;
9950 iops = rate / bsize;
9951 dout(1) << __func__
9952 << " osd bench result -"
9953 << std::fixed << std::setprecision(3)
9954 << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
9955 << " iops: " << iops
9956 << " elapsed_sec: " << elapsed
9957 << dendl;
9958
39ae355f
TL
9959 // Get the threshold IOPS set for the underlying hdd/ssd.
9960 double threshold_iops = 0.0;
9961 if (store_is_rotational) {
9962 threshold_iops = cct->_conf.get_val<double>(
9963 "osd_mclock_iops_capacity_threshold_hdd");
9964 } else {
9965 threshold_iops = cct->_conf.get_val<double>(
9966 "osd_mclock_iops_capacity_threshold_ssd");
9967 }
9968
9969 // Persist the iops value to the MON store or throw cluster warning
9970 // if the measured iops exceeds the set threshold. If the iops exceed
9971 // the threshold, the default value is used.
9972 if (iops > threshold_iops) {
9973 clog->warn() << "OSD bench result of " << std::to_string(iops)
9974 << " IOPS exceeded the threshold limit of "
9975 << std::to_string(threshold_iops) << " IOPS for osd."
9976 << std::to_string(whoami) << ". IOPS capacity is unchanged"
9977 << " at " << std::to_string(cur_iops) << " IOPS. The"
9978 << " recommendation is to establish the osd's IOPS capacity"
9979 << " using other benchmark tools (e.g. Fio) and then"
9980 << " override osd_mclock_max_capacity_iops_[hdd|ssd].";
9981 } else {
9982 mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
a4b75251 9983 }
39ae355f
TL
9984 }
9985}
a4b75251 9986
39ae355f
TL
9987bool OSD::maybe_override_options_for_qos(const std::set<std::string> *changed)
9988{
9989 // Override options only if the scheduler enabled is mclock and the
9990 // underlying objectstore is supported by mclock
9991 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
9992 !unsupported_objstore_for_qos()) {
9993 static const std::map<std::string, uint64_t> recovery_qos_defaults {
9994 {"osd_recovery_max_active", 0},
1e59de90
TL
9995 {"osd_recovery_max_active_hdd", 3},
9996 {"osd_recovery_max_active_ssd", 10},
9997 {"osd_max_backfills", 1},
39ae355f
TL
9998 };
9999
10000 // Check if we were called because of a configuration change
10001 if (changed != nullptr) {
10002 if (cct->_conf.get_val<bool>("osd_mclock_override_recovery_settings")) {
10003 if (changed->count("osd_max_backfills")) {
10004 dout(1) << __func__ << " Set local and remote max backfills to "
10005 << cct->_conf->osd_max_backfills << dendl;
10006 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10007 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10008 }
10009 } else {
10010 // Recovery options change was attempted without setting
10011 // the 'osd_mclock_override_recovery_settings' option.
10012 // Find the key to remove from the configuration db.
10013 std::string key;
10014 if (changed->count("osd_max_backfills")) {
10015 key = "osd_max_backfills";
10016 } else if (changed->count("osd_recovery_max_active")) {
10017 key = "osd_recovery_max_active";
10018 } else if (changed->count("osd_recovery_max_active_hdd")) {
10019 key = "osd_recovery_max_active_hdd";
10020 } else if (changed->count("osd_recovery_max_active_ssd")) {
10021 key = "osd_recovery_max_active_ssd";
10022 } else {
10023 // No key that we are interested in. Return.
10024 return true;
10025 }
10026
10027 // Remove the current entry from the configuration if
10028 // different from its default value.
10029 auto val = recovery_qos_defaults.find(key);
10030 if (val != recovery_qos_defaults.end() &&
10031 cct->_conf.get_val<uint64_t>(key) != val->second) {
10032 static const std::vector<std::string> osds = {
10033 "osd",
10034 "osd." + std::to_string(whoami)
10035 };
10036
10037 for (auto osd : osds) {
10038 std::string cmd =
10039 "{"
10040 "\"prefix\": \"config rm\", "
10041 "\"who\": \"" + osd + "\", "
10042 "\"name\": \"" + key + "\""
10043 "}";
10044 vector<std::string> vcmd{cmd};
10045
10046 dout(1) << __func__ << " Removing Key: " << key
10047 << " for " << osd << " from Mon db" << dendl;
10048 monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
10049 }
10050
10051 // Raise a cluster warning indicating that the changes did not
10052 // take effect and indicate the reason why.
10053 clog->warn() << "Change to " << key << " on osd."
10054 << std::to_string(whoami) << " did not take effect."
10055 << " Enable osd_mclock_override_recovery_settings before"
10056 << " setting this option.";
10057 }
10058 }
10059 } else { // if (changed != nullptr) (osd boot-up)
1e59de90
TL
10060 /**
10061 * This section is executed only during osd boot-up.
10062 * Override the default recovery max active (hdd & ssd) and max backfills
10063 * config options to either the mClock defaults or retain their respective
10064 * overridden values before the osd was restarted.
10065 */
39ae355f 10066 for (auto opt : recovery_qos_defaults) {
1e59de90
TL
10067 /**
10068 * Note: set_val_default doesn't overwrite an option if it was earlier
10069 * set at a config level greater than CONF_DEFAULT. It doesn't return
10070 * a status. With get_val(), the config subsystem is guaranteed to
10071 * either return the overridden value (if any) or the default value.
10072 */
39ae355f 10073 cct->_conf.set_val_default(opt.first, std::to_string(opt.second));
1e59de90
TL
10074 auto opt_val = cct->_conf.get_val<uint64_t>(opt.first);
10075 dout(1) << __func__ << " "
10076 << opt.first << " set to " << opt_val
10077 << dendl;
39ae355f 10078 if (opt.first == "osd_max_backfills") {
1e59de90
TL
10079 service.local_reserver.set_max(opt_val);
10080 service.remote_reserver.set_max(opt_val);
39ae355f 10081 }
39ae355f 10082 }
a4b75251 10083 }
39ae355f 10084 return true;
a4b75251 10085 }
39ae355f 10086 return false;
a4b75251
TL
10087}
10088
39ae355f 10089void OSD::maybe_override_sleep_options_for_qos()
b3b6e05e 10090{
39ae355f
TL
10091 // Override options only if the scheduler enabled is mclock and the
10092 // underlying objectstore is supported by mclock
20effc67
TL
10093 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10094 !unsupported_objstore_for_qos()) {
b3b6e05e 10095
39ae355f 10096 // Override the various sleep settings
b3b6e05e
TL
10097 // Disable recovery sleep
10098 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10099 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10100 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10101 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10102
10103 // Disable delete sleep
10104 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10105 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10106 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10107 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10108
10109 // Disable snap trim sleep
10110 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10111 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10112 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10113 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10114
10115 // Disable scrub sleep
10116 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
b3b6e05e 10117 }
b3b6e05e
TL
10118}
10119
1e59de90
TL
10120void OSD::maybe_override_cost_for_qos()
10121{
10122 // If the scheduler enabled is mclock, override the default PG deletion cost
10123 // so that mclock can meet the QoS goals.
10124 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10125 !unsupported_objstore_for_qos()) {
10126 uint64_t pg_delete_cost = 15728640;
10127 cct->_conf.set_val("osd_pg_delete_cost", std::to_string(pg_delete_cost));
10128 }
10129}
10130
39ae355f
TL
10131/**
10132 * A context for receiving status from a background mon command to set
10133 * a config option and optionally apply the changes on each op shard.
10134 */
10135class MonCmdSetConfigOnFinish : public Context {
10136 OSD *osd;
10137 CephContext *cct;
10138 std::string key;
10139 std::string val;
10140 bool update_shard;
10141public:
10142 explicit MonCmdSetConfigOnFinish(
10143 OSD *o,
10144 CephContext *cct,
10145 const std::string &k,
10146 const std::string &v,
10147 const bool s)
10148 : osd(o), cct(cct), key(k), val(v), update_shard(s) {}
10149 void finish(int r) override {
10150 if (r != 0) {
10151 // Fallback to setting the config within the in-memory "values" map.
10152 cct->_conf.set_val_default(key, val);
10153 }
10154
10155 // If requested, apply this option on the
10156 // active scheduler of each op shard.
10157 if (update_shard) {
10158 for (auto& shard : osd->shards) {
10159 shard->update_scheduler_config();
10160 }
10161 }
10162 }
10163};
10164
10165void OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
a4b75251
TL
10166{
10167 std::string cmd =
10168 "{"
10169 "\"prefix\": \"config set\", "
10170 "\"who\": \"osd." + std::to_string(whoami) + "\", "
10171 "\"name\": \"" + key + "\", "
10172 "\"value\": \"" + val + "\""
10173 "}";
a4b75251 10174 vector<std::string> vcmd{cmd};
a4b75251 10175
39ae355f
TL
10176 // List of config options to be distributed across each op shard.
10177 // Currently limited to a couple of mClock options.
10178 static const std::vector<std::string> shard_option =
10179 { "osd_mclock_max_capacity_iops_hdd", "osd_mclock_max_capacity_iops_ssd" };
10180 const bool update_shard = std::find(shard_option.begin(),
10181 shard_option.end(),
10182 key) != shard_option.end();
10183
10184 auto on_finish = new MonCmdSetConfigOnFinish(this, cct, key,
10185 val, update_shard);
10186 dout(10) << __func__ << " Set " << key << " = " << val << dendl;
10187 monc->start_mon_command(vcmd, {}, nullptr, nullptr, on_finish);
a4b75251
TL
10188}
10189
20effc67
TL
10190bool OSD::unsupported_objstore_for_qos()
10191{
10192 static const std::vector<std::string> unsupported_objstores = { "filestore" };
10193 return std::find(unsupported_objstores.begin(),
10194 unsupported_objstores.end(),
10195 store->get_type()) != unsupported_objstores.end();
10196}
10197
7c673cae
FG
10198void OSD::update_log_config()
10199{
20effc67
TL
10200 auto parsed_options = clog->parse_client_options(cct);
10201 derr << "log_to_monitors " << parsed_options.log_to_monitors << dendl;
7c673cae
FG
10202}
10203
10204void OSD::check_config()
10205{
10206 // some sanity checks
7c673cae
FG
10207 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10208 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10209 << " is not > osd_pg_epoch_persisted_max_stale ("
10210 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10211 }
9f95a23c 10212 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
f67539c2 10213 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9f95a23c
TL
10214 << cct->_conf->osd_object_clean_region_max_num_intervals
10215 << ") is < 0";
10216 }
7c673cae
FG
10217}
10218
7c673cae
FG
10219// --------------------------------
10220
10221void OSD::get_latest_osdmap()
10222{
10223 dout(10) << __func__ << " -- start" << dendl;
10224
f67539c2
TL
10225 boost::system::error_code ec;
10226 service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
7c673cae
FG
10227
10228 dout(10) << __func__ << " -- finish" << dendl;
10229}
10230
10231// --------------------------------
10232
9f95a23c
TL
10233void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10234 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10235 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
11fdf7f2
TL
10236 dout(10) << "setting " << queries.size() << " queries" << dendl;
10237
10238 std::list<OSDPerfMetricQuery> supported_queries;
10239 for (auto &it : queries) {
10240 auto &query = it.first;
10241 if (!query.key_descriptor.empty()) {
10242 supported_queries.push_back(query);
10243 }
10244 }
10245 if (supported_queries.size() < queries.size()) {
10246 dout(1) << queries.size() - supported_queries.size()
10247 << " unsupported queries" << dendl;
10248 }
11fdf7f2 10249 {
9f95a23c 10250 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
10251 m_perf_queries = supported_queries;
10252 m_perf_limits = queries;
10253 }
11fdf7f2
TL
10254 std::vector<PGRef> pgs;
10255 _get_pgs(&pgs);
10256 for (auto& pg : pgs) {
9f95a23c 10257 std::scoped_lock l{*pg};
eafe8130 10258 pg->set_dynamic_perf_stats_queries(supported_queries);
7c673cae 10259 }
7c673cae
FG
10260}
10261
9f95a23c
TL
10262MetricPayload OSD::get_perf_reports() {
10263 OSDMetricPayload payload;
10264 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10265
11fdf7f2
TL
10266 std::vector<PGRef> pgs;
10267 _get_pgs(&pgs);
10268 DynamicPerfStats dps;
10269 for (auto& pg : pgs) {
eafe8130
TL
10270 // m_perf_queries can be modified only in set_perf_queries by mgr client
10271 // request, and it is protected by by mgr client's lock, which is held
10272 // when set_perf_queries/get_perf_reports are called, so we may not hold
10273 // m_perf_queries_lock here.
10274 DynamicPerfStats pg_dps(m_perf_queries);
10275 pg->lock();
10276 pg->get_dynamic_perf_stats(&pg_dps);
10277 pg->unlock();
10278 dps.merge(pg_dps);
11fdf7f2 10279 }
9f95a23c
TL
10280 dps.add_to_reports(m_perf_limits, &reports);
10281 dout(20) << "reports for " << reports.size() << " queries" << dendl;
10282
10283 return payload;
11fdf7f2 10284}
224ce89b 10285
7c673cae
FG
10286// =============================================================
10287
10288#undef dout_context
11fdf7f2 10289#define dout_context cct
7c673cae 10290#undef dout_prefix
11fdf7f2 10291#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 10292
11fdf7f2 10293void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 10294{
11fdf7f2
TL
10295 dout(10) << pg->pg_id << " " << pg << dendl;
10296 slot->pg = pg;
10297 pg->osd_shard = this;
10298 pg->pg_slot = slot;
10299 osd->inc_num_pgs();
10300
10301 slot->epoch = pg->get_osdmap_epoch();
10302 pg_slots_by_epoch.insert(*slot);
10303}
10304
10305void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10306{
10307 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10308 slot->pg->osd_shard = nullptr;
10309 slot->pg->pg_slot = nullptr;
10310 slot->pg = nullptr;
10311 osd->dec_num_pgs();
10312
10313 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10314 slot->epoch = 0;
10315 if (waiting_for_min_pg_epoch) {
10316 min_pg_epoch_cond.notify_all();
10317 }
10318}
10319
10320void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10321{
10322 std::lock_guard l(shard_lock);
10323 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10324 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10325 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10326 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10327 slot->epoch = e;
10328 pg_slots_by_epoch.insert(*slot);
10329 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10330 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10331 if (waiting_for_min_pg_epoch) {
10332 min_pg_epoch_cond.notify_all();
10333 }
10334}
10335
10336epoch_t OSDShard::get_min_pg_epoch()
10337{
10338 std::lock_guard l(shard_lock);
10339 auto p = pg_slots_by_epoch.begin();
10340 if (p == pg_slots_by_epoch.end()) {
10341 return 0;
10342 }
10343 return p->epoch;
10344}
10345
10346void OSDShard::wait_min_pg_epoch(epoch_t need)
10347{
10348 std::unique_lock l{shard_lock};
10349 ++waiting_for_min_pg_epoch;
10350 min_pg_epoch_cond.wait(l, [need, this] {
10351 if (pg_slots_by_epoch.empty()) {
10352 return true;
10353 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10354 return true;
10355 } else {
10356 dout(10) << need << " waiting on "
10357 << pg_slots_by_epoch.begin()->epoch << dendl;
10358 return false;
10359 }
10360 });
10361 --waiting_for_min_pg_epoch;
10362}
10363
10364epoch_t OSDShard::get_max_waiting_epoch()
10365{
10366 std::lock_guard l(shard_lock);
10367 epoch_t r = 0;
10368 for (auto& i : pg_slots) {
10369 if (!i.second->waiting_peering.empty()) {
10370 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10371 }
10372 }
10373 return r;
10374}
10375
10376void OSDShard::consume_map(
9f95a23c 10377 const OSDMapRef& new_osdmap,
11fdf7f2
TL
10378 unsigned *pushes_to_free)
10379{
10380 std::lock_guard l(shard_lock);
10381 OSDMapRef old_osdmap;
7c673cae 10382 {
11fdf7f2
TL
10383 std::lock_guard l(osdmap_lock);
10384 old_osdmap = std::move(shard_osdmap);
10385 shard_osdmap = new_osdmap;
10386 }
10387 dout(10) << new_osdmap->get_epoch()
10388 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10389 << dendl;
20effc67 10390 int queued = 0;
11fdf7f2
TL
10391
10392 // check slots
10393 auto p = pg_slots.begin();
10394 while (p != pg_slots.end()) {
10395 OSDShardPGSlot *slot = p->second.get();
10396 const spg_t& pgid = p->first;
10397 dout(20) << __func__ << " " << pgid << dendl;
10398 if (!slot->waiting_for_split.empty()) {
10399 dout(20) << __func__ << " " << pgid
10400 << " waiting for split " << slot->waiting_for_split << dendl;
10401 ++p;
10402 continue;
10403 }
10404 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10405 dout(20) << __func__ << " " << pgid
10406 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10407 << dendl;
10408 ++p;
10409 continue;
10410 }
10411 if (!slot->waiting_peering.empty()) {
10412 epoch_t first = slot->waiting_peering.begin()->first;
10413 if (first <= new_osdmap->get_epoch()) {
10414 dout(20) << __func__ << " " << pgid
10415 << " pending_peering first epoch " << first
10416 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
20effc67 10417 queued += _wake_pg_slot(pgid, slot);
11fdf7f2
TL
10418 }
10419 ++p;
10420 continue;
10421 }
10422 if (!slot->waiting.empty()) {
10423 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10424 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10425 << dendl;
10426 ++p;
10427 continue;
7c673cae 10428 }
11fdf7f2
TL
10429 while (!slot->waiting.empty() &&
10430 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10431 auto& qi = slot->waiting.front();
10432 dout(20) << __func__ << " " << pgid
10433 << " waiting item " << qi
10434 << " epoch " << qi.get_map_epoch()
10435 << " <= " << new_osdmap->get_epoch()
10436 << ", "
10437 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10438 "misdirected")
10439 << ", dropping" << dendl;
10440 *pushes_to_free += qi.get_reserved_pushes();
10441 slot->waiting.pop_front();
10442 }
10443 }
10444 if (slot->waiting.empty() &&
10445 slot->num_running == 0 &&
10446 slot->waiting_for_split.empty() &&
10447 !slot->pg) {
10448 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10449 p = pg_slots.erase(p);
10450 continue;
7c673cae 10451 }
11fdf7f2
TL
10452
10453 ++p;
7c673cae 10454 }
7c673cae 10455 if (queued) {
11fdf7f2 10456 std::lock_guard l{sdata_wait_lock};
20effc67
TL
10457 if (queued == 1)
10458 sdata_cond.notify_one();
10459 else
10460 sdata_cond.notify_all();
7c673cae
FG
10461 }
10462}
10463
20effc67 10464int OSDShard::_wake_pg_slot(
11fdf7f2
TL
10465 spg_t pgid,
10466 OSDShardPGSlot *slot)
10467{
20effc67 10468 int count = 0;
11fdf7f2
TL
10469 dout(20) << __func__ << " " << pgid
10470 << " to_process " << slot->to_process
10471 << " waiting " << slot->waiting
10472 << " waiting_peering " << slot->waiting_peering << dendl;
10473 for (auto i = slot->to_process.rbegin();
10474 i != slot->to_process.rend();
10475 ++i) {
9f95a23c 10476 scheduler->enqueue_front(std::move(*i));
20effc67 10477 count++;
11fdf7f2
TL
10478 }
10479 slot->to_process.clear();
10480 for (auto i = slot->waiting.rbegin();
10481 i != slot->waiting.rend();
10482 ++i) {
9f95a23c 10483 scheduler->enqueue_front(std::move(*i));
20effc67 10484 count++;
11fdf7f2
TL
10485 }
10486 slot->waiting.clear();
10487 for (auto i = slot->waiting_peering.rbegin();
10488 i != slot->waiting_peering.rend();
10489 ++i) {
10490 // this is overkill; we requeue everything, even if some of these
10491 // items are waiting for maps we don't have yet. FIXME, maybe,
10492 // someday, if we decide this inefficiency matters
10493 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
9f95a23c 10494 scheduler->enqueue_front(std::move(*j));
20effc67 10495 count++;
11fdf7f2
TL
10496 }
10497 }
10498 slot->waiting_peering.clear();
10499 ++slot->requeue_seq;
20effc67 10500 return count;
11fdf7f2
TL
10501}
10502
10503void OSDShard::identify_splits_and_merges(
10504 const OSDMapRef& as_of_osdmap,
10505 set<pair<spg_t,epoch_t>> *split_pgs,
10506 set<pair<spg_t,epoch_t>> *merge_pgs)
10507{
10508 std::lock_guard l(shard_lock);
1e59de90 10509 dout(20) << __func__ << " " << pg_slots.size() << " slots" << dendl;
11fdf7f2
TL
10510 if (shard_osdmap) {
10511 for (auto& i : pg_slots) {
1e59de90 10512 dout(20) << __func__ << " slot pgid:" << i.first << "slot:" << i.second.get() << dendl;
11fdf7f2
TL
10513 const spg_t& pgid = i.first;
10514 auto *slot = i.second.get();
10515 if (slot->pg) {
10516 osd->service.identify_splits_and_merges(
10517 shard_osdmap, as_of_osdmap, pgid,
10518 split_pgs, merge_pgs);
10519 } else if (!slot->waiting_for_split.empty()) {
10520 osd->service.identify_splits_and_merges(
10521 shard_osdmap, as_of_osdmap, pgid,
10522 split_pgs, nullptr);
10523 } else {
10524 dout(20) << __func__ << " slot " << pgid
9f95a23c 10525 << " has no pg and waiting_for_split " << dendl;
7c673cae 10526 }
11fdf7f2
TL
10527 }
10528 }
1e59de90
TL
10529 dout(20) << __func__ << " " << split_pgs->size() << " splits, "
10530 << merge_pgs->size() << " merges" << dendl;
11fdf7f2
TL
10531}
10532
10533void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10534 set<pair<spg_t,epoch_t>> *pgids)
10535{
10536 std::lock_guard l(shard_lock);
10537 _prime_splits(pgids);
10538 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10539 set<pair<spg_t,epoch_t>> newer_children;
10540 for (auto i : *pgids) {
10541 osd->service.identify_splits_and_merges(
10542 as_of_osdmap, shard_osdmap, i.first,
10543 &newer_children, nullptr);
10544 }
10545 newer_children.insert(pgids->begin(), pgids->end());
10546 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10547 << shard_osdmap->get_epoch() << ", new children " << newer_children
10548 << dendl;
10549 _prime_splits(&newer_children);
10550 // note: we don't care what is left over here for other shards.
10551 // if this shard is ahead of us and one isn't, e.g., one thread is
10552 // calling into prime_splits via _process (due to a newly created
10553 // pg) and this shard has a newer map due to a racing consume_map,
10554 // then any grandchildren left here will be identified (or were
10555 // identified) when the slower shard's osdmap is advanced.
10556 // _prime_splits() will tolerate the case where the pgid is
10557 // already primed.
10558 }
10559}
10560
10561void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10562{
10563 dout(10) << *pgids << dendl;
10564 auto p = pgids->begin();
10565 while (p != pgids->end()) {
10566 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10567 if (shard_index == shard_id) {
10568 auto r = pg_slots.emplace(p->first, nullptr);
10569 if (r.second) {
10570 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10571 r.first->second = make_unique<OSDShardPGSlot>();
10572 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10573 } else {
11fdf7f2
TL
10574 auto q = r.first;
10575 ceph_assert(q != pg_slots.end());
10576 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10577 << dendl;
10578 q->second->waiting_for_split.insert(p->second);
7c673cae 10579 }
11fdf7f2
TL
10580 p = pgids->erase(p);
10581 } else {
10582 ++p;
7c673cae
FG
10583 }
10584 }
11fdf7f2
TL
10585}
10586
10587void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10588 set<pair<spg_t,epoch_t>> *merge_pgs)
10589{
10590 std::lock_guard l(shard_lock);
10591 dout(20) << __func__ << " checking shard " << shard_id
10592 << " for remaining merge pgs " << merge_pgs << dendl;
10593 auto p = merge_pgs->begin();
10594 while (p != merge_pgs->end()) {
10595 spg_t pgid = p->first;
10596 epoch_t epoch = p->second;
10597 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10598 if (shard_index != shard_id) {
10599 ++p;
10600 continue;
10601 }
10602 OSDShardPGSlot *slot;
10603 auto r = pg_slots.emplace(pgid, nullptr);
10604 if (r.second) {
10605 r.first->second = make_unique<OSDShardPGSlot>();
10606 }
10607 slot = r.first->second.get();
10608 if (slot->pg) {
10609 // already have pg
10610 dout(20) << __func__ << " have merge participant pg " << pgid
10611 << " " << slot->pg << dendl;
10612 } else if (!slot->waiting_for_split.empty() &&
10613 *slot->waiting_for_split.begin() < epoch) {
10614 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10615 << " " << slot->waiting_for_split << dendl;
10616 } else {
10617 dout(20) << __func__ << " creating empty merge participant " << pgid
10618 << " for merge in " << epoch << dendl;
10619 // leave history zeroed; PG::merge_from() will fill it in.
10620 pg_history_t history;
10621 PGCreateInfo cinfo(pgid, epoch - 1,
10622 history, PastIntervals(), false);
10623 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10624 _attach_pg(r.first->second.get(), pg.get());
10625 _wake_pg_slot(pgid, slot);
10626 pg->unlock();
10627 }
10628 // mark slot for merge
10629 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10630 slot->waiting_for_merge_epoch = epoch;
10631 p = merge_pgs->erase(p);
7c673cae
FG
10632 }
10633}
10634
11fdf7f2 10635void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 10636{
20effc67 10637 dout(15) << __func__ << ": " << pg << " #:" << pg_slots.size() << dendl;
11fdf7f2
TL
10638 epoch_t epoch;
10639 {
10640 std::lock_guard l(shard_lock);
20effc67 10641 dout(10) << __func__ << ": " << pg->pg_id << " " << pg << dendl;
11fdf7f2
TL
10642 auto p = pg_slots.find(pg->pg_id);
10643 ceph_assert(p != pg_slots.end());
10644 auto *slot = p->second.get();
20effc67
TL
10645 dout(20) << __func__ << ": " << pg->pg_id << " waiting_for_split "
10646 << slot->waiting_for_split << dendl;
11fdf7f2
TL
10647 ceph_assert(!slot->pg);
10648 ceph_assert(!slot->waiting_for_split.empty());
10649 _attach_pg(slot, pg);
10650
10651 epoch = pg->get_osdmap_epoch();
10652 ceph_assert(slot->waiting_for_split.count(epoch));
10653 slot->waiting_for_split.erase(epoch);
10654 if (slot->waiting_for_split.empty()) {
10655 _wake_pg_slot(pg->pg_id, slot);
10656 } else {
10657 dout(10) << __func__ << " still waiting for split on "
10658 << slot->waiting_for_split << dendl;
10659 }
7c673cae 10660 }
11fdf7f2
TL
10661
10662 // kick child to ensure it pulls up to the latest osdmap
10663 osd->enqueue_peering_evt(
10664 pg->pg_id,
10665 PGPeeringEventRef(
10666 std::make_shared<PGPeeringEvent>(
10667 epoch,
10668 epoch,
10669 NullEvt())));
10670
10671 std::lock_guard l{sdata_wait_lock};
10672 sdata_cond.notify_one();
7c673cae
FG
10673}
10674
11fdf7f2 10675void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 10676{
11fdf7f2
TL
10677 std::lock_guard l(shard_lock);
10678 vector<spg_t> to_delete;
10679 for (auto& i : pg_slots) {
10680 if (i.first != parent &&
10681 i.first.get_ancestor(old_pg_num) == parent) {
10682 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10683 << dendl;
10684 _wake_pg_slot(i.first, i.second.get());
10685 to_delete.push_back(i.first);
10686 }
10687 }
10688 for (auto pgid : to_delete) {
10689 pg_slots.erase(pgid);
10690 }
10691}
10692
a4b75251
TL
10693void OSDShard::update_scheduler_config()
10694{
a4b75251
TL
10695 scheduler->update_configuration();
10696}
10697
20effc67
TL
10698std::string OSDShard::get_scheduler_type()
10699{
10700 std::ostringstream scheduler_type;
10701 scheduler_type << *scheduler;
10702 return scheduler_type.str();
10703}
10704
9f95a23c
TL
10705OSDShard::OSDShard(
10706 int id,
10707 CephContext *cct,
10708 OSD *osd)
10709 : shard_id(id),
10710 cct(cct),
10711 osd(osd),
10712 shard_name(string("OSDShard.") + stringify(id)),
10713 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10714 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10715 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10716 shard_lock_name(shard_name + "::shard_lock"),
10717 shard_lock{make_mutex(shard_lock_name)},
f67539c2 10718 scheduler(ceph::osd::scheduler::make_scheduler(
39ae355f
TL
10719 cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(),
10720 osd->store->get_type(), osd->monc)),
9f95a23c
TL
10721 context_queue(sdata_wait_lock, sdata_cond)
10722{
10723 dout(0) << "using op scheduler " << *scheduler << dendl;
10724}
10725
11fdf7f2
TL
10726
10727// =============================================================
10728
10729#undef dout_context
10730#define dout_context osd->cct
10731#undef dout_prefix
10732#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10733
10734void OSD::ShardedOpWQ::_add_slot_waiter(
10735 spg_t pgid,
10736 OSDShardPGSlot *slot,
9f95a23c 10737 OpSchedulerItem&& qi)
11fdf7f2
TL
10738{
10739 if (qi.is_peering()) {
10740 dout(20) << __func__ << " " << pgid
10741 << " peering, item epoch is "
10742 << qi.get_map_epoch()
10743 << ", will wait on " << qi << dendl;
10744 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10745 } else {
10746 dout(20) << __func__ << " " << pgid
10747 << " item epoch is "
10748 << qi.get_map_epoch()
10749 << ", will wait on " << qi << dendl;
10750 slot->waiting.push_back(std::move(qi));
7c673cae
FG
10751 }
10752}
10753
10754#undef dout_prefix
10755#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10756
10757void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10758{
11fdf7f2
TL
10759 uint32_t shard_index = thread_index % osd->num_shards;
10760 auto& sdata = osd->shards[shard_index];
10761 ceph_assert(sdata);
10762
10763 // If all threads of shards do oncommits, there is a out-of-order
10764 // problem. So we choose the thread which has the smallest
10765 // thread_index(thread_index < num_shards) of shard to do oncommit
10766 // callback.
10767 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
10768
10769 // peek at spg_t
11fdf7f2 10770 sdata->shard_lock.lock();
9f95a23c 10771 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10772 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10773 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10774 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10775 // we raced with a context_queue addition, don't wait
10776 wait_lock.unlock();
10777 } else if (!sdata->stop_waiting) {
10778 dout(20) << __func__ << " empty q, waiting" << dendl;
10779 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10780 sdata->shard_lock.unlock();
10781 sdata->sdata_cond.wait(wait_lock);
10782 wait_lock.unlock();
10783 sdata->shard_lock.lock();
9f95a23c 10784 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10785 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10786 sdata->shard_lock.unlock();
10787 return;
10788 }
e306af50 10789 // found a work item; reapply default wq timeouts
11fdf7f2 10790 osd->cct->get_heartbeat_map()->reset_timeout(hb,
e306af50 10791 timeout_interval, suicide_interval);
11fdf7f2
TL
10792 } else {
10793 dout(20) << __func__ << " need return immediately" << dendl;
10794 wait_lock.unlock();
10795 sdata->shard_lock.unlock();
7c673cae
FG
10796 return;
10797 }
10798 }
11fdf7f2
TL
10799
10800 list<Context *> oncommits;
9f95a23c
TL
10801 if (is_smallest_thread_index) {
10802 sdata->context_queue.move_to(oncommits);
7c673cae 10803 }
11fdf7f2 10804
f67539c2
TL
10805 WorkItem work_item;
10806 while (!std::get_if<OpSchedulerItem>(&work_item)) {
10807 if (sdata->scheduler->empty()) {
10808 if (osd->is_stopping()) {
10809 sdata->shard_lock.unlock();
10810 for (auto c : oncommits) {
10811 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10812 delete c;
10813 }
10814 return; // OSD shutdown, discard.
10815 }
10816 sdata->shard_lock.unlock();
10817 handle_oncommits(oncommits);
10818 return;
10819 }
10820
10821 work_item = sdata->scheduler->dequeue();
11fdf7f2
TL
10822 if (osd->is_stopping()) {
10823 sdata->shard_lock.unlock();
10824 for (auto c : oncommits) {
f67539c2
TL
10825 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10826 delete c;
11fdf7f2
TL
10827 }
10828 return; // OSD shutdown, discard.
7c673cae 10829 }
7c673cae 10830
f67539c2
TL
10831 // If the work item is scheduled in the future, wait until
10832 // the time returned in the dequeue response before retrying.
10833 if (auto when_ready = std::get_if<double>(&work_item)) {
10834 if (is_smallest_thread_index) {
10835 sdata->shard_lock.unlock();
10836 handle_oncommits(oncommits);
2a845540 10837 sdata->shard_lock.lock();
f67539c2
TL
10838 }
10839 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10840 auto future_time = ceph::real_clock::from_double(*when_ready);
10841 dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
a4b75251
TL
10842 // Disable heartbeat timeout until we find a non-future work item to process.
10843 osd->cct->get_heartbeat_map()->clear_timeout(hb);
f67539c2
TL
10844 sdata->shard_lock.unlock();
10845 ++sdata->waiting_threads;
10846 sdata->sdata_cond.wait_until(wait_lock, future_time);
10847 --sdata->waiting_threads;
10848 wait_lock.unlock();
10849 sdata->shard_lock.lock();
a4b75251
TL
10850 // Reapply default wq timeouts
10851 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10852 timeout_interval, suicide_interval);
2a845540
TL
10853 // Populate the oncommits list if there were any additions
10854 // to the context_queue while we were waiting
10855 if (is_smallest_thread_index) {
10856 sdata->context_queue.move_to(oncommits);
10857 }
f67539c2
TL
10858 }
10859 } // while
10860
10861 // Access the stored item
10862 auto item = std::move(std::get<OpSchedulerItem>(work_item));
11fdf7f2
TL
10863 if (osd->is_stopping()) {
10864 sdata->shard_lock.unlock();
10865 for (auto c : oncommits) {
10866 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10867 delete c;
10868 }
10869 return; // OSD shutdown, discard.
10870 }
7c673cae 10871
11fdf7f2
TL
10872 const auto token = item.get_ordering_token();
10873 auto r = sdata->pg_slots.emplace(token, nullptr);
10874 if (r.second) {
10875 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 10876 }
11fdf7f2
TL
10877 OSDShardPGSlot *slot = r.first->second.get();
10878 dout(20) << __func__ << " " << token
10879 << (r.second ? " (new)" : "")
10880 << " to_process " << slot->to_process
10881 << " waiting " << slot->waiting
10882 << " waiting_peering " << slot->waiting_peering
10883 << dendl;
10884 slot->to_process.push_back(std::move(item));
10885 dout(20) << __func__ << " " << slot->to_process.back()
10886 << " queued" << dendl;
7c673cae 10887
11fdf7f2
TL
10888 retry_pg:
10889 PGRef pg = slot->pg;
7c673cae 10890
11fdf7f2
TL
10891 // lock pg (if we have it)
10892 if (pg) {
10893 // note the requeue seq now...
10894 uint64_t requeue_seq = slot->requeue_seq;
10895 ++slot->num_running;
7c673cae 10896
11fdf7f2
TL
10897 sdata->shard_lock.unlock();
10898 osd->service.maybe_inject_dispatch_delay();
10899 pg->lock();
10900 osd->service.maybe_inject_dispatch_delay();
10901 sdata->shard_lock.lock();
7c673cae 10902
11fdf7f2
TL
10903 auto q = sdata->pg_slots.find(token);
10904 if (q == sdata->pg_slots.end()) {
10905 // this can happen if we race with pg removal.
10906 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10907 pg->unlock();
10908 sdata->shard_lock.unlock();
10909 handle_oncommits(oncommits);
10910 return;
10911 }
10912 slot = q->second.get();
10913 --slot->num_running;
7c673cae 10914
11fdf7f2
TL
10915 if (slot->to_process.empty()) {
10916 // raced with _wake_pg_slot or consume_map
10917 dout(20) << __func__ << " " << token
10918 << " nothing queued" << dendl;
7c673cae 10919 pg->unlock();
11fdf7f2
TL
10920 sdata->shard_lock.unlock();
10921 handle_oncommits(oncommits);
10922 return;
7c673cae 10923 }
11fdf7f2
TL
10924 if (requeue_seq != slot->requeue_seq) {
10925 dout(20) << __func__ << " " << token
10926 << " requeue_seq " << slot->requeue_seq << " > our "
10927 << requeue_seq << ", we raced with _wake_pg_slot"
10928 << dendl;
7c673cae 10929 pg->unlock();
11fdf7f2
TL
10930 sdata->shard_lock.unlock();
10931 handle_oncommits(oncommits);
10932 return;
7c673cae 10933 }
11fdf7f2
TL
10934 if (slot->pg != pg) {
10935 // this can happen if we race with pg removal.
10936 dout(20) << __func__ << " slot " << token << " no longer attached to "
10937 << pg << dendl;
7c673cae 10938 pg->unlock();
11fdf7f2 10939 goto retry_pg;
7c673cae 10940 }
7c673cae
FG
10941 }
10942
11fdf7f2
TL
10943 dout(20) << __func__ << " " << token
10944 << " to_process " << slot->to_process
10945 << " waiting " << slot->waiting
10946 << " waiting_peering " << slot->waiting_peering << dendl;
10947
10948 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10949 suicide_interval);
10950
7c673cae 10951 // take next item
11fdf7f2
TL
10952 auto qi = std::move(slot->to_process.front());
10953 slot->to_process.pop_front();
10954 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10955 set<pair<spg_t,epoch_t>> new_children;
10956 OSDMapRef osdmap;
7c673cae 10957
11fdf7f2 10958 while (!pg) {
7c673cae 10959 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
10960 osdmap = sdata->shard_osdmap;
10961 const PGCreateInfo *create_info = qi.creates_pg();
10962 if (!slot->waiting_for_split.empty()) {
10963 dout(20) << __func__ << " " << token
10964 << " splitting " << slot->waiting_for_split << dendl;
10965 _add_slot_waiter(token, slot, std::move(qi));
10966 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10967 dout(20) << __func__ << " " << token
10968 << " map " << qi.get_map_epoch() << " > "
10969 << osdmap->get_epoch() << dendl;
10970 _add_slot_waiter(token, slot, std::move(qi));
10971 } else if (qi.is_peering()) {
10972 if (!qi.peering_requires_pg()) {
10973 // for pg-less events, we run them under the ordering lock, since
10974 // we don't have the pg lock to keep them ordered.
10975 qi.run(osd, sdata, pg, tp_handle);
10976 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10977 if (create_info) {
10978 if (create_info->by_mon &&
10979 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10980 dout(20) << __func__ << " " << token
10981 << " no pg, no longer primary, ignoring mon create on "
10982 << qi << dendl;
10983 } else {
10984 dout(20) << __func__ << " " << token
10985 << " no pg, should create on " << qi << dendl;
10986 pg = osd->handle_pg_create_info(osdmap, create_info);
10987 if (pg) {
10988 // we created the pg! drop out and continue "normally"!
10989 sdata->_attach_pg(slot, pg.get());
10990 sdata->_wake_pg_slot(token, slot);
10991
10992 // identify split children between create epoch and shard epoch.
10993 osd->service.identify_splits_and_merges(
10994 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10995 sdata->_prime_splits(&new_children);
10996 // distribute remaining split children to other shards below!
10997 break;
10998 }
10999 dout(20) << __func__ << " ignored create on " << qi << dendl;
11000 }
11001 } else {
11002 dout(20) << __func__ << " " << token
11003 << " no pg, peering, !create, discarding " << qi << dendl;
11004 }
11005 } else {
11006 dout(20) << __func__ << " " << token
11007 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11008 << ", discarding " << qi
11009 << dendl;
11010 }
11011 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11012 dout(20) << __func__ << " " << token
11013 << " no pg, should exist e" << osdmap->get_epoch()
11014 << ", will wait on " << qi << dendl;
11015 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 11016 } else {
11fdf7f2
TL
11017 dout(20) << __func__ << " " << token
11018 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11019 << ", dropping " << qi << dendl;
7c673cae 11020 // share map with client?
9f95a23c
TL
11021 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11022 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11023 sdata->shard_osdmap,
11024 (*_op)->sent_epoch);
7c673cae 11025 }
11fdf7f2 11026 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 11027 if (pushes_to_free > 0) {
11fdf7f2 11028 sdata->shard_lock.unlock();
7c673cae 11029 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 11030 handle_oncommits(oncommits);
7c673cae
FG
11031 return;
11032 }
11033 }
11fdf7f2
TL
11034 sdata->shard_lock.unlock();
11035 handle_oncommits(oncommits);
7c673cae
FG
11036 return;
11037 }
11fdf7f2
TL
11038 if (qi.is_peering()) {
11039 OSDMapRef osdmap = sdata->shard_osdmap;
11040 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11041 _add_slot_waiter(token, slot, std::move(qi));
11042 sdata->shard_lock.unlock();
11043 pg->unlock();
11044 handle_oncommits(oncommits);
11045 return;
11046 }
11047 }
11048 sdata->shard_lock.unlock();
7c673cae 11049
11fdf7f2
TL
11050 if (!new_children.empty()) {
11051 for (auto shard : osd->shards) {
11052 shard->prime_splits(osdmap, &new_children);
11053 }
11054 ceph_assert(new_children.empty());
11055 }
7c673cae
FG
11056
11057 // osd_opwq_process marks the point at which an operation has been dequeued
11058 // and will begin to be handled by a worker thread.
11059 {
11060#ifdef WITH_LTTNG
11061 osd_reqid_t reqid;
9f95a23c 11062 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11063 reqid = (*_op)->get_reqid();
11064 }
11065#endif
11066 tracepoint(osd, opwq_process_start, reqid.name._type,
11067 reqid.name._num, reqid.tid, reqid.inc);
11068 }
11069
11070 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11071 Formatter *f = Formatter::create("json");
11072 f->open_object_section("q");
11073 dump(f);
11074 f->close_section();
11075 f->flush(*_dout);
11076 delete f;
11077 *_dout << dendl;
11078
11fdf7f2 11079 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
11080
11081 {
11082#ifdef WITH_LTTNG
11083 osd_reqid_t reqid;
9f95a23c 11084 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11085 reqid = (*_op)->get_reqid();
11086 }
11087#endif
11088 tracepoint(osd, opwq_process_finish, reqid.name._type,
11089 reqid.name._num, reqid.tid, reqid.inc);
11090 }
11091
11fdf7f2 11092 handle_oncommits(oncommits);
7c673cae
FG
11093}
11094
9f95a23c 11095void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
1d09f67e
TL
11096 if (unlikely(m_fast_shutdown) ) {
11097 // stop enqueing when we are in the middle of a fast shutdown
11098 return;
11099 }
11100
7c673cae 11101 uint32_t shard_index =
11fdf7f2 11102 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 11103
11fdf7f2 11104 OSDShard* sdata = osd->shards[shard_index];
7c673cae 11105 assert (NULL != sdata);
20effc67
TL
11106
11107 dout(20) << __func__ << " " << item << dendl;
7c673cae 11108
9f95a23c
TL
11109 bool empty = true;
11110 {
11111 std::lock_guard l{sdata->shard_lock};
11112 empty = sdata->scheduler->empty();
11113 sdata->scheduler->enqueue(std::move(item));
11114 }
7c673cae 11115
f67539c2 11116 {
9f95a23c 11117 std::lock_guard l{sdata->sdata_wait_lock};
f67539c2
TL
11118 if (empty) {
11119 sdata->sdata_cond.notify_all();
11120 } else if (sdata->waiting_threads) {
11121 sdata->sdata_cond.notify_one();
11122 }
9f95a23c 11123 }
7c673cae
FG
11124}
11125
9f95a23c 11126void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
7c673cae 11127{
1d09f67e
TL
11128 if (unlikely(m_fast_shutdown) ) {
11129 // stop enqueing when we are in the middle of a fast shutdown
11130 return;
11131 }
11132
11fdf7f2
TL
11133 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11134 auto& sdata = osd->shards[shard_index];
11135 ceph_assert(sdata);
11136 sdata->shard_lock.lock();
11137 auto p = sdata->pg_slots.find(item.get_ordering_token());
11138 if (p != sdata->pg_slots.end() &&
11139 !p->second->to_process.empty()) {
7c673cae 11140 // we may be racing with _process, which has dequeued a new item
9f95a23c 11141 // from scheduler, put it on to_process, and is now busy taking the
7c673cae
FG
11142 // pg lock. ensure this old requeued item is ordered before any
11143 // such newer item in to_process.
11fdf7f2
TL
11144 p->second->to_process.push_front(std::move(item));
11145 item = std::move(p->second->to_process.back());
11146 p->second->to_process.pop_back();
11147 dout(20) << __func__
11148 << " " << p->second->to_process.front()
11149 << " shuffled w/ " << item << dendl;
7c673cae 11150 } else {
11fdf7f2 11151 dout(20) << __func__ << " " << item << dendl;
7c673cae 11152 }
9f95a23c 11153 sdata->scheduler->enqueue_front(std::move(item));
11fdf7f2
TL
11154 sdata->shard_lock.unlock();
11155 std::lock_guard l{sdata->sdata_wait_lock};
11156 sdata->sdata_cond.notify_one();
7c673cae
FG
11157}
11158
1d09f67e
TL
11159void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11160{
11161 uint32_t shard_index = 0;
11162 m_fast_shutdown = true;
11163
11164 for (; shard_index < osd->num_shards; shard_index++) {
11165 auto& sdata = osd->shards[shard_index];
11166 ceph_assert(sdata);
11167 sdata->shard_lock.lock();
11168 int work_count = 0;
11169 while(! sdata->scheduler->empty() ) {
11170 auto work_item = sdata->scheduler->dequeue();
11171 work_count++;
11172 }
11173 sdata->shard_lock.unlock();
11174 }
11175}
11176
f67539c2 11177namespace ceph::osd_cmds {
7c673cae 11178
2a845540
TL
11179int heap(CephContext& cct,
11180 const cmdmap_t& cmdmap,
11181 std::ostream& outos,
11182 std::ostream& erros)
7c673cae
FG
11183{
11184 if (!ceph_using_tcmalloc()) {
2a845540 11185 erros << "could not issue heap profiler command -- not using tcmalloc!";
7c673cae
FG
11186 return -EOPNOTSUPP;
11187 }
f67539c2 11188
7c673cae 11189 string cmd;
9f95a23c 11190 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
2a845540 11191 erros << "unable to get value for command \"" << cmd << "\"";
7c673cae 11192 return -EINVAL;
11fdf7f2 11193 }
f67539c2 11194
7c673cae
FG
11195 std::vector<std::string> cmd_vec;
11196 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
11197
11198 string val;
9f95a23c 11199 if (cmd_getval(cmdmap, "value", val)) {
11fdf7f2
TL
11200 cmd_vec.push_back(val);
11201 }
f67539c2 11202
2a845540 11203 ceph_heap_profiler_handle_command(cmd_vec, outos);
f67539c2 11204
7c673cae
FG
11205 return 0;
11206}
f67539c2
TL
11207
11208} // namespace ceph::osd_cmds