]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
buildsys: change download over to reef release
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
11fdf7f2 15
7c673cae 16#include "acconfig.h"
11fdf7f2
TL
17
18#include <cctype>
7c673cae
FG
19#include <fstream>
20#include <iostream>
11fdf7f2
TL
21#include <iterator>
22
23#include <unistd.h>
7c673cae
FG
24#include <sys/stat.h>
25#include <signal.h>
eafe8130 26#include <time.h>
eafe8130 27#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
28
29#ifdef HAVE_SYS_PARAM_H
30#include <sys/param.h>
31#endif
32
33#ifdef HAVE_SYS_MOUNT_H
34#include <sys/mount.h>
35#endif
36
37#include "osd/PG.h"
20effc67
TL
38#include "osd/scrubber/scrub_machine.h"
39#include "osd/scrubber/pg_scrubber.h"
7c673cae
FG
40
41#include "include/types.h"
42#include "include/compat.h"
11fdf7f2 43#include "include/random.h"
20effc67 44#include "include/scope_guard.h"
7c673cae
FG
45
46#include "OSD.h"
47#include "OSDMap.h"
48#include "Watch.h"
49#include "osdc/Objecter.h"
50
51#include "common/errno.h"
52#include "common/ceph_argparse.h"
9f95a23c 53#include "common/ceph_releases.h"
224ce89b 54#include "common/ceph_time.h"
7c673cae 55#include "common/version.h"
f67539c2 56#include "common/async/blocked_completion.h"
b5b8bbf5 57#include "common/pick_address.h"
11fdf7f2
TL
58#include "common/blkdev.h"
59#include "common/numa.h"
7c673cae
FG
60
61#include "os/ObjectStore.h"
62#ifdef HAVE_LIBFUSE
63#include "os/FuseStore.h"
64#endif
65
66#include "PrimaryLogPG.h"
67
7c673cae
FG
68#include "msg/Messenger.h"
69#include "msg/Message.h"
70
71#include "mon/MonClient.h"
72
73#include "messages/MLog.h"
74
75#include "messages/MGenericMessage.h"
7c673cae
FG
76#include "messages/MOSDPing.h"
77#include "messages/MOSDFailure.h"
78#include "messages/MOSDMarkMeDown.h"
9f95a23c 79#include "messages/MOSDMarkMeDead.h"
7c673cae
FG
80#include "messages/MOSDFull.h"
81#include "messages/MOSDOp.h"
82#include "messages/MOSDOpReply.h"
83#include "messages/MOSDBackoff.h"
84#include "messages/MOSDBeacon.h"
85#include "messages/MOSDRepOp.h"
86#include "messages/MOSDRepOpReply.h"
87#include "messages/MOSDBoot.h"
88#include "messages/MOSDPGTemp.h"
11fdf7f2 89#include "messages/MOSDPGReadyToMerge.h"
7c673cae
FG
90
91#include "messages/MOSDMap.h"
92#include "messages/MMonGetOSDMap.h"
93#include "messages/MOSDPGNotify.h"
9f95a23c 94#include "messages/MOSDPGNotify2.h"
9f95a23c 95#include "messages/MOSDPGQuery2.h"
7c673cae
FG
96#include "messages/MOSDPGLog.h"
97#include "messages/MOSDPGRemove.h"
98#include "messages/MOSDPGInfo.h"
9f95a23c 99#include "messages/MOSDPGInfo2.h"
7c673cae 100#include "messages/MOSDPGCreate.h"
11fdf7f2 101#include "messages/MOSDPGCreate2.h"
7c673cae
FG
102#include "messages/MBackfillReserve.h"
103#include "messages/MRecoveryReserve.h"
c07f9fc5 104#include "messages/MOSDForceRecovery.h"
7c673cae
FG
105#include "messages/MOSDECSubOpWrite.h"
106#include "messages/MOSDECSubOpWriteReply.h"
107#include "messages/MOSDECSubOpRead.h"
108#include "messages/MOSDECSubOpReadReply.h"
109#include "messages/MOSDPGCreated.h"
110#include "messages/MOSDPGUpdateLogMissing.h"
111#include "messages/MOSDPGUpdateLogMissingReply.h"
112
11fdf7f2
TL
113#include "messages/MOSDPeeringOp.h"
114
7c673cae
FG
115#include "messages/MOSDAlive.h"
116
117#include "messages/MOSDScrub.h"
11fdf7f2 118#include "messages/MOSDScrub2.h"
7c673cae 119
7c673cae
FG
120#include "messages/MCommand.h"
121#include "messages/MCommandReply.h"
122
123#include "messages/MPGStats.h"
7c673cae 124
9f95a23c
TL
125#include "messages/MMonGetPurgedSnaps.h"
126#include "messages/MMonGetPurgedSnapsReply.h"
127
7c673cae
FG
128#include "common/perf_counters.h"
129#include "common/Timer.h"
130#include "common/LogClient.h"
131#include "common/AsyncReserver.h"
132#include "common/HeartbeatMap.h"
133#include "common/admin_socket.h"
134#include "common/ceph_context.h"
135
136#include "global/signal_handler.h"
137#include "global/pidfile.h"
138
139#include "include/color.h"
140#include "perfglue/cpu_profiler.h"
141#include "perfglue/heap_profiler.h"
142
f67539c2 143#include "osd/ClassHandler.h"
7c673cae
FG
144#include "osd/OpRequest.h"
145
146#include "auth/AuthAuthorizeHandler.h"
147#include "auth/RotatingKeyRing.h"
7c673cae
FG
148
149#include "objclass/objclass.h"
150
151#include "common/cmdparse.h"
152#include "include/str_list.h"
153#include "include/util.h"
154
11fdf7f2 155#include "include/ceph_assert.h"
7c673cae
FG
156#include "common/config.h"
157#include "common/EventTrace.h"
158
11fdf7f2
TL
159#include "json_spirit/json_spirit_reader.h"
160#include "json_spirit/json_spirit_writer.h"
161
7c673cae
FG
162#ifdef WITH_LTTNG
163#define TRACEPOINT_DEFINE
164#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
165#include "tracing/osd.h"
166#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
167#undef TRACEPOINT_DEFINE
168#else
169#define tracepoint(...)
170#endif
20effc67
TL
171
172#include "osd_tracer.h"
173
7c673cae
FG
174
175#define dout_context cct
176#define dout_subsys ceph_subsys_osd
177#undef dout_prefix
178#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
179
f67539c2
TL
180using std::deque;
181using std::list;
182using std::lock_guard;
183using std::make_pair;
184using std::make_tuple;
185using std::make_unique;
186using std::map;
187using std::ostream;
188using std::ostringstream;
189using std::pair;
190using std::set;
191using std::string;
192using std::stringstream;
193using std::to_string;
194using std::unique_ptr;
195using std::vector;
196
197using ceph::bufferlist;
198using ceph::bufferptr;
199using ceph::decode;
200using ceph::encode;
201using ceph::fixed_u_to_string;
202using ceph::Formatter;
203using ceph::heartbeat_handle_d;
204using ceph::make_mutex;
205
9f95a23c
TL
206using namespace ceph::osd::scheduler;
207using TOPNSPC::common::cmd_getval;
20effc67 208using TOPNSPC::common::cmd_getval_or;
224ce89b 209
7c673cae
FG
210static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
211 return *_dout << "osd." << whoami << " " << epoch << " ";
212}
213
20effc67 214
7c673cae
FG
215//Initial features in new superblock.
216//Features here are also automatically upgraded
217CompatSet OSD::get_osd_initial_compat_set() {
218 CompatSet::FeatureSet ceph_osd_feature_compat;
219 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
220 CompatSet::FeatureSet ceph_osd_feature_incompat;
221 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
222 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
223 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
224 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
225 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
226 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
227 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
228 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
229 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
230 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
231 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
232 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
233 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
234 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 235 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
9f95a23c 236 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
7c673cae
FG
237 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
238 ceph_osd_feature_incompat);
239}
240
241//Features are added here that this OSD supports.
242CompatSet OSD::get_osd_compat_set() {
243 CompatSet compat = get_osd_initial_compat_set();
244 //Any features here can be set in code, but not in initial superblock
245 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
246 return compat;
247}
248
f67539c2 249OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
7c673cae
FG
250 osd(osd),
251 cct(osd->cct),
20effc67 252 whoami(osd->whoami), store(osd->store.get()),
7c673cae
FG
253 log_client(osd->log_client), clog(osd->clog),
254 pg_recovery_stats(osd->pg_recovery_stats),
255 cluster_messenger(osd->cluster_messenger),
256 client_messenger(osd->client_messenger),
257 logger(osd->logger),
258 recoverystate_perf(osd->recoverystate_perf),
259 monc(osd->monc),
11fdf7f2
TL
260 osd_max_object_size(cct->_conf, "osd_max_object_size"),
261 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
262 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
263 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
7c673cae 264 max_oldest_map(0),
20effc67 265 m_scrub_queue{cct, *this},
7c673cae
FG
266 agent_valid_iterator(false),
267 agent_ops(0),
268 flush_mode_high_count(0),
269 agent_active(true),
270 agent_thread(this),
271 agent_stop_flag(false),
7c673cae
FG
272 agent_timer(osd->client_messenger->cct, agent_timer_lock),
273 last_recalibrate(ceph_clock_now()),
274 promote_max_objects(0),
275 promote_max_bytes(0),
f67539c2 276 poolctx(poolctx),
9f95a23c
TL
277 objecter(make_unique<Objecter>(osd->client_messenger->cct,
278 osd->objecter_messenger,
f67539c2 279 osd->monc, poolctx)),
11fdf7f2 280 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
7c673cae
FG
281 watch_timer(osd->client_messenger->cct, watch_lock),
282 next_notif_id(0),
7c673cae 283 recovery_request_timer(cct, recovery_request_lock, false),
11fdf7f2 284 sleep_timer(cct, sleep_lock, false),
7c673cae 285 reserver_finisher(cct),
3efd9988 286 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 287 cct->_conf->osd_min_recovery_priority),
3efd9988 288 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 289 cct->_conf->osd_min_recovery_priority),
3efd9988 290 snap_reserver(cct, &reserver_finisher,
7c673cae 291 cct->_conf->osd_max_trimming_pgs),
7c673cae
FG
292 recovery_ops_active(0),
293 recovery_ops_reserved(0),
294 recovery_paused(false),
7c673cae
FG
295 map_cache(cct, cct->_conf->osd_map_cache_size),
296 map_bl_cache(cct->_conf->osd_map_cache_size),
297 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
7c673cae 298 cur_state(NONE),
11fdf7f2 299 cur_ratio(0), physical_ratio(0),
9f95a23c 300 boot_epoch(0), up_epoch(0), bind_epoch(0)
7c673cae
FG
301{
302 objecter->init();
11fdf7f2
TL
303
304 for (int i = 0; i < m_objecter_finishers; i++) {
305 ostringstream str;
306 str << "objecter-finisher-" << i;
9f95a23c
TL
307 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
308 objecter_finishers.push_back(std::move(fin));
11fdf7f2 309 }
7c673cae
FG
310}
311
31f18b77 312#ifdef PG_DEBUG_REFS
f67539c2 313void OSDService::add_pgid(spg_t pgid, PG *pg) {
11fdf7f2 314 std::lock_guard l(pgid_lock);
31f18b77
FG
315 if (!pgid_tracker.count(pgid)) {
316 live_pgs[pgid] = pg;
317 }
318 pgid_tracker[pgid]++;
319}
320void OSDService::remove_pgid(spg_t pgid, PG *pg)
321{
11fdf7f2
TL
322 std::lock_guard l(pgid_lock);
323 ceph_assert(pgid_tracker.count(pgid));
324 ceph_assert(pgid_tracker[pgid] > 0);
31f18b77
FG
325 pgid_tracker[pgid]--;
326 if (pgid_tracker[pgid] == 0) {
327 pgid_tracker.erase(pgid);
328 live_pgs.erase(pgid);
329 }
330}
331void OSDService::dump_live_pgids()
332{
11fdf7f2 333 std::lock_guard l(pgid_lock);
31f18b77
FG
334 derr << "live pgids:" << dendl;
335 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
336 i != pgid_tracker.cend();
337 ++i) {
338 derr << "\t" << *i << dendl;
339 live_pgs[i->first]->dump_live_ids();
340 }
341}
342#endif
343
344
9f95a23c
TL
345ceph::signedspan OSDService::get_mnow()
346{
347 return ceph::mono_clock::now() - osd->startup_time;
348}
7c673cae 349
11fdf7f2
TL
350void OSDService::identify_splits_and_merges(
351 OSDMapRef old_map,
352 OSDMapRef new_map,
353 spg_t pgid,
354 set<pair<spg_t,epoch_t>> *split_children,
355 set<pair<spg_t,epoch_t>> *merge_pgs)
7c673cae 356{
11fdf7f2 357 if (!old_map->have_pg_pool(pgid.pool())) {
7c673cae 358 return;
7c673cae 359 }
7c673cae 360 int old_pgnum = old_map->get_pg_num(pgid.pool());
11fdf7f2
TL
361 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
362 if (p == osd->pg_num_history.pg_nums.end()) {
363 return;
364 }
365 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
366 << " to e" << new_map->get_epoch()
367 << " pg_nums " << p->second << dendl;
368 deque<spg_t> queue;
369 queue.push_back(pgid);
eafe8130 370 set<spg_t> did;
11fdf7f2
TL
371 while (!queue.empty()) {
372 auto cur = queue.front();
373 queue.pop_front();
eafe8130 374 did.insert(cur);
11fdf7f2
TL
375 unsigned pgnum = old_pgnum;
376 for (auto q = p->second.lower_bound(old_map->get_epoch());
377 q != p->second.end() &&
378 q->first <= new_map->get_epoch();
379 ++q) {
380 if (pgnum < q->second) {
381 // split?
382 if (cur.ps() < pgnum) {
383 set<spg_t> children;
384 if (cur.is_split(pgnum, q->second, &children)) {
385 dout(20) << __func__ << " " << cur << " e" << q->first
386 << " pg_num " << pgnum << " -> " << q->second
387 << " children " << children << dendl;
388 for (auto i : children) {
389 split_children->insert(make_pair(i, q->first));
eafe8130
TL
390 if (!did.count(i))
391 queue.push_back(i);
11fdf7f2
TL
392 }
393 }
394 } else if (cur.ps() < q->second) {
395 dout(20) << __func__ << " " << cur << " e" << q->first
396 << " pg_num " << pgnum << " -> " << q->second
397 << " is a child" << dendl;
398 // normally we'd capture this from the parent, but it's
399 // possible the parent doesn't exist yet (it will be
400 // fabricated to allow an intervening merge). note this PG
401 // as a split child here to be sure we catch it.
402 split_children->insert(make_pair(cur, q->first));
403 } else {
404 dout(20) << __func__ << " " << cur << " e" << q->first
405 << " pg_num " << pgnum << " -> " << q->second
406 << " is post-split, skipping" << dendl;
407 }
408 } else if (merge_pgs) {
409 // merge?
410 if (cur.ps() >= q->second) {
411 if (cur.ps() < pgnum) {
412 spg_t parent;
413 if (cur.is_merge_source(pgnum, q->second, &parent)) {
414 set<spg_t> children;
415 parent.is_split(q->second, pgnum, &children);
416 dout(20) << __func__ << " " << cur << " e" << q->first
417 << " pg_num " << pgnum << " -> " << q->second
418 << " is merge source, target " << parent
419 << ", source(s) " << children << dendl;
420 merge_pgs->insert(make_pair(parent, q->first));
eafe8130
TL
421 if (!did.count(parent)) {
422 // queue (and re-scan) parent in case it might not exist yet
423 // and there are some future splits pending on it
424 queue.push_back(parent);
425 }
11fdf7f2
TL
426 for (auto c : children) {
427 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
428 if (!did.count(c))
429 queue.push_back(c);
11fdf7f2
TL
430 }
431 }
432 } else {
433 dout(20) << __func__ << " " << cur << " e" << q->first
434 << " pg_num " << pgnum << " -> " << q->second
435 << " is beyond old pgnum, skipping" << dendl;
436 }
437 } else {
438 set<spg_t> children;
439 if (cur.is_split(q->second, pgnum, &children)) {
440 dout(20) << __func__ << " " << cur << " e" << q->first
441 << " pg_num " << pgnum << " -> " << q->second
442 << " is merge target, source " << children << dendl;
443 for (auto c : children) {
444 merge_pgs->insert(make_pair(c, q->first));
eafe8130
TL
445 if (!did.count(c))
446 queue.push_back(c);
11fdf7f2
TL
447 }
448 merge_pgs->insert(make_pair(cur, q->first));
449 }
7c673cae
FG
450 }
451 }
11fdf7f2 452 pgnum = q->second;
7c673cae
FG
453 }
454 }
455}
456
7c673cae
FG
457void OSDService::need_heartbeat_peer_update()
458{
459 osd->need_heartbeat_peer_update();
460}
461
9f95a23c
TL
462HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
463{
464 std::lock_guard l(hb_stamp_lock);
465 if (peer >= hb_stamps.size()) {
466 hb_stamps.resize(peer + 1);
467 }
468 if (!hb_stamps[peer]) {
469 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
470 }
471 return hb_stamps[peer];
472}
473
474void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
475{
476 osd->enqueue_peering_evt(
477 spgid,
478 PGPeeringEventRef(
479 std::make_shared<PGPeeringEvent>(
480 epoch, epoch,
481 RenewLease())));
482}
483
7c673cae
FG
484void OSDService::start_shutdown()
485{
486 {
11fdf7f2 487 std::lock_guard l(agent_timer_lock);
7c673cae
FG
488 agent_timer.shutdown();
489 }
31f18b77
FG
490
491 {
11fdf7f2
TL
492 std::lock_guard l(sleep_lock);
493 sleep_timer.shutdown();
31f18b77 494 }
81eedcae
TL
495
496 {
497 std::lock_guard l(recovery_request_lock);
498 recovery_request_timer.shutdown();
499 }
7c673cae
FG
500}
501
31f18b77 502void OSDService::shutdown_reserver()
7c673cae
FG
503{
504 reserver_finisher.wait_for_empty();
505 reserver_finisher.stop();
31f18b77
FG
506}
507
508void OSDService::shutdown()
509{
9f95a23c
TL
510 mono_timer.suspend();
511
7c673cae 512 {
11fdf7f2 513 std::lock_guard l(watch_lock);
7c673cae
FG
514 watch_timer.shutdown();
515 }
516
517 objecter->shutdown();
9f95a23c 518 for (auto& f : objecter_finishers) {
11fdf7f2
TL
519 f->wait_for_empty();
520 f->stop();
7c673cae
FG
521 }
522
11fdf7f2 523 publish_map(OSDMapRef());
7c673cae
FG
524 next_osdmap = OSDMapRef();
525}
526
527void OSDService::init()
528{
529 reserver_finisher.start();
9f95a23c 530 for (auto& f : objecter_finishers) {
11fdf7f2
TL
531 f->start();
532 }
7c673cae
FG
533 objecter->set_client_incarnation(0);
534
535 // deprioritize objecter in daemonperf output
536 objecter->get_logger()->set_prio_adjust(-3);
537
538 watch_timer.init();
539 agent_timer.init();
9f95a23c 540 mono_timer.resume();
7c673cae
FG
541
542 agent_thread.create("osd_srv_agent");
543
544 if (cct->_conf->osd_recovery_delay_start)
545 defer_recovery(cct->_conf->osd_recovery_delay_start);
546}
547
548void OSDService::final_init()
549{
550 objecter->start(osdmap.get());
551}
552
553void OSDService::activate_map()
554{
555 // wake/unwake the tiering agent
9f95a23c 556 std::lock_guard l{agent_lock};
7c673cae
FG
557 agent_active =
558 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
559 osd->is_active();
9f95a23c 560 agent_cond.notify_all();
7c673cae
FG
561}
562
181888fb
FG
563void OSDService::request_osdmap_update(epoch_t e)
564{
565 osd->osdmap_subscribe(e, false);
566}
567
9f95a23c 568
7c673cae
FG
569class AgentTimeoutCB : public Context {
570 PGRef pg;
571public:
572 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
573 void finish(int) override {
574 pg->agent_choose_mode_restart();
575 }
576};
577
578void OSDService::agent_entry()
579{
580 dout(10) << __func__ << " start" << dendl;
9f95a23c 581 std::unique_lock agent_locker{agent_lock};
7c673cae
FG
582
583 while (!agent_stop_flag) {
584 if (agent_queue.empty()) {
585 dout(20) << __func__ << " empty queue" << dendl;
9f95a23c 586 agent_cond.wait(agent_locker);
7c673cae
FG
587 continue;
588 }
589 uint64_t level = agent_queue.rbegin()->first;
590 set<PGRef>& top = agent_queue.rbegin()->second;
591 dout(10) << __func__
592 << " tiers " << agent_queue.size()
593 << ", top is " << level
594 << " with pgs " << top.size()
595 << ", ops " << agent_ops << "/"
596 << cct->_conf->osd_agent_max_ops
597 << (agent_active ? " active" : " NOT ACTIVE")
598 << dendl;
599 dout(20) << __func__ << " oids " << agent_oids << dendl;
600 int max = cct->_conf->osd_agent_max_ops - agent_ops;
601 int agent_flush_quota = max;
602 if (!flush_mode_high_count)
603 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
604 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
9f95a23c 605 agent_cond.wait(agent_locker);
7c673cae
FG
606 continue;
607 }
608
609 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
610 agent_queue_pos = top.begin();
611 agent_valid_iterator = true;
612 }
613 PGRef pg = *agent_queue_pos;
614 dout(10) << "high_count " << flush_mode_high_count
615 << " agent_ops " << agent_ops
616 << " flush_quota " << agent_flush_quota << dendl;
9f95a23c 617 agent_locker.unlock();
7c673cae 618 if (!pg->agent_work(max, agent_flush_quota)) {
11fdf7f2 619 dout(10) << __func__ << " " << pg->pg_id
7c673cae
FG
620 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
621 << " seconds" << dendl;
622
f67539c2 623 logger->inc(l_osd_tier_delay);
7c673cae 624 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
9f95a23c 625 std::lock_guard timer_locker{agent_timer_lock};
7c673cae
FG
626 Context *cb = new AgentTimeoutCB(pg);
627 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
7c673cae 628 }
9f95a23c 629 agent_locker.lock();
7c673cae 630 }
7c673cae
FG
631 dout(10) << __func__ << " finish" << dendl;
632}
633
634void OSDService::agent_stop()
635{
636 {
11fdf7f2 637 std::lock_guard l(agent_lock);
7c673cae
FG
638
639 // By this time all ops should be cancelled
11fdf7f2 640 ceph_assert(agent_ops == 0);
7c673cae
FG
641 // By this time all PGs are shutdown and dequeued
642 if (!agent_queue.empty()) {
643 set<PGRef>& top = agent_queue.rbegin()->second;
11fdf7f2
TL
644 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
645 ceph_abort_msg("agent queue not empty");
7c673cae
FG
646 }
647
648 agent_stop_flag = true;
9f95a23c 649 agent_cond.notify_all();
7c673cae
FG
650 }
651 agent_thread.join();
652}
653
654// -------------------------------------
655
656void OSDService::promote_throttle_recalibrate()
657{
658 utime_t now = ceph_clock_now();
659 double dur = now - last_recalibrate;
660 last_recalibrate = now;
661 unsigned prob = promote_probability_millis;
662
663 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
664 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
665
666 unsigned min_prob = 1;
667
668 uint64_t attempts, obj, bytes;
669 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
670 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 671 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 672 << target_obj_sec << " obj/sec or "
1adf2230 673 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
674 << dendl;
675
676 // calculate what the probability *should* be, given the targets
677 unsigned new_prob;
678 if (attempts && dur > 0) {
679 uint64_t avg_size = 1;
680 if (obj)
11fdf7f2 681 avg_size = std::max<uint64_t>(bytes / obj, 1);
7c673cae
FG
682 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
683 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
684 / (double)attempts;
685 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
686 << avg_size << dendl;
687 if (target_obj_sec && target_bytes_sec)
11fdf7f2 688 new_prob = std::min(po, pb);
7c673cae
FG
689 else if (target_obj_sec)
690 new_prob = po;
691 else if (target_bytes_sec)
692 new_prob = pb;
693 else
694 new_prob = 1000;
695 } else {
696 new_prob = 1000;
697 }
698 dout(20) << __func__ << " new_prob " << new_prob << dendl;
699
700 // correct for persistent skew between target rate and actual rate, adjust
701 double ratio = 1.0;
702 unsigned actual = 0;
703 if (attempts && obj) {
704 actual = obj * 1000 / attempts;
705 ratio = (double)actual / (double)prob;
706 new_prob = (double)new_prob / ratio;
707 }
11fdf7f2
TL
708 new_prob = std::max(new_prob, min_prob);
709 new_prob = std::min(new_prob, 1000u);
7c673cae
FG
710
711 // adjust
712 prob = (prob + new_prob) / 2;
11fdf7f2
TL
713 prob = std::max(prob, min_prob);
714 prob = std::min(prob, 1000u);
7c673cae
FG
715 dout(10) << __func__ << " actual " << actual
716 << ", actual/prob ratio " << ratio
717 << ", adjusted new_prob " << new_prob
718 << ", prob " << promote_probability_millis << " -> " << prob
719 << dendl;
720 promote_probability_millis = prob;
721
722 // set hard limits for this interval to mitigate stampedes
91327a77
AA
723 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
724 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
725}
726
727// -------------------------------------
728
729float OSDService::get_failsafe_full_ratio()
730{
731 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
732 if (full_ratio > 1.0) full_ratio /= 100.0;
733 return full_ratio;
734}
735
11fdf7f2 736OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
7c673cae 737{
7c673cae
FG
738 // The OSDMap ratios take precendence. So if the failsafe is .95 and
739 // the admin sets the cluster full to .96, the failsafe moves up to .96
740 // too. (Not that having failsafe == full is ideal, but it's better than
741 // dropping writes before the clusters appears full.)
742 OSDMapRef osdmap = get_osdmap();
743 if (!osdmap || osdmap->get_epoch() == 0) {
11fdf7f2 744 return NONE;
7c673cae
FG
745 }
746 float nearfull_ratio = osdmap->get_nearfull_ratio();
747 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
748 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
749 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
750
9f95a23c 751 if (osdmap->require_osd_release < ceph_release_t::luminous) {
7c673cae
FG
752 // use the failsafe for nearfull and full; the mon isn't using the
753 // flags anyway because we're mid-upgrade.
754 full_ratio = failsafe_ratio;
755 backfillfull_ratio = failsafe_ratio;
756 nearfull_ratio = failsafe_ratio;
757 } else if (full_ratio <= 0 ||
758 backfillfull_ratio <= 0 ||
759 nearfull_ratio <= 0) {
760 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
761 // use failsafe flag. ick. the monitor did something wrong or the user
762 // did something stupid.
763 full_ratio = failsafe_ratio;
764 backfillfull_ratio = failsafe_ratio;
765 nearfull_ratio = failsafe_ratio;
766 }
767
7c673cae 768 if (injectfull_state > NONE && injectfull) {
7c673cae 769 inject = "(Injected)";
11fdf7f2
TL
770 return injectfull_state;
771 } else if (pratio > failsafe_ratio) {
772 return FAILSAFE;
7c673cae 773 } else if (ratio > full_ratio) {
11fdf7f2 774 return FULL;
7c673cae 775 } else if (ratio > backfillfull_ratio) {
11fdf7f2 776 return BACKFILLFULL;
92f5a8d4 777 } else if (pratio > nearfull_ratio) {
11fdf7f2 778 return NEARFULL;
7c673cae 779 }
11fdf7f2
TL
780 return NONE;
781}
782
783void OSDService::check_full_status(float ratio, float pratio)
784{
785 std::lock_guard l(full_status_lock);
786
787 cur_ratio = ratio;
788 physical_ratio = pratio;
789
790 string inject;
791 s_names new_state;
792 new_state = recalc_full_state(ratio, pratio, inject);
793
7c673cae 794 dout(20) << __func__ << " cur ratio " << ratio
11fdf7f2 795 << ", physical ratio " << pratio
7c673cae
FG
796 << ", new state " << get_full_state_name(new_state)
797 << " " << inject
798 << dendl;
799
800 // warn
801 if (cur_state != new_state) {
802 dout(10) << __func__ << " " << get_full_state_name(cur_state)
803 << " -> " << get_full_state_name(new_state) << dendl;
804 if (new_state == FAILSAFE) {
c07f9fc5 805 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
806 << (int)roundf(ratio * 100) << "% full";
807 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
808 clog->error() << "full status failsafe disengaged, no longer dropping "
809 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
810 }
811 cur_state = new_state;
812 }
813}
814
815bool OSDService::need_fullness_update()
816{
817 OSDMapRef osdmap = get_osdmap();
818 s_names cur = NONE;
819 if (osdmap->exists(whoami)) {
820 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
821 cur = FULL;
822 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
823 cur = BACKFILLFULL;
824 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
825 cur = NEARFULL;
826 }
827 }
828 s_names want = NONE;
829 if (is_full())
830 want = FULL;
831 else if (is_backfillfull())
832 want = BACKFILLFULL;
833 else if (is_nearfull())
834 want = NEARFULL;
835 return want != cur;
836}
837
11fdf7f2 838bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
7c673cae 839{
7c673cae
FG
840 if (injectfull && injectfull_state >= type) {
841 // injectfull is either a count of the number of times to return failsafe full
842 // or if -1 then always return full
843 if (injectfull > 0)
844 --injectfull;
11fdf7f2
TL
845 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
846 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
847 << dendl;
7c673cae
FG
848 return true;
849 }
11fdf7f2
TL
850 return false;
851}
852
853bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
854{
855 std::lock_guard l(full_status_lock);
856
857 if (_check_inject_full(dpp, type))
858 return true;
859
860 if (cur_state >= type)
861 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
862 << " physical " << physical_ratio << dendl;
7c673cae 863
7c673cae
FG
864 return cur_state >= type;
865}
866
11fdf7f2
TL
867bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
868{
869 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
870 {
871 std::lock_guard l(full_status_lock);
872 if (_check_inject_full(dpp, type)) {
873 return true;
874 }
875 }
876
877 float pratio;
878 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
879
880 string notused;
881 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
882
883 if (tentative_state >= type)
884 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
885
886 return tentative_state >= type;
887}
888
889bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
890{
891 return _check_full(dpp, FAILSAFE);
892}
893
894bool OSDService::check_full(DoutPrefixProvider *dpp) const
7c673cae 895{
11fdf7f2 896 return _check_full(dpp, FULL);
7c673cae
FG
897}
898
11fdf7f2 899bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
7c673cae 900{
11fdf7f2 901 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
7c673cae
FG
902}
903
11fdf7f2 904bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
7c673cae 905{
11fdf7f2 906 return _check_full(dpp, BACKFILLFULL);
7c673cae
FG
907}
908
11fdf7f2 909bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
7c673cae 910{
11fdf7f2 911 return _check_full(dpp, NEARFULL);
7c673cae
FG
912}
913
914bool OSDService::is_failsafe_full() const
915{
11fdf7f2 916 std::lock_guard l(full_status_lock);
7c673cae
FG
917 return cur_state == FAILSAFE;
918}
919
920bool OSDService::is_full() const
921{
11fdf7f2 922 std::lock_guard l(full_status_lock);
7c673cae
FG
923 return cur_state >= FULL;
924}
925
926bool OSDService::is_backfillfull() const
927{
11fdf7f2 928 std::lock_guard l(full_status_lock);
7c673cae
FG
929 return cur_state >= BACKFILLFULL;
930}
931
932bool OSDService::is_nearfull() const
933{
11fdf7f2 934 std::lock_guard l(full_status_lock);
7c673cae
FG
935 return cur_state >= NEARFULL;
936}
937
938void OSDService::set_injectfull(s_names type, int64_t count)
939{
11fdf7f2 940 std::lock_guard l(full_status_lock);
7c673cae
FG
941 injectfull_state = type;
942 injectfull = count;
943}
944
11fdf7f2
TL
945void OSDService::set_statfs(const struct store_statfs_t &stbuf,
946 osd_alert_list_t& alerts)
7c673cae 947{
224ce89b 948 uint64_t bytes = stbuf.total;
224ce89b 949 uint64_t avail = stbuf.available;
11fdf7f2
TL
950 uint64_t used = stbuf.get_used_raw();
951
952 // For testing fake statfs values so it doesn't matter if all
953 // OSDs are using the same partition.
954 if (cct->_conf->fake_statfs_for_testing) {
955 uint64_t total_num_bytes = 0;
956 vector<PGRef> pgs;
957 osd->_get_pgs(&pgs);
958 for (auto p : pgs) {
959 total_num_bytes += p->get_stats_num_bytes();
960 }
961 bytes = cct->_conf->fake_statfs_for_testing;
962 if (total_num_bytes < bytes)
963 avail = bytes - total_num_bytes;
964 else
965 avail = 0;
966 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
967 << " adjust available " << avail
968 << dendl;
969 used = bytes - avail;
970 }
7c673cae 971
f67539c2
TL
972 logger->set(l_osd_stat_bytes, bytes);
973 logger->set(l_osd_stat_bytes_used, used);
974 logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 975
11fdf7f2
TL
976 std::lock_guard l(stat_lock);
977 osd_stat.statfs = stbuf;
978 osd_stat.os_alerts.clear();
979 osd_stat.os_alerts[whoami].swap(alerts);
980 if (cct->_conf->fake_statfs_for_testing) {
981 osd_stat.statfs.total = bytes;
982 osd_stat.statfs.available = avail;
983 // For testing don't want used to go negative, so clear reserved
984 osd_stat.statfs.internally_reserved = 0;
224ce89b
WB
985 }
986}
7c673cae 987
11fdf7f2
TL
988osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
989 int num_pgs)
224ce89b 990{
eafe8130
TL
991 utime_t now = ceph_clock_now();
992 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
11fdf7f2
TL
993 std::lock_guard l(stat_lock);
994 osd_stat.hb_peers.swap(hb_peers);
995 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
996 osd_stat.num_pgs = num_pgs;
eafe8130
TL
997 // Clean entries that aren't updated
998 // This is called often enough that we can just remove 1 at a time
999 for (auto i: osd_stat.hb_pingtime) {
1000 if (i.second.last_update == 0)
1001 continue;
1002 if (stale_time && now.sec() - i.second.last_update > stale_time) {
1003 dout(20) << __func__ << " time out heartbeat for osd " << i.first
1004 << " last_update " << i.second.last_update << dendl;
1005 osd_stat.hb_pingtime.erase(i.first);
1006 break;
1007 }
1008 }
11fdf7f2
TL
1009 return osd_stat;
1010}
1011
1012void OSDService::inc_osd_stat_repaired()
1013{
1014 std::lock_guard l(stat_lock);
1015 osd_stat.num_shards_repaired++;
1016 return;
1017}
1018
1019float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1020 uint64_t adjust_used)
1021{
1022 *pratio =
b3b6e05e 1023 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
11fdf7f2
TL
1024
1025 if (adjust_used) {
1026 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1027 if (new_stat.statfs.available > adjust_used)
1028 new_stat.statfs.available -= adjust_used;
1029 else
1030 new_stat.statfs.available = 0;
1031 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
7c673cae
FG
1032 }
1033
11fdf7f2
TL
1034 // Check all pgs and adjust kb_used to include all pending backfill data
1035 int backfill_adjusted = 0;
1036 vector<PGRef> pgs;
1037 osd->_get_pgs(&pgs);
1038 for (auto p : pgs) {
1039 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1040 }
1041 if (backfill_adjusted) {
1042 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1043 }
b3b6e05e 1044 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
7c673cae
FG
1045}
1046
7c673cae
FG
1047void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1048{
1049 OSDMapRef next_map = get_nextmap_reserved();
1050 // service map is always newer/newest
11fdf7f2 1051 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1052
1053 if (next_map->is_down(peer) ||
1054 next_map->get_info(peer).up_from > from_epoch) {
1055 m->put();
1056 release_map(next_map);
1057 return;
1058 }
9f95a23c
TL
1059 ConnectionRef peer_con;
1060 if (peer == whoami) {
1061 peer_con = osd->cluster_messenger->get_loopback_connection();
1062 } else {
1063 peer_con = osd->cluster_messenger->connect_to_osd(
1064 next_map->get_cluster_addrs(peer), false, true);
1065 }
1066 maybe_share_map(peer_con.get(), next_map);
7c673cae
FG
1067 peer_con->send_message(m);
1068 release_map(next_map);
1069}
1070
9f95a23c
TL
1071void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1072{
1073 OSDMapRef next_map = get_nextmap_reserved();
1074 // service map is always newer/newest
1075 ceph_assert(from_epoch <= next_map->get_epoch());
1076
1077 for (auto& iter : messages) {
1078 if (next_map->is_down(iter.first) ||
1079 next_map->get_info(iter.first).up_from > from_epoch) {
1080 iter.second->put();
1081 continue;
1082 }
1083 ConnectionRef peer_con;
1084 if (iter.first == whoami) {
1085 peer_con = osd->cluster_messenger->get_loopback_connection();
1086 } else {
1087 peer_con = osd->cluster_messenger->connect_to_osd(
1088 next_map->get_cluster_addrs(iter.first), false, true);
1089 }
1090 maybe_share_map(peer_con.get(), next_map);
1091 peer_con->send_message(iter.second);
1092 }
1093 release_map(next_map);
1094}
7c673cae
FG
1095ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1096{
1097 OSDMapRef next_map = get_nextmap_reserved();
1098 // service map is always newer/newest
11fdf7f2 1099 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1100
1101 if (next_map->is_down(peer) ||
1102 next_map->get_info(peer).up_from > from_epoch) {
1103 release_map(next_map);
1104 return NULL;
1105 }
9f95a23c
TL
1106 ConnectionRef con;
1107 if (peer == whoami) {
1108 con = osd->cluster_messenger->get_loopback_connection();
1109 } else {
1110 con = osd->cluster_messenger->connect_to_osd(
1111 next_map->get_cluster_addrs(peer), false, true);
1112 }
7c673cae
FG
1113 release_map(next_map);
1114 return con;
1115}
1116
1117pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1118{
1119 OSDMapRef next_map = get_nextmap_reserved();
1120 // service map is always newer/newest
11fdf7f2 1121 ceph_assert(from_epoch <= next_map->get_epoch());
7c673cae
FG
1122
1123 pair<ConnectionRef,ConnectionRef> ret;
1124 if (next_map->is_down(peer) ||
1125 next_map->get_info(peer).up_from > from_epoch) {
1126 release_map(next_map);
1127 return ret;
1128 }
11fdf7f2
TL
1129 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1130 next_map->get_hb_back_addrs(peer));
1131 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1132 next_map->get_hb_front_addrs(peer));
7c673cae
FG
1133 release_map(next_map);
1134 return ret;
1135}
1136
11fdf7f2
TL
1137entity_name_t OSDService::get_cluster_msgr_name() const
1138{
1139 return cluster_messenger->get_myname();
1140}
7c673cae 1141
94b18763
FG
1142void OSDService::queue_want_pg_temp(pg_t pgid,
1143 const vector<int>& want,
1144 bool forced)
7c673cae 1145{
11fdf7f2 1146 std::lock_guard l(pg_temp_lock);
94b18763 1147 auto p = pg_temp_pending.find(pgid);
7c673cae 1148 if (p == pg_temp_pending.end() ||
94b18763
FG
1149 p->second.acting != want ||
1150 forced) {
11fdf7f2 1151 pg_temp_wanted[pgid] = {want, forced};
7c673cae
FG
1152 }
1153}
1154
1155void OSDService::remove_want_pg_temp(pg_t pgid)
1156{
11fdf7f2 1157 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1158 pg_temp_wanted.erase(pgid);
1159 pg_temp_pending.erase(pgid);
1160}
1161
1162void OSDService::_sent_pg_temp()
1163{
11fdf7f2
TL
1164#ifdef HAVE_STDLIB_MAP_SPLICING
1165 pg_temp_pending.merge(pg_temp_wanted);
1166#else
94b18763
FG
1167 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1168 make_move_iterator(end(pg_temp_wanted)));
11fdf7f2 1169#endif
7c673cae
FG
1170 pg_temp_wanted.clear();
1171}
1172
1173void OSDService::requeue_pg_temp()
1174{
11fdf7f2 1175 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1176 // wanted overrides pending. note that remove_want_pg_temp
1177 // clears the item out of both.
1178 unsigned old_wanted = pg_temp_wanted.size();
1179 unsigned old_pending = pg_temp_pending.size();
1180 _sent_pg_temp();
1181 pg_temp_wanted.swap(pg_temp_pending);
1182 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1183 << pg_temp_wanted.size() << dendl;
1184}
1185
94b18763
FG
1186std::ostream& operator<<(std::ostream& out,
1187 const OSDService::pg_temp_t& pg_temp)
1188{
1189 out << pg_temp.acting;
1190 if (pg_temp.forced) {
1191 out << " (forced)";
1192 }
1193 return out;
1194}
1195
7c673cae
FG
1196void OSDService::send_pg_temp()
1197{
11fdf7f2 1198 std::lock_guard l(pg_temp_lock);
7c673cae
FG
1199 if (pg_temp_wanted.empty())
1200 return;
1201 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763 1202 MOSDPGTemp *ms[2] = {nullptr, nullptr};
11fdf7f2
TL
1203 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1204 auto& m = ms[pg_temp.forced];
94b18763
FG
1205 if (!m) {
1206 m = new MOSDPGTemp(osdmap->get_epoch());
11fdf7f2 1207 m->forced = pg_temp.forced;
94b18763 1208 }
11fdf7f2 1209 m->pg_temp.emplace(pgid, pg_temp.acting);
94b18763
FG
1210 }
1211 for (auto m : ms) {
1212 if (m) {
1213 monc->send_mon_message(m);
1214 }
1215 }
7c673cae
FG
1216 _sent_pg_temp();
1217}
1218
1219void OSDService::send_pg_created(pg_t pgid)
1220{
11fdf7f2 1221 std::lock_guard l(pg_created_lock);
7c673cae 1222 dout(20) << __func__ << dendl;
11fdf7f2 1223 auto o = get_osdmap();
9f95a23c 1224 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2 1225 pg_created.insert(pgid);
c07f9fc5
FG
1226 monc->send_mon_message(new MOSDPGCreated(pgid));
1227 }
7c673cae
FG
1228}
1229
11fdf7f2
TL
1230void OSDService::send_pg_created()
1231{
1232 std::lock_guard l(pg_created_lock);
1233 dout(20) << __func__ << dendl;
1234 auto o = get_osdmap();
9f95a23c 1235 if (o->require_osd_release >= ceph_release_t::luminous) {
11fdf7f2
TL
1236 for (auto pgid : pg_created) {
1237 monc->send_mon_message(new MOSDPGCreated(pgid));
1238 }
1239 }
1240}
1241
1242void OSDService::prune_pg_created()
1243{
1244 std::lock_guard l(pg_created_lock);
1245 dout(20) << __func__ << dendl;
1246 auto o = get_osdmap();
1247 auto i = pg_created.begin();
1248 while (i != pg_created.end()) {
1249 auto p = o->get_pg_pool(i->pool());
1250 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1251 dout(20) << __func__ << " pruning " << *i << dendl;
1252 i = pg_created.erase(i);
1253 } else {
1254 dout(20) << __func__ << " keeping " << *i << dendl;
1255 ++i;
1256 }
1257 }
1258}
1259
1260
7c673cae
FG
1261// --------------------------------------
1262// dispatch
1263
7c673cae
FG
1264void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1265 epoch_t *_bind_epoch) const
1266{
11fdf7f2 1267 std::lock_guard l(epoch_lock);
7c673cae
FG
1268 if (_boot_epoch)
1269 *_boot_epoch = boot_epoch;
1270 if (_up_epoch)
1271 *_up_epoch = up_epoch;
1272 if (_bind_epoch)
1273 *_bind_epoch = bind_epoch;
1274}
1275
1276void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1277 const epoch_t *_bind_epoch)
1278{
11fdf7f2 1279 std::lock_guard l(epoch_lock);
7c673cae 1280 if (_boot_epoch) {
11fdf7f2 1281 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
7c673cae
FG
1282 boot_epoch = *_boot_epoch;
1283 }
1284 if (_up_epoch) {
11fdf7f2 1285 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
7c673cae
FG
1286 up_epoch = *_up_epoch;
1287 }
1288 if (_bind_epoch) {
11fdf7f2 1289 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
7c673cae
FG
1290 bind_epoch = *_bind_epoch;
1291 }
1292}
1293
1294bool OSDService::prepare_to_stop()
1295{
9f95a23c 1296 std::unique_lock l(is_stopping_lock);
7c673cae
FG
1297 if (get_state() != NOT_STOPPING)
1298 return false;
1299
1300 OSDMapRef osdmap = get_osdmap();
1301 if (osdmap && osdmap->is_up(whoami)) {
1d09f67e 1302 dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
7c673cae 1303 set_state(PREPARING_TO_STOP);
11fdf7f2
TL
1304 monc->send_mon_message(
1305 new MOSDMarkMeDown(
1306 monc->get_fsid(),
1307 whoami,
1308 osdmap->get_addrs(whoami),
1309 osdmap->get_epoch(),
1d09f67e
TL
1310 true, // request ack
1311 true // mark as down and dead
11fdf7f2 1312 ));
9f95a23c
TL
1313 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1314 is_stopping_cond.wait_for(l, timeout,
1315 [this] { return get_state() == STOPPING; });
7c673cae 1316 }
1d09f67e 1317
7c673cae
FG
1318 dout(0) << __func__ << " starting shutdown" << dendl;
1319 set_state(STOPPING);
1320 return true;
1321}
1322
1323void OSDService::got_stop_ack()
1324{
9f95a23c 1325 std::scoped_lock l(is_stopping_lock);
7c673cae
FG
1326 if (get_state() == PREPARING_TO_STOP) {
1327 dout(0) << __func__ << " starting shutdown" << dendl;
1328 set_state(STOPPING);
9f95a23c 1329 is_stopping_cond.notify_all();
7c673cae
FG
1330 } else {
1331 dout(10) << __func__ << " ignoring msg" << dendl;
1332 }
1333}
1334
1335MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1336 OSDSuperblock& sblock)
1337{
28e407b8
AA
1338 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1339 osdmap->get_encoding_features());
7c673cae
FG
1340 m->oldest_map = max_oldest_map;
1341 m->newest_map = sblock.newest_map;
1342
11fdf7f2
TL
1343 int max = cct->_conf->osd_map_message_max;
1344 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1345
1346 if (since < m->oldest_map) {
1347 // we don't have the next map the target wants, so start with a
1348 // full map.
1349 bufferlist bl;
1350 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1351 << since << ", starting with full map" << dendl;
1352 since = m->oldest_map;
1353 if (!get_map_bl(since, bl)) {
1354 derr << __func__ << " missing full map " << since << dendl;
1355 goto panic;
1356 }
1357 max--;
1358 max_bytes -= bl.length();
f67539c2 1359 m->maps[since] = std::move(bl);
11fdf7f2
TL
1360 }
1361 for (epoch_t e = since + 1; e <= to; ++e) {
7c673cae 1362 bufferlist bl;
11fdf7f2 1363 if (get_inc_map_bl(e, bl)) {
f67539c2 1364 m->incremental_maps[e] = std::move(bl);
11fdf7f2 1365 } else {
e306af50 1366 dout(10) << __func__ << " missing incremental map " << e << dendl;
11fdf7f2
TL
1367 if (!get_map_bl(e, bl)) {
1368 derr << __func__ << " also missing full map " << e << dendl;
1369 goto panic;
1370 }
f67539c2 1371 m->maps[e] = std::move(bl);
11fdf7f2
TL
1372 }
1373 max--;
1374 max_bytes -= bl.length();
1375 if (max <= 0 || max_bytes <= 0) {
7c673cae 1376 break;
11fdf7f2
TL
1377 }
1378 }
1379 return m;
1380
1381 panic:
1382 if (!m->maps.empty() ||
1383 !m->incremental_maps.empty()) {
1384 // send what we have so far
1385 return m;
1386 }
1387 // send something
1388 bufferlist bl;
1389 if (get_inc_map_bl(m->newest_map, bl)) {
f67539c2 1390 m->incremental_maps[m->newest_map] = std::move(bl);
11fdf7f2
TL
1391 } else {
1392 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1393 if (!get_map_bl(m->newest_map, bl)) {
1394 derr << __func__ << " unable to load latest full map " << m->newest_map
7c673cae 1395 << dendl;
11fdf7f2 1396 ceph_abort();
7c673cae 1397 }
f67539c2 1398 m->maps[m->newest_map] = std::move(bl);
7c673cae
FG
1399 }
1400 return m;
1401}
1402
1403void OSDService::send_map(MOSDMap *m, Connection *con)
1404{
1405 con->send_message(m);
1406}
1407
1408void OSDService::send_incremental_map(epoch_t since, Connection *con,
9f95a23c 1409 const OSDMapRef& osdmap)
7c673cae
FG
1410{
1411 epoch_t to = osdmap->get_epoch();
1412 dout(10) << "send_incremental_map " << since << " -> " << to
1413 << " to " << con << " " << con->get_peer_addr() << dendl;
1414
1415 MOSDMap *m = NULL;
1416 while (!m) {
1417 OSDSuperblock sblock(get_superblock());
1418 if (since < sblock.oldest_map) {
1419 // just send latest full map
28e407b8
AA
1420 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1421 osdmap->get_encoding_features());
7c673cae
FG
1422 m->oldest_map = max_oldest_map;
1423 m->newest_map = sblock.newest_map;
1424 get_map_bl(to, m->maps[to]);
1425 send_map(m, con);
1426 return;
1427 }
1428
1429 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1430 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1431 << ", only sending most recent" << dendl;
1432 since = to - cct->_conf->osd_map_share_max_epochs;
1433 }
1434
7c673cae
FG
1435 m = build_incremental_map_msg(since, to, sblock);
1436 }
1437 send_map(m, con);
1438}
1439
1440bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1441{
1442 bool found = map_bl_cache.lookup(e, &bl);
31f18b77 1443 if (found) {
f67539c2 1444 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1445 return true;
31f18b77 1446 }
f67539c2 1447 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1448 found = store->read(meta_ch,
31f18b77
FG
1449 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1450 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1451 if (found) {
7c673cae 1452 _add_map_bl(e, bl);
31f18b77 1453 }
7c673cae
FG
1454 return found;
1455}
1456
1457bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1458{
11fdf7f2 1459 std::lock_guard l(map_cache_lock);
7c673cae 1460 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77 1461 if (found) {
f67539c2 1462 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1463 return true;
31f18b77 1464 }
f67539c2 1465 logger->inc(l_osd_map_bl_cache_miss);
11fdf7f2 1466 found = store->read(meta_ch,
31f18b77
FG
1467 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1468 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1469 if (found) {
7c673cae 1470 _add_map_inc_bl(e, bl);
31f18b77 1471 }
7c673cae
FG
1472 return found;
1473}
1474
1475void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1476{
1477 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1478 // cache a contiguous buffer
1479 if (bl.get_num_buffers() > 1) {
1480 bl.rebuild();
1481 }
1482 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1483 map_bl_cache.add(e, bl);
1484}
1485
1486void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1487{
1488 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1489 // cache a contiguous buffer
1490 if (bl.get_num_buffers() > 1) {
1491 bl.rebuild();
1492 }
1493 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1494 map_bl_inc_cache.add(e, bl);
1495}
1496
7c673cae
FG
1497OSDMapRef OSDService::_add_map(OSDMap *o)
1498{
1499 epoch_t e = o->get_epoch();
1500
1501 if (cct->_conf->osd_map_dedup) {
1502 // Dedup against an existing map at a nearby epoch
1503 OSDMapRef for_dedup = map_cache.lower_bound(e);
1504 if (for_dedup) {
1505 OSDMap::dedup(for_dedup.get(), o);
1506 }
1507 }
1508 bool existed;
1509 OSDMapRef l = map_cache.add(e, o, &existed);
1510 if (existed) {
1511 delete o;
1512 }
1513 return l;
1514}
1515
1516OSDMapRef OSDService::try_get_map(epoch_t epoch)
1517{
11fdf7f2 1518 std::lock_guard l(map_cache_lock);
7c673cae
FG
1519 OSDMapRef retval = map_cache.lookup(epoch);
1520 if (retval) {
1521 dout(30) << "get_map " << epoch << " -cached" << dendl;
f67539c2 1522 logger->inc(l_osd_map_cache_hit);
7c673cae
FG
1523 return retval;
1524 }
f67539c2 1525 {
7c673cae
FG
1526 logger->inc(l_osd_map_cache_miss);
1527 epoch_t lb = map_cache.cached_key_lower_bound();
1528 if (epoch < lb) {
1529 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1530 logger->inc(l_osd_map_cache_miss_low);
1531 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1532 }
1533 }
1534
1535 OSDMap *map = new OSDMap;
1536 if (epoch > 0) {
1537 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1538 bufferlist bl;
1539 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1540 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1541 delete map;
1542 return OSDMapRef();
1543 }
1544 map->decode(bl);
1545 } else {
1546 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1547 }
1548 return _add_map(map);
1549}
1550
1551// ops
1552
1553
1554void OSDService::reply_op_error(OpRequestRef op, int err)
1555{
9f95a23c 1556 reply_op_error(op, err, eversion_t(), 0, {});
7c673cae
FG
1557}
1558
1559void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
9f95a23c
TL
1560 version_t uv,
1561 vector<pg_log_op_return_item_t> op_returns)
7c673cae 1562{
9f95a23c 1563 auto m = op->get_req<MOSDOp>();
11fdf7f2 1564 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1565 int flags;
1566 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1567
9f95a23c
TL
1568 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1569 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
7c673cae 1570 reply->set_reply_versions(v, uv);
9f95a23c 1571 reply->set_op_returns(op_returns);
7c673cae
FG
1572 m->get_connection()->send_message(reply);
1573}
1574
1575void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1576{
31f18b77
FG
1577 if (!cct->_conf->osd_debug_misdirected_ops) {
1578 return;
1579 }
1580
9f95a23c 1581 auto m = op->get_req<MOSDOp>();
11fdf7f2 1582 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae 1583
11fdf7f2 1584 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
7c673cae
FG
1585
1586 if (pg->is_ec_pg()) {
1587 /**
1588 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1589 * can get this result:
1590 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1591 * [CRUSH_ITEM_NONE, 2, 3]/3
1592 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1593 * [3, 2, 3]/3
1594 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1595 * -- misdirected op
1596 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1597 * it and fulfils it
1598 *
1599 * We can't compute the op target based on the sending map epoch due to
1600 * splitting. The simplest thing is to detect such cases here and drop
1601 * them without an error (the client will resend anyway).
1602 */
11fdf7f2 1603 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
7c673cae
FG
1604 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1605 if (!opmap) {
1606 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1607 << m->get_map_epoch() << ", dropping" << dendl;
1608 return;
1609 }
1610 pg_t _pgid = m->get_raw_pg();
1611 spg_t pgid;
1612 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1613 _pgid = opmap->raw_pg_to_pg(_pgid);
1614 if (opmap->get_primary_shard(_pgid, &pgid) &&
11fdf7f2 1615 pgid.shard != pg->pg_id.shard) {
7c673cae
FG
1616 dout(7) << __func__ << ": " << *pg << " primary changed since "
1617 << m->get_map_epoch() << ", dropping" << dendl;
1618 return;
1619 }
1620 }
1621
1622 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1623 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1624 << " pg " << m->get_raw_pg()
1625 << " to osd." << whoami
11fdf7f2 1626 << " not " << pg->get_acting()
7c673cae 1627 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1628}
1629
9f95a23c 1630void OSDService::enqueue_back(OpSchedulerItem&& qi)
7c673cae 1631{
11fdf7f2 1632 osd->op_shardedwq.queue(std::move(qi));
7c673cae
FG
1633}
1634
9f95a23c 1635void OSDService::enqueue_front(OpSchedulerItem&& qi)
7c673cae 1636{
11fdf7f2 1637 osd->op_shardedwq.queue_front(std::move(qi));
7c673cae
FG
1638}
1639
11fdf7f2
TL
1640void OSDService::queue_recovery_context(
1641 PG *pg,
1642 GenContext<ThreadPool::TPHandle&> *c)
7c673cae 1643{
11fdf7f2
TL
1644 epoch_t e = get_osdmap_epoch();
1645 enqueue_back(
9f95a23c
TL
1646 OpSchedulerItem(
1647 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1648 new PGRecoveryContext(pg->get_pgid(), c, e)),
1649 cct->_conf->osd_recovery_cost,
1650 cct->_conf->osd_recovery_priority,
1651 ceph_clock_now(),
1652 0,
1653 e));
7c673cae
FG
1654}
1655
1656void OSDService::queue_for_snap_trim(PG *pg)
1657{
1658 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
11fdf7f2 1659 enqueue_back(
9f95a23c
TL
1660 OpSchedulerItem(
1661 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1662 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1663 cct->_conf->osd_snap_trim_cost,
1664 cct->_conf->osd_snap_trim_priority,
1665 ceph_clock_now(),
1666 0,
1667 pg->get_osdmap_epoch()));
1668}
1669
f67539c2
TL
1670template <class MSG_TYPE>
1671void OSDService::queue_scrub_event_msg(PG* pg,
1672 Scrub::scrub_prio_t with_priority,
20effc67
TL
1673 unsigned int qu_priority,
1674 Scrub::act_token_t act_token)
11fdf7f2 1675{
11fdf7f2 1676 const auto epoch = pg->get_osdmap_epoch();
20effc67
TL
1677 auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
1678 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
1679 << ". Epoch: " << epoch << " token: " << act_token << dendl;
f67539c2
TL
1680
1681 enqueue_back(OpSchedulerItem(
1682 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1683 pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1684}
1685
1686template <class MSG_TYPE>
20effc67
TL
1687void OSDService::queue_scrub_event_msg(PG* pg,
1688 Scrub::scrub_prio_t with_priority)
f67539c2
TL
1689{
1690 const auto epoch = pg->get_osdmap_epoch();
1691 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1692 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1693
1694 enqueue_back(OpSchedulerItem(
1695 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1696 pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1697}
1698
1699void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1700{
1701 queue_scrub_event_msg<PGScrub>(pg, with_priority);
1702}
1703
1704void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1705{
1706 queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1707}
1708
1709void OSDService::queue_for_rep_scrub(PG* pg,
1710 Scrub::scrub_prio_t with_priority,
20effc67
TL
1711 unsigned int qu_priority,
1712 Scrub::act_token_t act_token)
f67539c2 1713{
20effc67 1714 queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
f67539c2
TL
1715}
1716
1717void OSDService::queue_for_rep_scrub_resched(PG* pg,
1718 Scrub::scrub_prio_t with_priority,
20effc67
TL
1719 unsigned int qu_priority,
1720 Scrub::act_token_t act_token)
f67539c2
TL
1721{
1722 // Resulting scrub event: 'SchedReplica'
20effc67
TL
1723 queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
1724 act_token);
f67539c2
TL
1725}
1726
1727void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1728{
1729 // Resulting scrub event: 'RemotesReserved'
1730 queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1731}
1732
1733void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1734{
1735 // Resulting scrub event: 'ReservationFailure'
1736 queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1737}
1738
1739void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1740{
1741 // Resulting scrub event: 'InternalSchedScrub'
1742 queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1743}
1744
1745void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1746{
1747 // Resulting scrub event: 'ActivePushesUpd'
1748 queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1749}
1750
20effc67
TL
1751void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
1752{
1753 // Resulting scrub event: 'SelectedChunkFree'
1754 queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
1755}
1756
1757void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
1758{
1759 // Resulting scrub event: 'ChunkIsBusy'
1760 queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
1761}
1762
f67539c2
TL
1763void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1764{
1765 queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1766}
1767
1768void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1769{
1770 // Resulting scrub event: 'Unblocked'
1771 queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1772}
1773
1774void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1775{
1776 // Resulting scrub event: 'DigestUpdate'
1777 queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1778}
1779
20effc67
TL
1780void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
1781{
1782 // Resulting scrub event: 'IntLocalMapDone'
1783 queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
1784}
1785
f67539c2
TL
1786void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1787{
1788 // Resulting scrub event: 'GotReplicas'
1789 queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1790}
1791
20effc67
TL
1792void OSDService::queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority)
1793{
1794 // Resulting scrub event: 'MapsCompared'
1795 queue_scrub_event_msg<PGScrubMapsCompared>(pg, with_priority);
1796}
1797
f67539c2
TL
1798void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1799{
1800 // Resulting scrub event: 'ReplicaPushesUpd'
1801 queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
11fdf7f2
TL
1802}
1803
20effc67
TL
1804void OSDService::queue_scrub_is_finished(PG *pg)
1805{
1806 // Resulting scrub event: 'ScrubFinished'
1807 queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
1808}
1809
1810void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
1811{
1812 // Resulting scrub event: 'NextChunk'
1813 queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
1814}
1815
11fdf7f2
TL
1816void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1817{
1818 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1819 enqueue_back(
9f95a23c
TL
1820 OpSchedulerItem(
1821 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1822 new PGDelete(pgid, e)),
1823 cct->_conf->osd_pg_delete_cost,
1824 cct->_conf->osd_pg_delete_priority,
1825 ceph_clock_now(),
1826 0,
1827 e));
1828}
1829
1830bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1831{
1832 return osd->try_finish_pg_delete(pg, old_pg_num);
1833}
1834
1835// ---
1836
1837void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1838{
1839 std::lock_guard l(merge_lock);
1840 dout(10) << __func__ << " " << pg->pg_id << dendl;
1841 ready_to_merge_source[pg->pg_id.pgid] = version;
1842 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1843 _send_ready_to_merge();
1844}
1845
1846void OSDService::set_ready_to_merge_target(PG *pg,
1847 eversion_t version,
1848 epoch_t last_epoch_started,
1849 epoch_t last_epoch_clean)
1850{
1851 std::lock_guard l(merge_lock);
1852 dout(10) << __func__ << " " << pg->pg_id << dendl;
1853 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1854 make_tuple(version,
1855 last_epoch_started,
1856 last_epoch_clean)));
1857 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1858 _send_ready_to_merge();
1859}
1860
1861void OSDService::set_not_ready_to_merge_source(pg_t source)
1862{
1863 std::lock_guard l(merge_lock);
1864 dout(10) << __func__ << " " << source << dendl;
1865 not_ready_to_merge_source.insert(source);
1866 assert(ready_to_merge_source.count(source) == 0);
1867 _send_ready_to_merge();
1868}
1869
1870void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1871{
1872 std::lock_guard l(merge_lock);
1873 dout(10) << __func__ << " " << target << " source " << source << dendl;
1874 not_ready_to_merge_target[target] = source;
1875 assert(ready_to_merge_target.count(target) == 0);
1876 _send_ready_to_merge();
1877}
1878
1879void OSDService::send_ready_to_merge()
1880{
1881 std::lock_guard l(merge_lock);
1882 _send_ready_to_merge();
1883}
1884
1885void OSDService::_send_ready_to_merge()
1886{
1887 dout(20) << __func__
1888 << " ready_to_merge_source " << ready_to_merge_source
1889 << " not_ready_to_merge_source " << not_ready_to_merge_source
1890 << " ready_to_merge_target " << ready_to_merge_target
1891 << " not_ready_to_merge_target " << not_ready_to_merge_target
1892 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1893 << dendl;
1894 for (auto src : not_ready_to_merge_source) {
1895 if (sent_ready_to_merge_source.count(src) == 0) {
1896 monc->send_mon_message(new MOSDPGReadyToMerge(
1897 src,
1898 {}, {}, 0, 0,
1899 false,
1900 osdmap->get_epoch()));
1901 sent_ready_to_merge_source.insert(src);
1902 }
1903 }
1904 for (auto p : not_ready_to_merge_target) {
1905 if (sent_ready_to_merge_source.count(p.second) == 0) {
1906 monc->send_mon_message(new MOSDPGReadyToMerge(
1907 p.second,
1908 {}, {}, 0, 0,
1909 false,
1910 osdmap->get_epoch()));
1911 sent_ready_to_merge_source.insert(p.second);
1912 }
1913 }
1914 for (auto src : ready_to_merge_source) {
1915 if (not_ready_to_merge_source.count(src.first) ||
1916 not_ready_to_merge_target.count(src.first.get_parent())) {
1917 continue;
1918 }
1919 auto p = ready_to_merge_target.find(src.first.get_parent());
1920 if (p != ready_to_merge_target.end() &&
1921 sent_ready_to_merge_source.count(src.first) == 0) {
1922 monc->send_mon_message(new MOSDPGReadyToMerge(
1923 src.first, // source pgid
1924 src.second, // src version
1925 std::get<0>(p->second), // target version
1926 std::get<1>(p->second), // PG's last_epoch_started
1927 std::get<2>(p->second), // PG's last_epoch_clean
1928 true,
1929 osdmap->get_epoch()));
1930 sent_ready_to_merge_source.insert(src.first);
1931 }
1932 }
1933}
1934
1935void OSDService::clear_ready_to_merge(PG *pg)
1936{
1937 std::lock_guard l(merge_lock);
1938 dout(10) << __func__ << " " << pg->pg_id << dendl;
1939 ready_to_merge_source.erase(pg->pg_id.pgid);
1940 ready_to_merge_target.erase(pg->pg_id.pgid);
1941 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1942 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1943 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1944}
1945
1946void OSDService::clear_sent_ready_to_merge()
1947{
1948 std::lock_guard l(merge_lock);
1949 sent_ready_to_merge_source.clear();
1950}
1951
9f95a23c 1952void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
11fdf7f2
TL
1953{
1954 std::lock_guard l(merge_lock);
1955 auto i = sent_ready_to_merge_source.begin();
1956 while (i != sent_ready_to_merge_source.end()) {
1957 if (!osdmap->pg_exists(*i)) {
1958 dout(10) << __func__ << " " << *i << dendl;
1959 i = sent_ready_to_merge_source.erase(i);
1960 } else {
1961 ++i;
1962 }
1963 }
7c673cae
FG
1964}
1965
11fdf7f2
TL
1966// ---
1967
1968void OSDService::_queue_for_recovery(
1969 std::pair<epoch_t, PGRef> p,
1970 uint64_t reserved_pushes)
1971{
9f95a23c 1972 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
11fdf7f2 1973 enqueue_back(
9f95a23c
TL
1974 OpSchedulerItem(
1975 unique_ptr<OpSchedulerItem::OpQueueable>(
11fdf7f2
TL
1976 new PGRecovery(
1977 p.second->get_pgid(), p.first, reserved_pushes)),
1978 cct->_conf->osd_recovery_cost,
1979 cct->_conf->osd_recovery_priority,
1980 ceph_clock_now(),
1981 0,
1982 p.first));
1983}
7c673cae
FG
1984
1985// ====================================================================
1986// OSD
1987
1988#undef dout_prefix
1989#define dout_prefix *_dout
1990
1991// Commands shared between OSD's console and admin console:
f67539c2 1992namespace ceph::osd_cmds {
7c673cae 1993
2a845540
TL
1994int heap(CephContext& cct,
1995 const cmdmap_t& cmdmap,
1996 std::ostream& outos,
1997 std::ostream& erros);
f67539c2
TL
1998
1999} // namespace ceph::osd_cmds
7c673cae 2000
20effc67
TL
2001int OSD::mkfs(CephContext *cct,
2002 std::unique_ptr<ObjectStore> store,
2003 uuid_d fsid,
2004 int whoami,
2005 string osdspec_affinity)
7c673cae
FG
2006{
2007 int ret;
2008
7c673cae
FG
2009 OSDSuperblock sb;
2010 bufferlist sbbl;
7c673cae
FG
2011 // if we are fed a uuid for this osd, use it.
2012 store->set_fsid(cct->_conf->osd_uuid);
2013
2014 ret = store->mkfs();
2015 if (ret) {
224ce89b
WB
2016 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2017 << cpp_strerror(ret) << dendl;
20effc67 2018 return ret;
7c673cae
FG
2019 }
2020
31f18b77 2021 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
2022
2023 ret = store->mount();
2024 if (ret) {
224ce89b
WB
2025 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2026 << cpp_strerror(ret) << dendl;
20effc67 2027 return ret;
7c673cae
FG
2028 }
2029
20effc67
TL
2030 auto umount_store = make_scope_guard([&] {
2031 store->umount();
2032 });
2033
2034 ObjectStore::CollectionHandle ch =
2035 store->open_collection(coll_t::meta());
11fdf7f2
TL
2036 if (ch) {
2037 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2038 if (ret < 0) {
2039 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
20effc67 2040 return ret;
11fdf7f2 2041 }
7c673cae
FG
2042 /* if we already have superblock, check content of superblock */
2043 dout(0) << " have superblock" << dendl;
11fdf7f2
TL
2044 auto p = sbbl.cbegin();
2045 decode(sb, p);
7c673cae
FG
2046 if (whoami != sb.whoami) {
2047 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2048 << dendl;
20effc67 2049 return -EINVAL;
7c673cae
FG
2050 }
2051 if (fsid != sb.cluster_fsid) {
2052 derr << "provided cluster fsid " << fsid
2053 << " != superblock's " << sb.cluster_fsid << dendl;
20effc67 2054 return -EINVAL;
7c673cae
FG
2055 }
2056 } else {
2057 // create superblock
2058 sb.cluster_fsid = fsid;
2059 sb.osd_fsid = store->get_fsid();
2060 sb.whoami = whoami;
2061 sb.compat_features = get_osd_initial_compat_set();
2062
2063 bufferlist bl;
11fdf7f2 2064 encode(sb, bl);
7c673cae 2065
11fdf7f2
TL
2066 ObjectStore::CollectionHandle ch = store->create_new_collection(
2067 coll_t::meta());
7c673cae
FG
2068 ObjectStore::Transaction t;
2069 t.create_collection(coll_t::meta(), 0);
2070 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
11fdf7f2 2071 ret = store->queue_transaction(ch, std::move(t));
7c673cae
FG
2072 if (ret) {
2073 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
11fdf7f2 2074 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
20effc67 2075 return ret;
7c673cae 2076 }
a4b75251 2077 ch->flush();
7c673cae
FG
2078 }
2079
20effc67 2080 ret = write_meta(cct, store.get(), sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
7c673cae 2081 if (ret) {
224ce89b
WB
2082 derr << "OSD::mkfs: failed to write fsid file: error "
2083 << cpp_strerror(ret) << dendl;
11fdf7f2 2084 }
7c673cae
FG
2085 return ret;
2086}
2087
e306af50 2088int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
7c673cae
FG
2089{
2090 char val[80];
2091 int r;
2092
2093 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2094 r = store->write_meta("magic", val);
2095 if (r < 0)
2096 return r;
2097
2098 snprintf(val, sizeof(val), "%d", whoami);
2099 r = store->write_meta("whoami", val);
2100 if (r < 0)
2101 return r;
2102
2103 cluster_fsid.print(val);
2104 r = store->write_meta("ceph_fsid", val);
2105 if (r < 0)
2106 return r;
2107
11fdf7f2 2108 string key = cct->_conf.get_val<string>("key");
3efd9988
FG
2109 if (key.size()) {
2110 r = store->write_meta("osd_key", key);
2111 if (r < 0)
2112 return r;
b32b8144 2113 } else {
11fdf7f2 2114 string keyfile = cct->_conf.get_val<string>("keyfile");
b32b8144
FG
2115 if (!keyfile.empty()) {
2116 bufferlist keybl;
2117 string err;
11fdf7f2 2118 r = keybl.read_file(keyfile.c_str(), &err);
b32b8144
FG
2119 if (r < 0) {
2120 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2121 << err << ": " << cpp_strerror(r) << dendl;
2122 return r;
2123 }
2124 r = store->write_meta("osd_key", keybl.to_str());
2125 if (r < 0)
2126 return r;
2127 }
3efd9988 2128 }
e306af50
TL
2129 if (!osdspec_affinity.empty()) {
2130 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2131 if (r < 0)
2132 return r;
2133 }
3efd9988 2134
39ae355f
TL
2135 r = store->write_meta("ceph_version_when_created", pretty_version_to_str());
2136 if (r < 0)
2137 return r;
2138
2139 ostringstream created_at;
2140 utime_t now = ceph_clock_now();
2141 now.gmtime(created_at);
2142 r = store->write_meta("created_at", created_at.str());
2143 if (r < 0)
2144 return r;
2145
7c673cae
FG
2146 r = store->write_meta("ready", "ready");
2147 if (r < 0)
2148 return r;
2149
2150 return 0;
2151}
2152
11fdf7f2
TL
2153int OSD::peek_meta(ObjectStore *store,
2154 std::string *magic,
2155 uuid_d *cluster_fsid,
2156 uuid_d *osd_fsid,
2157 int *whoami,
9f95a23c 2158 ceph_release_t *require_osd_release)
7c673cae
FG
2159{
2160 string val;
2161
2162 int r = store->read_meta("magic", &val);
2163 if (r < 0)
2164 return r;
11fdf7f2 2165 *magic = val;
7c673cae
FG
2166
2167 r = store->read_meta("whoami", &val);
2168 if (r < 0)
2169 return r;
11fdf7f2 2170 *whoami = atoi(val.c_str());
7c673cae
FG
2171
2172 r = store->read_meta("ceph_fsid", &val);
2173 if (r < 0)
2174 return r;
11fdf7f2 2175 r = cluster_fsid->parse(val.c_str());
7c673cae
FG
2176 if (!r)
2177 return -EINVAL;
2178
2179 r = store->read_meta("fsid", &val);
2180 if (r < 0) {
11fdf7f2 2181 *osd_fsid = uuid_d();
7c673cae 2182 } else {
11fdf7f2 2183 r = osd_fsid->parse(val.c_str());
7c673cae
FG
2184 if (!r)
2185 return -EINVAL;
2186 }
2187
11fdf7f2
TL
2188 r = store->read_meta("require_osd_release", &val);
2189 if (r >= 0) {
9f95a23c 2190 *require_osd_release = ceph_release_from_name(val);
11fdf7f2
TL
2191 }
2192
7c673cae
FG
2193 return 0;
2194}
2195
2196
2197#undef dout_prefix
2198#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2199
2200// cons/des
2201
20effc67
TL
2202OSD::OSD(CephContext *cct_,
2203 std::unique_ptr<ObjectStore> store_,
7c673cae
FG
2204 int id,
2205 Messenger *internal_messenger,
2206 Messenger *external_messenger,
2207 Messenger *hb_client_front,
2208 Messenger *hb_client_back,
2209 Messenger *hb_front_serverm,
2210 Messenger *hb_back_serverm,
2211 Messenger *osdc_messenger,
2212 MonClient *mc,
f67539c2
TL
2213 const std::string &dev, const std::string &jdev,
2214 ceph::async::io_context_pool& poolctx) :
7c673cae 2215 Dispatcher(cct_),
7c673cae 2216 tick_timer(cct, osd_lock),
7c673cae 2217 tick_timer_without_osd_lock(cct, tick_timer_lock),
11fdf7f2 2218 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
7c673cae
FG
2219 cluster_messenger(internal_messenger),
2220 client_messenger(external_messenger),
2221 objecter_messenger(osdc_messenger),
2222 monc(mc),
9f95a23c 2223 mgrc(cct_, client_messenger, &mc->monmap),
f67539c2
TL
2224 logger(create_logger()),
2225 recoverystate_perf(create_recoverystate_perf()),
20effc67 2226 store(std::move(store_)),
7c673cae
FG
2227 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2228 clog(log_client.create_channel()),
2229 whoami(id),
2230 dev_path(dev), journal_path(jdev),
31f18b77 2231 store_is_rotational(store->is_rotational()),
7c673cae
FG
2232 trace_endpoint("0.0.0.0", 0, "osd"),
2233 asok_hook(NULL),
11fdf7f2
TL
2234 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2235 "osd_pg_epoch_max_lag_factor")),
7c673cae 2236 osd_compat(get_osd_compat_set()),
7c673cae 2237 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 2238 get_num_op_threads()),
7c673cae
FG
2239 heartbeat_stop(false),
2240 heartbeat_need_update(true),
2241 hb_front_client_messenger(hb_client_front),
2242 hb_back_client_messenger(hb_client_back),
2243 hb_front_server_messenger(hb_front_serverm),
2244 hb_back_server_messenger(hb_back_serverm),
2245 daily_loadavg(0.0),
2246 heartbeat_thread(this),
2247 heartbeat_dispatcher(this),
2248 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2249 cct->_conf->osd_num_op_tracker_shard),
2250 test_ops_hook(NULL),
7c673cae 2251 op_shardedwq(
7c673cae 2252 this,
f67539c2
TL
2253 ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2254 ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
7c673cae 2255 &osd_op_tp),
7c673cae 2256 last_pg_create_epoch(0),
11fdf7f2 2257 boot_finisher(cct),
7c673cae
FG
2258 up_thru_wanted(0),
2259 requested_full_first(0),
2260 requested_full_last(0),
f67539c2 2261 service(this, poolctx)
7c673cae 2262{
11fdf7f2
TL
2263
2264 if (!gss_ktfile_client.empty()) {
f67539c2
TL
2265 // Assert we can export environment variable
2266 /*
11fdf7f2
TL
2267 The default client keytab is used, if it is present and readable,
2268 to automatically obtain initial credentials for GSSAPI client
2269 applications. The principal name of the first entry in the client
2270 keytab is used by default when obtaining initial credentials.
2271 1. The KRB5_CLIENT_KTNAME environment variable.
2272 2. The default_client_keytab_name profile variable in [libdefaults].
2273 3. The hardcoded default, DEFCKTNAME.
2274 */
f67539c2 2275 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
11fdf7f2
TL
2276 gss_ktfile_client.c_str(), 1));
2277 ceph_assert(set_result == 0);
2278 }
2279
7c673cae
FG
2280 monc->set_messenger(client_messenger);
2281 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2282 cct->_conf->osd_op_log_threshold);
2283 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2284 cct->_conf->osd_op_history_duration);
2285 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2286 cct->_conf->osd_op_history_slow_op_threshold);
9f95a23c 2287 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
7c673cae
FG
2288#ifdef WITH_BLKIN
2289 std::stringstream ss;
2290 ss << "osd." << whoami;
2291 trace_endpoint.copy_name(ss.str());
2292#endif
11fdf7f2
TL
2293
2294 // initialize shards
2295 num_shards = get_num_op_shards();
2296 for (uint32_t i = 0; i < num_shards; i++) {
2297 OSDShard *one_shard = new OSDShard(
2298 i,
2299 cct,
9f95a23c 2300 this);
11fdf7f2
TL
2301 shards.push_back(one_shard);
2302 }
7c673cae
FG
2303}
2304
2305OSD::~OSD()
2306{
11fdf7f2
TL
2307 while (!shards.empty()) {
2308 delete shards.back();
2309 shards.pop_back();
2310 }
7c673cae
FG
2311 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2312 cct->get_perfcounters_collection()->remove(logger);
2313 delete recoverystate_perf;
2314 delete logger;
7c673cae
FG
2315}
2316
91327a77
AA
2317double OSD::get_tick_interval() const
2318{
2319 // vary +/- 5% to avoid scrub scheduling livelocks
2320 constexpr auto delta = 0.05;
91327a77 2321 return (OSD_TICK_INTERVAL *
11fdf7f2 2322 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
91327a77
AA
2323}
2324
7c673cae
FG
2325void OSD::handle_signal(int signum)
2326{
11fdf7f2 2327 ceph_assert(signum == SIGINT || signum == SIGTERM);
7c673cae
FG
2328 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2329 shutdown();
2330}
2331
2332int OSD::pre_init()
2333{
11fdf7f2 2334 std::lock_guard lock(osd_lock);
7c673cae
FG
2335 if (is_stopping())
2336 return 0;
2337
2338 if (store->test_mount_in_use()) {
2339 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2340 << "currently in use. (Is ceph-osd already running?)" << dendl;
2341 return -EBUSY;
2342 }
2343
11fdf7f2
TL
2344 cct->_conf.add_observer(this);
2345 return 0;
2346}
2347
2348int OSD::set_numa_affinity()
2349{
2350 // storage numa node
2351 int store_node = -1;
2352 store->get_numa_node(&store_node, nullptr, nullptr);
2353 if (store_node >= 0) {
2354 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2355 }
2356
2357 // check network numa node(s)
2358 int front_node = -1, back_node = -1;
2359 string front_iface = pick_iface(
2360 cct,
2361 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2362 string back_iface = pick_iface(
2363 cct,
2364 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2365 int r = get_iface_numa_node(front_iface, &front_node);
92f5a8d4 2366 if (r >= 0 && front_node >= 0) {
11fdf7f2 2367 dout(1) << __func__ << " public network " << front_iface << " numa node "
92f5a8d4 2368 << front_node << dendl;
11fdf7f2 2369 r = get_iface_numa_node(back_iface, &back_node);
92f5a8d4 2370 if (r >= 0 && back_node >= 0) {
11fdf7f2
TL
2371 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2372 << back_node << dendl;
2373 if (front_node == back_node &&
2374 front_node == store_node) {
2375 dout(1) << " objectstore and network numa nodes all match" << dendl;
2376 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2377 numa_node = front_node;
2378 }
92f5a8d4
TL
2379 } else if (front_node != back_node) {
2380 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2381 << dendl;
11fdf7f2
TL
2382 } else {
2383 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2384 << dendl;
2385 }
92f5a8d4
TL
2386 } else if (back_node == -2) {
2387 dout(1) << __func__ << " cluster network " << back_iface
2388 << " ports numa nodes do not match" << dendl;
2389 } else {
2390 derr << __func__ << " unable to identify cluster interface '" << back_iface
2391 << "' numa node: " << cpp_strerror(r) << dendl;
11fdf7f2 2392 }
92f5a8d4
TL
2393 } else if (front_node == -2) {
2394 dout(1) << __func__ << " public network " << front_iface
2395 << " ports numa nodes do not match" << dendl;
11fdf7f2
TL
2396 } else {
2397 derr << __func__ << " unable to identify public interface '" << front_iface
2398 << "' numa node: " << cpp_strerror(r) << dendl;
2399 }
2400 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2401 // this takes precedence over the automagic logic above
2402 numa_node = node;
2403 }
2404 if (numa_node >= 0) {
2405 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2406 if (r < 0) {
2407 dout(1) << __func__ << " unable to determine numa node " << numa_node
2408 << " CPUs" << dendl;
2409 numa_node = -1;
2410 } else {
2411 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2412 << " cpus "
2413 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2414 << dendl;
92f5a8d4 2415 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
11fdf7f2
TL
2416 if (r < 0) {
2417 r = -errno;
2418 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2419 << dendl;
2420 numa_node = -1;
2421 }
2422 }
2423 } else {
2424 dout(1) << __func__ << " not setting numa affinity" << dendl;
2425 }
7c673cae
FG
2426 return 0;
2427}
2428
2429// asok
2430
2431class OSDSocketHook : public AdminSocketHook {
2432 OSD *osd;
2433public:
2434 explicit OSDSocketHook(OSD *o) : osd(o) {}
9f95a23c 2435 int call(std::string_view prefix, const cmdmap_t& cmdmap,
39ae355f 2436 const bufferlist& inbl,
9f95a23c
TL
2437 Formatter *f,
2438 std::ostream& ss,
2439 bufferlist& out) override {
2440 ceph_abort("should use async hook");
2441 }
2442 void call_async(
2443 std::string_view prefix,
2444 const cmdmap_t& cmdmap,
2445 Formatter *f,
2446 const bufferlist& inbl,
2447 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
11fdf7f2 2448 try {
9f95a23c
TL
2449 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2450 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2451 bufferlist empty;
2452 on_finish(-EINVAL, e.what(), empty);
11fdf7f2 2453 }
7c673cae
FG
2454 }
2455};
2456
11fdf7f2
TL
2457std::set<int64_t> OSD::get_mapped_pools()
2458{
2459 std::set<int64_t> pools;
2460 std::vector<spg_t> pgids;
2461 _get_pgids(&pgids);
2462 for (const auto &pgid : pgids) {
2463 pools.insert(pgid.pool());
2464 }
2465 return pools;
2466}
2467
20effc67
TL
2468OSD::PGRefOrError OSD::locate_asok_target(const cmdmap_t& cmdmap,
2469 stringstream& ss,
2470 bool only_primary)
2471{
2472 string pgidstr;
2473 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2474 ss << "no pgid specified";
2475 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2476 }
2477
2478 pg_t pgid;
2479 if (!pgid.parse(pgidstr.c_str())) {
2480 ss << "couldn't parse pgid '" << pgidstr << "'";
2481 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2482 }
2483
2484 spg_t pcand;
2485 PGRef pg;
2486 if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) {
2487 if (pg->is_primary() || !only_primary) {
2488 return OSD::PGRefOrError{pg, 0};
2489 }
2490
2491 ss << "not primary for pgid " << pgid;
2492 pg->unlock();
2493 return OSD::PGRefOrError{std::nullopt, -EAGAIN};
2494 } else {
2495 ss << "i don't have pgid " << pgid;
2496 return OSD::PGRefOrError{std::nullopt, -ENOENT};
2497 }
2498}
2499
2500// note that the cmdmap is explicitly copied into asok_route_to_pg()
2501int OSD::asok_route_to_pg(
2502 bool only_primary,
2503 std::string_view prefix,
2504 cmdmap_t cmdmap,
2505 Formatter* f,
2506 stringstream& ss,
2507 const bufferlist& inbl,
2508 bufferlist& outbl,
2509 std::function<void(int, const std::string&, bufferlist&)> on_finish)
2510{
2511 auto [target_pg, ret] = locate_asok_target(cmdmap, ss, only_primary);
2512
2513 if (!target_pg.has_value()) {
2514 // 'ss' and 'ret' already contain the error information
2515 on_finish(ret, ss.str(), outbl);
2516 return ret;
2517 }
2518
2519 // the PG was locked by locate_asok_target()
2520 try {
2521 (*target_pg)->do_command(prefix, cmdmap, inbl, on_finish);
2522 (*target_pg)->unlock();
2523 return 0; // the pg handler calls on_finish directly
2524 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2525 (*target_pg)->unlock();
2526 ss << e.what();
2527 on_finish(ret, ss.str(), outbl);
2528 return -EINVAL;
2529 }
2530}
2531
9f95a23c
TL
2532void OSD::asok_command(
2533 std::string_view prefix, const cmdmap_t& cmdmap,
2534 Formatter *f,
2535 const bufferlist& inbl,
2536 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 2537{
9f95a23c
TL
2538 int ret = 0;
2539 stringstream ss; // stderr error message stream
2540 bufferlist outbl; // if empty at end, we'll dump formatter as output
2541
2542 // --- PG commands are routed here to PG::do_command ---
2543 if (prefix == "pg" ||
2544 prefix == "query" ||
2545 prefix == "mark_unfound_lost" ||
2546 prefix == "list_unfound" ||
2547 prefix == "scrub" ||
2548 prefix == "deep_scrub"
2549 ) {
2550 string pgidstr;
2551 pg_t pgid;
2552 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2553 ss << "no pgid specified";
2554 ret = -EINVAL;
2555 goto out;
2556 }
2557 if (!pgid.parse(pgidstr.c_str())) {
2558 ss << "couldn't parse pgid '" << pgidstr << "'";
2559 ret = -EINVAL;
2560 goto out;
2561 }
2562 spg_t pcand;
2563 PGRef pg;
2564 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2565 (pg = _lookup_lock_pg(pcand))) {
2566 if (pg->is_primary()) {
2567 cmdmap_t new_cmdmap = cmdmap;
2568 try {
2569 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2570 pg->unlock();
2571 return; // the pg handler calls on_finish directly
2572 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2573 pg->unlock();
2574 ss << e.what();
2575 ret = -EINVAL;
2576 goto out;
2577 }
2578 } else {
2579 ss << "not primary for pgid " << pgid;
2580 // do not reply; they will get newer maps and realize they
2581 // need to resend.
2582 pg->unlock();
2583 ret = -EAGAIN;
2584 goto out;
2585 }
2586 } else {
2587 ss << "i don't have pgid " << pgid;
2588 ret = -ENOENT;
2589 }
2590 }
2591
20effc67
TL
2592 // --- PG commands that will be answered even if !primary ---
2593
2594 else if (prefix == "scrubdebug") {
2595 asok_route_to_pg(false, prefix, cmdmap, f, ss, inbl, outbl, on_finish);
2596 return;
2597 }
2598
9f95a23c
TL
2599 // --- OSD commands follow ---
2600
2601 else if (prefix == "status") {
2602 lock_guard l(osd_lock);
7c673cae
FG
2603 f->open_object_section("status");
2604 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2605 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2606 f->dump_unsigned("whoami", superblock.whoami);
2607 f->dump_string("state", get_state_name(get_state()));
2608 f->dump_unsigned("oldest_map", superblock.oldest_map);
2609 f->dump_unsigned("newest_map", superblock.newest_map);
11fdf7f2 2610 f->dump_unsigned("num_pgs", num_pgs);
7c673cae 2611 f->close_section();
9f95a23c 2612 } else if (prefix == "flush_journal") {
7c673cae 2613 store->flush_journal();
9f95a23c
TL
2614 } else if (prefix == "dump_ops_in_flight" ||
2615 prefix == "ops" ||
2616 prefix == "dump_blocked_ops" ||
2617 prefix == "dump_historic_ops" ||
2618 prefix == "dump_historic_ops_by_duration" ||
2619 prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2620
2621 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2622even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2623will start to track new ops received afterwards.";
2624
2625 set<string> filters;
2626 vector<string> filter_str;
9f95a23c 2627 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
c07f9fc5
FG
2628 copy(filter_str.begin(), filter_str.end(),
2629 inserter(filters, filters.end()));
2630 }
2631
9f95a23c
TL
2632 if (prefix == "dump_ops_in_flight" ||
2633 prefix == "ops") {
c07f9fc5
FG
2634 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2635 ss << error_str;
9f95a23c
TL
2636 ret = -EINVAL;
2637 goto out;
c07f9fc5
FG
2638 }
2639 }
9f95a23c 2640 if (prefix == "dump_blocked_ops") {
c07f9fc5
FG
2641 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2642 ss << error_str;
9f95a23c
TL
2643 ret = -EINVAL;
2644 goto out;
c07f9fc5
FG
2645 }
2646 }
9f95a23c 2647 if (prefix == "dump_historic_ops") {
c07f9fc5
FG
2648 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2649 ss << error_str;
9f95a23c
TL
2650 ret = -EINVAL;
2651 goto out;
c07f9fc5
FG
2652 }
2653 }
9f95a23c 2654 if (prefix == "dump_historic_ops_by_duration") {
c07f9fc5
FG
2655 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2656 ss << error_str;
9f95a23c
TL
2657 ret = -EINVAL;
2658 goto out;
c07f9fc5
FG
2659 }
2660 }
9f95a23c 2661 if (prefix == "dump_historic_slow_ops") {
c07f9fc5
FG
2662 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2663 ss << error_str;
9f95a23c
TL
2664 ret = -EINVAL;
2665 goto out;
c07f9fc5 2666 }
7c673cae 2667 }
9f95a23c 2668 } else if (prefix == "dump_op_pq_state") {
7c673cae
FG
2669 f->open_object_section("pq");
2670 op_shardedwq.dump(f);
2671 f->close_section();
f67539c2 2672 } else if (prefix == "dump_blocklist") {
7c673cae 2673 list<pair<entity_addr_t,utime_t> > bl;
33c7a0ef 2674 list<pair<entity_addr_t,utime_t> > rbl;
7c673cae 2675 OSDMapRef curmap = service.get_osdmap();
33c7a0ef 2676 curmap->get_blocklist(&bl, &rbl);
7c673cae 2677
f67539c2 2678 f->open_array_section("blocklist");
7c673cae
FG
2679 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2680 it != bl.end(); ++it) {
224ce89b 2681 f->open_object_section("entry");
7c673cae
FG
2682 f->open_object_section("entity_addr_t");
2683 it->first.dump(f);
2684 f->close_section(); //entity_addr_t
2685 it->second.localtime(f->dump_stream("expire_time"));
2686 f->close_section(); //entry
2687 }
f67539c2 2688 f->close_section(); //blocklist
33c7a0ef
TL
2689 f->open_array_section("range_blocklist");
2690 for (list<pair<entity_addr_t,utime_t> >::iterator it = rbl.begin();
2691 it != rbl.end(); ++it) {
2692 f->open_object_section("entry");
2693 f->open_object_section("entity_addr_t");
2694 it->first.dump(f);
2695 f->close_section(); //entity_addr_t
2696 it->second.localtime(f->dump_stream("expire_time"));
2697 f->close_section(); //entry
2698 }
2699 f->close_section(); //blocklist
9f95a23c 2700 } else if (prefix == "dump_watchers") {
7c673cae
FG
2701 list<obj_watch_item_t> watchers;
2702 // scan pg's
11fdf7f2
TL
2703 vector<PGRef> pgs;
2704 _get_pgs(&pgs);
2705 for (auto& pg : pgs) {
2706 list<obj_watch_item_t> pg_watchers;
2707 pg->get_watchers(&pg_watchers);
2708 watchers.splice(watchers.end(), pg_watchers);
7c673cae
FG
2709 }
2710
2711 f->open_array_section("watchers");
2712 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2713 it != watchers.end(); ++it) {
2714
224ce89b 2715 f->open_object_section("watch");
7c673cae
FG
2716
2717 f->dump_string("namespace", it->obj.nspace);
2718 f->dump_string("object", it->obj.oid.name);
2719
2720 f->open_object_section("entity_name");
2721 it->wi.name.dump(f);
2722 f->close_section(); //entity_name_t
2723
224ce89b
WB
2724 f->dump_unsigned("cookie", it->wi.cookie);
2725 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2726
2727 f->open_object_section("entity_addr_t");
2728 it->wi.addr.dump(f);
2729 f->close_section(); //entity_addr_t
2730
2731 f->close_section(); //watch
2732 }
2733
2734 f->close_section(); //watchers
9f95a23c 2735 } else if (prefix == "dump_recovery_reservations") {
7c673cae
FG
2736 f->open_object_section("reservations");
2737 f->open_object_section("local_reservations");
2738 service.local_reserver.dump(f);
2739 f->close_section();
2740 f->open_object_section("remote_reservations");
2741 service.remote_reserver.dump(f);
2742 f->close_section();
2743 f->close_section();
9f95a23c 2744 } else if (prefix == "dump_scrub_reservations") {
eafe8130 2745 f->open_object_section("scrub_reservations");
20effc67 2746 service.get_scrub_services().dump_scrub_reservations(f);
eafe8130 2747 f->close_section();
9f95a23c 2748 } else if (prefix == "get_latest_osdmap") {
7c673cae 2749 get_latest_osdmap();
9f95a23c 2750 } else if (prefix == "set_heap_property") {
7c673cae
FG
2751 string property;
2752 int64_t value = 0;
2753 string error;
2754 bool success = false;
9f95a23c 2755 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2756 error = "unable to get property";
2757 success = false;
9f95a23c 2758 } else if (!cmd_getval(cmdmap, "value", value)) {
7c673cae
FG
2759 error = "unable to get value";
2760 success = false;
2761 } else if (value < 0) {
2762 error = "negative value not allowed";
2763 success = false;
2764 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2765 error = "invalid property";
2766 success = false;
2767 } else {
2768 success = true;
2769 }
2770 f->open_object_section("result");
2771 f->dump_string("error", error);
2772 f->dump_bool("success", success);
2773 f->close_section();
9f95a23c 2774 } else if (prefix == "get_heap_property") {
7c673cae
FG
2775 string property;
2776 size_t value = 0;
2777 string error;
2778 bool success = false;
9f95a23c 2779 if (!cmd_getval(cmdmap, "property", property)) {
7c673cae
FG
2780 error = "unable to get property";
2781 success = false;
2782 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2783 error = "invalid property";
2784 success = false;
2785 } else {
2786 success = true;
2787 }
2788 f->open_object_section("result");
2789 f->dump_string("error", error);
2790 f->dump_bool("success", success);
2791 f->dump_int("value", value);
2792 f->close_section();
9f95a23c 2793 } else if (prefix == "dump_objectstore_kv_stats") {
7c673cae 2794 store->get_db_statistics(f);
9f95a23c 2795 } else if (prefix == "dump_scrubs") {
20effc67 2796 service.get_scrub_services().dump_scrubs(f);
9f95a23c 2797 } else if (prefix == "calc_objectstore_db_histogram") {
7c673cae 2798 store->generate_db_histogram(f);
9f95a23c 2799 } else if (prefix == "flush_store_cache") {
11fdf7f2 2800 store->flush_cache(&ss);
39ae355f
TL
2801 } else if (prefix == "rotate-stored-key") {
2802 store->write_meta("osd_key", inbl.to_str());
9f95a23c 2803 } else if (prefix == "dump_pgstate_history") {
7c673cae 2804 f->open_object_section("pgstate_history");
9f95a23c 2805 f->open_array_section("pgs");
11fdf7f2
TL
2806 vector<PGRef> pgs;
2807 _get_pgs(&pgs);
2808 for (auto& pg : pgs) {
9f95a23c 2809 f->open_object_section("pg");
11fdf7f2 2810 f->dump_stream("pg") << pg->pg_id;
9f95a23c 2811 f->dump_string("currently", pg->get_current_state());
11fdf7f2 2812 pg->dump_pgstate_history(f);
9f95a23c 2813 f->close_section();
7c673cae
FG
2814 }
2815 f->close_section();
9f95a23c
TL
2816 f->close_section();
2817 } else if (prefix == "compact") {
224ce89b
WB
2818 dout(1) << "triggering manual compaction" << dendl;
2819 auto start = ceph::coarse_mono_clock::now();
2820 store->compact();
2821 auto end = ceph::coarse_mono_clock::now();
11fdf7f2 2822 double duration = std::chrono::duration<double>(end-start).count();
f67539c2 2823 dout(1) << "finished manual compaction in "
11fdf7f2 2824 << duration
224ce89b
WB
2825 << " seconds" << dendl;
2826 f->open_object_section("compact_result");
11fdf7f2
TL
2827 f->dump_float("elapsed_time", duration);
2828 f->close_section();
9f95a23c 2829 } else if (prefix == "get_mapped_pools") {
11fdf7f2
TL
2830 f->open_array_section("mapped_pools");
2831 set<int64_t> poollist = get_mapped_pools();
2832 for (auto pool : poollist) {
2833 f->dump_int("pool_id", pool);
2834 }
2835 f->close_section();
9f95a23c 2836 } else if (prefix == "smart") {
11fdf7f2 2837 string devid;
9f95a23c
TL
2838 cmd_getval(cmdmap, "devid", devid);
2839 ostringstream out;
2840 probe_smart(devid, out);
2841 outbl.append(out.str());
2842 } else if (prefix == "list_devices") {
11fdf7f2
TL
2843 set<string> devnames;
2844 store->get_devices(&devnames);
9f95a23c 2845 f->open_array_section("list_devices");
11fdf7f2
TL
2846 for (auto dev : devnames) {
2847 if (dev.find("dm-") == 0) {
2848 continue;
2849 }
9f95a23c
TL
2850 string err;
2851 f->open_object_section("device");
11fdf7f2 2852 f->dump_string("device", "/dev/" + dev);
9f95a23c
TL
2853 f->dump_string("device_id", get_device_id(dev, &err));
2854 f->close_section();
11fdf7f2 2855 }
224ce89b 2856 f->close_section();
9f95a23c
TL
2857 } else if (prefix == "send_beacon") {
2858 lock_guard l(osd_lock);
11fdf7f2
TL
2859 if (is_active()) {
2860 send_beacon(ceph::coarse_mono_clock::now());
2861 }
9f95a23c
TL
2862 }
2863
2864 else if (prefix == "cluster_log") {
2865 vector<string> msg;
2866 cmd_getval(cmdmap, "message", msg);
2867 if (msg.empty()) {
2868 ret = -EINVAL;
2869 ss << "ignoring empty log message";
2870 goto out;
2871 }
2872 string message = msg.front();
2873 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2874 message += " " + *a;
2875 string lvl;
2876 cmd_getval(cmdmap, "level", lvl);
2877 clog_type level = string_to_clog_type(lvl);
2878 if (level < 0) {
2879 ret = -EINVAL;
2880 ss << "unknown level '" << lvl << "'";
2881 goto out;
2882 }
2883 clog->do_log(level, message);
2884 }
2885
2886 else if (prefix == "bench") {
9f95a23c 2887 // default count 1G, size 4MB
20effc67
TL
2888 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 1LL << 30);
2889 int64_t bsize = cmd_getval_or<int64_t>(cmdmap, "size", 4LL << 20);
2890 int64_t osize = cmd_getval_or<int64_t>(cmdmap, "object_size", 0);
2891 int64_t onum = cmd_getval_or<int64_t>(cmdmap, "object_num", 0);
a4b75251 2892 double elapsed = 0.0;
9f95a23c 2893
a4b75251
TL
2894 ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
2895 if (ret != 0) {
9f95a23c 2896 goto out;
9f95a23c
TL
2897 }
2898
9f95a23c
TL
2899 double rate = count / elapsed;
2900 double iops = rate / bsize;
2901 f->open_object_section("osd_bench_results");
2902 f->dump_int("bytes_written", count);
2903 f->dump_int("blocksize", bsize);
2904 f->dump_float("elapsed_sec", elapsed);
2905 f->dump_float("bytes_per_sec", rate);
2906 f->dump_float("iops", iops);
2907 f->close_section();
2908 }
2909
2910 else if (prefix == "flush_pg_stats") {
2911 mgrc.send_pgstats();
2912 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2913 }
2914
2915 else if (prefix == "heap") {
2a845540
TL
2916 std::stringstream outss;
2917 ret = ceph::osd_cmds::heap(*cct, cmdmap, outss, ss);
2918 outbl.append(outss);
9f95a23c
TL
2919 }
2920
2921 else if (prefix == "debug dump_missing") {
2922 f->open_array_section("pgs");
2923 vector<PGRef> pgs;
2924 _get_pgs(&pgs);
2925 for (auto& pg : pgs) {
2926 string s = stringify(pg->pg_id);
2927 f->open_array_section(s.c_str());
2928 pg->lock();
2929 pg->dump_missing(f);
2930 pg->unlock();
2931 f->close_section();
2932 }
2933 f->close_section();
2934 }
2935
2936 else if (prefix == "debug kick_recovery_wq") {
2937 int64_t delay;
2938 cmd_getval(cmdmap, "delay", delay);
2939 ostringstream oss;
2940 oss << delay;
2941 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2942 if (ret != 0) {
2943 ss << "kick_recovery_wq: error setting "
2944 << "osd_recovery_delay_start to '" << delay << "': error "
2945 << ret;
2946 goto out;
2947 }
2948 cct->_conf.apply_changes(nullptr);
2949 ss << "kicking recovery queue. set osd_recovery_delay_start "
2950 << "to " << cct->_conf->osd_recovery_delay_start;
2951 }
2952
2953 else if (prefix == "cpu_profiler") {
2954 ostringstream ds;
2955 string arg;
2956 cmd_getval(cmdmap, "arg", arg);
2957 vector<string> argvec;
2958 get_str_vec(arg, argvec);
2959 cpu_profiler_handle_command(argvec, ds);
2960 outbl.append(ds.str());
2961 }
2962
2963 else if (prefix == "dump_pg_recovery_stats") {
2964 lock_guard l(osd_lock);
2965 pg_recovery_stats.dump_formatted(f);
2966 }
2967
2968 else if (prefix == "reset_pg_recovery_stats") {
2969 lock_guard l(osd_lock);
2970 pg_recovery_stats.reset();
2971 }
2972
2973 else if (prefix == "perf histogram dump") {
2974 std::string logger;
2975 std::string counter;
2976 cmd_getval(cmdmap, "logger", logger);
2977 cmd_getval(cmdmap, "counter", counter);
2978 cct->get_perfcounters_collection()->dump_formatted_histograms(
2979 f, false, logger, counter);
2980 }
2981
2982 else if (prefix == "cache drop") {
2983 lock_guard l(osd_lock);
2984 dout(20) << "clearing all caches" << dendl;
2985 // Clear the objectstore's cache - onode and buffer for Bluestore,
2986 // system's pagecache for Filestore
2987 ret = store->flush_cache(&ss);
2988 if (ret < 0) {
2989 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2990 goto out;
2991 }
2992 // Clear the objectcontext cache (per PG)
2993 vector<PGRef> pgs;
2994 _get_pgs(&pgs);
2995 for (auto& pg: pgs) {
2996 pg->clear_cache();
2997 }
2998 }
2999
3000 else if (prefix == "cache status") {
3001 lock_guard l(osd_lock);
3002 int obj_ctx_count = 0;
3003 vector<PGRef> pgs;
3004 _get_pgs(&pgs);
3005 for (auto& pg: pgs) {
3006 obj_ctx_count += pg->get_cache_obj_count();
3007 }
3008 f->open_object_section("cache_status");
3009 f->dump_int("object_ctx", obj_ctx_count);
3010 store->dump_cache_stats(f);
3011 f->close_section();
3012 }
3013
3014 else if (prefix == "scrub_purged_snaps") {
3015 lock_guard l(osd_lock);
3016 scrub_purged_snaps();
3017 }
3018
3019 else if (prefix == "dump_osd_network") {
3020 lock_guard l(osd_lock);
3021 int64_t value = 0;
3022 if (!(cmd_getval(cmdmap, "value", value))) {
3023 // Convert milliseconds to microseconds
3024 value = static_cast<double>(g_conf().get_val<double>(
3025 "mon_warn_on_slow_ping_time")) * 1000;
3026 if (value == 0) {
3027 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
3028 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
3029 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
3030 }
3031 } else {
3032 // Convert user input to microseconds
3033 value *= 1000;
3034 }
3035 if (value < 0) value = 0;
3036
3037 struct osd_ping_time_t {
3038 uint32_t pingtime;
3039 int to;
3040 bool back;
3041 std::array<uint32_t,3> times;
3042 std::array<uint32_t,3> min;
3043 std::array<uint32_t,3> max;
3044 uint32_t last;
3045 uint32_t last_update;
3046
3047 bool operator<(const osd_ping_time_t& rhs) const {
3048 if (pingtime < rhs.pingtime)
3049 return true;
3050 if (pingtime > rhs.pingtime)
3051 return false;
3052 if (to < rhs.to)
3053 return true;
3054 if (to > rhs.to)
3055 return false;
3056 return back;
3057 }
3058 };
3059
3060 set<osd_ping_time_t> sorted;
3061 // Get pingtimes under lock and not on the stack
eafe8130
TL
3062 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3063 service.get_hb_pingtime(pingtimes);
3064 for (auto j : *pingtimes) {
3065 if (j.second.last_update == 0)
3066 continue;
3067 osd_ping_time_t item;
3068 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3069 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3070 if (item.pingtime >= value) {
3071 item.to = j.first;
3072 item.times[0] = j.second.back_pingtime[0];
3073 item.times[1] = j.second.back_pingtime[1];
3074 item.times[2] = j.second.back_pingtime[2];
3075 item.min[0] = j.second.back_min[0];
3076 item.min[1] = j.second.back_min[1];
3077 item.min[2] = j.second.back_min[2];
3078 item.max[0] = j.second.back_max[0];
3079 item.max[1] = j.second.back_max[1];
3080 item.max[2] = j.second.back_max[2];
3081 item.last = j.second.back_last;
3082 item.back = true;
3083 item.last_update = j.second.last_update;
3084 sorted.emplace(item);
3085 }
3086 if (j.second.front_last == 0)
3087 continue;
3088 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3089 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3090 if (item.pingtime >= value) {
3091 item.to = j.first;
3092 item.times[0] = j.second.front_pingtime[0];
3093 item.times[1] = j.second.front_pingtime[1];
3094 item.times[2] = j.second.front_pingtime[2];
3095 item.min[0] = j.second.front_min[0];
3096 item.min[1] = j.second.front_min[1];
3097 item.min[2] = j.second.front_min[2];
3098 item.max[0] = j.second.front_max[0];
3099 item.max[1] = j.second.front_max[1];
3100 item.max[2] = j.second.front_max[2];
3101 item.last = j.second.front_last;
3102 item.last_update = j.second.last_update;
3103 item.back = false;
3104 sorted.emplace(item);
3105 }
3106 }
3107 delete pingtimes;
3108 //
3109 // Network ping times (1min 5min 15min)
3110 f->open_object_section("network_ping_times");
3111 f->dump_int("threshold", value / 1000);
3112 f->open_array_section("entries");
3113 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3114 ceph_assert(sitem.pingtime >= value);
3115 f->open_object_section("entry");
3116
3117 const time_t lu(sitem.last_update);
3118 char buffer[26];
3119 string lustr(ctime_r(&lu, buffer));
3120 lustr.pop_back(); // Remove trailing \n
3121 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3122 f->dump_string("last update", lustr);
3123 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3124 f->dump_int("from osd", whoami);
3125 f->dump_int("to osd", sitem.to);
3126 f->dump_string("interface", (sitem.back ? "back" : "front"));
3127 f->open_object_section("average");
3128 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3129 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3130 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3131 f->close_section(); // average
3132 f->open_object_section("min");
3133 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3134 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3135 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3136 f->close_section(); // min
3137 f->open_object_section("max");
3138 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3139 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3140 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3141 f->close_section(); // max
3142 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3143 f->close_section(); // entry
3144 }
3145 f->close_section(); // entries
3146 f->close_section(); // network_ping_times
20effc67
TL
3147 } else if (prefix == "dump_pool_statfs") {
3148 lock_guard l(osd_lock);
3149
3150 int64_t p = 0;
3151 if (!(cmd_getval(cmdmap, "poolid", p))) {
3152 ss << "Error dumping pool statfs: no poolid provided";
3153 ret = -EINVAL;
3154 goto out;
3155 }
3156
3157 store_statfs_t st;
3158 bool per_pool_omap_stats = false;
3159
3160 ret = store->pool_statfs(p, &st, &per_pool_omap_stats);
3161 if (ret < 0) {
3162 ss << "Error dumping pool statfs: " << cpp_strerror(ret);
3163 goto out;
3164 } else {
3165 ss << "dumping pool statfs...";
3166 f->open_object_section("pool_statfs");
3167 f->dump_int("poolid", p);
3168 st.dump(f);
3169 f->close_section();
3170 }
7c673cae 3171 } else {
11fdf7f2 3172 ceph_abort_msg("broken asok registration");
7c673cae 3173 }
9f95a23c
TL
3174
3175 out:
3176 on_finish(ret, ss.str(), outbl);
7c673cae
FG
3177}
3178
a4b75251
TL
3179int OSD::run_osd_bench_test(
3180 int64_t count,
3181 int64_t bsize,
3182 int64_t osize,
3183 int64_t onum,
3184 double *elapsed,
3185 ostream &ss)
3186{
3187 int ret = 0;
39ae355f 3188 srand(time(NULL) % (unsigned long) -1);
a4b75251
TL
3189 uint32_t duration = cct->_conf->osd_bench_duration;
3190
3191 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3192 // let us limit the block size because the next checks rely on it
3193 // having a sane value. If we allow any block size to be set things
3194 // can still go sideways.
3195 ss << "block 'size' values are capped at "
3196 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3197 << " a higher value, please adjust 'osd_bench_max_block_size'";
3198 ret = -EINVAL;
3199 return ret;
3200 } else if (bsize < (int64_t) (1 << 20)) {
3201 // entering the realm of small block sizes.
3202 // limit the count to a sane value, assuming a configurable amount of
3203 // IOPS and duration, so that the OSD doesn't get hung up on this,
3204 // preventing timeouts from going off
3205 int64_t max_count =
3206 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3207 if (count > max_count) {
3208 ss << "'count' values greater than " << max_count
3209 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3210 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3211 << " for " << duration << " seconds,"
3212 << " can cause ill effects on osd. "
3213 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3214 << " value if you wish to use a higher 'count'.";
3215 ret = -EINVAL;
3216 return ret;
3217 }
3218 } else {
3219 // 1MB block sizes are big enough so that we get more stuff done.
3220 // However, to avoid the osd from getting hung on this and having
3221 // timers being triggered, we are going to limit the count assuming
3222 // a configurable throughput and duration.
3223 // NOTE: max_count is the total amount of bytes that we believe we
3224 // will be able to write during 'duration' for the given
3225 // throughput. The block size hardly impacts this unless it's
3226 // way too big. Given we already check how big the block size
3227 // is, it's safe to assume everything will check out.
3228 int64_t max_count =
3229 cct->_conf->osd_bench_large_size_max_throughput * duration;
3230 if (count > max_count) {
3231 ss << "'count' values greater than " << max_count
3232 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3233 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3234 << " for " << duration << " seconds,"
3235 << " can cause ill effects on osd. "
3236 << " Please adjust 'osd_bench_large_size_max_throughput'"
3237 << " with a higher value if you wish to use a higher 'count'.";
3238 ret = -EINVAL;
3239 return ret;
3240 }
3241 }
3242
3243 if (osize && bsize > osize) {
3244 bsize = osize;
3245 }
3246
3247 dout(1) << " bench count " << count
3248 << " bsize " << byte_u_t(bsize) << dendl;
3249
3250 ObjectStore::Transaction cleanupt;
3251
3252 if (osize && onum) {
3253 bufferlist bl;
3254 bufferptr bp(osize);
20effc67 3255 memset(bp.c_str(), 'a', bp.length());
a4b75251
TL
3256 bl.push_back(std::move(bp));
3257 bl.rebuild_page_aligned();
3258 for (int i=0; i<onum; ++i) {
3259 char nm[30];
3260 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3261 object_t oid(nm);
3262 hobject_t soid(sobject_t(oid, 0));
3263 ObjectStore::Transaction t;
3264 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3265 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3266 cleanupt.remove(coll_t(), ghobject_t(soid));
3267 }
3268 }
3269
a4b75251
TL
3270 {
3271 C_SaferCond waiter;
3272 if (!service.meta_ch->flush_commit(&waiter)) {
3273 waiter.wait();
3274 }
3275 }
3276
39ae355f 3277 bufferlist bl;
a4b75251
TL
3278 utime_t start = ceph_clock_now();
3279 for (int64_t pos = 0; pos < count; pos += bsize) {
3280 char nm[30];
3281 unsigned offset = 0;
39ae355f
TL
3282 bufferptr bp(bsize);
3283 memset(bp.c_str(), rand() & 0xff, bp.length());
3284 bl.push_back(std::move(bp));
3285 bl.rebuild_page_aligned();
a4b75251
TL
3286 if (onum && osize) {
3287 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3288 offset = rand() % (osize / bsize) * bsize;
3289 } else {
3290 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3291 }
3292 object_t oid(nm);
3293 hobject_t soid(sobject_t(oid, 0));
3294 ObjectStore::Transaction t;
3295 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3296 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3297 if (!onum || !osize) {
3298 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3299 }
39ae355f 3300 bl.clear();
a4b75251
TL
3301 }
3302
3303 {
3304 C_SaferCond waiter;
3305 if (!service.meta_ch->flush_commit(&waiter)) {
3306 waiter.wait();
3307 }
3308 }
3309 utime_t end = ceph_clock_now();
3310 *elapsed = end - start;
3311
3312 // clean up
3313 store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3314 {
3315 C_SaferCond waiter;
3316 if (!service.meta_ch->flush_commit(&waiter)) {
3317 waiter.wait();
3318 }
3319 }
3320
3321 return ret;
3322}
3323
7c673cae
FG
3324class TestOpsSocketHook : public AdminSocketHook {
3325 OSDService *service;
3326 ObjectStore *store;
3327public:
3328 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
9f95a23c 3329 int call(std::string_view command, const cmdmap_t& cmdmap,
39ae355f 3330 const bufferlist&,
9f95a23c
TL
3331 Formatter *f,
3332 std::ostream& errss,
3333 bufferlist& out) override {
3334 int r = 0;
3335 stringstream outss;
11fdf7f2 3336 try {
9f95a23c
TL
3337 test_ops(service, store, command, cmdmap, outss);
3338 out.append(outss);
3339 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3340 errss << e.what();
3341 r = -EINVAL;
11fdf7f2 3342 }
9f95a23c 3343 return r;
7c673cae
FG
3344 }
3345 void test_ops(OSDService *service, ObjectStore *store,
11fdf7f2 3346 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
7c673cae
FG
3347
3348};
3349
3350class OSD::C_Tick : public Context {
3351 OSD *osd;
3352 public:
3353 explicit C_Tick(OSD *o) : osd(o) {}
3354 void finish(int r) override {
3355 osd->tick();
3356 }
3357};
3358
3359class OSD::C_Tick_WithoutOSDLock : public Context {
3360 OSD *osd;
3361 public:
3362 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3363 void finish(int r) override {
3364 osd->tick_without_osd_lock();
3365 }
3366};
3367
3368int OSD::enable_disable_fuse(bool stop)
3369{
3370#ifdef HAVE_LIBFUSE
3371 int r;
3372 string mntpath = cct->_conf->osd_data + "/fuse";
3373 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3374 dout(1) << __func__ << " disabling" << dendl;
3375 fuse_store->stop();
3376 delete fuse_store;
3377 fuse_store = NULL;
3378 r = ::rmdir(mntpath.c_str());
7c673cae 3379 if (r < 0) {
c07f9fc5
FG
3380 r = -errno;
3381 derr << __func__ << " failed to rmdir " << mntpath << ": "
3382 << cpp_strerror(r) << dendl;
7c673cae
FG
3383 return r;
3384 }
3385 return 0;
3386 }
3387 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3388 dout(1) << __func__ << " enabling" << dendl;
3389 r = ::mkdir(mntpath.c_str(), 0700);
3390 if (r < 0)
3391 r = -errno;
3392 if (r < 0 && r != -EEXIST) {
3393 derr << __func__ << " unable to create " << mntpath << ": "
3394 << cpp_strerror(r) << dendl;
3395 return r;
3396 }
20effc67 3397 fuse_store = new FuseStore(store.get(), mntpath);
7c673cae
FG
3398 r = fuse_store->start();
3399 if (r < 0) {
3400 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3401 delete fuse_store;
3402 fuse_store = NULL;
3403 return r;
3404 }
3405 }
3406#endif // HAVE_LIBFUSE
3407 return 0;
3408}
3409
9f95a23c
TL
3410size_t OSD::get_num_cache_shards()
3411{
3412 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3413}
3414
31f18b77
FG
3415int OSD::get_num_op_shards()
3416{
3417 if (cct->_conf->osd_op_num_shards)
3418 return cct->_conf->osd_op_num_shards;
3419 if (store_is_rotational)
3420 return cct->_conf->osd_op_num_shards_hdd;
3421 else
3422 return cct->_conf->osd_op_num_shards_ssd;
3423}
3424
3425int OSD::get_num_op_threads()
3426{
3427 if (cct->_conf->osd_op_num_threads_per_shard)
3428 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3429 if (store_is_rotational)
3430 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3431 else
3432 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3433}
3434
c07f9fc5
FG
3435float OSD::get_osd_recovery_sleep()
3436{
3437 if (cct->_conf->osd_recovery_sleep)
3438 return cct->_conf->osd_recovery_sleep;
d2e6a577 3439 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 3440 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577 3441 else if (store_is_rotational && !journal_is_rotational)
11fdf7f2 3442 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
d2e6a577
FG
3443 else
3444 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
3445}
3446
11fdf7f2
TL
3447float OSD::get_osd_delete_sleep()
3448{
3449 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3450 if (osd_delete_sleep > 0)
3451 return osd_delete_sleep;
3452 if (!store_is_rotational && !journal_is_rotational)
3453 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3454 if (store_is_rotational && !journal_is_rotational)
3455 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3456 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3457}
3458
9f95a23c
TL
3459int OSD::get_recovery_max_active()
3460{
3461 if (cct->_conf->osd_recovery_max_active)
3462 return cct->_conf->osd_recovery_max_active;
3463 if (store_is_rotational)
3464 return cct->_conf->osd_recovery_max_active_hdd;
3465 else
3466 return cct->_conf->osd_recovery_max_active_ssd;
3467}
3468
494da23a
TL
3469float OSD::get_osd_snap_trim_sleep()
3470{
3471 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3472 if (osd_snap_trim_sleep > 0)
3473 return osd_snap_trim_sleep;
3474 if (!store_is_rotational && !journal_is_rotational)
3475 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3476 if (store_is_rotational && !journal_is_rotational)
3477 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3478 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3479}
3480
7c673cae
FG
3481int OSD::init()
3482{
9f95a23c 3483 OSDMapRef osdmap;
7c673cae 3484 CompatSet initial, diff;
11fdf7f2 3485 std::lock_guard lock(osd_lock);
7c673cae
FG
3486 if (is_stopping())
3487 return 0;
20effc67 3488 tracing::osd::tracer.init("osd");
7c673cae
FG
3489 tick_timer.init();
3490 tick_timer_without_osd_lock.init();
3491 service.recovery_request_timer.init();
11fdf7f2
TL
3492 service.sleep_timer.init();
3493
3494 boot_finisher.start();
3495
3496 {
3497 string val;
3498 store->read_meta("require_osd_release", &val);
9f95a23c 3499 last_require_osd_release = ceph_release_from_name(val);
11fdf7f2 3500 }
7c673cae
FG
3501
3502 // mount.
31f18b77
FG
3503 dout(2) << "init " << dev_path
3504 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3505 << dendl;
d2e6a577 3506 dout(2) << "journal " << journal_path << dendl;
11fdf7f2 3507 ceph_assert(store); // call pre_init() first!
7c673cae 3508
9f95a23c 3509 store->set_cache_shards(get_num_cache_shards());
7c673cae 3510
20effc67
TL
3511 int rotating_auth_attempts = 0;
3512 auto rotating_auth_timeout =
3513 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3514
7c673cae
FG
3515 int r = store->mount();
3516 if (r < 0) {
3517 derr << "OSD:init: unable to mount object store" << dendl;
3518 return r;
3519 }
d2e6a577
FG
3520 journal_is_rotational = store->is_journal_rotational();
3521 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3522 << dendl;
7c673cae
FG
3523
3524 enable_disable_fuse(false);
3525
3526 dout(2) << "boot" << dendl;
3527
11fdf7f2 3528 service.meta_ch = store->open_collection(coll_t::meta());
20effc67
TL
3529 if (!service.meta_ch) {
3530 derr << "OSD:init: unable to open meta collection"
3531 << dendl;
3532 r = -ENOENT;
3533 goto out;
3534 }
7c673cae
FG
3535 // initialize the daily loadavg with current 15min loadavg
3536 double loadavgs[3];
3537 if (getloadavg(loadavgs, 3) == 3) {
3538 daily_loadavg = loadavgs[2];
3539 } else {
3540 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3541 daily_loadavg = 1.0;
3542 }
3543
7c673cae
FG
3544 // sanity check long object name handling
3545 {
3546 hobject_t l;
3547 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3548 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3549 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3550 r = store->validate_hobject_key(l);
3551 if (r < 0) {
3552 derr << "backend (" << store->get_type() << ") is unable to support max "
3553 << "object name[space] len" << dendl;
3554 derr << " osd max object name len = "
3555 << cct->_conf->osd_max_object_name_len << dendl;
3556 derr << " osd max object namespace len = "
3557 << cct->_conf->osd_max_object_namespace_len << dendl;
3558 derr << cpp_strerror(r) << dendl;
3559 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3560 goto out;
3561 }
3562 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3563 << dendl;
3564 } else {
3565 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3566 }
3567 }
3568
3569 // read superblock
3570 r = read_superblock();
3571 if (r < 0) {
3572 derr << "OSD::init() : unable to read osd superblock" << dendl;
3573 r = -EINVAL;
3574 goto out;
3575 }
3576
3577 if (osd_compat.compare(superblock.compat_features) < 0) {
3578 derr << "The disk uses features unsupported by the executable." << dendl;
3579 derr << " ondisk features " << superblock.compat_features << dendl;
3580 derr << " daemon features " << osd_compat << dendl;
3581
3582 if (osd_compat.writeable(superblock.compat_features)) {
3583 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3584 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3585 r = -EOPNOTSUPP;
3586 goto out;
3587 }
3588 else {
3589 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3590 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3591 r = -EOPNOTSUPP;
3592 goto out;
3593 }
3594 }
3595
3596 assert_warn(whoami == superblock.whoami);
3597 if (whoami != superblock.whoami) {
3598 derr << "OSD::init: superblock says osd"
3599 << superblock.whoami << " but I am osd." << whoami << dendl;
3600 r = -EINVAL;
3601 goto out;
3602 }
3603
9f95a23c
TL
3604 startup_time = ceph::mono_clock::now();
3605
11fdf7f2 3606 // load up "current" osdmap
9f95a23c
TL
3607 assert_warn(!get_osdmap());
3608 if (get_osdmap()) {
11fdf7f2
TL
3609 derr << "OSD::init: unable to read current osdmap" << dendl;
3610 r = -EINVAL;
3611 goto out;
3612 }
3613 osdmap = get_map(superblock.current_epoch);
9f95a23c 3614 set_osdmap(osdmap);
11fdf7f2
TL
3615
3616 // make sure we don't have legacy pgs deleting
3617 {
3618 vector<coll_t> ls;
3619 int r = store->list_collections(ls);
3620 ceph_assert(r >= 0);
3621 for (auto c : ls) {
3622 spg_t pgid;
3623 if (c.is_pg(&pgid) &&
3624 !osdmap->have_pg_pool(pgid.pool())) {
3625 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3626 if (!store->exists(service.meta_ch, oid)) {
3627 derr << __func__ << " missing pg_pool_t for deleted pool "
3628 << pgid.pool() << " for pg " << pgid
3629 << "; please downgrade to luminous and allow "
3630 << "pg deletion to complete before upgrading" << dendl;
3631 ceph_abort();
3632 }
3633 }
3634 }
3635 }
3636
7c673cae
FG
3637 initial = get_osd_initial_compat_set();
3638 diff = superblock.compat_features.unsupported(initial);
3639 if (superblock.compat_features.merge(initial)) {
9f95a23c
TL
3640 // Are we adding SNAPMAPPER2?
3641 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3642 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3643 << dendl;
3644 auto ch = service.meta_ch;
3645 auto hoid = make_snapmapper_oid();
3646 unsigned max = cct->_conf->osd_target_transaction_size;
20effc67 3647 r = SnapMapper::convert_legacy(cct, store.get(), ch, hoid, max);
9f95a23c
TL
3648 if (r < 0)
3649 goto out;
3650 }
7c673cae
FG
3651 // We need to persist the new compat_set before we
3652 // do anything else
3653 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3654 ObjectStore::Transaction t;
3655 write_superblock(t);
11fdf7f2 3656 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3657 if (r < 0)
3658 goto out;
3659 }
3660
3661 // make sure snap mapper object exists
11fdf7f2 3662 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
7c673cae
FG
3663 dout(10) << "init creating/touching snapmapper object" << dendl;
3664 ObjectStore::Transaction t;
3665 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
11fdf7f2 3666 r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
3667 if (r < 0)
3668 goto out;
3669 }
9f95a23c
TL
3670 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3671 dout(10) << "init creating/touching purged_snaps object" << dendl;
3672 ObjectStore::Transaction t;
3673 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3674 r = store->queue_transaction(service.meta_ch, std::move(t));
3675 if (r < 0)
3676 goto out;
3677 }
7c673cae
FG
3678
3679 if (cct->_conf->osd_open_classes_on_start) {
9f95a23c 3680 int r = ClassHandler::get_instance().open_all_classes();
7c673cae
FG
3681 if (r)
3682 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3683 }
3684
11fdf7f2 3685 check_osdmap_features();
7c673cae 3686
7c673cae
FG
3687 {
3688 epoch_t bind_epoch = osdmap->get_epoch();
3689 service.set_epochs(NULL, NULL, &bind_epoch);
3690 }
3691
3692 clear_temp_objects();
3693
d2e6a577 3694 // initialize osdmap references in sharded wq
11fdf7f2
TL
3695 for (auto& shard : shards) {
3696 std::lock_guard l(shard->osdmap_lock);
3697 shard->shard_osdmap = osdmap;
3698 }
d2e6a577 3699
7c673cae
FG
3700 // load up pgs (as they previously existed)
3701 load_pgs();
3702
3703 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
7c673cae 3704
f67539c2
TL
3705 if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3706 dout(2) << "compacting object store's omap" << dendl;
3707 store->compact();
3708 }
7c673cae 3709
11fdf7f2
TL
3710 // prime osd stats
3711 {
3712 struct store_statfs_t stbuf;
3713 osd_alert_list_t alerts;
3714 int r = store->statfs(&stbuf, &alerts);
3715 ceph_assert(r == 0);
3716 service.set_statfs(stbuf, alerts);
3717 }
3718
f67539c2 3719 // client_messenger's auth_client will be set up by monc->init() later.
11fdf7f2
TL
3720 for (auto m : { cluster_messenger,
3721 objecter_messenger,
3722 hb_front_client_messenger,
3723 hb_back_client_messenger,
3724 hb_front_server_messenger,
3725 hb_back_server_messenger } ) {
3726 m->set_auth_client(monc);
3727 }
3728 for (auto m : { client_messenger,
3729 cluster_messenger,
3730 hb_front_server_messenger,
3731 hb_back_server_messenger }) {
3732 m->set_auth_server(monc);
3733 }
3734 monc->set_handle_authentication_dispatcher(this);
7c673cae
FG
3735
3736 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3737 | CEPH_ENTITY_TYPE_MGR);
3738 r = monc->init();
3739 if (r < 0)
3740 goto out;
3741
f67539c2 3742 mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
11fdf7f2 3743 mgrc.set_perf_metric_query_cb(
9f95a23c
TL
3744 [this](const ConfigPayload &config_payload) {
3745 set_perf_queries(config_payload);
11fdf7f2 3746 },
9f95a23c
TL
3747 [this] {
3748 return get_perf_reports();
11fdf7f2 3749 });
7c673cae 3750 mgrc.init();
7c673cae
FG
3751
3752 // tell monc about log_client so it will know about mon session resets
3753 monc->set_log_client(&log_client);
3754 update_log_config();
3755
11fdf7f2
TL
3756 // i'm ready!
3757 client_messenger->add_dispatcher_tail(&mgrc);
3758 client_messenger->add_dispatcher_tail(this);
3759 cluster_messenger->add_dispatcher_head(this);
3760
3761 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3762 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3763 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3764 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3765
9f95a23c 3766 objecter_messenger->add_dispatcher_head(service.objecter.get());
11fdf7f2 3767
28e407b8
AA
3768 service.init();
3769 service.publish_map(osdmap);
3770 service.publish_superblock(superblock);
3771 service.max_oldest_map = superblock.oldest_map;
3772
11fdf7f2
TL
3773 for (auto& shard : shards) {
3774 // put PGs in a temporary set because we may modify pg_slots
3775 // unordered_map below.
3776 set<PGRef> pgs;
3777 for (auto& i : shard->pg_slots) {
3778 PGRef pg = i.second->pg;
3779 if (!pg) {
3780 continue;
3781 }
3782 pgs.insert(pg);
3783 }
3784 for (auto pg : pgs) {
9f95a23c 3785 std::scoped_lock l{*pg};
11fdf7f2
TL
3786 set<pair<spg_t,epoch_t>> new_children;
3787 set<pair<spg_t,epoch_t>> merge_pgs;
3788 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3789 &new_children, &merge_pgs);
3790 if (!new_children.empty()) {
3791 for (auto shard : shards) {
3792 shard->prime_splits(osdmap, &new_children);
3793 }
3794 assert(new_children.empty());
3795 }
3796 if (!merge_pgs.empty()) {
3797 for (auto shard : shards) {
3798 shard->prime_merges(osdmap, &merge_pgs);
3799 }
3800 assert(merge_pgs.empty());
3801 }
11fdf7f2
TL
3802 }
3803 }
3804
7c673cae 3805 osd_op_tp.start();
7c673cae 3806
7c673cae
FG
3807 // start the heartbeat
3808 heartbeat_thread.create("osd_srv_heartbt");
3809
3810 // tick
91327a77
AA
3811 tick_timer.add_event_after(get_tick_interval(),
3812 new C_Tick(this));
7c673cae 3813 {
11fdf7f2 3814 std::lock_guard l(tick_timer_lock);
91327a77
AA
3815 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3816 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
3817 }
3818
9f95a23c 3819 osd_lock.unlock();
7c673cae
FG
3820
3821 r = monc->authenticate();
3822 if (r < 0) {
c07f9fc5
FG
3823 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3824 << dendl;
11fdf7f2 3825 exit(1);
7c673cae
FG
3826 }
3827
11fdf7f2 3828 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
7c673cae
FG
3829 derr << "unable to obtain rotating service keys; retrying" << dendl;
3830 ++rotating_auth_attempts;
11fdf7f2 3831 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
39ae355f
TL
3832 derr << __func__ << " wait_auth_rotating timed out"
3833 <<" -- maybe I have a clock skew against the monitors?" << dendl;
11fdf7f2 3834 exit(1);
7c673cae
FG
3835 }
3836 }
3837
3838 r = update_crush_device_class();
3839 if (r < 0) {
d2e6a577
FG
3840 derr << __func__ << " unable to update_crush_device_class: "
3841 << cpp_strerror(r) << dendl;
11fdf7f2 3842 exit(1);
7c673cae
FG
3843 }
3844
3845 r = update_crush_location();
3846 if (r < 0) {
d2e6a577 3847 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 3848 << cpp_strerror(r) << dendl;
11fdf7f2 3849 exit(1);
7c673cae
FG
3850 }
3851
9f95a23c 3852 osd_lock.lock();
7c673cae
FG
3853 if (is_stopping())
3854 return 0;
3855
3856 // start objecter *after* we have authenticated, so that we don't ignore
3857 // the OSDMaps it requests.
3858 service.final_init();
3859
3860 check_config();
3861
3862 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3863 consume_map();
7c673cae
FG
3864
3865 dout(0) << "done with init, starting boot process" << dendl;
3866
3867 // subscribe to any pg creations
3868 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3869
3870 // MgrClient needs this (it doesn't have MonClient reference itself)
3871 monc->sub_want("mgrmap", 0, 0);
3872
3873 // we don't need to ask for an osdmap here; objecter will
3874 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3875
3876 monc->renew_subs();
3877
3878 start_boot();
3879
a4b75251 3880 // Override a few options if mclock scheduler is enabled.
39ae355f 3881 maybe_override_sleep_options_for_qos();
a4b75251 3882 maybe_override_options_for_qos();
39ae355f 3883 maybe_override_max_osd_capacity_for_qos();
a4b75251 3884
7c673cae 3885 return 0;
7c673cae
FG
3886
3887out:
3888 enable_disable_fuse(true);
3889 store->umount();
20effc67 3890 store.reset();
7c673cae
FG
3891 return r;
3892}
3893
3894void OSD::final_init()
3895{
3896 AdminSocket *admin_socket = cct->get_admin_socket();
3897 asok_hook = new OSDSocketHook(this);
9f95a23c 3898 int r = admin_socket->register_command("status", asok_hook,
7c673cae 3899 "high-level status of OSD");
11fdf7f2 3900 ceph_assert(r == 0);
9f95a23c 3901 r = admin_socket->register_command("flush_journal",
7c673cae
FG
3902 asok_hook,
3903 "flush the journal to permanent store");
11fdf7f2 3904 ceph_assert(r == 0);
9f95a23c 3905 r = admin_socket->register_command("dump_ops_in_flight " \
c07f9fc5
FG
3906 "name=filterstr,type=CephString,n=N,req=false",
3907 asok_hook,
7c673cae 3908 "show the ops currently in flight");
11fdf7f2 3909 ceph_assert(r == 0);
9f95a23c 3910 r = admin_socket->register_command("ops " \
c07f9fc5
FG
3911 "name=filterstr,type=CephString,n=N,req=false",
3912 asok_hook,
7c673cae 3913 "show the ops currently in flight");
11fdf7f2 3914 ceph_assert(r == 0);
9f95a23c 3915 r = admin_socket->register_command("dump_blocked_ops " \
c07f9fc5
FG
3916 "name=filterstr,type=CephString,n=N,req=false",
3917 asok_hook,
7c673cae 3918 "show the blocked ops currently in flight");
11fdf7f2 3919 ceph_assert(r == 0);
9f95a23c 3920 r = admin_socket->register_command("dump_historic_ops " \
c07f9fc5 3921 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3922 asok_hook,
3923 "show recent ops");
11fdf7f2 3924 ceph_assert(r == 0);
9f95a23c 3925 r = admin_socket->register_command("dump_historic_slow_ops " \
c07f9fc5 3926 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3927 asok_hook,
3928 "show slowest recent ops");
11fdf7f2 3929 ceph_assert(r == 0);
9f95a23c 3930 r = admin_socket->register_command("dump_historic_ops_by_duration " \
c07f9fc5 3931 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
3932 asok_hook,
3933 "show slowest recent ops, sorted by duration");
11fdf7f2 3934 ceph_assert(r == 0);
9f95a23c 3935 r = admin_socket->register_command("dump_op_pq_state",
7c673cae 3936 asok_hook,
20effc67 3937 "dump op queue state");
11fdf7f2 3938 ceph_assert(r == 0);
f67539c2 3939 r = admin_socket->register_command("dump_blocklist",
7c673cae 3940 asok_hook,
f67539c2 3941 "dump blocklisted clients and times");
11fdf7f2 3942 ceph_assert(r == 0);
9f95a23c 3943 r = admin_socket->register_command("dump_watchers",
7c673cae
FG
3944 asok_hook,
3945 "show clients which have active watches,"
3946 " and on which objects");
11fdf7f2 3947 ceph_assert(r == 0);
9f95a23c 3948 r = admin_socket->register_command("dump_recovery_reservations",
7c673cae
FG
3949 asok_hook,
3950 "show recovery reservations");
11fdf7f2 3951 ceph_assert(r == 0);
9f95a23c 3952 r = admin_socket->register_command("dump_scrub_reservations",
eafe8130 3953 asok_hook,
f6b5b4d7 3954 "show scrub reservations");
eafe8130 3955 ceph_assert(r == 0);
9f95a23c 3956 r = admin_socket->register_command("get_latest_osdmap",
7c673cae
FG
3957 asok_hook,
3958 "force osd to update the latest map from "
3959 "the mon");
11fdf7f2 3960 ceph_assert(r == 0);
7c673cae 3961
9f95a23c 3962 r = admin_socket->register_command("set_heap_property " \
7c673cae
FG
3963 "name=property,type=CephString " \
3964 "name=value,type=CephInt",
3965 asok_hook,
3966 "update malloc extension heap property");
11fdf7f2 3967 ceph_assert(r == 0);
7c673cae 3968
9f95a23c 3969 r = admin_socket->register_command("get_heap_property " \
7c673cae
FG
3970 "name=property,type=CephString",
3971 asok_hook,
3972 "get malloc extension heap property");
11fdf7f2 3973 ceph_assert(r == 0);
7c673cae
FG
3974
3975 r = admin_socket->register_command("dump_objectstore_kv_stats",
7c673cae
FG
3976 asok_hook,
3977 "print statistics of kvdb which used by bluestore");
11fdf7f2 3978 ceph_assert(r == 0);
7c673cae
FG
3979
3980 r = admin_socket->register_command("dump_scrubs",
7c673cae
FG
3981 asok_hook,
3982 "print scheduled scrubs");
11fdf7f2 3983 ceph_assert(r == 0);
7c673cae
FG
3984
3985 r = admin_socket->register_command("calc_objectstore_db_histogram",
7c673cae
FG
3986 asok_hook,
3987 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
11fdf7f2 3988 ceph_assert(r == 0);
7c673cae
FG
3989
3990 r = admin_socket->register_command("flush_store_cache",
7c673cae
FG
3991 asok_hook,
3992 "Flush bluestore internal cache");
11fdf7f2 3993 ceph_assert(r == 0);
39ae355f
TL
3994 r = admin_socket->register_command("rotate-stored-key",
3995 asok_hook,
3996 "Update the stored osd_key");
3997 ceph_assert(r == 0);
9f95a23c 3998 r = admin_socket->register_command("dump_pgstate_history",
7c673cae
FG
3999 asok_hook,
4000 "show recent state history");
11fdf7f2 4001 ceph_assert(r == 0);
7c673cae 4002
9f95a23c 4003 r = admin_socket->register_command("compact",
224ce89b
WB
4004 asok_hook,
4005 "Commpact object store's omap."
4006 " WARNING: Compaction probably slows your requests");
11fdf7f2
TL
4007 ceph_assert(r == 0);
4008
9f95a23c 4009 r = admin_socket->register_command("get_mapped_pools",
11fdf7f2
TL
4010 asok_hook,
4011 "dump pools whose PG(s) are mapped to this OSD.");
4012
4013 ceph_assert(r == 0);
4014
9f95a23c 4015 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
11fdf7f2
TL
4016 asok_hook,
4017 "probe OSD devices for SMART data.");
4018
4019 ceph_assert(r == 0);
4020
9f95a23c 4021 r = admin_socket->register_command("list_devices",
11fdf7f2
TL
4022 asok_hook,
4023 "list OSD devices.");
9f95a23c 4024 r = admin_socket->register_command("send_beacon",
11fdf7f2
TL
4025 asok_hook,
4026 "send OSD beacon to mon immediately");
224ce89b 4027
9f95a23c
TL
4028 r = admin_socket->register_command(
4029 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
4030 "Dump osd heartbeat network ping times");
eafe8130
TL
4031 ceph_assert(r == 0);
4032
20effc67
TL
4033 r = admin_socket->register_command(
4034 "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook,
4035 "Dump store's statistics for the given pool");
4036 ceph_assert(r == 0);
4037
4038 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store.get());
7c673cae
FG
4039 // Note: pools are CephString instead of CephPoolname because
4040 // these commands traditionally support both pool names and numbers
4041 r = admin_socket->register_command(
7c673cae
FG
4042 "setomapval " \
4043 "name=pool,type=CephString " \
4044 "name=objname,type=CephObjectname " \
4045 "name=key,type=CephString "\
4046 "name=val,type=CephString",
4047 test_ops_hook,
4048 "set omap key");
11fdf7f2 4049 ceph_assert(r == 0);
7c673cae 4050 r = admin_socket->register_command(
7c673cae
FG
4051 "rmomapkey " \
4052 "name=pool,type=CephString " \
4053 "name=objname,type=CephObjectname " \
4054 "name=key,type=CephString",
4055 test_ops_hook,
4056 "remove omap key");
11fdf7f2 4057 ceph_assert(r == 0);
7c673cae 4058 r = admin_socket->register_command(
7c673cae
FG
4059 "setomapheader " \
4060 "name=pool,type=CephString " \
4061 "name=objname,type=CephObjectname " \
4062 "name=header,type=CephString",
4063 test_ops_hook,
4064 "set omap header");
11fdf7f2 4065 ceph_assert(r == 0);
7c673cae
FG
4066
4067 r = admin_socket->register_command(
7c673cae
FG
4068 "getomap " \
4069 "name=pool,type=CephString " \
4070 "name=objname,type=CephObjectname",
4071 test_ops_hook,
4072 "output entire object map");
11fdf7f2 4073 ceph_assert(r == 0);
7c673cae
FG
4074
4075 r = admin_socket->register_command(
7c673cae
FG
4076 "truncobj " \
4077 "name=pool,type=CephString " \
4078 "name=objname,type=CephObjectname " \
4079 "name=len,type=CephInt",
4080 test_ops_hook,
4081 "truncate object to length");
11fdf7f2 4082 ceph_assert(r == 0);
7c673cae
FG
4083
4084 r = admin_socket->register_command(
7c673cae
FG
4085 "injectdataerr " \
4086 "name=pool,type=CephString " \
4087 "name=objname,type=CephObjectname " \
4088 "name=shardid,type=CephInt,req=false,range=0|255",
4089 test_ops_hook,
4090 "inject data error to an object");
11fdf7f2 4091 ceph_assert(r == 0);
7c673cae
FG
4092
4093 r = admin_socket->register_command(
7c673cae
FG
4094 "injectmdataerr " \
4095 "name=pool,type=CephString " \
4096 "name=objname,type=CephObjectname " \
4097 "name=shardid,type=CephInt,req=false,range=0|255",
4098 test_ops_hook,
4099 "inject metadata error to an object");
11fdf7f2 4100 ceph_assert(r == 0);
7c673cae 4101 r = admin_socket->register_command(
7c673cae
FG
4102 "set_recovery_delay " \
4103 "name=utime,type=CephInt,req=false",
4104 test_ops_hook,
4105 "Delay osd recovery by specified seconds");
11fdf7f2 4106 ceph_assert(r == 0);
7c673cae 4107 r = admin_socket->register_command(
7c673cae
FG
4108 "injectfull " \
4109 "name=type,type=CephString,req=false " \
4110 "name=count,type=CephInt,req=false ",
4111 test_ops_hook,
4112 "Inject a full disk (optional count times)");
11fdf7f2 4113 ceph_assert(r == 0);
9f95a23c
TL
4114 r = admin_socket->register_command(
4115 "bench " \
4116 "name=count,type=CephInt,req=false " \
4117 "name=size,type=CephInt,req=false " \
4118 "name=object_size,type=CephInt,req=false " \
4119 "name=object_num,type=CephInt,req=false ",
4120 asok_hook,
4121 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4122 "(default count=1G default size=4MB). Results in log.");
4123 ceph_assert(r == 0);
4124 r = admin_socket->register_command(
4125 "cluster_log " \
4126 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4127 "name=message,type=CephString,n=N",
4128 asok_hook,
4129 "log a message to the cluster log");
4130 ceph_assert(r == 0);
4131 r = admin_socket->register_command(
4132 "flush_pg_stats",
4133 asok_hook,
4134 "flush pg stats");
4135 ceph_assert(r == 0);
4136 r = admin_socket->register_command(
4137 "heap " \
4138 "name=heapcmd,type=CephChoices,strings=" \
4139 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4140 "name=value,type=CephString,req=false",
4141 asok_hook,
4142 "show heap usage info (available only if compiled with tcmalloc)");
4143 ceph_assert(r == 0);
4144 r = admin_socket->register_command(
4145 "debug dump_missing " \
4146 "name=filename,type=CephFilepath",
4147 asok_hook,
4148 "dump missing objects to a named file");
4149 ceph_assert(r == 0);
4150 r = admin_socket->register_command(
4151 "debug kick_recovery_wq " \
4152 "name=delay,type=CephInt,range=0",
4153 asok_hook,
4154 "set osd_recovery_delay_start to <val>");
4155 ceph_assert(r == 0);
4156 r = admin_socket->register_command(
4157 "cpu_profiler " \
4158 "name=arg,type=CephChoices,strings=status|flush",
4159 asok_hook,
4160 "run cpu profiling on daemon");
4161 ceph_assert(r == 0);
4162 r = admin_socket->register_command(
4163 "dump_pg_recovery_stats",
4164 asok_hook,
4165 "dump pg recovery statistics");
4166 ceph_assert(r == 0);
4167 r = admin_socket->register_command(
4168 "reset_pg_recovery_stats",
4169 asok_hook,
4170 "reset pg recovery statistics");
4171 ceph_assert(r == 0);
4172 r = admin_socket->register_command(
4173 "cache drop",
4174 asok_hook,
4175 "Drop all OSD caches");
4176 ceph_assert(r == 0);
4177 r = admin_socket->register_command(
4178 "cache status",
4179 asok_hook,
4180 "Get OSD caches statistics");
4181 ceph_assert(r == 0);
4182 r = admin_socket->register_command(
4183 "scrub_purged_snaps",
4184 asok_hook,
4185 "Scrub purged_snaps vs snapmapper index");
4186 ceph_assert(r == 0);
20effc67
TL
4187 r = admin_socket->register_command(
4188 "scrubdebug " \
4189 "name=pgid,type=CephPgid " \
4190 "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4191 "name=value,type=CephString,req=false",
4192 asok_hook,
4193 "debug the scrubber");
4194 ceph_assert(r == 0);
7c673cae 4195
9f95a23c
TL
4196 // -- pg commands --
4197 // old form: ceph pg <pgid> command ...
4198 r = admin_socket->register_command(
4199 "pg " \
4200 "name=pgid,type=CephPgid " \
4201 "name=cmd,type=CephChoices,strings=query",
4202 asok_hook,
4203 "");
4204 ceph_assert(r == 0);
4205 r = admin_socket->register_command(
4206 "pg " \
4207 "name=pgid,type=CephPgid " \
4208 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4209 "name=mulcmd,type=CephChoices,strings=revert|delete",
4210 asok_hook,
4211 "");
4212 ceph_assert(r == 0);
4213 r = admin_socket->register_command(
4214 "pg " \
4215 "name=pgid,type=CephPgid " \
4216 "name=cmd,type=CephChoices,strings=list_unfound " \
4217 "name=offset,type=CephString,req=false",
4218 asok_hook,
4219 "");
4220 ceph_assert(r == 0);
4221 r = admin_socket->register_command(
4222 "pg " \
4223 "name=pgid,type=CephPgid " \
4224 "name=cmd,type=CephChoices,strings=scrub " \
4225 "name=time,type=CephInt,req=false",
4226 asok_hook,
4227 "");
4228 ceph_assert(r == 0);
4229 r = admin_socket->register_command(
4230 "pg " \
4231 "name=pgid,type=CephPgid " \
4232 "name=cmd,type=CephChoices,strings=deep_scrub " \
4233 "name=time,type=CephInt,req=false",
4234 asok_hook,
4235 "");
4236 ceph_assert(r == 0);
4237 // new form: tell <pgid> <cmd> for both cli and rest
4238 r = admin_socket->register_command(
4239 "query",
4240 asok_hook,
4241 "show details of a specific pg");
4242 ceph_assert(r == 0);
4243 r = admin_socket->register_command(
4244 "mark_unfound_lost " \
4245 "name=pgid,type=CephPgid,req=false " \
4246 "name=mulcmd,type=CephChoices,strings=revert|delete",
4247 asok_hook,
4248 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4249 ceph_assert(r == 0);
4250 r = admin_socket->register_command(
4251 "list_unfound " \
4252 "name=pgid,type=CephPgid,req=false " \
4253 "name=offset,type=CephString,req=false",
4254 asok_hook,
4255 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4256 ceph_assert(r == 0);
4257 r = admin_socket->register_command(
4258 "scrub " \
4259 "name=pgid,type=CephPgid,req=false " \
4260 "name=time,type=CephInt,req=false",
4261 asok_hook,
4262 "Trigger a scheduled scrub ");
4263 ceph_assert(r == 0);
4264 r = admin_socket->register_command(
4265 "deep_scrub " \
4266 "name=pgid,type=CephPgid,req=false " \
4267 "name=time,type=CephInt,req=false",
4268 asok_hook,
4269 "Trigger a scheduled deep scrub ");
4270 ceph_assert(r == 0);
4271}
7c673cae 4272
f67539c2 4273PerfCounters* OSD::create_logger()
9f95a23c 4274{
f67539c2 4275 PerfCounters* logger = build_osd_logger(cct);
7c673cae 4276 cct->get_perfcounters_collection()->add(logger);
f67539c2 4277 return logger;
7c673cae
FG
4278}
4279
f67539c2 4280PerfCounters* OSD::create_recoverystate_perf()
7c673cae 4281{
f67539c2 4282 PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
7c673cae 4283 cct->get_perfcounters_collection()->add(recoverystate_perf);
f67539c2 4284 return recoverystate_perf;
7c673cae
FG
4285}
4286
4287int OSD::shutdown()
4288{
1d09f67e
TL
4289 // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4290 //cct->_conf->osd_fast_shutdown = true;
4291
4292 dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4293 << cct->_conf->osd_fast_shutdown
4294 << ", null-fm = " << store->has_null_manager() << dendl;
4295
4296 utime_t start_time_func = ceph_clock_now();
4297
92f5a8d4
TL
4298 if (cct->_conf->osd_fast_shutdown) {
4299 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
f67539c2
TL
4300 if (cct->_conf->osd_fast_shutdown_notify_mon)
4301 service.prepare_to_stop();
92f5a8d4 4302
1d09f67e
TL
4303 // There is no state we need to keep wehn running in NULL-FM moode
4304 if (!store->has_null_manager()) {
4305 cct->_log->flush();
4306 _exit(0);
4307 }
4308 } else if (!service.prepare_to_stop()) {
7c673cae 4309 return 0; // already shutting down
1d09f67e
TL
4310 }
4311
9f95a23c 4312 osd_lock.lock();
7c673cae 4313 if (is_stopping()) {
9f95a23c 4314 osd_lock.unlock();
7c673cae
FG
4315 return 0;
4316 }
7c673cae 4317
1d09f67e
TL
4318 if (!cct->_conf->osd_fast_shutdown) {
4319 dout(0) << "shutdown" << dendl;
4320 }
4321
4322 // don't accept new task for this OSD
7c673cae
FG
4323 set_state(STATE_STOPPING);
4324
1d09f67e
TL
4325 // Disabled debugging during fast-shutdown
4326 if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
11fdf7f2
TL
4327 cct->_conf.set_val("debug_osd", "100");
4328 cct->_conf.set_val("debug_journal", "100");
4329 cct->_conf.set_val("debug_filestore", "100");
4330 cct->_conf.set_val("debug_bluestore", "100");
4331 cct->_conf.set_val("debug_ms", "100");
4332 cct->_conf.apply_changes(nullptr);
3efd9988 4333 }
7c673cae 4334
39ae355f
TL
4335 // stop MgrClient earlier as it's more like an internal consumer of OSD
4336 //
4337 // should occur before unmounting the database in fast-shutdown to avoid
4338 // a race condition (see https://tracker.ceph.com/issues/56101)
4339 mgrc.shutdown();
4340
1d09f67e
TL
4341 if (cct->_conf->osd_fast_shutdown) {
4342 // first, stop new task from being taken from op_shardedwq
4343 // and clear all pending tasks
4344 op_shardedwq.stop_for_fast_shutdown();
4345
4346 utime_t start_time_timer = ceph_clock_now();
4347 tick_timer.shutdown();
4348 {
4349 std::lock_guard l(tick_timer_lock);
4350 tick_timer_without_osd_lock.shutdown();
4351 }
4352
4353 osd_lock.unlock();
4354 utime_t start_time_osd_drain = ceph_clock_now();
4355
4356 // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4357 osd_op_tp.drain();
4358 osd_op_tp.stop();
4359
4360 utime_t start_time_umount = ceph_clock_now();
4361 store->prepare_for_fast_shutdown();
4362 std::lock_guard lock(osd_lock);
4363 // TBD: assert in allocator that nothing is being add
4364 store->umount();
4365
4366 utime_t end_time = ceph_clock_now();
4367 if (cct->_conf->osd_fast_shutdown_timeout) {
4368 ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4369 }
4370 dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl;
4371 dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl;
4372 dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl;
4373 dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl;
4374 cct->_log->flush();
4375
4376 // now it is safe to exit
4377 _exit(0);
4378 }
4379
7c673cae
FG
4380 service.start_shutdown();
4381
4382 // stop sending work to pgs. this just prevents any new work in _process
4383 // from racing with on_shutdown and potentially entering the pg after.
4384 op_shardedwq.drain();
4385
4386 // Shutdown PGs
4387 {
11fdf7f2
TL
4388 vector<PGRef> pgs;
4389 _get_pgs(&pgs);
4390 for (auto pg : pgs) {
4391 pg->shutdown();
7c673cae
FG
4392 }
4393 }
7c673cae
FG
4394
4395 // drain op queue again (in case PGs requeued something)
4396 op_shardedwq.drain();
4397 {
4398 finished.clear(); // zap waiters (bleh, this is messy)
11fdf7f2 4399 waiting_for_osdmap.clear();
7c673cae
FG
4400 }
4401
7c673cae 4402 // unregister commands
11fdf7f2 4403 cct->get_admin_socket()->unregister_commands(asok_hook);
7c673cae
FG
4404 delete asok_hook;
4405 asok_hook = NULL;
4406
11fdf7f2 4407 cct->get_admin_socket()->unregister_commands(test_ops_hook);
7c673cae
FG
4408 delete test_ops_hook;
4409 test_ops_hook = NULL;
4410
9f95a23c 4411 osd_lock.unlock();
7c673cae 4412
9f95a23c
TL
4413 {
4414 std::lock_guard l{heartbeat_lock};
4415 heartbeat_stop = true;
4416 heartbeat_cond.notify_all();
4417 heartbeat_peers.clear();
4418 }
7c673cae
FG
4419 heartbeat_thread.join();
4420
9f95a23c
TL
4421 hb_back_server_messenger->mark_down_all();
4422 hb_front_server_messenger->mark_down_all();
4423 hb_front_client_messenger->mark_down_all();
4424 hb_back_client_messenger->mark_down_all();
4425
7c673cae
FG
4426 osd_op_tp.drain();
4427 osd_op_tp.stop();
4428 dout(10) << "op sharded tp stopped" << dendl;
4429
7c673cae
FG
4430 dout(10) << "stopping agent" << dendl;
4431 service.agent_stop();
4432
11fdf7f2
TL
4433 boot_finisher.wait_for_empty();
4434
9f95a23c 4435 osd_lock.lock();
7c673cae 4436
11fdf7f2 4437 boot_finisher.stop();
494da23a 4438 reset_heartbeat_peers(true);
7c673cae
FG
4439
4440 tick_timer.shutdown();
4441
4442 {
11fdf7f2 4443 std::lock_guard l(tick_timer_lock);
7c673cae
FG
4444 tick_timer_without_osd_lock.shutdown();
4445 }
4446
4447 // note unmount epoch
9f95a23c 4448 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
7c673cae 4449 superblock.mounted = service.get_boot_epoch();
9f95a23c 4450 superblock.clean_thru = get_osdmap_epoch();
7c673cae
FG
4451 ObjectStore::Transaction t;
4452 write_superblock(t);
11fdf7f2 4453 int r = store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4454 if (r) {
4455 derr << "OSD::shutdown: error writing superblock: "
4456 << cpp_strerror(r) << dendl;
4457 }
4458
4459
31f18b77
FG
4460 service.shutdown_reserver();
4461
7c673cae
FG
4462 // Remove PGs
4463#ifdef PG_DEBUG_REFS
4464 service.dump_live_pgids();
4465#endif
11fdf7f2
TL
4466 while (true) {
4467 vector<PGRef> pgs;
4468 _get_pgs(&pgs, true);
4469 if (pgs.empty()) {
4470 break;
4471 }
4472 for (auto& pg : pgs) {
4473 if (pg->is_deleted()) {
4474 continue;
4475 }
4476 dout(20) << " kicking pg " << pg << dendl;
4477 pg->lock();
4478 if (pg->get_num_ref() != 1) {
4479 derr << "pgid " << pg->get_pgid() << " has ref count of "
4480 << pg->get_num_ref() << dendl;
7c673cae 4481#ifdef PG_DEBUG_REFS
11fdf7f2 4482 pg->dump_live_ids();
7c673cae 4483#endif
31f18b77
FG
4484 if (cct->_conf->osd_shutdown_pgref_assert) {
4485 ceph_abort();
4486 }
7c673cae 4487 }
11fdf7f2
TL
4488 pg->ch.reset();
4489 pg->unlock();
7c673cae 4490 }
7c673cae
FG
4491 }
4492#ifdef PG_DEBUG_REFS
4493 service.dump_live_pgids();
4494#endif
f64942e4 4495
9f95a23c 4496 osd_lock.unlock();
11fdf7f2 4497 cct->_conf.remove_observer(this);
9f95a23c 4498 osd_lock.lock();
7c673cae 4499
11fdf7f2
TL
4500 service.meta_ch.reset();
4501
7c673cae
FG
4502 dout(10) << "syncing store" << dendl;
4503 enable_disable_fuse(true);
4504
4505 if (cct->_conf->osd_journal_flush_on_shutdown) {
4506 dout(10) << "flushing journal" << dendl;
4507 store->flush_journal();
4508 }
4509
7c673cae 4510 monc->shutdown();
9f95a23c
TL
4511 osd_lock.unlock();
4512 {
4513 std::unique_lock l{map_lock};
4514 set_osdmap(OSDMapRef());
4515 }
11fdf7f2
TL
4516 for (auto s : shards) {
4517 std::lock_guard l(s->osdmap_lock);
4518 s->shard_osdmap = OSDMapRef();
4519 }
7c673cae 4520 service.shutdown();
11fdf7f2
TL
4521
4522 std::lock_guard lock(osd_lock);
4523 store->umount();
20effc67 4524 store.reset();
11fdf7f2
TL
4525 dout(10) << "Store synced" << dendl;
4526
7c673cae
FG
4527 op_tracker.on_shutdown();
4528
9f95a23c 4529 ClassHandler::get_instance().shutdown();
7c673cae
FG
4530 client_messenger->shutdown();
4531 cluster_messenger->shutdown();
4532 hb_front_client_messenger->shutdown();
4533 hb_back_client_messenger->shutdown();
4534 objecter_messenger->shutdown();
4535 hb_front_server_messenger->shutdown();
4536 hb_back_server_messenger->shutdown();
4537
1d09f67e
TL
4538 utime_t duration = ceph_clock_now() - start_time_func;
4539 dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
4540
20effc67
TL
4541 tracing::osd::tracer.shutdown();
4542
7c673cae
FG
4543 return r;
4544}
4545
4546int OSD::mon_cmd_maybe_osd_create(string &cmd)
4547{
4548 bool created = false;
4549 while (true) {
4550 dout(10) << __func__ << " cmd: " << cmd << dendl;
4551 vector<string> vcmd{cmd};
4552 bufferlist inbl;
4553 C_SaferCond w;
4554 string outs;
4555 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4556 int r = w.wait();
4557 if (r < 0) {
4558 if (r == -ENOENT && !created) {
4559 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4560 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4561 vector<string> vnewcmd{newcmd};
4562 bufferlist inbl;
4563 C_SaferCond w;
4564 string outs;
4565 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4566 int r = w.wait();
4567 if (r < 0) {
4568 derr << __func__ << " fail: osd does not exist and created failed: "
4569 << cpp_strerror(r) << dendl;
4570 return r;
4571 }
4572 created = true;
4573 continue;
4574 }
4575 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4576 return r;
4577 }
4578 break;
4579 }
4580
4581 return 0;
4582}
4583
4584int OSD::update_crush_location()
4585{
4586 if (!cct->_conf->osd_crush_update_on_start) {
4587 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4588 return 0;
4589 }
4590
4591 char weight[32];
4592 if (cct->_conf->osd_crush_initial_weight >= 0) {
4593 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4594 } else {
4595 struct store_statfs_t st;
11fdf7f2
TL
4596 osd_alert_list_t alerts;
4597 int r = store->statfs(&st, &alerts);
7c673cae
FG
4598 if (r < 0) {
4599 derr << "statfs: " << cpp_strerror(r) << dendl;
4600 return r;
4601 }
4602 snprintf(weight, sizeof(weight), "%.4lf",
11fdf7f2
TL
4603 std::max(.00001,
4604 double(st.total) /
4605 double(1ull << 40 /* TB */)));
7c673cae
FG
4606 }
4607
9f95a23c 4608 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
7c673cae
FG
4609
4610 string cmd =
4611 string("{\"prefix\": \"osd crush create-or-move\", ") +
9f95a23c
TL
4612 string("\"id\": ") + stringify(whoami) + ", " +
4613 string("\"weight\":") + weight + ", " +
4614 string("\"args\": [") + stringify(cct->crush_location) + "]}";
7c673cae
FG
4615 return mon_cmd_maybe_osd_create(cmd);
4616}
4617
4618int OSD::update_crush_device_class()
4619{
224ce89b
WB
4620 if (!cct->_conf->osd_class_update_on_start) {
4621 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4622 return 0;
4623 }
4624
7c673cae
FG
4625 string device_class;
4626 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
4627 if (r < 0 || device_class.empty()) {
4628 device_class = store->get_default_device_class();
4629 }
4630
4631 if (device_class.empty()) {
d2e6a577 4632 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 4633 return 0;
224ce89b 4634 }
7c673cae
FG
4635
4636 string cmd =
4637 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
4638 string("\"class\": \"") + device_class + string("\", ") +
4639 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 4640
224ce89b 4641 r = mon_cmd_maybe_osd_create(cmd);
11fdf7f2
TL
4642 if (r == -EBUSY) {
4643 // good, already bound to a device-class
4644 return 0;
4645 } else {
4646 return r;
4647 }
7c673cae
FG
4648}
4649
4650void OSD::write_superblock(ObjectStore::Transaction& t)
4651{
4652 dout(10) << "write_superblock " << superblock << dendl;
4653
4654 //hack: at minimum it's using the baseline feature set
4655 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4656 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4657
4658 bufferlist bl;
11fdf7f2 4659 encode(superblock, bl);
7c673cae
FG
4660 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4661}
4662
4663int OSD::read_superblock()
4664{
4665 bufferlist bl;
11fdf7f2 4666 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
7c673cae
FG
4667 if (r < 0)
4668 return r;
4669
11fdf7f2
TL
4670 auto p = bl.cbegin();
4671 decode(superblock, p);
7c673cae
FG
4672
4673 dout(10) << "read_superblock " << superblock << dendl;
4674
4675 return 0;
4676}
4677
4678void OSD::clear_temp_objects()
4679{
4680 dout(10) << __func__ << dendl;
4681 vector<coll_t> ls;
4682 store->list_collections(ls);
4683 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4684 spg_t pgid;
4685 if (!p->is_pg(&pgid))
4686 continue;
4687
4688 // list temp objects
4689 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4690
4691 vector<ghobject_t> temps;
4692 ghobject_t next;
4693 while (1) {
4694 vector<ghobject_t> objects;
11fdf7f2
TL
4695 auto ch = store->open_collection(*p);
4696 ceph_assert(ch);
4697 store->collection_list(ch, next, ghobject_t::get_max(),
7c673cae
FG
4698 store->get_ideal_list_max(),
4699 &objects, &next);
4700 if (objects.empty())
4701 break;
4702 vector<ghobject_t>::iterator q;
4703 for (q = objects.begin(); q != objects.end(); ++q) {
4704 // Hammer set pool for temps to -1, so check for clean-up
4705 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4706 temps.push_back(*q);
4707 } else {
4708 break;
4709 }
4710 }
4711 // If we saw a non-temp object and hit the break above we can
4712 // break out of the while loop too.
4713 if (q != objects.end())
4714 break;
4715 }
4716 if (!temps.empty()) {
4717 ObjectStore::Transaction t;
4718 int removed = 0;
4719 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4720 dout(20) << " removing " << *p << " object " << *q << dendl;
4721 t.remove(*p, *q);
4722 if (++removed > cct->_conf->osd_target_transaction_size) {
11fdf7f2 4723 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4724 t = ObjectStore::Transaction();
4725 removed = 0;
4726 }
4727 }
4728 if (removed) {
11fdf7f2 4729 store->queue_transaction(service.meta_ch, std::move(t));
7c673cae
FG
4730 }
4731 }
4732 }
4733}
4734
4735void OSD::recursive_remove_collection(CephContext* cct,
4736 ObjectStore *store, spg_t pgid,
4737 coll_t tmp)
4738{
4739 OSDriver driver(
4740 store,
4741 coll_t(),
4742 make_snapmapper_oid());
4743
11fdf7f2 4744 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
7c673cae
FG
4745 ObjectStore::Transaction t;
4746 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4747
11fdf7f2
TL
4748 ghobject_t next;
4749 int max = cct->_conf->osd_target_transaction_size;
7c673cae 4750 vector<ghobject_t> objects;
11fdf7f2
TL
4751 objects.reserve(max);
4752 while (true) {
4753 objects.clear();
4754 store->collection_list(ch, next, ghobject_t::get_max(),
4755 max, &objects, &next);
4756 generic_dout(10) << __func__ << " " << objects << dendl;
4757 if (objects.empty())
4758 break;
4759 for (auto& p: objects) {
4760 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4761 int r = mapper.remove_oid(p.hobj, &_t);
4762 if (r != 0 && r != -ENOENT)
4763 ceph_abort();
4764 t.remove(tmp, p);
7c673cae 4765 }
11fdf7f2
TL
4766 int r = store->queue_transaction(ch, std::move(t));
4767 ceph_assert(r == 0);
4768 t = ObjectStore::Transaction();
7c673cae
FG
4769 }
4770 t.remove_collection(tmp);
11fdf7f2
TL
4771 int r = store->queue_transaction(ch, std::move(t));
4772 ceph_assert(r == 0);
7c673cae
FG
4773
4774 C_SaferCond waiter;
11fdf7f2 4775 if (!ch->flush_commit(&waiter)) {
7c673cae
FG
4776 waiter.wait();
4777 }
4778}
4779
4780
4781// ======================================================
4782// PG's
4783
7c673cae
FG
4784PG* OSD::_make_pg(
4785 OSDMapRef createmap,
4786 spg_t pgid)
4787{
11fdf7f2
TL
4788 dout(10) << __func__ << " " << pgid << dendl;
4789 pg_pool_t pi;
4790 map<string,string> ec_profile;
4791 string name;
4792 if (createmap->have_pg_pool(pgid.pool())) {
4793 pi = *createmap->get_pg_pool(pgid.pool());
4794 name = createmap->get_pool_name(pgid.pool());
4795 if (pi.is_erasure()) {
4796 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4797 }
4798 } else {
4799 // pool was deleted; grab final pg_pool_t off disk.
4800 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4801 bufferlist bl;
4802 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4803 if (r < 0) {
4804 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4805 << dendl;
4806 return nullptr;
4807 }
4808 ceph_assert(r >= 0);
4809 auto p = bl.cbegin();
4810 decode(pi, p);
4811 decode(name, p);
4812 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4813 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4814 << " tombstone" << dendl;
4815 return nullptr;
4816 }
4817 decode(ec_profile, p);
4818 }
f67539c2 4819 PGPool pool(createmap, pgid.pool(), pi, name);
7c673cae 4820 PG *pg;
11fdf7f2
TL
4821 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4822 pi.type == pg_pool_t::TYPE_ERASURE)
4823 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
7c673cae
FG
4824 else
4825 ceph_abort();
7c673cae
FG
4826 return pg;
4827}
4828
11fdf7f2 4829void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
7c673cae 4830{
11fdf7f2
TL
4831 v->clear();
4832 v->reserve(get_num_pgs());
4833 for (auto& s : shards) {
4834 std::lock_guard l(s->shard_lock);
4835 for (auto& j : s->pg_slots) {
4836 if (j.second->pg &&
4837 !j.second->pg->is_deleted()) {
4838 v->push_back(j.second->pg);
4839 if (clear_too) {
4840 s->_detach_pg(j.second.get());
4841 }
4842 }
7c673cae 4843 }
7c673cae 4844 }
7c673cae
FG
4845}
4846
11fdf7f2 4847void OSD::_get_pgids(vector<spg_t> *v)
7c673cae 4848{
11fdf7f2
TL
4849 v->clear();
4850 v->reserve(get_num_pgs());
4851 for (auto& s : shards) {
4852 std::lock_guard l(s->shard_lock);
4853 for (auto& j : s->pg_slots) {
4854 if (j.second->pg &&
4855 !j.second->pg->is_deleted()) {
4856 v->push_back(j.first);
4857 }
7c673cae
FG
4858 }
4859 }
7c673cae
FG
4860}
4861
11fdf7f2 4862void OSD::register_pg(PGRef pg)
7c673cae 4863{
11fdf7f2
TL
4864 spg_t pgid = pg->get_pgid();
4865 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4866 auto sdata = shards[shard_index];
4867 std::lock_guard l(sdata->shard_lock);
4868 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4869 ceph_assert(r.second);
4870 auto *slot = r.first->second.get();
4871 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4872 sdata->_attach_pg(slot, pg.get());
4873}
7c673cae 4874
11fdf7f2
TL
4875bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4876{
4877 auto sdata = pg->osd_shard;
4878 ceph_assert(sdata);
4879 {
4880 std::lock_guard l(sdata->shard_lock);
4881 auto p = sdata->pg_slots.find(pg->pg_id);
4882 if (p == sdata->pg_slots.end() ||
4883 !p->second->pg) {
4884 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4885 return false;
4886 }
4887 if (p->second->waiting_for_merge_epoch) {
4888 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4889 return false;
4890 }
4891 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4892 sdata->_detach_pg(p->second.get());
4893 }
7c673cae 4894
11fdf7f2
TL
4895 for (auto shard : shards) {
4896 shard->unprime_split_children(pg->pg_id, old_pg_num);
4897 }
7c673cae 4898
11fdf7f2
TL
4899 // update pg count now since we might not get an osdmap any time soon.
4900 if (pg->is_primary())
4901 service.logger->dec(l_osd_pg_primary);
9f95a23c
TL
4902 else if (pg->is_nonprimary())
4903 service.logger->dec(l_osd_pg_replica); // misnomver
11fdf7f2
TL
4904 else
4905 service.logger->dec(l_osd_pg_stray);
7c673cae 4906
11fdf7f2 4907 return true;
7c673cae
FG
4908}
4909
11fdf7f2 4910PGRef OSD::_lookup_pg(spg_t pgid)
7c673cae 4911{
11fdf7f2
TL
4912 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4913 auto sdata = shards[shard_index];
4914 std::lock_guard l(sdata->shard_lock);
4915 auto p = sdata->pg_slots.find(pgid);
4916 if (p == sdata->pg_slots.end()) {
7c673cae 4917 return nullptr;
11fdf7f2
TL
4918 }
4919 return p->second->pg;
7c673cae
FG
4920}
4921
11fdf7f2 4922PGRef OSD::_lookup_lock_pg(spg_t pgid)
31f18b77 4923{
11fdf7f2
TL
4924 PGRef pg = _lookup_pg(pgid);
4925 if (!pg) {
4926 return nullptr;
4927 }
4928 pg->lock();
4929 if (!pg->is_deleted()) {
4930 return pg;
4931 }
4932 pg->unlock();
4933 return nullptr;
31f18b77
FG
4934}
4935
11fdf7f2 4936PGRef OSD::lookup_lock_pg(spg_t pgid)
7c673cae 4937{
11fdf7f2 4938 return _lookup_lock_pg(pgid);
7c673cae
FG
4939}
4940
4941void OSD::load_pgs()
4942{
9f95a23c 4943 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 4944 dout(0) << "load_pgs" << dendl;
11fdf7f2 4945
7c673cae 4946 {
11fdf7f2
TL
4947 auto pghist = make_pg_num_history_oid();
4948 bufferlist bl;
4949 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4950 if (r >= 0 && bl.length() > 0) {
4951 auto p = bl.cbegin();
4952 decode(pg_num_history, p);
4953 }
4954 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
7c673cae
FG
4955 }
4956
4957 vector<coll_t> ls;
4958 int r = store->list_collections(ls);
4959 if (r < 0) {
4960 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4961 }
4962
11fdf7f2 4963 int num = 0;
7c673cae
FG
4964 for (vector<coll_t>::iterator it = ls.begin();
4965 it != ls.end();
4966 ++it) {
4967 spg_t pgid;
4968 if (it->is_temp(&pgid) ||
20effc67 4969 (it->is_pg(&pgid) && PG::_has_removal_flag(store.get(), pgid))) {
11fdf7f2
TL
4970 dout(10) << "load_pgs " << *it
4971 << " removing, legacy or flagged for removal pg" << dendl;
20effc67 4972 recursive_remove_collection(cct, store.get(), pgid, *it);
7c673cae
FG
4973 continue;
4974 }
4975
4976 if (!it->is_pg(&pgid)) {
4977 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4978 continue;
4979 }
4980
7c673cae 4981 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
7c673cae 4982 epoch_t map_epoch = 0;
20effc67 4983 int r = PG::peek_map_epoch(store.get(), pgid, &map_epoch);
7c673cae
FG
4984 if (r < 0) {
4985 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4986 << dendl;
4987 continue;
4988 }
4989
11fdf7f2 4990 PGRef pg;
7c673cae
FG
4991 if (map_epoch > 0) {
4992 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4993 if (!pgosdmap) {
9f95a23c 4994 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
7c673cae
FG
4995 derr << __func__ << ": could not find map for epoch " << map_epoch
4996 << " on pg " << pgid << ", but the pool is not present in the "
4997 << "current map, so this is probably a result of bug 10617. "
4998 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4999 << "to clean it up later." << dendl;
5000 continue;
5001 } else {
5002 derr << __func__ << ": have pgid " << pgid << " at epoch "
5003 << map_epoch << ", but missing map. Crashing."
5004 << dendl;
11fdf7f2 5005 ceph_abort_msg("Missing map in load_pgs");
7c673cae
FG
5006 }
5007 }
11fdf7f2 5008 pg = _make_pg(pgosdmap, pgid);
7c673cae 5009 } else {
9f95a23c 5010 pg = _make_pg(get_osdmap(), pgid);
7c673cae 5011 }
11fdf7f2 5012 if (!pg) {
20effc67 5013 recursive_remove_collection(cct, store.get(), pgid, *it);
11fdf7f2
TL
5014 continue;
5015 }
5016
5017 // there can be no waiters here, so we don't call _wake_pg_slot
7c673cae 5018
11fdf7f2 5019 pg->lock();
7c673cae
FG
5020 pg->ch = store->open_collection(pg->coll);
5021
5022 // read pg state, log
20effc67 5023 pg->read_state(store.get());
7c673cae 5024
94b18763
FG
5025 if (pg->dne()) {
5026 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
5027 pg->ch = nullptr;
94b18763 5028 pg->unlock();
20effc67 5029 recursive_remove_collection(cct, store.get(), pgid, *it);
94b18763
FG
5030 continue;
5031 }
11fdf7f2
TL
5032 {
5033 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5034 assert(NULL != shards[shard_index]);
5035 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5036 }
7c673cae 5037
11fdf7f2 5038 dout(10) << __func__ << " loaded " << *pg << dendl;
7c673cae 5039 pg->unlock();
7c673cae 5040
11fdf7f2
TL
5041 register_pg(pg);
5042 ++num;
7c673cae 5043 }
11fdf7f2 5044 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
7c673cae
FG
5045}
5046
5047
11fdf7f2
TL
5048PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
5049 const PGCreateInfo *info)
5050{
5051 spg_t pgid = info->pgid;
7c673cae 5052
11fdf7f2
TL
5053 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
5054 dout(10) << __func__ << " hit max pg, dropping" << dendl;
5055 return nullptr;
5056 }
3efd9988 5057
11fdf7f2 5058 OSDMapRef startmap = get_map(info->epoch);
7c673cae 5059
11fdf7f2
TL
5060 if (info->by_mon) {
5061 int64_t pool_id = pgid.pgid.pool();
5062 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
5063 if (!pool) {
5064 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
5065 return nullptr;
5066 }
9f95a23c 5067 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
11fdf7f2
TL
5068 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
5069 // this ensures we do not process old creating messages after the
5070 // pool's initial pgs have been created (and pg are subsequently
5071 // allowed to split or merge).
5072 dout(20) << __func__ << " dropping " << pgid
5073 << "create, pool does not have CREATING flag set" << dendl;
5074 return nullptr;
7c673cae
FG
5075 }
5076 }
7c673cae 5077
11fdf7f2
TL
5078 int up_primary, acting_primary;
5079 vector<int> up, acting;
5080 startmap->pg_to_up_acting_osds(
5081 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
7c673cae 5082
11fdf7f2
TL
5083 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
5084 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
5085 store->get_type() != "bluestore") {
5086 clog->warn() << "pg " << pgid
5087 << " is at risk of silent data corruption: "
5088 << "the pool allows ec overwrites but is not stored in "
5089 << "bluestore, so deep scrubbing will not detect bitrot";
7c673cae 5090 }
20effc67 5091 PeeringCtx rctx;
9f95a23c
TL
5092 create_pg_collection(
5093 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
5094 init_pg_ondisk(rctx.transaction, pgid, pp);
7c673cae 5095
9f95a23c 5096 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae 5097
11fdf7f2
TL
5098 PGRef pg = _make_pg(startmap, pgid);
5099 pg->ch = store->create_new_collection(pg->coll);
7c673cae 5100
11fdf7f2
TL
5101 {
5102 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5103 assert(NULL != shards[shard_index]);
5104 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
7c673cae 5105 }
7c673cae 5106
11fdf7f2 5107 pg->lock(true);
7c673cae 5108
11fdf7f2
TL
5109 // we are holding the shard lock
5110 ceph_assert(!pg->is_deleted());
5111
5112 pg->init(
5113 role,
5114 up,
5115 up_primary,
5116 acting,
5117 acting_primary,
5118 info->history,
5119 info->past_intervals,
11fdf7f2 5120 rctx.transaction);
7c673cae 5121
92f5a8d4
TL
5122 pg->init_collection_pool_opts();
5123
11fdf7f2 5124 if (pg->is_primary()) {
9f95a23c 5125 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
5126 pg->set_dynamic_perf_stats_queries(m_perf_queries);
5127 }
7c673cae 5128
9f95a23c
TL
5129 pg->handle_initialize(rctx);
5130 pg->handle_activate_map(rctx);
7c673cae 5131
11fdf7f2 5132 dispatch_context(rctx, pg.get(), osdmap, nullptr);
7c673cae 5133
11fdf7f2
TL
5134 dout(10) << __func__ << " new pg " << *pg << dendl;
5135 return pg;
7c673cae
FG
5136}
5137
11fdf7f2
TL
5138bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
5139 spg_t pgid,
5140 bool is_mon_create)
3efd9988
FG
5141{
5142 const auto max_pgs_per_osd =
11fdf7f2
TL
5143 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5144 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
3efd9988 5145
11fdf7f2 5146 if (num_pgs < max_pgs_per_osd) {
3efd9988
FG
5147 return false;
5148 }
11fdf7f2
TL
5149
5150 std::lock_guard l(pending_creates_lock);
3efd9988
FG
5151 if (is_mon_create) {
5152 pending_creates_from_mon++;
5153 } else {
9f95a23c
TL
5154 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
5155 pending_creates_from_osd.emplace(pgid, is_primary);
3efd9988 5156 }
1adf2230 5157 dout(1) << __func__ << " withhold creation of pg " << pgid
11fdf7f2 5158 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
3efd9988
FG
5159 return true;
5160}
5161
5162// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5163// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5164// to up set if pg_temp is empty. so an empty pg_temp won't work.
5165static vector<int32_t> twiddle(const vector<int>& acting) {
5166 if (acting.size() > 1) {
5167 return {acting[0]};
5168 } else {
5169 vector<int32_t> twiddled(acting.begin(), acting.end());
5170 twiddled.push_back(-1);
5171 return twiddled;
5172 }
5173}
5174
5175void OSD::resume_creating_pg()
5176{
5177 bool do_sub_pg_creates = false;
b32b8144 5178 bool have_pending_creates = false;
3efd9988
FG
5179 {
5180 const auto max_pgs_per_osd =
11fdf7f2
TL
5181 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5182 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5183 if (max_pgs_per_osd <= num_pgs) {
3efd9988
FG
5184 // this could happen if admin decreases this setting before a PG is removed
5185 return;
5186 }
11fdf7f2
TL
5187 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5188 std::lock_guard l(pending_creates_lock);
3efd9988 5189 if (pending_creates_from_mon > 0) {
11fdf7f2
TL
5190 dout(20) << __func__ << " pending_creates_from_mon "
5191 << pending_creates_from_mon << dendl;
3efd9988
FG
5192 do_sub_pg_creates = true;
5193 if (pending_creates_from_mon >= spare_pgs) {
5194 spare_pgs = pending_creates_from_mon = 0;
5195 } else {
5196 spare_pgs -= pending_creates_from_mon;
5197 pending_creates_from_mon = 0;
5198 }
5199 }
5200 auto pg = pending_creates_from_osd.cbegin();
5201 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 5202 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 5203 vector<int> acting;
9f95a23c
TL
5204 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5205 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
3efd9988 5206 pg = pending_creates_from_osd.erase(pg);
94b18763 5207 do_sub_pg_creates = true;
3efd9988
FG
5208 spare_pgs--;
5209 }
b32b8144
FG
5210 have_pending_creates = (pending_creates_from_mon > 0 ||
5211 !pending_creates_from_osd.empty());
3efd9988 5212 }
b32b8144
FG
5213
5214 bool do_renew_subs = false;
3efd9988
FG
5215 if (do_sub_pg_creates) {
5216 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5217 dout(4) << __func__ << ": resolicit pg creates from mon since "
5218 << last_pg_create_epoch << dendl;
b32b8144 5219 do_renew_subs = true;
3efd9988
FG
5220 }
5221 }
9f95a23c 5222 version_t start = get_osdmap_epoch() + 1;
b32b8144
FG
5223 if (have_pending_creates) {
5224 // don't miss any new osdmap deleting PGs
5225 if (monc->sub_want("osdmap", start, 0)) {
5226 dout(4) << __func__ << ": resolicit osdmap from mon since "
5227 << start << dendl;
5228 do_renew_subs = true;
5229 }
94b18763 5230 } else if (do_sub_pg_creates) {
b32b8144
FG
5231 // no need to subscribe the osdmap continuously anymore
5232 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5233 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
11fdf7f2 5234 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
b32b8144
FG
5235 << start << dendl;
5236 do_renew_subs = true;
5237 }
5238 }
5239
5240 if (do_renew_subs) {
5241 monc->renew_subs();
5242 }
5243
94b18763 5244 service.send_pg_temp();
3efd9988 5245}
7c673cae
FG
5246
5247void OSD::build_initial_pg_history(
5248 spg_t pgid,
5249 epoch_t created,
5250 utime_t created_stamp,
5251 pg_history_t *h,
5252 PastIntervals *pi)
5253{
5254 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
9f95a23c 5255 *h = pg_history_t(created, created_stamp);
7c673cae
FG
5256
5257 OSDMapRef lastmap = service.get_map(created);
5258 int up_primary, acting_primary;
5259 vector<int> up, acting;
5260 lastmap->pg_to_up_acting_osds(
5261 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5262
5263 ostringstream debug;
9f95a23c 5264 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
7c673cae
FG
5265 OSDMapRef osdmap = service.get_map(e);
5266 int new_up_primary, new_acting_primary;
5267 vector<int> new_up, new_acting;
5268 osdmap->pg_to_up_acting_osds(
5269 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5270
5271 // this is a bit imprecise, but sufficient?
5272 struct min_size_predicate_t : public IsPGRecoverablePredicate {
5273 const pg_pool_t *pi;
5274 bool operator()(const set<pg_shard_t> &have) const {
5275 return have.size() >= pi->min_size;
5276 }
11fdf7f2 5277 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
7c673cae
FG
5278 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5279
5280 bool new_interval = PastIntervals::check_new_interval(
5281 acting_primary,
5282 new_acting_primary,
5283 acting, new_acting,
5284 up_primary,
5285 new_up_primary,
5286 up, new_up,
5287 h->same_interval_since,
5288 h->last_epoch_clean,
9f95a23c
TL
5289 osdmap.get(),
5290 lastmap.get(),
7c673cae 5291 pgid.pgid,
9f95a23c 5292 min_size_predicate,
7c673cae
FG
5293 pi,
5294 &debug);
5295 if (new_interval) {
5296 h->same_interval_since = e;
181888fb
FG
5297 if (up != new_up) {
5298 h->same_up_since = e;
5299 }
5300 if (acting_primary != new_acting_primary) {
5301 h->same_primary_since = e;
5302 }
5303 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5304 osdmap->get_pg_num(pgid.pgid.pool()),
5305 nullptr)) {
5306 h->last_epoch_split = e;
5307 }
5308 up = new_up;
5309 acting = new_acting;
5310 up_primary = new_up_primary;
5311 acting_primary = new_acting_primary;
c07f9fc5 5312 }
7c673cae
FG
5313 lastmap = osdmap;
5314 }
5315 dout(20) << __func__ << " " << debug.str() << dendl;
5316 dout(10) << __func__ << " " << *h << " " << *pi
5317 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5318 pi->get_bounds()) << ")"
5319 << dendl;
5320}
5321
7c673cae
FG
5322void OSD::_add_heartbeat_peer(int p)
5323{
5324 if (p == whoami)
5325 return;
5326 HeartbeatInfo *hi;
5327
5328 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5329 if (i == heartbeat_peers.end()) {
9f95a23c 5330 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
7c673cae
FG
5331 if (!cons.first)
5332 return;
9f95a23c
TL
5333 assert(cons.second);
5334
7c673cae
FG
5335 hi = &heartbeat_peers[p];
5336 hi->peer = p;
9f95a23c
TL
5337
5338 auto stamps = service.get_hb_stamps(p);
5339
5340 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5341 sb->peer = p;
5342 sb->stamps = stamps;
eafe8130 5343 hi->hb_interval_start = ceph_clock_now();
7c673cae 5344 hi->con_back = cons.first.get();
9f95a23c
TL
5345 hi->con_back->set_priv(sb);
5346
5347 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5348 sf->peer = p;
5349 sf->stamps = stamps;
5350 hi->con_front = cons.second.get();
5351 hi->con_front->set_priv(sf);
5352
5353 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5354 << " " << hi->con_back->get_peer_addr()
5355 << " " << hi->con_front->get_peer_addr()
5356 << dendl;
7c673cae
FG
5357 } else {
5358 hi = &i->second;
5359 }
9f95a23c 5360 hi->epoch = get_osdmap_epoch();
7c673cae
FG
5361}
5362
5363void OSD::_remove_heartbeat_peer(int n)
5364{
5365 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
11fdf7f2 5366 ceph_assert(q != heartbeat_peers.end());
7c673cae
FG
5367 dout(20) << " removing heartbeat peer osd." << n
5368 << " " << q->second.con_back->get_peer_addr()
5369 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5370 << dendl;
9f95a23c 5371 q->second.clear_mark_down();
7c673cae
FG
5372 heartbeat_peers.erase(q);
5373}
5374
5375void OSD::need_heartbeat_peer_update()
5376{
5377 if (is_stopping())
5378 return;
5379 dout(20) << "need_heartbeat_peer_update" << dendl;
5380 heartbeat_set_peers_need_update();
5381}
5382
5383void OSD::maybe_update_heartbeat_peers()
5384{
9f95a23c 5385 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5386
11fdf7f2 5387 if (is_waiting_for_healthy() || is_active()) {
7c673cae
FG
5388 utime_t now = ceph_clock_now();
5389 if (last_heartbeat_resample == utime_t()) {
5390 last_heartbeat_resample = now;
5391 heartbeat_set_peers_need_update();
5392 } else if (!heartbeat_peers_need_update()) {
5393 utime_t dur = now - last_heartbeat_resample;
5394 if (dur > cct->_conf->osd_heartbeat_grace) {
5395 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5396 heartbeat_set_peers_need_update();
5397 last_heartbeat_resample = now;
494da23a
TL
5398 // automatically clean up any stale heartbeat peers
5399 // if we are unhealthy, then clean all
5400 reset_heartbeat_peers(is_waiting_for_healthy());
7c673cae
FG
5401 }
5402 }
5403 }
5404
5405 if (!heartbeat_peers_need_update())
5406 return;
5407 heartbeat_clear_peers_need_update();
5408
11fdf7f2 5409 std::lock_guard l(heartbeat_lock);
7c673cae
FG
5410
5411 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5412
5413
5414 // build heartbeat from set
5415 if (is_active()) {
11fdf7f2
TL
5416 vector<PGRef> pgs;
5417 _get_pgs(&pgs);
5418 for (auto& pg : pgs) {
5419 pg->with_heartbeat_peers([&](int peer) {
9f95a23c 5420 if (get_osdmap()->is_up(peer)) {
11fdf7f2
TL
5421 _add_heartbeat_peer(peer);
5422 }
5423 });
7c673cae
FG
5424 }
5425 }
5426
5427 // include next and previous up osds to ensure we have a fully-connected set
5428 set<int> want, extras;
9f95a23c 5429 const int next = get_osdmap()->get_next_up_osd_after(whoami);
7c673cae
FG
5430 if (next >= 0)
5431 want.insert(next);
9f95a23c 5432 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
7c673cae
FG
5433 if (prev >= 0 && prev != next)
5434 want.insert(prev);
5435
11fdf7f2
TL
5436 // make sure we have at least **min_down** osds coming from different
5437 // subtree level (e.g., hosts) for fast failure detection.
5438 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5439 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
9f95a23c
TL
5440 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5441 get_osdmap()->get_random_up_osds_by_subtree(
5442 whoami, subtree, limit, want, &want);
11fdf7f2 5443
7c673cae
FG
5444 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5445 dout(10) << " adding neighbor peer osd." << *p << dendl;
5446 extras.insert(*p);
5447 _add_heartbeat_peer(*p);
5448 }
5449
5450 // remove down peers; enumerate extras
5451 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5452 while (p != heartbeat_peers.end()) {
9f95a23c 5453 if (!get_osdmap()->is_up(p->first)) {
7c673cae
FG
5454 int o = p->first;
5455 ++p;
5456 _remove_heartbeat_peer(o);
5457 continue;
5458 }
9f95a23c 5459 if (p->second.epoch < get_osdmap_epoch()) {
7c673cae
FG
5460 extras.insert(p->first);
5461 }
5462 ++p;
5463 }
5464
5465 // too few?
11fdf7f2 5466 for (int n = next; n >= 0; ) {
7c673cae
FG
5467 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5468 break;
5469 if (!extras.count(n) && !want.count(n) && n != whoami) {
5470 dout(10) << " adding random peer osd." << n << dendl;
5471 extras.insert(n);
5472 _add_heartbeat_peer(n);
5473 }
9f95a23c 5474 n = get_osdmap()->get_next_up_osd_after(n);
11fdf7f2 5475 if (n == next)
7c673cae
FG
5476 break; // came full circle; stop
5477 }
5478
5479 // too many?
5480 for (set<int>::iterator p = extras.begin();
5481 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5482 ++p) {
5483 if (want.count(*p))
5484 continue;
5485 _remove_heartbeat_peer(*p);
5486 }
5487
5488 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
9f95a23c
TL
5489
5490 // clean up stale failure pending
5491 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5492 if (heartbeat_peers.count(it->first) == 0) {
5493 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5494 failure_pending.erase(it++);
5495 } else {
5496 it++;
5497 }
5498 }
7c673cae
FG
5499}
5500
494da23a 5501void OSD::reset_heartbeat_peers(bool all)
7c673cae 5502{
9f95a23c 5503 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae 5504 dout(10) << "reset_heartbeat_peers" << dendl;
494da23a
TL
5505 utime_t stale = ceph_clock_now();
5506 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
11fdf7f2 5507 std::lock_guard l(heartbeat_lock);
494da23a 5508 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
b3b6e05e 5509 auto& [peer, hi] = *it;
494da23a 5510 if (all || hi.is_stale(stale)) {
9f95a23c 5511 hi.clear_mark_down();
494da23a 5512 // stop sending failure_report to mon too
b3b6e05e
TL
5513 failure_queue.erase(peer);
5514 failure_pending.erase(peer);
5515 it = heartbeat_peers.erase(it);
494da23a 5516 } else {
b3b6e05e 5517 ++it;
7c673cae 5518 }
7c673cae 5519 }
7c673cae
FG
5520}
5521
5522void OSD::handle_osd_ping(MOSDPing *m)
5523{
5524 if (superblock.cluster_fsid != m->fsid) {
5525 dout(20) << "handle_osd_ping from " << m->get_source_inst()
9f95a23c
TL
5526 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5527 << dendl;
7c673cae
FG
5528 m->put();
5529 return;
5530 }
5531
5532 int from = m->get_source().num();
5533
9f95a23c 5534 heartbeat_lock.lock();
7c673cae 5535 if (is_stopping()) {
9f95a23c 5536 heartbeat_lock.unlock();
7c673cae
FG
5537 m->put();
5538 return;
5539 }
5540
9f95a23c
TL
5541 utime_t now = ceph_clock_now();
5542 auto mnow = service.get_mnow();
5543 ConnectionRef con(m->get_connection());
7c673cae 5544 OSDMapRef curmap = service.get_osdmap();
c07f9fc5 5545 if (!curmap) {
9f95a23c 5546 heartbeat_lock.unlock();
c07f9fc5
FG
5547 m->put();
5548 return;
5549 }
7c673cae 5550
9f95a23c
TL
5551 auto sref = con->get_priv();
5552 Session *s = static_cast<Session*>(sref.get());
5553 if (!s) {
5554 heartbeat_lock.unlock();
5555 m->put();
5556 return;
5557 }
5558 if (!s->stamps) {
5559 s->peer = from;
5560 s->stamps = service.get_hb_stamps(from);
5561 }
5562
7c673cae
FG
5563 switch (m->op) {
5564
5565 case MOSDPing::PING:
5566 {
5567 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5568 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5569 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5570 if (heartbeat_drop->second == 0) {
5571 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5572 } else {
5573 --heartbeat_drop->second;
5574 dout(5) << "Dropping heartbeat from " << from
5575 << ", " << heartbeat_drop->second
5576 << " remaining to drop" << dendl;
5577 break;
5578 }
5579 } else if (cct->_conf->osd_debug_drop_ping_probability >
5580 ((((double)(rand()%100))/100.0))) {
5581 heartbeat_drop =
5582 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5583 cct->_conf->osd_debug_drop_ping_duration)).first;
5584 dout(5) << "Dropping heartbeat from " << from
5585 << ", " << heartbeat_drop->second
5586 << " remaining to drop" << dendl;
5587 break;
5588 }
5589 }
5590
9f95a23c
TL
5591 ceph::signedspan sender_delta_ub{};
5592 s->stamps->got_ping(
5593 m->up_from,
5594 mnow,
5595 m->mono_send_stamp,
5596 m->delta_ub,
5597 &sender_delta_ub);
5598 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5599
7c673cae 5600 if (!cct->get_heartbeat_map()->is_healthy()) {
9f95a23c
TL
5601 dout(10) << "internal heartbeat not healthy, dropping ping request"
5602 << dendl;
7c673cae
FG
5603 break;
5604 }
5605
5606 Message *r = new MOSDPing(monc->get_fsid(),
5607 curmap->get_epoch(),
9f95a23c
TL
5608 MOSDPing::PING_REPLY,
5609 m->ping_stamp,
5610 m->mono_ping_stamp,
5611 mnow,
5612 service.get_up_epoch(),
5613 cct->_conf->osd_heartbeat_min_size,
5614 sender_delta_ub);
5615 con->send_message(r);
7c673cae
FG
5616
5617 if (curmap->is_up(from)) {
7c673cae 5618 if (is_active()) {
9f95a23c
TL
5619 ConnectionRef cluster_con = service.get_con_osd_cluster(
5620 from, curmap->get_epoch());
5621 if (cluster_con) {
5622 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5623 }
5624 }
5625 } else if (!curmap->exists(from) ||
5626 curmap->get_down_at(from) > m->map_epoch) {
5627 // tell them they have died
5628 Message *r = new MOSDPing(monc->get_fsid(),
5629 curmap->get_epoch(),
5630 MOSDPing::YOU_DIED,
9f95a23c
TL
5631 m->ping_stamp,
5632 m->mono_ping_stamp,
5633 mnow,
5634 service.get_up_epoch(),
31f18b77 5635 cct->_conf->osd_heartbeat_min_size);
9f95a23c 5636 con->send_message(r);
7c673cae
FG
5637 }
5638 }
5639 break;
5640
5641 case MOSDPing::PING_REPLY:
5642 {
5643 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5644 if (i != heartbeat_peers.end()) {
9f95a23c 5645 auto acked = i->second.ping_history.find(m->ping_stamp);
11fdf7f2 5646 if (acked != i->second.ping_history.end()) {
11fdf7f2 5647 int &unacknowledged = acked->second.second;
9f95a23c 5648 if (con == i->second.con_back) {
11fdf7f2
TL
5649 dout(25) << "handle_osd_ping got reply from osd." << from
5650 << " first_tx " << i->second.first_tx
5651 << " last_tx " << i->second.last_tx
9f95a23c
TL
5652 << " last_rx_back " << i->second.last_rx_back
5653 << " -> " << now
11fdf7f2
TL
5654 << " last_rx_front " << i->second.last_rx_front
5655 << dendl;
5656 i->second.last_rx_back = now;
5657 ceph_assert(unacknowledged > 0);
5658 --unacknowledged;
5659 // if there is no front con, set both stamps.
5660 if (i->second.con_front == NULL) {
5661 i->second.last_rx_front = now;
5662 ceph_assert(unacknowledged > 0);
5663 --unacknowledged;
5664 }
9f95a23c 5665 } else if (con == i->second.con_front) {
11fdf7f2
TL
5666 dout(25) << "handle_osd_ping got reply from osd." << from
5667 << " first_tx " << i->second.first_tx
5668 << " last_tx " << i->second.last_tx
5669 << " last_rx_back " << i->second.last_rx_back
9f95a23c
TL
5670 << " last_rx_front " << i->second.last_rx_front
5671 << " -> " << now
11fdf7f2
TL
5672 << dendl;
5673 i->second.last_rx_front = now;
5674 ceph_assert(unacknowledged > 0);
5675 --unacknowledged;
5676 }
7c673cae 5677
11fdf7f2
TL
5678 if (unacknowledged == 0) {
5679 // succeeded in getting all replies
5680 dout(25) << "handle_osd_ping got all replies from osd." << from
9f95a23c 5681 << " , erase pending ping(sent at " << m->ping_stamp << ")"
11fdf7f2
TL
5682 << " and older pending ping(s)"
5683 << dendl;
eafe8130
TL
5684
5685#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5686 ++i->second.hb_average_count;
9f95a23c 5687 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
eafe8130
TL
5688 i->second.hb_total_back += back_pingtime;
5689 if (back_pingtime < i->second.hb_min_back)
5690 i->second.hb_min_back = back_pingtime;
5691 if (back_pingtime > i->second.hb_max_back)
5692 i->second.hb_max_back = back_pingtime;
9f95a23c 5693 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
eafe8130
TL
5694 i->second.hb_total_front += front_pingtime;
5695 if (front_pingtime < i->second.hb_min_front)
5696 i->second.hb_min_front = front_pingtime;
5697 if (front_pingtime > i->second.hb_max_front)
5698 i->second.hb_max_front = front_pingtime;
5699
5700 ceph_assert(i->second.hb_interval_start != utime_t());
5701 if (i->second.hb_interval_start == utime_t())
5702 i->second.hb_interval_start = now;
5703 int64_t hb_avg_time_period = 60;
5704 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5705 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5706 }
5707 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5708 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5709 uint32_t back_min = i->second.hb_min_back;
5710 uint32_t back_max = i->second.hb_max_back;
5711 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5712 uint32_t front_min = i->second.hb_min_front;
5713 uint32_t front_max = i->second.hb_max_front;
5714
5715 // Reset for new interval
5716 i->second.hb_average_count = 0;
5717 i->second.hb_interval_start = now;
5718 i->second.hb_total_back = i->second.hb_max_back = 0;
5719 i->second.hb_min_back = UINT_MAX;
5720 i->second.hb_total_front = i->second.hb_max_front = 0;
5721 i->second.hb_min_front = UINT_MAX;
5722
5723 // Record per osd interace ping times
5724 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5725 if (i->second.hb_back_pingtime.size() == 0) {
5726 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5727 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5728 i->second.hb_back_pingtime.push_back(back_avg);
5729 i->second.hb_back_min.push_back(back_min);
5730 i->second.hb_back_max.push_back(back_max);
5731 i->second.hb_front_pingtime.push_back(front_avg);
5732 i->second.hb_front_min.push_back(front_min);
5733 i->second.hb_front_max.push_back(front_max);
5734 ++i->second.hb_index;
5735 }
5736 } else {
5737 int index = i->second.hb_index & (hb_vector_size - 1);
5738 i->second.hb_back_pingtime[index] = back_avg;
5739 i->second.hb_back_min[index] = back_min;
5740 i->second.hb_back_max[index] = back_max;
5741 i->second.hb_front_pingtime[index] = front_avg;
5742 i->second.hb_front_min[index] = front_min;
5743 i->second.hb_front_max[index] = front_max;
5744 ++i->second.hb_index;
5745 }
5746
5747 {
5748 std::lock_guard l(service.stat_lock);
5749 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5750 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5751
5752 uint32_t total = 0;
5753 uint32_t min = UINT_MAX;
5754 uint32_t max = 0;
5755 uint32_t count = 0;
5756 uint32_t which = 0;
5757 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5758 for (int32_t k = size - 1 ; k >= 0; --k) {
5759 ++count;
5760 int index = (i->second.hb_index + k) % size;
5761 total += i->second.hb_back_pingtime[index];
5762 if (i->second.hb_back_min[index] < min)
5763 min = i->second.hb_back_min[index];
5764 if (i->second.hb_back_max[index] > max)
5765 max = i->second.hb_back_max[index];
5766 if (count == 1 || count == 5 || count == 15) {
5767 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5768 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5769 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5770 which++;
5771 if (count == 15)
5772 break;
5773 }
5774 }
5775
5776 if (i->second.con_front != NULL) {
5777 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5778
5779 total = 0;
5780 min = UINT_MAX;
5781 max = 0;
5782 count = 0;
5783 which = 0;
5784 for (int32_t k = size - 1 ; k >= 0; --k) {
5785 ++count;
5786 int index = (i->second.hb_index + k) % size;
5787 total += i->second.hb_front_pingtime[index];
5788 if (i->second.hb_front_min[index] < min)
5789 min = i->second.hb_front_min[index];
5790 if (i->second.hb_front_max[index] > max)
5791 max = i->second.hb_front_max[index];
5792 if (count == 1 || count == 5 || count == 15) {
5793 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5794 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5795 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5796 which++;
5797 if (count == 15)
5798 break;
5799 }
5800 }
5801 }
5802 }
5803 } else {
5804 std::lock_guard l(service.stat_lock);
5805 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5806 if (i->second.con_front != NULL)
5807 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5808 }
11fdf7f2 5809 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
7c673cae
FG
5810 }
5811
11fdf7f2
TL
5812 if (i->second.is_healthy(now)) {
5813 // Cancel false reports
5814 auto failure_queue_entry = failure_queue.find(from);
5815 if (failure_queue_entry != failure_queue.end()) {
5816 dout(10) << "handle_osd_ping canceling queued "
5817 << "failure report for osd." << from << dendl;
5818 failure_queue.erase(failure_queue_entry);
5819 }
5820
5821 auto failure_pending_entry = failure_pending.find(from);
5822 if (failure_pending_entry != failure_pending.end()) {
5823 dout(10) << "handle_osd_ping canceling in-flight "
5824 << "failure report for osd." << from << dendl;
5825 send_still_alive(curmap->get_epoch(),
5826 from,
5827 failure_pending_entry->second.second);
5828 failure_pending.erase(failure_pending_entry);
5829 }
7c673cae 5830 }
11fdf7f2
TL
5831 } else {
5832 // old replies, deprecated by newly sent pings.
9f95a23c 5833 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
11fdf7f2
TL
5834 << ") is found, treat as covered by newly sent pings "
5835 << "and ignore"
5836 << dendl;
7c673cae
FG
5837 }
5838 }
5839
5840 if (m->map_epoch &&
5841 curmap->is_up(from)) {
7c673cae 5842 if (is_active()) {
9f95a23c
TL
5843 ConnectionRef cluster_con = service.get_con_osd_cluster(
5844 from, curmap->get_epoch());
5845 if (cluster_con) {
5846 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
7c673cae
FG
5847 }
5848 }
5849 }
9f95a23c
TL
5850
5851 s->stamps->got_ping_reply(
5852 mnow,
5853 m->mono_send_stamp,
5854 m->delta_ub);
5855 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
7c673cae
FG
5856 }
5857 break;
5858
5859 case MOSDPing::YOU_DIED:
5860 dout(10) << "handle_osd_ping " << m->get_source_inst()
5861 << " says i am down in " << m->map_epoch << dendl;
5862 osdmap_subscribe(curmap->get_epoch()+1, false);
5863 break;
5864 }
5865
9f95a23c 5866 heartbeat_lock.unlock();
7c673cae
FG
5867 m->put();
5868}
5869
5870void OSD::heartbeat_entry()
5871{
9f95a23c 5872 std::unique_lock l(heartbeat_lock);
7c673cae
FG
5873 if (is_stopping())
5874 return;
5875 while (!heartbeat_stop) {
5876 heartbeat();
5877
eafe8130
TL
5878 double wait;
5879 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5880 wait = (float)cct->_conf->osd_heartbeat_interval;
5881 } else {
5882 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5883 }
9f95a23c 5884 auto w = ceph::make_timespan(wait);
7c673cae 5885 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
9f95a23c 5886 heartbeat_cond.wait_for(l, w);
7c673cae
FG
5887 if (is_stopping())
5888 return;
5889 dout(30) << "heartbeat_entry woke up" << dendl;
5890 }
5891}
5892
5893void OSD::heartbeat_check()
5894{
9f95a23c 5895 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
7c673cae
FG
5896 utime_t now = ceph_clock_now();
5897
11fdf7f2 5898 // check for incoming heartbeats (move me elsewhere?)
7c673cae
FG
5899 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5900 p != heartbeat_peers.end();
5901 ++p) {
5902
5903 if (p->second.first_tx == utime_t()) {
5904 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
11fdf7f2 5905 << " yet, skipping" << dendl;
7c673cae
FG
5906 continue;
5907 }
5908
5909 dout(25) << "heartbeat_check osd." << p->first
5910 << " first_tx " << p->second.first_tx
5911 << " last_tx " << p->second.last_tx
5912 << " last_rx_back " << p->second.last_rx_back
5913 << " last_rx_front " << p->second.last_rx_front
5914 << dendl;
11fdf7f2
TL
5915 if (p->second.is_unhealthy(now)) {
5916 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
7c673cae
FG
5917 if (p->second.last_rx_back == utime_t() ||
5918 p->second.last_rx_front == utime_t()) {
11fdf7f2
TL
5919 derr << "heartbeat_check: no reply from "
5920 << p->second.con_front->get_peer_addr().get_sockaddr()
5921 << " osd." << p->first
5922 << " ever on either front or back, first ping sent "
5923 << p->second.first_tx
5924 << " (oldest deadline " << oldest_deadline << ")"
5925 << dendl;
7c673cae 5926 // fail
11fdf7f2 5927 failure_queue[p->first] = p->second.first_tx;
7c673cae 5928 } else {
11fdf7f2
TL
5929 derr << "heartbeat_check: no reply from "
5930 << p->second.con_front->get_peer_addr().get_sockaddr()
7c673cae
FG
5931 << " osd." << p->first << " since back " << p->second.last_rx_back
5932 << " front " << p->second.last_rx_front
11fdf7f2
TL
5933 << " (oldest deadline " << oldest_deadline << ")"
5934 << dendl;
7c673cae 5935 // fail
11fdf7f2 5936 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
7c673cae
FG
5937 }
5938 }
5939 }
5940}
5941
5942void OSD::heartbeat()
5943{
9f95a23c 5944 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
7c673cae
FG
5945 dout(30) << "heartbeat" << dendl;
5946
20effc67
TL
5947 auto load_for_logger = service.get_scrub_services().update_load_average();
5948 if (load_for_logger) {
5949 logger->set(l_osd_loadavg, load_for_logger.value());
7c673cae 5950 }
7c673cae
FG
5951 dout(30) << "heartbeat checking stats" << dendl;
5952
11fdf7f2 5953 // refresh peer list and osd stats
7c673cae
FG
5954 vector<int> hb_peers;
5955 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5956 p != heartbeat_peers.end();
5957 ++p)
5958 hb_peers.push_back(p->first);
7c673cae 5959
11fdf7f2
TL
5960 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5961 dout(5) << __func__ << " " << new_stat << dendl;
5962 ceph_assert(new_stat.statfs.total);
5963
5964 float pratio;
5965 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5966
5967 service.check_full_status(ratio, pratio);
7c673cae
FG
5968
5969 utime_t now = ceph_clock_now();
9f95a23c 5970 auto mnow = service.get_mnow();
11fdf7f2
TL
5971 utime_t deadline = now;
5972 deadline += cct->_conf->osd_heartbeat_grace;
7c673cae
FG
5973
5974 // send heartbeats
5975 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5976 i != heartbeat_peers.end();
5977 ++i) {
5978 int peer = i->first;
f67539c2
TL
5979 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5980 if (!s) {
5981 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5982 continue;
5983 }
9f95a23c
TL
5984 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5985
7c673cae
FG
5986 i->second.last_tx = now;
5987 if (i->second.first_tx == utime_t())
5988 i->second.first_tx = now;
11fdf7f2
TL
5989 i->second.ping_history[now] = make_pair(deadline,
5990 HeartbeatInfo::HEARTBEAT_MAX_CONN);
eafe8130
TL
5991 if (i->second.hb_interval_start == utime_t())
5992 i->second.hb_interval_start = now;
9f95a23c 5993
9f95a23c
TL
5994 std::optional<ceph::signedspan> delta_ub;
5995 s->stamps->sent_ping(&delta_ub);
5996
5997 i->second.con_back->send_message(
5998 new MOSDPing(monc->get_fsid(),
5999 service.get_osdmap_epoch(),
6000 MOSDPing::PING,
6001 now,
6002 mnow,
6003 mnow,
6004 service.get_up_epoch(),
6005 cct->_conf->osd_heartbeat_min_size,
6006 delta_ub));
7c673cae
FG
6007
6008 if (i->second.con_front)
9f95a23c
TL
6009 i->second.con_front->send_message(
6010 new MOSDPing(monc->get_fsid(),
6011 service.get_osdmap_epoch(),
6012 MOSDPing::PING,
6013 now,
6014 mnow,
6015 mnow,
6016 service.get_up_epoch(),
6017 cct->_conf->osd_heartbeat_min_size,
6018 delta_ub));
7c673cae
FG
6019 }
6020
6021 logger->set(l_osd_hb_to, heartbeat_peers.size());
6022
6023 // hmm.. am i all alone?
6024 dout(30) << "heartbeat lonely?" << dendl;
6025 if (heartbeat_peers.empty()) {
6026 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
6027 last_mon_heartbeat = now;
6028 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
9f95a23c 6029 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
6030 }
6031 }
6032
6033 dout(30) << "heartbeat done" << dendl;
6034}
6035
6036bool OSD::heartbeat_reset(Connection *con)
6037{
11fdf7f2
TL
6038 std::lock_guard l(heartbeat_lock);
6039 auto s = con->get_priv();
9f95a23c 6040 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
11fdf7f2 6041 con->set_priv(nullptr);
7c673cae 6042 if (s) {
7c673cae 6043 if (is_stopping()) {
7c673cae
FG
6044 return true;
6045 }
9f95a23c
TL
6046 auto session = static_cast<Session*>(s.get());
6047 auto p = heartbeat_peers.find(session->peer);
7c673cae
FG
6048 if (p != heartbeat_peers.end() &&
6049 (p->second.con_back == con ||
6050 p->second.con_front == con)) {
6051 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6052 << ", reopening" << dendl;
9f95a23c 6053 p->second.clear_mark_down(con);
7c673cae
FG
6054 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
6055 if (newcon.first) {
6056 p->second.con_back = newcon.first.get();
11fdf7f2 6057 p->second.con_back->set_priv(s);
7c673cae
FG
6058 if (newcon.second) {
6059 p->second.con_front = newcon.second.get();
11fdf7f2 6060 p->second.con_front->set_priv(s);
7c673cae 6061 }
11fdf7f2 6062 p->second.ping_history.clear();
7c673cae
FG
6063 } else {
6064 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6065 << ", raced with osdmap update, closing out peer" << dendl;
6066 heartbeat_peers.erase(p);
6067 }
6068 } else {
6069 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
6070 }
7c673cae
FG
6071 }
6072 return true;
6073}
6074
6075
6076
6077// =========================================
6078
6079void OSD::tick()
6080{
9f95a23c 6081 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6082 dout(10) << "tick" << dendl;
6083
9f95a23c
TL
6084 utime_t now = ceph_clock_now();
6085 // throw out any obsolete markdown log
6086 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6087 while (!osd_markdown_log.empty() &&
6088 osd_markdown_log.front() + grace < now)
6089 osd_markdown_log.pop_front();
6090
7c673cae
FG
6091 if (is_active() || is_waiting_for_healthy()) {
6092 maybe_update_heartbeat_peers();
6093 }
6094
6095 if (is_waiting_for_healthy()) {
6096 start_boot();
494da23a
TL
6097 }
6098
6099 if (is_waiting_for_healthy() || is_booting()) {
6100 std::lock_guard l(heartbeat_lock);
494da23a
TL
6101 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
6102 last_mon_heartbeat = now;
6103 dout(1) << __func__ << " checking mon for new map" << dendl;
9f95a23c 6104 osdmap_subscribe(get_osdmap_epoch() + 1, false);
11fdf7f2 6105 }
7c673cae
FG
6106 }
6107
6108 do_waiters();
6109
9f95a23c
TL
6110 // scrub purged_snaps every deep scrub interval
6111 {
6112 const utime_t last = superblock.last_purged_snaps_scrub;
6113 utime_t next = last;
6114 next += cct->_conf->osd_scrub_min_interval;
6115 std::mt19937 rng;
6116 // use a seed that is stable for each scrub interval, but varies
6117 // by OSD to avoid any herds.
6118 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
20effc67 6119 double r = (rng() % 1024) / 1024.0;
9f95a23c
TL
6120 next +=
6121 cct->_conf->osd_scrub_min_interval *
6122 cct->_conf->osd_scrub_interval_randomize_ratio * r;
6123 if (next < ceph_clock_now()) {
6124 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6125 << " next " << next << " ... now" << dendl;
6126 scrub_purged_snaps();
6127 } else {
6128 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6129 << " next " << next << dendl;
6130 }
6131 }
6132
91327a77 6133 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
6134}
6135
6136void OSD::tick_without_osd_lock()
6137{
9f95a23c 6138 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
7c673cae
FG
6139 dout(10) << "tick_without_osd_lock" << dendl;
6140
f67539c2
TL
6141 logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
6142 logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
6143 logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
11fdf7f2
TL
6144
6145 // refresh osd stats
6146 struct store_statfs_t stbuf;
6147 osd_alert_list_t alerts;
6148 int r = store->statfs(&stbuf, &alerts);
6149 ceph_assert(r == 0);
6150 service.set_statfs(stbuf, alerts);
7c673cae
FG
6151
6152 // osd_lock is not being held, which means the OSD state
6153 // might change when doing the monitor report
6154 if (is_active() || is_waiting_for_healthy()) {
9f95a23c
TL
6155 {
6156 std::lock_guard l{heartbeat_lock};
6157 heartbeat_check();
6158 }
6159 map_lock.lock_shared();
11fdf7f2 6160 std::lock_guard l(mon_report_lock);
7c673cae
FG
6161
6162 // mon report?
7c673cae 6163 utime_t now = ceph_clock_now();
11fdf7f2
TL
6164 if (service.need_fullness_update() ||
6165 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
7c673cae 6166 last_mon_report = now;
7c673cae
FG
6167 send_full_update();
6168 send_failures();
7c673cae 6169 }
9f95a23c 6170 map_lock.unlock_shared();
11fdf7f2
TL
6171
6172 epoch_t max_waiting_epoch = 0;
6173 for (auto s : shards) {
6174 max_waiting_epoch = std::max(max_waiting_epoch,
6175 s->get_max_waiting_epoch());
6176 }
6177 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6178 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6179 << ", requesting new map" << dendl;
6180 osdmap_subscribe(superblock.newest_map + 1, false);
6181 }
7c673cae
FG
6182 }
6183
6184 if (is_active()) {
6185 if (!scrub_random_backoff()) {
6186 sched_scrub();
6187 }
6188 service.promote_throttle_recalibrate();
3efd9988 6189 resume_creating_pg();
224ce89b
WB
6190 bool need_send_beacon = false;
6191 const auto now = ceph::coarse_mono_clock::now();
6192 {
6193 // borrow lec lock to pretect last_sent_beacon from changing
11fdf7f2 6194 std::lock_guard l{min_last_epoch_clean_lock};
224ce89b 6195 const auto elapsed = now - last_sent_beacon;
f67539c2 6196 if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
224ce89b
WB
6197 cct->_conf->osd_beacon_report_interval) {
6198 need_send_beacon = true;
6199 }
6200 }
6201 if (need_send_beacon) {
6202 send_beacon(now);
6203 }
7c673cae
FG
6204 }
6205
11fdf7f2 6206 mgrc.update_daemon_health(get_health_metrics());
7c673cae 6207 service.kick_recovery_queue();
91327a77
AA
6208 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6209 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
6210}
6211
7c673cae
FG
6212// Usage:
6213// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6214// rmomapkey <pool-id> [namespace/]<obj-name> <key>
6215// setomapheader <pool-id> [namespace/]<obj-name> <header>
6216// getomap <pool> [namespace/]<obj-name>
6217// truncobj <pool-id> [namespace/]<obj-name> <newlen>
6218// injectmdataerr [namespace/]<obj-name> [shardid]
6219// injectdataerr [namespace/]<obj-name> [shardid]
6220//
6221// set_recovery_delay [utime]
6222void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
11fdf7f2
TL
6223 std::string_view command,
6224 const cmdmap_t& cmdmap, ostream &ss)
7c673cae
FG
6225{
6226 //Test support
6227 //Support changing the omap on a single osd by using the Admin Socket to
6228 //directly request the osd make a change.
6229 if (command == "setomapval" || command == "rmomapkey" ||
6230 command == "setomapheader" || command == "getomap" ||
6231 command == "truncobj" || command == "injectmdataerr" ||
6232 command == "injectdataerr"
6233 ) {
6234 pg_t rawpg;
6235 int64_t pool;
6236 OSDMapRef curmap = service->get_osdmap();
6237 int r = -1;
6238
6239 string poolstr;
6240
9f95a23c 6241 cmd_getval(cmdmap, "pool", poolstr);
7c673cae
FG
6242 pool = curmap->lookup_pg_pool_name(poolstr);
6243 //If we can't find it by name then maybe id specified
6244 if (pool < 0 && isdigit(poolstr[0]))
6245 pool = atoll(poolstr.c_str());
6246 if (pool < 0) {
b5b8bbf5 6247 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
6248 return;
6249 }
6250
6251 string objname, nspace;
9f95a23c 6252 cmd_getval(cmdmap, "objname", objname);
7c673cae
FG
6253 std::size_t found = objname.find_first_of('/');
6254 if (found != string::npos) {
6255 nspace = objname.substr(0, found);
6256 objname = objname.substr(found+1);
6257 }
6258 object_locator_t oloc(pool, nspace);
6259 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
6260
6261 if (r < 0) {
6262 ss << "Invalid namespace/objname";
6263 return;
6264 }
6265
20effc67 6266 int64_t shardid = cmd_getval_or<int64_t>(cmdmap, "shardid", shard_id_t::NO_SHARD);
7c673cae
FG
6267 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6268 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6269 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6270 if (curmap->pg_is_ec(rawpg)) {
6271 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6272 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6273 return;
6274 }
6275 }
6276
6277 ObjectStore::Transaction t;
6278
6279 if (command == "setomapval") {
6280 map<string, bufferlist> newattrs;
6281 bufferlist val;
6282 string key, valstr;
9f95a23c
TL
6283 cmd_getval(cmdmap, "key", key);
6284 cmd_getval(cmdmap, "val", valstr);
7c673cae
FG
6285
6286 val.append(valstr);
6287 newattrs[key] = val;
6288 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
11fdf7f2 6289 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6290 if (r < 0)
6291 ss << "error=" << r;
6292 else
6293 ss << "ok";
6294 } else if (command == "rmomapkey") {
6295 string key;
9f95a23c 6296 cmd_getval(cmdmap, "key", key);
7c673cae 6297
9f95a23c 6298 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
11fdf7f2 6299 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6300 if (r < 0)
6301 ss << "error=" << r;
6302 else
6303 ss << "ok";
6304 } else if (command == "setomapheader") {
6305 bufferlist newheader;
6306 string headerstr;
6307
9f95a23c 6308 cmd_getval(cmdmap, "header", headerstr);
7c673cae
FG
6309 newheader.append(headerstr);
6310 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
11fdf7f2 6311 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6312 if (r < 0)
6313 ss << "error=" << r;
6314 else
6315 ss << "ok";
6316 } else if (command == "getomap") {
6317 //Debug: Output entire omap
6318 bufferlist hdrbl;
6319 map<string, bufferlist> keyvals;
11fdf7f2
TL
6320 auto ch = store->open_collection(coll_t(pgid));
6321 if (!ch) {
6322 ss << "unable to open collection for " << pgid;
6323 r = -ENOENT;
6324 } else {
6325 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6326 if (r >= 0) {
7c673cae
FG
6327 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6328 for (map<string, bufferlist>::iterator it = keyvals.begin();
11fdf7f2 6329 it != keyvals.end(); ++it)
7c673cae
FG
6330 ss << " key=" << (*it).first << " val="
6331 << string((*it).second.c_str(), (*it).second.length());
11fdf7f2 6332 } else {
7c673cae 6333 ss << "error=" << r;
11fdf7f2 6334 }
7c673cae
FG
6335 }
6336 } else if (command == "truncobj") {
6337 int64_t trunclen;
9f95a23c 6338 cmd_getval(cmdmap, "len", trunclen);
7c673cae 6339 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
11fdf7f2 6340 r = store->queue_transaction(service->meta_ch, std::move(t));
7c673cae
FG
6341 if (r < 0)
6342 ss << "error=" << r;
6343 else
6344 ss << "ok";
6345 } else if (command == "injectdataerr") {
6346 store->inject_data_error(gobj);
6347 ss << "ok";
6348 } else if (command == "injectmdataerr") {
6349 store->inject_mdata_error(gobj);
6350 ss << "ok";
6351 }
6352 return;
6353 }
6354 if (command == "set_recovery_delay") {
20effc67 6355 int64_t delay = cmd_getval_or<int64_t>(cmdmap, "utime", 0);
7c673cae
FG
6356 ostringstream oss;
6357 oss << delay;
11fdf7f2 6358 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
7c673cae
FG
6359 oss.str().c_str());
6360 if (r != 0) {
6361 ss << "set_recovery_delay: error setting "
6362 << "osd_recovery_delay_start to '" << delay << "': error "
6363 << r;
6364 return;
6365 }
11fdf7f2 6366 service->cct->_conf.apply_changes(nullptr);
7c673cae
FG
6367 ss << "set_recovery_delay: set osd_recovery_delay_start "
6368 << "to " << service->cct->_conf->osd_recovery_delay_start;
6369 return;
6370 }
7c673cae 6371 if (command == "injectfull") {
20effc67
TL
6372 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", -1);
6373 string type = cmd_getval_or<string>(cmdmap, "type", "full");
7c673cae 6374 OSDService::s_names state;
20effc67 6375
7c673cae
FG
6376 if (type == "none" || count == 0) {
6377 type = "none";
6378 count = 0;
6379 }
6380 state = service->get_full_state(type);
6381 if (state == OSDService::s_names::INVALID) {
6382 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6383 return;
6384 }
6385 service->set_injectfull(state, count);
6386 return;
6387 }
6388 ss << "Internal error - command=" << command;
6389}
6390
7c673cae
FG
6391// =========================================
6392
6393void OSD::ms_handle_connect(Connection *con)
6394{
6395 dout(10) << __func__ << " con " << con << dendl;
6396 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
11fdf7f2 6397 std::lock_guard l(osd_lock);
7c673cae
FG
6398 if (is_stopping())
6399 return;
6400 dout(10) << __func__ << " on mon" << dendl;
6401
6402 if (is_preboot()) {
6403 start_boot();
6404 } else if (is_booting()) {
6405 _send_boot(); // resend boot message
6406 } else {
9f95a23c 6407 map_lock.lock_shared();
11fdf7f2 6408 std::lock_guard l2(mon_report_lock);
7c673cae
FG
6409
6410 utime_t now = ceph_clock_now();
6411 last_mon_report = now;
6412
6413 // resend everything, it's a new session
6414 send_full_update();
6415 send_alive();
6416 service.requeue_pg_temp();
11fdf7f2 6417 service.clear_sent_ready_to_merge();
7c673cae 6418 service.send_pg_temp();
11fdf7f2
TL
6419 service.send_ready_to_merge();
6420 service.send_pg_created();
7c673cae
FG
6421 requeue_failures();
6422 send_failures();
7c673cae 6423
9f95a23c 6424 map_lock.unlock_shared();
7c673cae
FG
6425 if (is_active()) {
6426 send_beacon(ceph::coarse_mono_clock::now());
6427 }
6428 }
6429
6430 // full map requests may happen while active or pre-boot
6431 if (requested_full_first) {
6432 rerequest_full_maps();
6433 }
6434 }
6435}
6436
6437void OSD::ms_handle_fast_connect(Connection *con)
6438{
6439 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6440 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6441 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6442 s = ceph::make_ref<Session>(cct, con);
6443 con->set_priv(s);
7c673cae
FG
6444 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6445 << " addr=" << s->con->get_peer_addr() << dendl;
6446 // we don't connect to clients
11fdf7f2 6447 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6448 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6449 }
7c673cae
FG
6450 }
6451}
6452
6453void OSD::ms_handle_fast_accept(Connection *con)
6454{
6455 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6456 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
9f95a23c
TL
6457 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6458 s = ceph::make_ref<Session>(cct, con);
6459 con->set_priv(s);
7c673cae
FG
6460 dout(10) << "new session (incoming)" << s << " con=" << con
6461 << " addr=" << con->get_peer_addr()
6462 << " must have raced with connect" << dendl;
11fdf7f2 6463 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
7c673cae
FG
6464 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6465 }
7c673cae
FG
6466 }
6467}
6468
6469bool OSD::ms_handle_reset(Connection *con)
6470{
9f95a23c
TL
6471 auto session = ceph::ref_cast<Session>(con->get_priv());
6472 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
7c673cae
FG
6473 if (!session)
6474 return false;
6475 session->wstate.reset(con);
11fdf7f2
TL
6476 session->con->set_priv(nullptr);
6477 session->con.reset(); // break con <-> session ref cycle
7c673cae
FG
6478 // note that we break session->con *before* the session_handle_reset
6479 // cleanup below. this avoids a race between us and
6480 // PG::add_backoff, Session::check_backoff, etc.
9f95a23c 6481 session_handle_reset(session);
7c673cae
FG
6482 return true;
6483}
6484
6485bool OSD::ms_handle_refused(Connection *con)
6486{
6487 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6488 return false;
6489
9f95a23c
TL
6490 auto session = ceph::ref_cast<Session>(con->get_priv());
6491 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
7c673cae
FG
6492 if (!session)
6493 return false;
6494 int type = con->get_peer_type();
6495 // handle only OSD failures here
6496 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6497 OSDMapRef osdmap = get_osdmap();
6498 if (osdmap) {
6499 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6500 if (id >= 0 && osdmap->is_up(id)) {
6501 // I'm cheating mon heartbeat grace logic, because we know it's not going
6502 // to respawn alone. +1 so we won't hit any boundary case.
11fdf7f2
TL
6503 monc->send_mon_message(
6504 new MOSDFailure(
6505 monc->get_fsid(),
6506 id,
6507 osdmap->get_addrs(id),
6508 cct->_conf->osd_heartbeat_grace + 1,
6509 osdmap->get_epoch(),
6510 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6511 ));
7c673cae
FG
6512 }
6513 }
6514 }
7c673cae
FG
6515 return true;
6516}
6517
f67539c2 6518struct CB_OSD_GetVersion {
7c673cae 6519 OSD *osd;
f67539c2
TL
6520 explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6521 void operator ()(boost::system::error_code ec, version_t newest,
6522 version_t oldest) {
6523 if (!ec)
7c673cae
FG
6524 osd->_got_mon_epochs(oldest, newest);
6525 }
6526};
6527
6528void OSD::start_boot()
6529{
6530 if (!_is_healthy()) {
6531 // if we are not healthy, do not mark ourselves up (yet)
6532 dout(1) << "not healthy; waiting to boot" << dendl;
6533 if (!is_waiting_for_healthy())
6534 start_waiting_for_healthy();
6535 // send pings sooner rather than later
6536 heartbeat_kick();
6537 return;
6538 }
6539 dout(1) << __func__ << dendl;
6540 set_state(STATE_PREBOOT);
6541 dout(10) << "start_boot - have maps " << superblock.oldest_map
6542 << ".." << superblock.newest_map << dendl;
f67539c2 6543 monc->get_version("osdmap", CB_OSD_GetVersion(this));
7c673cae
FG
6544}
6545
6546void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6547{
11fdf7f2 6548 std::lock_guard l(osd_lock);
7c673cae
FG
6549 if (is_preboot()) {
6550 _preboot(oldest, newest);
6551 }
6552}
6553
6554void OSD::_preboot(epoch_t oldest, epoch_t newest)
6555{
11fdf7f2 6556 ceph_assert(is_preboot());
7c673cae
FG
6557 dout(10) << __func__ << " _preboot mon has osdmaps "
6558 << oldest << ".." << newest << dendl;
6559
6560 // ensure our local fullness awareness is accurate
81eedcae
TL
6561 {
6562 std::lock_guard l(heartbeat_lock);
6563 heartbeat();
6564 }
7c673cae 6565
9f95a23c
TL
6566 const auto& monmap = monc->monmap;
6567 const auto osdmap = get_osdmap();
7c673cae 6568 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
6569 if (osdmap->get_epoch() == 0) {
6570 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 6571 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
6572 derr << "osdmap says I am destroyed" << dendl;
6573 // provide a small margin so we don't livelock seeing if we
6574 // un-destroyed ourselves.
6575 if (osdmap->get_epoch() > newest - 1) {
6576 exit(0);
6577 }
81eedcae 6578 } else if (osdmap->is_noup(whoami)) {
7c673cae
FG
6579 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6580 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6581 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6582 << dendl;
7c673cae
FG
6583 } else if (service.need_fullness_update()) {
6584 derr << "osdmap fullness state needs update" << dendl;
6585 send_full_update();
9f95a23c
TL
6586 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6587 superblock.purged_snaps_last < superblock.current_epoch) {
6588 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6589 << " < newest_map " << superblock.current_epoch << dendl;
6590 _get_purged_snaps();
7c673cae
FG
6591 } else if (osdmap->get_epoch() >= oldest - 1 &&
6592 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
11fdf7f2
TL
6593
6594 // wait for pgs to fully catch up in a different thread, since
6595 // this thread might be required for splitting and merging PGs to
6596 // make progress.
6597 boot_finisher.queue(
9f95a23c 6598 new LambdaContext(
11fdf7f2 6599 [this](int r) {
9f95a23c 6600 std::unique_lock l(osd_lock);
11fdf7f2
TL
6601 if (is_preboot()) {
6602 dout(10) << __func__ << " waiting for peering work to drain"
6603 << dendl;
9f95a23c 6604 l.unlock();
11fdf7f2 6605 for (auto shard : shards) {
9f95a23c 6606 shard->wait_min_pg_epoch(get_osdmap_epoch());
11fdf7f2 6607 }
9f95a23c 6608 l.lock();
11fdf7f2
TL
6609 }
6610 if (is_preboot()) {
6611 _send_boot();
6612 }
6613 }));
6614 return;
7c673cae
FG
6615 }
6616
6617 // get all the latest maps
6618 if (osdmap->get_epoch() + 1 >= oldest)
6619 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6620 else
6621 osdmap_subscribe(oldest - 1, true);
6622}
6623
9f95a23c
TL
6624void OSD::_get_purged_snaps()
6625{
6626 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6627 // overlapping requests to the mon, which will be somewhat inefficient, but
6628 // it should be reliable.
6629 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6630 << ", newest_map " << superblock.current_epoch << dendl;
6631 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6632 superblock.purged_snaps_last + 1,
6633 superblock.current_epoch + 1);
6634 monc->send_mon_message(m);
6635}
6636
6637void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6638{
6639 dout(10) << __func__ << " " << *m << dendl;
6640 ObjectStore::Transaction t;
6641 if (!is_preboot() ||
6642 m->last < superblock.purged_snaps_last) {
6643 goto out;
6644 }
20effc67 6645 SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
9f95a23c
TL
6646 make_purged_snaps_oid(), &t,
6647 m->purged_snaps);
6648 superblock.purged_snaps_last = m->last;
6649 write_superblock(t);
6650 store->queue_transaction(
6651 service.meta_ch,
6652 std::move(t));
6653 service.publish_superblock(superblock);
6654 if (m->last < superblock.current_epoch) {
6655 _get_purged_snaps();
6656 } else {
6657 start_boot();
6658 }
6659out:
6660 m->put();
6661}
6662
7c673cae
FG
6663void OSD::send_full_update()
6664{
6665 if (!service.need_fullness_update())
6666 return;
6667 unsigned state = 0;
6668 if (service.is_full()) {
6669 state = CEPH_OSD_FULL;
6670 } else if (service.is_backfillfull()) {
6671 state = CEPH_OSD_BACKFILLFULL;
6672 } else if (service.is_nearfull()) {
6673 state = CEPH_OSD_NEARFULL;
6674 }
6675 set<string> s;
6676 OSDMap::calc_state_set(state, s);
6677 dout(10) << __func__ << " want state " << s << dendl;
9f95a23c 6678 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
7c673cae
FG
6679}
6680
6681void OSD::start_waiting_for_healthy()
6682{
6683 dout(1) << "start_waiting_for_healthy" << dendl;
6684 set_state(STATE_WAITING_FOR_HEALTHY);
6685 last_heartbeat_resample = utime_t();
181888fb
FG
6686
6687 // subscribe to osdmap updates, in case our peers really are known to be dead
9f95a23c 6688 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
6689}
6690
6691bool OSD::_is_healthy()
6692{
6693 if (!cct->get_heartbeat_map()->is_healthy()) {
6694 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6695 return false;
6696 }
6697
6698 if (is_waiting_for_healthy()) {
11fdf7f2 6699 utime_t now = ceph_clock_now();
9f95a23c
TL
6700 if (osd_markdown_log.empty()) {
6701 dout(5) << __func__ << " force returning true since last markdown"
6702 << " was " << cct->_conf->osd_max_markdown_period
6703 << "s ago" << dendl;
11fdf7f2
TL
6704 return true;
6705 }
6706 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6707 int num = 0, up = 0;
6708 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6709 p != heartbeat_peers.end();
6710 ++p) {
11fdf7f2 6711 if (p->second.is_healthy(now))
7c673cae
FG
6712 ++up;
6713 ++num;
6714 }
6715 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6716 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6717 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6718 return false;
6719 }
6720 }
6721
6722 return true;
6723}
6724
6725void OSD::_send_boot()
6726{
6727 dout(10) << "_send_boot" << dendl;
11fdf7f2
TL
6728 Connection *local_connection =
6729 cluster_messenger->get_loopback_connection().get();
6730 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6731 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6732 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6733 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6734
6735 dout(20) << " initial client_addrs " << client_addrs
6736 << ", cluster_addrs " << cluster_addrs
6737 << ", hb_back_addrs " << hb_back_addrs
6738 << ", hb_front_addrs " << hb_front_addrs
6739 << dendl;
6740 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6741 dout(10) << " assuming cluster_addrs match client_addrs "
6742 << client_addrs << dendl;
6743 cluster_addrs = cluster_messenger->get_myaddrs();
6744 }
6745 if (auto session = local_connection->get_priv(); !session) {
6746 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6747 }
6748
7c673cae 6749 local_connection = hb_back_server_messenger->get_loopback_connection().get();
11fdf7f2
TL
6750 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6751 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6752 << cluster_addrs << dendl;
6753 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
7c673cae 6754 }
11fdf7f2
TL
6755 if (auto session = local_connection->get_priv(); !session) {
6756 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
7c673cae
FG
6757 }
6758
11fdf7f2
TL
6759 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6760 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6761 dout(10) << " assuming hb_front_addrs match client_addrs "
6762 << client_addrs << dendl;
6763 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6764 }
6765 if (auto session = local_connection->get_priv(); !session) {
6766 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6767 }
6768
6769 // we now know what our front and back addrs will be, and we are
6770 // about to tell the mon what our metadata (including numa bindings)
6771 // are, so now is a good time!
6772 set_numa_affinity();
6773
6774 MOSDBoot *mboot = new MOSDBoot(
6775 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6776 hb_back_addrs, hb_front_addrs, cluster_addrs,
6777 CEPH_FEATURES_ALL);
6778 dout(10) << " final client_addrs " << client_addrs
6779 << ", cluster_addrs " << cluster_addrs
6780 << ", hb_back_addrs " << hb_back_addrs
6781 << ", hb_front_addrs " << hb_front_addrs
7c673cae
FG
6782 << dendl;
6783 _collect_metadata(&mboot->metadata);
6784 monc->send_mon_message(mboot);
6785 set_state(STATE_BOOTING);
6786}
6787
6788void OSD::_collect_metadata(map<string,string> *pm)
6789{
6790 // config info
6791 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6792 if (store->get_type() == "filestore") {
6793 // not applicable for bluestore
6794 (*pm)["osd_journal"] = journal_path;
6795 }
11fdf7f2
TL
6796 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6797 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6798 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6799 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
7c673cae
FG
6800
6801 // backend
6802 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6803 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6804 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6805 (*pm)["default_device_class"] = store->get_default_device_class();
f6b5b4d7
TL
6806 string osdspec_affinity;
6807 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6808 if (r < 0 || osdspec_affinity.empty()) {
6809 osdspec_affinity = "";
6810 }
6811 (*pm)["osdspec_affinity"] = osdspec_affinity;
39ae355f
TL
6812 string ceph_version_when_created;
6813 r = store->read_meta("ceph_version_when_created", &ceph_version_when_created);
6814 if (r <0 || ceph_version_when_created.empty()) {
6815 ceph_version_when_created = "";
6816 }
6817 (*pm)["ceph_version_when_created"] = ceph_version_when_created;
6818 string created_at;
6819 r = store->read_meta("created_at", &created_at);
6820 if (r < 0 || created_at.empty()) {
6821 created_at = "";
6822 }
6823 (*pm)["created_at"] = created_at;
7c673cae
FG
6824 store->collect_metadata(pm);
6825
6826 collect_sys_info(pm, cct);
6827
11fdf7f2
TL
6828 (*pm)["front_iface"] = pick_iface(
6829 cct,
6830 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6831 (*pm)["back_iface"] = pick_iface(
6832 cct,
6833 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6834
6835 // network numa
6836 {
6837 int node = -1;
6838 set<int> nodes;
6839 set<string> unknown;
6840 for (auto nm : { "front_iface", "back_iface" }) {
6841 if (!(*pm)[nm].size()) {
6842 unknown.insert(nm);
6843 continue;
6844 }
6845 int n = -1;
6846 int r = get_iface_numa_node((*pm)[nm], &n);
6847 if (r < 0) {
6848 unknown.insert((*pm)[nm]);
6849 continue;
6850 }
6851 nodes.insert(n);
6852 if (node < 0) {
6853 node = n;
6854 }
6855 }
6856 if (unknown.size()) {
6857 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6858 }
6859 if (!nodes.empty()) {
6860 (*pm)["network_numa_nodes"] = stringify(nodes);
6861 }
6862 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6863 (*pm)["network_numa_node"] = stringify(node);
6864 }
6865 }
6866
6867 if (numa_node >= 0) {
6868 (*pm)["numa_node"] = stringify(numa_node);
6869 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6870 &numa_cpu_set);
6871 }
6872
6873 set<string> devnames;
6874 store->get_devices(&devnames);
9f95a23c
TL
6875 map<string,string> errs;
6876 get_device_metadata(devnames, pm, &errs);
6877 for (auto& i : errs) {
6878 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
11fdf7f2 6879 }
7c673cae
FG
6880 dout(10) << __func__ << " " << *pm << dendl;
6881}
6882
6883void OSD::queue_want_up_thru(epoch_t want)
6884{
9f95a23c
TL
6885 std::shared_lock map_locker{map_lock};
6886 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6887 std::lock_guard report_locker(mon_report_lock);
7c673cae
FG
6888 if (want > up_thru_wanted) {
6889 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6890 << ", currently " << cur
6891 << dendl;
6892 up_thru_wanted = want;
6893 send_alive();
6894 } else {
6895 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6896 << ", currently " << cur
6897 << dendl;
6898 }
7c673cae
FG
6899}
6900
6901void OSD::send_alive()
6902{
9f95a23c
TL
6903 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6904 const auto osdmap = get_osdmap();
7c673cae
FG
6905 if (!osdmap->exists(whoami))
6906 return;
6907 epoch_t up_thru = osdmap->get_up_thru(whoami);
6908 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6909 if (up_thru_wanted > up_thru) {
6910 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6911 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6912 }
6913}
6914
6915void OSD::request_full_map(epoch_t first, epoch_t last)
6916{
6917 dout(10) << __func__ << " " << first << ".." << last
6918 << ", previously requested "
6919 << requested_full_first << ".." << requested_full_last << dendl;
9f95a23c 6920 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
6921 ceph_assert(first > 0 && last > 0);
6922 ceph_assert(first <= last);
6923 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
7c673cae
FG
6924 if (requested_full_first == 0) {
6925 // first request
6926 requested_full_first = first;
6927 requested_full_last = last;
6928 } else if (last <= requested_full_last) {
6929 // dup
6930 return;
6931 } else {
6932 // additional request
6933 first = requested_full_last + 1;
6934 requested_full_last = last;
6935 }
6936 MMonGetOSDMap *req = new MMonGetOSDMap;
6937 req->request_full(first, last);
6938 monc->send_mon_message(req);
6939}
6940
6941void OSD::got_full_map(epoch_t e)
6942{
11fdf7f2 6943 ceph_assert(requested_full_first <= requested_full_last);
9f95a23c 6944 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
6945 if (requested_full_first == 0) {
6946 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6947 return;
6948 }
6949 if (e < requested_full_first) {
6950 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6951 << ".." << requested_full_last
6952 << ", ignoring" << dendl;
6953 return;
6954 }
6955 if (e >= requested_full_last) {
6956 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6957 << ".." << requested_full_last << ", resetting" << dendl;
6958 requested_full_first = requested_full_last = 0;
6959 return;
6960 }
f67539c2 6961
7c673cae
FG
6962 requested_full_first = e + 1;
6963
6964 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6965 << ".." << requested_full_last
6966 << ", still need more" << dendl;
6967}
6968
6969void OSD::requeue_failures()
6970{
11fdf7f2 6971 std::lock_guard l(heartbeat_lock);
7c673cae
FG
6972 unsigned old_queue = failure_queue.size();
6973 unsigned old_pending = failure_pending.size();
11fdf7f2 6974 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
7c673cae
FG
6975 failure_queue[p->first] = p->second.first;
6976 failure_pending.erase(p++);
6977 }
6978 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6979 << failure_queue.size() << dendl;
6980}
6981
6982void OSD::send_failures()
6983{
9f95a23c
TL
6984 ceph_assert(ceph_mutex_is_locked(map_lock));
6985 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
11fdf7f2 6986 std::lock_guard l(heartbeat_lock);
7c673cae 6987 utime_t now = ceph_clock_now();
9f95a23c 6988 const auto osdmap = get_osdmap();
7c673cae
FG
6989 while (!failure_queue.empty()) {
6990 int osd = failure_queue.begin()->first;
7c673cae
FG
6991 if (!failure_pending.count(osd)) {
6992 int failed_for = (int)(double)(now - failure_queue.begin()->second);
11fdf7f2
TL
6993 monc->send_mon_message(
6994 new MOSDFailure(
6995 monc->get_fsid(),
6996 osd,
6997 osdmap->get_addrs(osd),
6998 failed_for,
6999 osdmap->get_epoch()));
7000 failure_pending[osd] = make_pair(failure_queue.begin()->second,
7001 osdmap->get_addrs(osd));
7c673cae
FG
7002 }
7003 failure_queue.erase(osd);
7004 }
7005}
7006
11fdf7f2 7007void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
7c673cae 7008{
11fdf7f2
TL
7009 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
7010 MOSDFailure::FLAG_ALIVE);
7c673cae
FG
7011 monc->send_mon_message(m);
7012}
7013
11fdf7f2 7014void OSD::cancel_pending_failures()
7c673cae 7015{
11fdf7f2
TL
7016 std::lock_guard l(heartbeat_lock);
7017 auto it = failure_pending.begin();
7018 while (it != failure_pending.end()) {
7019 dout(10) << __func__ << " canceling in-flight failure report for osd."
7020 << it->first << dendl;
9f95a23c 7021 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
11fdf7f2 7022 failure_pending.erase(it++);
7c673cae 7023 }
7c673cae
FG
7024}
7025
7026void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
7027{
7028 const auto& monmap = monc->monmap;
7029 // send beacon to mon even if we are just connected, and the monmap is not
7030 // initialized yet by then.
7031 if (monmap.epoch > 0 &&
7032 monmap.get_required_features().contains_all(
7033 ceph::features::mon::FEATURE_LUMINOUS)) {
7034 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
7035 MOSDBeacon* beacon = nullptr;
7036 {
11fdf7f2 7037 std::lock_guard l{min_last_epoch_clean_lock};
9f95a23c
TL
7038 beacon = new MOSDBeacon(get_osdmap_epoch(),
7039 min_last_epoch_clean,
f67539c2
TL
7040 superblock.last_purged_snaps_scrub,
7041 cct->_conf->osd_beacon_report_interval);
494da23a 7042 beacon->pgs = min_last_epoch_clean_pgs;
224ce89b 7043 last_sent_beacon = now;
7c673cae
FG
7044 }
7045 monc->send_mon_message(beacon);
7046 } else {
7047 dout(20) << __func__ << " not sending" << dendl;
7048 }
7049}
7050
7c673cae
FG
7051void OSD::handle_command(MCommand *m)
7052{
7053 ConnectionRef con = m->get_connection();
9f95a23c 7054 auto session = ceph::ref_cast<Session>(con->get_priv());
7c673cae 7055 if (!session) {
9f95a23c 7056 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
7057 m->put();
7058 return;
7059 }
9f95a23c
TL
7060 if (!session->caps.allow_all()) {
7061 con->send_message(new MCommandReply(m, -EACCES));
7c673cae
FG
7062 m->put();
7063 return;
7064 }
9f95a23c 7065 cct->get_admin_socket()->queue_tell_command(m);
7c673cae
FG
7066 m->put();
7067}
7068
f64942e4
AA
7069namespace {
7070 class unlock_guard {
9f95a23c 7071 ceph::mutex& m;
f64942e4 7072 public:
9f95a23c 7073 explicit unlock_guard(ceph::mutex& mutex)
f64942e4
AA
7074 : m(mutex)
7075 {
11fdf7f2 7076 m.unlock();
f64942e4
AA
7077 }
7078 unlock_guard(unlock_guard&) = delete;
7079 ~unlock_guard() {
11fdf7f2 7080 m.lock();
f64942e4
AA
7081 }
7082 };
7083}
7084
9f95a23c 7085void OSD::scrub_purged_snaps()
7c673cae 7086{
9f95a23c
TL
7087 dout(10) << __func__ << dendl;
7088 ceph_assert(ceph_mutex_is_locked(osd_lock));
20effc67 7089 SnapMapper::Scrubber s(cct, store.get(), service.meta_ch,
9f95a23c
TL
7090 make_snapmapper_oid(),
7091 make_purged_snaps_oid());
7092 clog->debug() << "purged_snaps scrub starts";
7093 osd_lock.unlock();
7094 s.run();
7095 if (s.stray.size()) {
7096 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
7097 } else {
7098 clog->debug() << "purged_snaps scrub ok";
224ce89b 7099 }
9f95a23c
TL
7100 set<pair<spg_t,snapid_t>> queued;
7101 for (auto& [pool, snap, hash, shard] : s.stray) {
7102 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
7103 if (!pi) {
7104 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
7105 continue;
11fdf7f2 7106 }
9f95a23c
TL
7107 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
7108 spg_t spgid(pgid, shard);
7109 pair<spg_t,snapid_t> p(spgid, snap);
7110 if (queued.count(p)) {
7111 dout(20) << __func__ << " pg " << spgid << " snap " << snap
7112 << " already queued" << dendl;
7113 continue;
11fdf7f2 7114 }
9f95a23c
TL
7115 PGRef pg = lookup_lock_pg(spgid);
7116 if (!pg) {
7117 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
7118 continue;
11fdf7f2 7119 }
9f95a23c
TL
7120 queued.insert(p);
7121 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
7122 << snap << dendl;
7123 pg->queue_snap_retrim(snap);
7124 pg->unlock();
7c673cae 7125 }
9f95a23c
TL
7126 osd_lock.lock();
7127 if (is_stopping()) {
7128 return;
7129 }
7130 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
7131 ObjectStore::Transaction t;
7132 superblock.last_purged_snaps_scrub = ceph_clock_now();
7133 write_superblock(t);
7134 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7135 ceph_assert(tr == 0);
7136 if (is_active()) {
7137 send_beacon(ceph::coarse_mono_clock::now());
7138 }
7139 dout(10) << __func__ << " done" << dendl;
11fdf7f2
TL
7140}
7141
7142void OSD::probe_smart(const string& only_devid, ostream& ss)
7143{
7144 set<string> devnames;
7145 store->get_devices(&devnames);
7146 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7147 "osd_smart_report_timeout");
7148
7149 // == typedef std::map<std::string, mValue> mObject;
7150 json_spirit::mObject json_map;
7151
7152 for (auto dev : devnames) {
7153 // smartctl works only on physical devices; filter out any logical device
7154 if (dev.find("dm-") == 0) {
7155 continue;
7156 }
7157
7158 string err;
7159 string devid = get_device_id(dev, &err);
7160 if (devid.size() == 0) {
7161 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7162 << err << "), skipping" << dendl;
7163 continue;
7164 }
7165 if (only_devid.size() && devid != only_devid) {
7166 continue;
7167 }
7168
7169 json_spirit::mValue smart_json;
7170 if (block_device_get_metrics(dev, smart_timeout,
7171 &smart_json)) {
7172 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7173 continue;
7174 }
7175 json_map[devid] = smart_json;
7c673cae 7176 }
11fdf7f2 7177 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7c673cae
FG
7178}
7179
7180bool OSD::heartbeat_dispatch(Message *m)
7181{
7182 dout(30) << "heartbeat_dispatch " << m << dendl;
7183 switch (m->get_type()) {
7184
7185 case CEPH_MSG_PING:
7186 dout(10) << "ping from " << m->get_source_inst() << dendl;
7187 m->put();
7188 break;
7189
7190 case MSG_OSD_PING:
7191 handle_osd_ping(static_cast<MOSDPing*>(m));
7192 break;
7193
7194 default:
7195 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7196 m->put();
7197 }
7198
7199 return true;
7200}
7201
7202bool OSD::ms_dispatch(Message *m)
7203{
7204 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7205 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7206 service.got_stop_ack();
7207 m->put();
7208 return true;
7209 }
7210
7211 // lock!
7212
9f95a23c 7213 osd_lock.lock();
7c673cae 7214 if (is_stopping()) {
9f95a23c 7215 osd_lock.unlock();
7c673cae
FG
7216 m->put();
7217 return true;
7218 }
7219
7220 do_waiters();
7221 _dispatch(m);
7222
9f95a23c 7223 osd_lock.unlock();
7c673cae
FG
7224
7225 return true;
7226}
7227
9f95a23c
TL
7228void OSDService::maybe_share_map(
7229 Connection *con,
7230 const OSDMapRef& osdmap,
7231 epoch_t peer_epoch_lb)
7c673cae 7232{
9f95a23c
TL
7233 // NOTE: we assume caller hold something that keeps the Connection itself
7234 // pinned (e.g., an OpRequest's MessageRef).
7235 auto session = ceph::ref_cast<Session>(con->get_priv());
7236 if (!session) {
7c673cae
FG
7237 return;
7238 }
7c673cae 7239
9f95a23c
TL
7240 // assume the peer has the newer of the op's sent_epoch and what
7241 // we think we sent them.
7c673cae 7242 session->sent_epoch_lock.lock();
9f95a23c
TL
7243 if (peer_epoch_lb > session->last_sent_epoch) {
7244 dout(10) << __func__ << " con " << con
7245 << " " << con->get_peer_addr()
7246 << " map epoch " << session->last_sent_epoch
7247 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7248 session->last_sent_epoch = peer_epoch_lb;
7249 }
7250 epoch_t last_sent_epoch = session->last_sent_epoch;
7c673cae
FG
7251 session->sent_epoch_lock.unlock();
7252
9f95a23c
TL
7253 if (osdmap->get_epoch() <= last_sent_epoch) {
7254 return;
7255 }
11fdf7f2 7256
9f95a23c
TL
7257 send_incremental_map(last_sent_epoch, con, osdmap);
7258 last_sent_epoch = osdmap->get_epoch();
7c673cae
FG
7259
7260 session->sent_epoch_lock.lock();
7261 if (session->last_sent_epoch < last_sent_epoch) {
9f95a23c
TL
7262 dout(10) << __func__ << " con " << con
7263 << " " << con->get_peer_addr()
7264 << " map epoch " << session->last_sent_epoch
7265 << " -> " << last_sent_epoch << " (shared)" << dendl;
7c673cae
FG
7266 session->last_sent_epoch = last_sent_epoch;
7267 }
7268 session->sent_epoch_lock.unlock();
7c673cae
FG
7269}
7270
9f95a23c 7271void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7c673cae 7272{
9f95a23c 7273 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7c673cae
FG
7274
7275 auto i = session->waiting_on_map.begin();
7276 while (i != session->waiting_on_map.end()) {
7277 OpRequestRef op = &(*i);
11fdf7f2 7278 ceph_assert(ms_can_fast_dispatch(op->get_req()));
9f95a23c 7279 auto m = op->get_req<MOSDFastDispatchOp>();
7c673cae
FG
7280 if (m->get_min_epoch() > osdmap->get_epoch()) {
7281 break;
7282 }
7283 session->waiting_on_map.erase(i++);
7284 op->put();
7285
7286 spg_t pgid;
7287 if (m->get_type() == CEPH_MSG_OSD_OP) {
7288 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7289 static_cast<const MOSDOp*>(m)->get_pg());
7290 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7291 continue;
7292 }
7293 } else {
7294 pgid = m->get_spg();
7295 }
11fdf7f2 7296 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7c673cae
FG
7297 }
7298
7299 if (session->waiting_on_map.empty()) {
7300 clear_session_waiting_on_map(session);
7301 } else {
7302 register_session_waiting_on_map(session);
7303 }
7304}
7305
7306void OSD::ms_fast_dispatch(Message *m)
7307{
20effc67 7308 auto dispatch_span = tracing::osd::tracer.start_trace(__func__);
11fdf7f2 7309 FUNCTRACE(cct);
7c673cae
FG
7310 if (service.is_stopping()) {
7311 m->put();
7312 return;
7313 }
11fdf7f2
TL
7314 // peering event?
7315 switch (m->get_type()) {
7316 case CEPH_MSG_PING:
7317 dout(10) << "ping from " << m->get_source() << dendl;
7318 m->put();
7319 return;
11fdf7f2
TL
7320 case MSG_OSD_FORCE_RECOVERY:
7321 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7322 return;
7323 case MSG_OSD_SCRUB2:
7324 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7325 return;
11fdf7f2
TL
7326 case MSG_OSD_PG_CREATE2:
7327 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
11fdf7f2
TL
7328 case MSG_OSD_PG_NOTIFY:
7329 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7330 case MSG_OSD_PG_INFO:
7331 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7332 case MSG_OSD_PG_REMOVE:
7333 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
11fdf7f2
TL
7334 // these are single-pg messages that handle themselves
7335 case MSG_OSD_PG_LOG:
7336 case MSG_OSD_PG_TRIM:
9f95a23c
TL
7337 case MSG_OSD_PG_NOTIFY2:
7338 case MSG_OSD_PG_QUERY2:
7339 case MSG_OSD_PG_INFO2:
11fdf7f2
TL
7340 case MSG_OSD_BACKFILL_RESERVE:
7341 case MSG_OSD_RECOVERY_RESERVE:
9f95a23c
TL
7342 case MSG_OSD_PG_LEASE:
7343 case MSG_OSD_PG_LEASE_ACK:
11fdf7f2
TL
7344 {
7345 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7346 if (require_osd_peer(pm)) {
7347 enqueue_peering_evt(
7348 pm->get_spg(),
7349 PGPeeringEventRef(pm->get_event()));
7350 }
7351 pm->put();
7352 return;
7353 }
7354 }
7355
7c673cae
FG
7356 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7357 {
7358#ifdef WITH_LTTNG
7359 osd_reqid_t reqid = op->get_reqid();
7360#endif
7361 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7362 reqid.name._num, reqid.tid, reqid.inc);
7363 }
20effc67
TL
7364 op->osd_parent_span = tracing::osd::tracer.add_span("op-request-created", dispatch_span);
7365
7c673cae
FG
7366 if (m->trace)
7367 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7368
11fdf7f2 7369 // note sender epoch, min req's epoch
7c673cae
FG
7370 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7371 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
11fdf7f2 7372 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7c673cae
FG
7373
7374 service.maybe_inject_dispatch_delay();
7375
7376 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7377 m->get_type() != CEPH_MSG_OSD_OP) {
7378 // queue it directly
7379 enqueue_op(
7380 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
11fdf7f2 7381 std::move(op),
7c673cae
FG
7382 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7383 } else {
7384 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7385 // message that didn't have an explicit spg_t); we need to map
7386 // them to an spg_t while preserving delivery order.
11fdf7f2
TL
7387 auto priv = m->get_connection()->get_priv();
7388 if (auto session = static_cast<Session*>(priv.get()); session) {
7389 std::lock_guard l{session->session_dispatch_lock};
7390 op->get();
7391 session->waiting_on_map.push_back(*op);
7392 OSDMapRef nextmap = service.get_nextmap_reserved();
7393 dispatch_session_waiting(session, nextmap);
7394 service.release_map(nextmap);
7c673cae
FG
7395 }
7396 }
f67539c2 7397 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7c673cae
FG
7398}
7399
11fdf7f2 7400int OSD::ms_handle_authentication(Connection *con)
7c673cae 7401{
11fdf7f2 7402 int ret = 0;
9f95a23c 7403 auto s = ceph::ref_cast<Session>(con->get_priv());
11fdf7f2 7404 if (!s) {
9f95a23c
TL
7405 s = ceph::make_ref<Session>(cct, con);
7406 con->set_priv(s);
11fdf7f2
TL
7407 s->entity_name = con->get_peer_entity_name();
7408 dout(10) << __func__ << " new session " << s << " con " << s->con
7409 << " entity " << s->entity_name
7410 << " addr " << con->get_peer_addrs() << dendl;
7411 } else {
7412 dout(10) << __func__ << " existing session " << s << " con " << s->con
7413 << " entity " << s->entity_name
7414 << " addr " << con->get_peer_addrs() << dendl;
7c673cae
FG
7415 }
7416
11fdf7f2 7417 AuthCapsInfo &caps_info = con->get_peer_caps_info();
9f95a23c 7418 if (caps_info.allow_all) {
11fdf7f2 7419 s->caps.set_allow_all();
9f95a23c 7420 } else if (caps_info.caps.length() > 0) {
11fdf7f2
TL
7421 bufferlist::const_iterator p = caps_info.caps.cbegin();
7422 string str;
7423 try {
7424 decode(str, p);
7425 }
f67539c2 7426 catch (ceph::buffer::error& e) {
11fdf7f2
TL
7427 dout(10) << __func__ << " session " << s << " " << s->entity_name
7428 << " failed to decode caps string" << dendl;
9f95a23c 7429 ret = -EACCES;
11fdf7f2
TL
7430 }
7431 if (!ret) {
7c673cae 7432 bool success = s->caps.parse(str);
11fdf7f2
TL
7433 if (success) {
7434 dout(10) << __func__ << " session " << s
7435 << " " << s->entity_name
7436 << " has caps " << s->caps << " '" << str << "'" << dendl;
7437 ret = 1;
7438 } else {
7439 dout(10) << __func__ << " session " << s << " " << s->entity_name
7440 << " failed to parse caps '" << str << "'" << dendl;
9f95a23c 7441 ret = -EACCES;
11fdf7f2 7442 }
7c673cae 7443 }
7c673cae 7444 }
11fdf7f2 7445 return ret;
7c673cae
FG
7446}
7447
7448void OSD::do_waiters()
7449{
9f95a23c 7450 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7451
7452 dout(10) << "do_waiters -- start" << dendl;
7453 while (!finished.empty()) {
7454 OpRequestRef next = finished.front();
7455 finished.pop_front();
7456 dispatch_op(next);
7457 }
7458 dout(10) << "do_waiters -- finish" << dendl;
7459}
7460
7461void OSD::dispatch_op(OpRequestRef op)
7462{
7463 switch (op->get_req()->get_type()) {
7464
7465 case MSG_OSD_PG_CREATE:
7466 handle_pg_create(op);
7467 break;
7c673cae
FG
7468 }
7469}
7470
7471void OSD::_dispatch(Message *m)
7472{
9f95a23c 7473 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
7474 dout(20) << "_dispatch " << m << " " << *m << dendl;
7475
7476 switch (m->get_type()) {
7c673cae
FG
7477 // -- don't need OSDMap --
7478
7479 // map and replication
7480 case CEPH_MSG_OSD_MAP:
7481 handle_osd_map(static_cast<MOSDMap*>(m));
7482 break;
9f95a23c
TL
7483 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7484 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7485 break;
7c673cae
FG
7486
7487 // osd
7c673cae
FG
7488 case MSG_OSD_SCRUB:
7489 handle_scrub(static_cast<MOSDScrub*>(m));
7490 break;
7491
11fdf7f2
TL
7492 case MSG_COMMAND:
7493 handle_command(static_cast<MCommand*>(m));
7494 return;
c07f9fc5 7495
7c673cae
FG
7496 // -- need OSDMap --
7497
7498 case MSG_OSD_PG_CREATE:
7c673cae
FG
7499 {
7500 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7501 if (m->trace)
7502 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7503 // no map? starting up?
9f95a23c 7504 if (!get_osdmap()) {
7c673cae
FG
7505 dout(7) << "no OSDMap, not booted" << dendl;
7506 logger->inc(l_osd_waiting_for_map);
7507 waiting_for_osdmap.push_back(op);
7508 op->mark_delayed("no osdmap");
7509 break;
7510 }
7511
7512 // need OSDMap
7513 dispatch_op(op);
7514 }
7515 }
7516}
7517
11fdf7f2 7518// remove me post-nautilus
7c673cae
FG
7519void OSD::handle_scrub(MOSDScrub *m)
7520{
7521 dout(10) << "handle_scrub " << *m << dendl;
7522 if (!require_mon_or_mgr_peer(m)) {
7523 m->put();
7524 return;
7525 }
7526 if (m->fsid != monc->get_fsid()) {
11fdf7f2
TL
7527 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7528 << dendl;
7c673cae
FG
7529 m->put();
7530 return;
7531 }
7532
11fdf7f2
TL
7533 vector<spg_t> spgs;
7534 _get_pgids(&spgs);
7535
7536 if (!m->scrub_pgs.empty()) {
7537 vector<spg_t> v;
7538 for (auto pgid : m->scrub_pgs) {
7c673cae 7539 spg_t pcand;
9f95a23c 7540 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
11fdf7f2
TL
7541 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7542 v.push_back(pcand);
7c673cae
FG
7543 }
7544 }
11fdf7f2
TL
7545 spgs.swap(v);
7546 }
7547
7548 for (auto pgid : spgs) {
7549 enqueue_peering_evt(
7550 pgid,
7551 PGPeeringEventRef(
7552 std::make_shared<PGPeeringEvent>(
7553 get_osdmap_epoch(),
7554 get_osdmap_epoch(),
9f95a23c 7555 PeeringState::RequestScrub(m->deep, m->repair))));
7c673cae
FG
7556 }
7557
7558 m->put();
7559}
7560
11fdf7f2
TL
7561void OSD::handle_fast_scrub(MOSDScrub2 *m)
7562{
7563 dout(10) << __func__ << " " << *m << dendl;
7564 if (!require_mon_or_mgr_peer(m)) {
7565 m->put();
7566 return;
7567 }
7568 if (m->fsid != monc->get_fsid()) {
7569 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7570 << dendl;
7571 m->put();
7572 return;
7573 }
7574 for (auto pgid : m->scrub_pgs) {
7575 enqueue_peering_evt(
7576 pgid,
7577 PGPeeringEventRef(
7578 std::make_shared<PGPeeringEvent>(
7579 m->epoch,
7580 m->epoch,
9f95a23c 7581 PeeringState::RequestScrub(m->deep, m->repair))));
11fdf7f2
TL
7582 }
7583 m->put();
7584}
7585
7c673cae
FG
7586bool OSD::scrub_random_backoff()
7587{
7588 bool coin_flip = (rand() / (double)RAND_MAX >=
7589 cct->_conf->osd_scrub_backoff_ratio);
7590 if (!coin_flip) {
7591 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7592 return true;
7593 }
7594 return false;
7595}
7596
7c673cae 7597
20effc67 7598void OSD::sched_scrub()
f67539c2 7599{
20effc67 7600 auto& scrub_scheduler = service.get_scrub_services();
f67539c2 7601
20effc67
TL
7602 // fail fast if no resources are available
7603 if (!scrub_scheduler.can_inc_scrubs()) {
7604 dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7605 return;
f67539c2 7606 }
f67539c2 7607
20effc67
TL
7608 // if there is a PG that is just now trying to reserve scrub replica resources -
7609 // we should wait and not initiate a new scrub
7610 if (scrub_scheduler.is_reserving_now()) {
7611 dout(20) << __func__ << ": scrub resources reservation in progress" << dendl;
7612 return;
9f95a23c 7613 }
9f95a23c 7614
20effc67 7615 Scrub::ScrubPreconds env_conditions;
28e407b8 7616
20effc67
TL
7617 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7618 if (!cct->_conf->osd_repair_during_recovery) {
7619 dout(15) << __func__ << ": not scheduling scrubs due to active recovery"
7620 << dendl;
7621 return;
28e407b8 7622 }
20effc67
TL
7623 dout(10) << __func__
7624 << " will only schedule explicitly requested repair due to active recovery"
7625 << dendl;
7626 env_conditions.allow_requested_repair_only = true;
28e407b8
AA
7627 }
7628
20effc67
TL
7629 if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
7630 dout(20) << __func__ << " sched_scrub starts" << dendl;
7631 auto all_jobs = scrub_scheduler.list_registered_jobs();
7632 for (const auto& sj : all_jobs) {
7633 dout(20) << "sched_scrub scrub-queue jobs: " << *sj << dendl;
7c673cae
FG
7634 }
7635 }
20effc67
TL
7636
7637 auto was_started = scrub_scheduler.select_pg_and_scrub(env_conditions);
7638 dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started)
7639 << ")" << dendl;
7c673cae
FG
7640}
7641
20effc67
TL
7642Scrub::schedule_result_t OSDService::initiate_a_scrub(spg_t pgid,
7643 bool allow_requested_repair_only)
7c673cae 7644{
20effc67 7645 dout(20) << __func__ << " trying " << pgid << dendl;
7c673cae 7646
20effc67
TL
7647 // we have a candidate to scrub. We need some PG information to know if scrubbing is
7648 // allowed
7c673cae 7649
20effc67
TL
7650 PGRef pg = osd->lookup_lock_pg(pgid);
7651 if (!pg) {
7652 // the PG was dequeued in the short timespan between creating the candidates list
7653 // (collect_ripe_jobs()) and here
7654 dout(5) << __func__ << " pg " << pgid << " not found" << dendl;
7655 return Scrub::schedule_result_t::no_such_pg;
7c673cae
FG
7656 }
7657
20effc67
TL
7658 // This has already started, so go on to the next scrub job
7659 if (pg->is_scrub_queued_or_active()) {
7660 pg->unlock();
7661 dout(20) << __func__ << ": already in progress pgid " << pgid << dendl;
7662 return Scrub::schedule_result_t::already_started;
7c673cae 7663 }
20effc67
TL
7664 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7665 if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
7666 pg->unlock();
7667 dout(10) << __func__ << " skip " << pgid
7668 << " because repairing is not explicitly requested on it" << dendl;
7669 return Scrub::schedule_result_t::preconditions;
b5b8bbf5
FG
7670 }
7671
20effc67
TL
7672 auto scrub_attempt = pg->sched_scrub();
7673 pg->unlock();
7674 return scrub_attempt;
7c673cae
FG
7675}
7676
494da23a
TL
7677void OSD::resched_all_scrubs()
7678{
7679 dout(10) << __func__ << ": start" << dendl;
20effc67
TL
7680 auto all_jobs = service.get_scrub_services().list_registered_jobs();
7681 for (auto& e : all_jobs) {
7682
7683 auto& job = *e;
7684 dout(20) << __func__ << ": examine " << job.pgid << dendl;
7685
7686 PGRef pg = _lookup_lock_pg(job.pgid);
7687 if (!pg)
7688 continue;
7689
7690 if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
7691 dout(15) << __func__ << ": reschedule " << job.pgid << dendl;
7692 pg->reschedule_scrub();
7693 }
7694 pg->unlock();
494da23a
TL
7695 }
7696 dout(10) << __func__ << ": done" << dendl;
7697}
7698
11fdf7f2
TL
7699MPGStats* OSD::collect_pg_stats()
7700{
20effc67 7701 dout(15) << __func__ << dendl;
11fdf7f2
TL
7702 // This implementation unconditionally sends every is_primary PG's
7703 // stats every time we're called. This has equivalent cost to the
7704 // previous implementation's worst case where all PGs are busy and
7705 // their stats are always enqueued for sending.
9f95a23c 7706 std::shared_lock l{map_lock};
11fdf7f2 7707
11fdf7f2
TL
7708 osd_stat_t cur_stat = service.get_osd_stat();
7709 cur_stat.os_perf_stat = store->get_cur_stats();
7710
9f95a23c 7711 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
11fdf7f2
TL
7712 m->osd_stat = cur_stat;
7713
7714 std::lock_guard lec{min_last_epoch_clean_lock};
9f95a23c 7715 min_last_epoch_clean = get_osdmap_epoch();
11fdf7f2
TL
7716 min_last_epoch_clean_pgs.clear();
7717
7718 std::set<int64_t> pool_set;
7719 vector<PGRef> pgs;
7720 _get_pgs(&pgs);
7721 for (auto& pg : pgs) {
7722 auto pool = pg->pg_id.pgid.pool();
7723 pool_set.emplace((int64_t)pool);
7724 if (!pg->is_primary()) {
7725 continue;
7726 }
20effc67 7727 pg->with_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
11fdf7f2 7728 m->pg_stat[pg->pg_id.pgid] = s;
f67539c2 7729 min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
11fdf7f2
TL
7730 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7731 });
7732 }
7733 store_statfs_t st;
81eedcae 7734 bool per_pool_stats = false;
9f95a23c 7735 bool per_pool_omap_stats = false;
11fdf7f2 7736 for (auto p : pool_set) {
9f95a23c 7737 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
11fdf7f2
TL
7738 if (r == -ENOTSUP) {
7739 break;
7740 } else {
7741 assert(r >= 0);
7742 m->pool_stat[p] = st;
81eedcae 7743 per_pool_stats = true;
11fdf7f2
TL
7744 }
7745 }
7c673cae 7746
81eedcae
TL
7747 // indicate whether we are reporting per-pool stats
7748 m->osd_stat.num_osds = 1;
7749 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
9f95a23c 7750 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
81eedcae 7751
11fdf7f2
TL
7752 return m;
7753}
7c673cae 7754
11fdf7f2 7755vector<DaemonHealthMetric> OSD::get_health_metrics()
b32b8144 7756{
11fdf7f2
TL
7757 vector<DaemonHealthMetric> metrics;
7758 {
7759 utime_t oldest_secs;
7760 const utime_t now = ceph_clock_now();
7761 auto too_old = now;
7762 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7763 int slow = 0;
7764 TrackedOpRef oldest_op;
20effc67
TL
7765 OSDMapRef osdmap = get_osdmap();
7766 // map of slow op counts by slow op event type for an aggregated logging to
7767 // the cluster log.
7768 map<uint8_t, int> slow_op_types;
7769 // map of slow op counts by pool for reporting a pool name with highest
7770 // slow ops.
7771 map<uint64_t, int> slow_op_pools;
7772 bool log_aggregated_slow_op =
7773 cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
11fdf7f2
TL
7774 auto count_slow_ops = [&](TrackedOp& op) {
7775 if (op.get_initiated() < too_old) {
9f95a23c
TL
7776 stringstream ss;
7777 ss << "slow request " << op.get_desc()
7778 << " initiated "
7779 << op.get_initiated()
7780 << " currently "
7781 << op.state_string();
7782 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
20effc67
TL
7783 if (log_aggregated_slow_op) {
7784 if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
7785 uint8_t op_type = req->state_flag();
7786 auto m = req->get_req<MOSDFastDispatchOp>();
7787 uint64_t poolid = m->get_spg().pgid.m_pool;
7788 slow_op_types[op_type]++;
7789 if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
7790 slow_op_pools[poolid]++;
7791 }
7792 }
7793 } else {
7794 clog->warn() << ss.str();
7795 }
11fdf7f2
TL
7796 slow++;
7797 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7798 oldest_op = &op;
7799 }
7800 return true;
7801 } else {
7802 return false;
7803 }
7804 };
7805 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7806 if (slow) {
7807 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7808 << oldest_op->get_desc() << dendl;
20effc67
TL
7809 if (log_aggregated_slow_op &&
7810 slow_op_types.size() > 0) {
7811 stringstream ss;
7812 ss << slow << " slow requests (by type [ ";
7813 for (const auto& [op_type, count] : slow_op_types) {
7814 ss << "'" << OpRequest::get_state_string(op_type)
7815 << "' : " << count
7816 << " ";
7817 }
7818 auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
7819 [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
7820 return p1.second < p2.second;
7821 });
7822 if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
7823 string pool_name = osdmap->get_pool_name(slow_pool_it->first);
7824 ss << "] most affected pool [ '"
7825 << pool_name
7826 << "' : "
7827 << slow_pool_it->second
7828 << " ])";
7829 } else {
7830 ss << "])";
7831 }
7832 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7833 clog->warn() << ss.str();
7834 }
11fdf7f2
TL
7835 }
7836 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7837 } else {
7838 // no news is not good news.
7839 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7840 }
7841 }
7842 {
7843 std::lock_guard l(pending_creates_lock);
7844 auto n_primaries = pending_creates_from_mon;
7845 for (const auto& create : pending_creates_from_osd) {
7846 if (create.second) {
7847 n_primaries++;
7848 }
b32b8144 7849 }
11fdf7f2 7850 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
b32b8144 7851 }
b32b8144
FG
7852 return metrics;
7853}
7854
7c673cae
FG
7855// =====================================================
7856// MAP
7857
7858void OSD::wait_for_new_map(OpRequestRef op)
7859{
7860 // ask?
7861 if (waiting_for_osdmap.empty()) {
9f95a23c 7862 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7c673cae
FG
7863 }
7864
7865 logger->inc(l_osd_waiting_for_map);
7866 waiting_for_osdmap.push_back(op);
7867 op->mark_delayed("wait for new map");
7868}
7869
7870
7871/** update_map
7872 * assimilate new OSDMap(s). scan pgs, etc.
7873 */
7874
7875void OSD::note_down_osd(int peer)
7876{
9f95a23c
TL
7877 ceph_assert(ceph_mutex_is_locked(osd_lock));
7878 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7c673cae 7879
9f95a23c 7880 std::lock_guard l{heartbeat_lock};
7c673cae
FG
7881 failure_queue.erase(peer);
7882 failure_pending.erase(peer);
7883 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7884 if (p != heartbeat_peers.end()) {
9f95a23c 7885 p->second.clear_mark_down();
7c673cae
FG
7886 heartbeat_peers.erase(p);
7887 }
7c673cae
FG
7888}
7889
7890void OSD::note_up_osd(int peer)
7891{
7c673cae
FG
7892 heartbeat_set_peers_need_update();
7893}
7894
7895struct C_OnMapCommit : public Context {
7896 OSD *osd;
7897 epoch_t first, last;
7898 MOSDMap *msg;
7899 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7900 : osd(o), first(f), last(l), msg(m) {}
7901 void finish(int r) override {
7902 osd->_committed_osd_maps(first, last, msg);
7903 msg->put();
7904 }
7905};
7906
7c673cae
FG
7907void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7908{
11fdf7f2 7909 std::lock_guard l(osdmap_subscribe_lock);
181888fb 7910 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7911 return;
7912
11fdf7f2 7913 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
181888fb 7914
7c673cae
FG
7915 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7916 force_request) {
7917 monc->renew_subs();
7918 }
7919}
7920
7921void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7922{
7923 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7924 if (min <= superblock.oldest_map)
7925 return;
7926
7927 int num = 0;
7928 ObjectStore::Transaction t;
7929 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7930 dout(20) << " removing old osdmap epoch " << e << dendl;
7931 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7932 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7933 superblock.oldest_map = e + 1;
7934 num++;
7935 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7936 service.publish_superblock(superblock);
7937 write_superblock(t);
11fdf7f2
TL
7938 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7939 ceph_assert(tr == 0);
7c673cae
FG
7940 num = 0;
7941 if (!skip_maps) {
7942 // skip_maps leaves us with a range of old maps if we fail to remove all
7943 // of them before moving superblock.oldest_map forward to the first map
7944 // in the incoming MOSDMap msg. so we should continue removing them in
7945 // this case, even we could do huge series of delete transactions all at
7946 // once.
7947 break;
7948 }
7949 }
7950 }
7951 if (num > 0) {
7952 service.publish_superblock(superblock);
7953 write_superblock(t);
11fdf7f2
TL
7954 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7955 ceph_assert(tr == 0);
7c673cae
FG
7956 }
7957 // we should not remove the cached maps
11fdf7f2 7958 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7c673cae
FG
7959}
7960
7961void OSD::handle_osd_map(MOSDMap *m)
7962{
11fdf7f2
TL
7963 // wait for pgs to catch up
7964 {
7965 // we extend the map cache pins to accomodate pgs slow to consume maps
7966 // for some period, until we hit the max_lag_factor bound, at which point
7967 // we block here to stop injesting more maps than they are able to keep
7968 // up with.
7969 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7970 m_osd_pg_epoch_max_lag_factor;
7971 ceph_assert(max_lag > 0);
7972 epoch_t osd_min = 0;
7973 for (auto shard : shards) {
7974 epoch_t min = shard->get_min_pg_epoch();
7975 if (osd_min == 0 || min < osd_min) {
7976 osd_min = min;
7977 }
7978 }
9f95a23c 7979 epoch_t osdmap_epoch = get_osdmap_epoch();
11fdf7f2 7980 if (osd_min > 0 &&
9f95a23c
TL
7981 osdmap_epoch > max_lag &&
7982 osdmap_epoch - max_lag > osd_min) {
7983 epoch_t need = osdmap_epoch - max_lag;
11fdf7f2
TL
7984 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7985 << " max_lag " << max_lag << ")" << dendl;
7986 for (auto shard : shards) {
7987 epoch_t min = shard->get_min_pg_epoch();
7988 if (need > min) {
7989 dout(10) << __func__ << " waiting for pgs to consume " << need
7990 << " (shard " << shard->shard_id << " min " << min
7991 << ", map cache is " << cct->_conf->osd_map_cache_size
7992 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7993 << ")" << dendl;
7994 unlock_guard unlock{osd_lock};
7995 shard->wait_min_pg_epoch(need);
7996 }
7997 }
7998 }
7999 }
8000
9f95a23c 8001 ceph_assert(ceph_mutex_is_locked(osd_lock));
11fdf7f2
TL
8002 map<epoch_t,OSDMapRef> added_maps;
8003 map<epoch_t,bufferlist> added_maps_bl;
7c673cae
FG
8004 if (m->fsid != monc->get_fsid()) {
8005 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
8006 << monc->get_fsid() << dendl;
8007 m->put();
8008 return;
8009 }
8010 if (is_initializing()) {
8011 dout(0) << "ignoring osdmap until we have initialized" << dendl;
8012 m->put();
8013 return;
8014 }
8015
9f95a23c
TL
8016 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
8017 if (session && !(session->entity_name.is_mon() ||
7c673cae
FG
8018 session->entity_name.is_osd())) {
8019 //not enough perms!
8020 dout(10) << "got osd map from Session " << session
8021 << " which we can't take maps from (not a mon or osd)" << dendl;
8022 m->put();
7c673cae
FG
8023 return;
8024 }
7c673cae
FG
8025
8026 // share with the objecter
8027 if (!is_preboot())
8028 service.objecter->handle_osd_map(m);
8029
8030 epoch_t first = m->get_first();
8031 epoch_t last = m->get_last();
8032 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
8033 << superblock.newest_map
8034 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
8035 << dendl;
8036
8037 logger->inc(l_osd_map);
8038 logger->inc(l_osd_mape, last - first + 1);
8039 if (first <= superblock.newest_map)
8040 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8041 if (service.max_oldest_map < m->oldest_map) {
8042 service.max_oldest_map = m->oldest_map;
11fdf7f2 8043 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7c673cae
FG
8044 }
8045
8046 // make sure there is something new, here, before we bother flushing
8047 // the queues and such
8048 if (last <= superblock.newest_map) {
8049 dout(10) << " no new maps here, dropping" << dendl;
8050 m->put();
8051 return;
8052 }
8053
8054 // missing some?
8055 bool skip_maps = false;
8056 if (first > superblock.newest_map + 1) {
8057 dout(10) << "handle_osd_map message skips epochs "
8058 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8059 if (m->oldest_map <= superblock.newest_map + 1) {
8060 osdmap_subscribe(superblock.newest_map + 1, false);
8061 m->put();
8062 return;
8063 }
8064 // always try to get the full range of maps--as many as we can. this
8065 // 1- is good to have
8066 // 2- is at present the only way to ensure that we get a *full* map as
8067 // the first map!
8068 if (m->oldest_map < first) {
8069 osdmap_subscribe(m->oldest_map - 1, true);
8070 m->put();
8071 return;
8072 }
8073 skip_maps = true;
8074 }
8075
8076 ObjectStore::Transaction t;
8077 uint64_t txn_size = 0;
8078
9f95a23c
TL
8079 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8080
7c673cae 8081 // store new maps: queue for disk and put in the osdmap cache
11fdf7f2 8082 epoch_t start = std::max(superblock.newest_map + 1, first);
7c673cae
FG
8083 for (epoch_t e = start; e <= last; e++) {
8084 if (txn_size >= t.get_num_bytes()) {
8085 derr << __func__ << " transaction size overflowed" << dendl;
11fdf7f2 8086 ceph_assert(txn_size < t.get_num_bytes());
7c673cae
FG
8087 }
8088 txn_size = t.get_num_bytes();
8089 map<epoch_t,bufferlist>::iterator p;
8090 p = m->maps.find(e);
8091 if (p != m->maps.end()) {
8092 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8093 OSDMap *o = new OSDMap;
8094 bufferlist& bl = p->second;
8095
8096 o->decode(bl);
8097
9f95a23c
TL
8098 purged_snaps[e] = o->get_new_purged_snaps();
8099
7c673cae
FG
8100 ghobject_t fulloid = get_osdmap_pobject_name(e);
8101 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
11fdf7f2
TL
8102 added_maps[e] = add_map(o);
8103 added_maps_bl[e] = bl;
7c673cae
FG
8104 got_full_map(e);
8105 continue;
8106 }
8107
8108 p = m->incremental_maps.find(e);
8109 if (p != m->incremental_maps.end()) {
8110 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8111 bufferlist& bl = p->second;
8112 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8113 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7c673cae
FG
8114
8115 OSDMap *o = new OSDMap;
8116 if (e > 1) {
8117 bufferlist obl;
8118 bool got = get_map_bl(e - 1, obl);
11fdf7f2
TL
8119 if (!got) {
8120 auto p = added_maps_bl.find(e - 1);
8121 ceph_assert(p != added_maps_bl.end());
8122 obl = p->second;
8123 }
7c673cae
FG
8124 o->decode(obl);
8125 }
8126
8127 OSDMap::Incremental inc;
11fdf7f2 8128 auto p = bl.cbegin();
7c673cae 8129 inc.decode(p);
494da23a 8130
7c673cae 8131 if (o->apply_incremental(inc) < 0) {
9f95a23c 8132 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
11fdf7f2 8133 ceph_abort_msg("bad fsid");
7c673cae
FG
8134 }
8135
8136 bufferlist fbl;
8137 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8138
8139 bool injected_failure = false;
8140 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8141 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8142 derr << __func__ << " injecting map crc failure" << dendl;
8143 injected_failure = true;
8144 }
8145
8146 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8147 dout(2) << "got incremental " << e
8148 << " but failed to encode full with correct crc; requesting"
8149 << dendl;
8150 clog->warn() << "failed to encode map e" << e << " with expected crc";
8151 dout(20) << "my encoded map was:\n";
8152 fbl.hexdump(*_dout);
8153 *_dout << dendl;
8154 delete o;
8155 request_full_map(e, last);
8156 last = e - 1;
f6b5b4d7
TL
8157
8158 // don't continue committing if we failed to enc the first inc map
8159 if (last < start) {
8160 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8161 m->put();
8162 return;
8163 }
7c673cae
FG
8164 break;
8165 }
8166 got_full_map(e);
9f95a23c 8167 purged_snaps[e] = o->get_new_purged_snaps();
7c673cae
FG
8168
8169 ghobject_t fulloid = get_osdmap_pobject_name(e);
8170 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
11fdf7f2
TL
8171 added_maps[e] = add_map(o);
8172 added_maps_bl[e] = fbl;
7c673cae
FG
8173 continue;
8174 }
8175
11fdf7f2 8176 ceph_abort_msg("MOSDMap lied about what maps it had?");
7c673cae
FG
8177 }
8178
8179 // even if this map isn't from a mon, we may have satisfied our subscription
8180 monc->sub_got("osdmap", last);
8181
8182 if (!m->maps.empty() && requested_full_first) {
8183 dout(10) << __func__ << " still missing full maps " << requested_full_first
8184 << ".." << requested_full_last << dendl;
8185 rerequest_full_maps();
8186 }
8187
7c673cae
FG
8188 if (superblock.oldest_map) {
8189 // make sure we at least keep pace with incoming maps
8190 trim_maps(m->oldest_map, last - first + 1, skip_maps);
11fdf7f2 8191 pg_num_history.prune(superblock.oldest_map);
7c673cae
FG
8192 }
8193
8194 if (!superblock.oldest_map || skip_maps)
8195 superblock.oldest_map = first;
8196 superblock.newest_map = last;
8197 superblock.current_epoch = last;
8198
8199 // note in the superblock that we were clean thru the prior epoch
8200 epoch_t boot_epoch = service.get_boot_epoch();
8201 if (boot_epoch && boot_epoch >= superblock.mounted) {
8202 superblock.mounted = boot_epoch;
8203 superblock.clean_thru = last;
8204 }
8205
11fdf7f2
TL
8206 // check for pg_num changes and deleted pools
8207 OSDMapRef lastmap;
8208 for (auto& i : added_maps) {
8209 if (!lastmap) {
8210 if (!(lastmap = service.try_get_map(i.first - 1))) {
8211 dout(10) << __func__ << " can't get previous map " << i.first - 1
8212 << " probably first start of this osd" << dendl;
8213 continue;
8214 }
8215 }
8216 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8217 for (auto& j : lastmap->get_pools()) {
8218 if (!i.second->have_pg_pool(j.first)) {
8219 pg_num_history.log_pool_delete(i.first, j.first);
8220 dout(10) << __func__ << " recording final pg_pool_t for pool "
8221 << j.first << dendl;
8222 // this information is needed by _make_pg() if have to restart before
8223 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8224 ghobject_t obj = make_final_pool_info_oid(j.first);
8225 bufferlist bl;
8226 encode(j.second, bl, CEPH_FEATURES_ALL);
8227 string name = lastmap->get_pool_name(j.first);
8228 encode(name, bl);
8229 map<string,string> profile;
8230 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8231 profile = lastmap->get_erasure_code_profile(
8232 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8233 }
8234 encode(profile, bl);
8235 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
11fdf7f2
TL
8236 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8237 new_pg_num != j.second.get_pg_num()) {
8238 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8239 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8240 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8241 }
8242 }
8243 for (auto& j : i.second->get_pools()) {
8244 if (!lastmap->have_pg_pool(j.first)) {
8245 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8246 << j.second.get_pg_num() << dendl;
8247 pg_num_history.log_pg_num_change(i.first, j.first,
8248 j.second.get_pg_num());
8249 }
8250 }
8251 lastmap = i.second;
8252 }
8253 pg_num_history.epoch = last;
8254 {
8255 bufferlist bl;
8256 ::encode(pg_num_history, bl);
8257 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8258 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8259 }
8260
9f95a23c
TL
8261 // record new purged_snaps
8262 if (superblock.purged_snaps_last == start - 1) {
20effc67 8263 SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
9f95a23c
TL
8264 make_purged_snaps_oid(), &t,
8265 purged_snaps);
8266 superblock.purged_snaps_last = last;
8267 } else {
8268 dout(10) << __func__ << " superblock purged_snaps_last is "
8269 << superblock.purged_snaps_last
8270 << ", not recording new purged_snaps" << dendl;
8271 }
8272
7c673cae
FG
8273 // superblock and commit
8274 write_superblock(t);
11fdf7f2 8275 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
7c673cae 8276 store->queue_transaction(
11fdf7f2
TL
8277 service.meta_ch,
8278 std::move(t));
7c673cae
FG
8279 service.publish_superblock(superblock);
8280}
8281
8282void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8283{
8284 dout(10) << __func__ << " " << first << ".." << last << dendl;
8285 if (is_stopping()) {
8286 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8287 return;
8288 }
11fdf7f2 8289 std::lock_guard l(osd_lock);
31f18b77
FG
8290 if (is_stopping()) {
8291 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8292 return;
8293 }
9f95a23c 8294 map_lock.lock();
7c673cae 8295
f6b5b4d7
TL
8296 ceph_assert(first <= last);
8297
7c673cae
FG
8298 bool do_shutdown = false;
8299 bool do_restart = false;
8300 bool network_error = false;
f6b5b4d7 8301 OSDMapRef osdmap = get_osdmap();
7c673cae
FG
8302
8303 // advance through the new maps
8304 for (epoch_t cur = first; cur <= last; cur++) {
8305 dout(10) << " advance to epoch " << cur
8306 << " (<= last " << last
8307 << " <= newest_map " << superblock.newest_map
8308 << ")" << dendl;
8309
8310 OSDMapRef newmap = get_map(cur);
11fdf7f2 8311 ceph_assert(newmap); // we just cached it above!
7c673cae 8312
f67539c2 8313 // start blocklisting messages sent to peers that go down.
7c673cae
FG
8314 service.pre_publish_map(newmap);
8315
8316 // kill connections to newly down osds
8317 bool waited_for_reservations = false;
8318 set<int> old;
9f95a23c 8319 osdmap = get_osdmap();
7c673cae
FG
8320 osdmap->get_all_osds(old);
8321 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8322 if (*p != whoami &&
8323 osdmap->is_up(*p) && // in old map
8324 newmap->is_down(*p)) { // but not the new one
8325 if (!waited_for_reservations) {
8326 service.await_reserved_maps();
8327 waited_for_reservations = true;
8328 }
8329 note_down_osd(*p);
8330 } else if (*p != whoami &&
8331 osdmap->is_down(*p) &&
8332 newmap->is_up(*p)) {
8333 note_up_osd(*p);
8334 }
8335 }
8336
81eedcae 8337 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
7c673cae
FG
8338 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8339 << dendl;
8340 if (is_booting()) {
8341 // this captures the case where we sent the boot message while
8342 // NOUP was being set on the mon and our boot request was
8343 // dropped, and then later it is cleared. it imperfectly
8344 // handles the case where our original boot message was not
8345 // dropped and we restart even though we might have booted, but
8346 // that is harmless (boot will just take slightly longer).
8347 do_restart = true;
8348 }
8349 }
8350
9f95a23c
TL
8351 osdmap = std::move(newmap);
8352 set_osdmap(osdmap);
7c673cae
FG
8353 epoch_t up_epoch;
8354 epoch_t boot_epoch;
8355 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8356 if (!up_epoch &&
8357 osdmap->is_up(whoami) &&
11fdf7f2 8358 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
7c673cae
FG
8359 up_epoch = osdmap->get_epoch();
8360 dout(10) << "up_epoch is " << up_epoch << dendl;
8361 if (!boot_epoch) {
8362 boot_epoch = osdmap->get_epoch();
8363 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8364 }
8365 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8366 }
8367 }
8368
7c673cae
FG
8369 epoch_t _bind_epoch = service.get_bind_epoch();
8370 if (osdmap->is_up(whoami) &&
11fdf7f2
TL
8371 osdmap->get_addrs(whoami).legacy_equals(
8372 client_messenger->get_myaddrs()) &&
7c673cae
FG
8373 _bind_epoch < osdmap->get_up_from(whoami)) {
8374
8375 if (is_booting()) {
8376 dout(1) << "state: booting -> active" << dendl;
8377 set_state(STATE_ACTIVE);
11fdf7f2 8378 do_restart = false;
7c673cae
FG
8379
8380 // set incarnation so that osd_reqid_t's we generate for our
8381 // objecter requests are unique across restarts.
8382 service.objecter->set_client_incarnation(osdmap->get_epoch());
11fdf7f2 8383 cancel_pending_failures();
7c673cae
FG
8384 }
8385 }
8386
8387 if (osdmap->get_epoch() > 0 &&
8388 is_active()) {
8389 if (!osdmap->exists(whoami)) {
9f95a23c 8390 derr << "map says i do not exist. shutting down." << dendl;
7c673cae
FG
8391 do_shutdown = true; // don't call shutdown() while we have
8392 // everything paused
9f95a23c
TL
8393 } else if (osdmap->is_stop(whoami)) {
8394 derr << "map says i am stopped by admin. shutting down." << dendl;
8395 do_shutdown = true;
7c673cae 8396 } else if (!osdmap->is_up(whoami) ||
11fdf7f2
TL
8397 !osdmap->get_addrs(whoami).legacy_equals(
8398 client_messenger->get_myaddrs()) ||
8399 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8400 cluster_messenger->get_myaddrs()) ||
8401 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8402 hb_back_server_messenger->get_myaddrs()) ||
8403 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8404 hb_front_server_messenger->get_myaddrs())) {
7c673cae
FG
8405 if (!osdmap->is_up(whoami)) {
8406 if (service.is_preparing_to_stop() || service.is_stopping()) {
8407 service.got_stop_ack();
8408 } else {
c07f9fc5
FG
8409 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8410 "but it is still running";
8411 clog->debug() << "map e" << osdmap->get_epoch()
8412 << " wrongly marked me down at e"
8413 << osdmap->get_down_at(whoami);
7c673cae 8414 }
9f95a23c
TL
8415 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8416 // note that this is best-effort...
8417 monc->send_mon_message(
8418 new MOSDMarkMeDead(
8419 monc->get_fsid(),
8420 whoami,
8421 osdmap->get_epoch()));
8422 }
11fdf7f2
TL
8423 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8424 client_messenger->get_myaddrs())) {
7c673cae 8425 clog->error() << "map e" << osdmap->get_epoch()
11fdf7f2
TL
8426 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8427 << " != my " << client_messenger->get_myaddrs() << ")";
8428 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8429 cluster_messenger->get_myaddrs())) {
7c673cae
FG
8430 clog->error() << "map e" << osdmap->get_epoch()
8431 << " had wrong cluster addr ("
11fdf7f2
TL
8432 << osdmap->get_cluster_addrs(whoami)
8433 << " != my " << cluster_messenger->get_myaddrs() << ")";
8434 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8435 hb_back_server_messenger->get_myaddrs())) {
7c673cae 8436 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8437 << " had wrong heartbeat back addr ("
11fdf7f2
TL
8438 << osdmap->get_hb_back_addrs(whoami)
8439 << " != my " << hb_back_server_messenger->get_myaddrs()
7c673cae 8440 << ")";
11fdf7f2
TL
8441 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8442 hb_front_server_messenger->get_myaddrs())) {
7c673cae 8443 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8444 << " had wrong heartbeat front addr ("
11fdf7f2
TL
8445 << osdmap->get_hb_front_addrs(whoami)
8446 << " != my " << hb_front_server_messenger->get_myaddrs()
7c673cae
FG
8447 << ")";
8448 }
8449
8450 if (!service.is_stopping()) {
8451 epoch_t up_epoch = 0;
8452 epoch_t bind_epoch = osdmap->get_epoch();
8453 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8454 do_restart = true;
8455
8456 //add markdown log
8457 utime_t now = ceph_clock_now();
8458 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8459 osd_markdown_log.push_back(now);
7c673cae 8460 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
9f95a23c
TL
8461 derr << __func__ << " marked down "
8462 << osd_markdown_log.size()
8463 << " > osd_max_markdown_count "
8464 << cct->_conf->osd_max_markdown_count
8465 << " in last " << grace << " seconds, shutting down"
8466 << dendl;
7c673cae
FG
8467 do_restart = false;
8468 do_shutdown = true;
8469 }
8470
8471 start_waiting_for_healthy();
8472
8473 set<int> avoid_ports;
8474#if defined(__FreeBSD__)
8475 // prevent FreeBSD from grabbing the client_messenger port during
f67539c2 8476 // rebinding. In which case a cluster_meesneger will connect also
7c673cae 8477 // to the same port
11fdf7f2 8478 client_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae 8479#endif
11fdf7f2 8480 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
7c673cae
FG
8481
8482 int r = cluster_messenger->rebind(avoid_ports);
8483 if (r != 0) {
8484 do_shutdown = true; // FIXME: do_restart?
8485 network_error = true;
9f95a23c
TL
8486 derr << __func__ << " marked down:"
8487 << " rebind cluster_messenger failed" << dendl;
7c673cae
FG
8488 }
8489
9f95a23c
TL
8490 hb_back_server_messenger->mark_down_all();
8491 hb_front_server_messenger->mark_down_all();
7c673cae
FG
8492 hb_front_client_messenger->mark_down_all();
8493 hb_back_client_messenger->mark_down_all();
8494
494da23a 8495 reset_heartbeat_peers(true);
7c673cae
FG
8496 }
8497 }
20effc67
TL
8498 } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
8499 derr << "map says i am stopped by admin. shutting down." << dendl;
8500 do_shutdown = true;
7c673cae
FG
8501 }
8502
9f95a23c 8503 map_lock.unlock();
7c673cae 8504
11fdf7f2 8505 check_osdmap_features();
7c673cae
FG
8506
8507 // yay!
8508 consume_map();
8509
8510 if (is_active() || is_waiting_for_healthy())
8511 maybe_update_heartbeat_peers();
8512
11fdf7f2 8513 if (is_active()) {
7c673cae
FG
8514 activate_map();
8515 }
8516
31f18b77 8517 if (do_shutdown) {
7c673cae 8518 if (network_error) {
11fdf7f2 8519 cancel_pending_failures();
7c673cae
FG
8520 }
8521 // trigger shutdown in a different thread
8522 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8523 queue_async_signal(SIGINT);
8524 }
31f18b77
FG
8525 else if (m->newest_map && m->newest_map > last) {
8526 dout(10) << " msg say newest map is " << m->newest_map
8527 << ", requesting more" << dendl;
8528 osdmap_subscribe(osdmap->get_epoch()+1, false);
8529 }
7c673cae
FG
8530 else if (is_preboot()) {
8531 if (m->get_source().is_mon())
8532 _preboot(m->oldest_map, m->newest_map);
8533 else
8534 start_boot();
8535 }
8536 else if (do_restart)
8537 start_boot();
8538
8539}
8540
11fdf7f2 8541void OSD::check_osdmap_features()
7c673cae
FG
8542{
8543 // adjust required feature bits?
8544
8545 // we have to be a bit careful here, because we are accessing the
8546 // Policy structures without taking any lock. in particular, only
8547 // modify integer values that can safely be read by a racing CPU.
8548 // since we are only accessing existing Policy structures a their
8549 // current memory location, and setting or clearing bits in integer
8550 // fields, and we are the only writer, this is not a problem.
8551
9f95a23c 8552 const auto osdmap = get_osdmap();
7c673cae
FG
8553 {
8554 Messenger::Policy p = client_messenger->get_default_policy();
8555 uint64_t mask;
8556 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8557 if ((p.features_required & mask) != features) {
8558 dout(0) << "crush map has features " << features
8559 << ", adjusting msgr requires for clients" << dendl;
8560 p.features_required = (p.features_required & ~mask) | features;
8561 client_messenger->set_default_policy(p);
8562 }
8563 }
8564 {
8565 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8566 uint64_t mask;
8567 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8568 if ((p.features_required & mask) != features) {
8569 dout(0) << "crush map has features " << features
8570 << " was " << p.features_required
8571 << ", adjusting msgr requires for mons" << dendl;
8572 p.features_required = (p.features_required & ~mask) | features;
8573 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8574 }
8575 }
8576 {
8577 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8578 uint64_t mask;
8579 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8580
8581 if ((p.features_required & mask) != features) {
8582 dout(0) << "crush map has features " << features
8583 << ", adjusting msgr requires for osds" << dendl;
8584 p.features_required = (p.features_required & ~mask) | features;
8585 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8586 }
8587
11fdf7f2 8588 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7c673cae
FG
8589 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8590 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8591 ObjectStore::Transaction t;
8592 write_superblock(t);
11fdf7f2
TL
8593 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8594 ceph_assert(err == 0);
7c673cae
FG
8595 }
8596 }
11fdf7f2 8597
9f95a23c
TL
8598 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8599 hb_front_server_messenger->set_require_authorizer(false);
8600 hb_back_server_messenger->set_require_authorizer(false);
8601 } else {
8602 hb_front_server_messenger->set_require_authorizer(true);
8603 hb_back_server_messenger->set_require_authorizer(true);
11fdf7f2
TL
8604 }
8605
8606 if (osdmap->require_osd_release != last_require_osd_release) {
8607 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8608 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8609 store->write_meta("require_osd_release",
8610 stringify((int)osdmap->require_osd_release));
8611 last_require_osd_release = osdmap->require_osd_release;
8612 }
7c673cae
FG
8613}
8614
11fdf7f2
TL
8615struct C_FinishSplits : public Context {
8616 OSD *osd;
8617 set<PGRef> pgs;
8618 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8619 : osd(osd), pgs(in) {}
8620 void finish(int r) override {
8621 osd->_finish_splits(pgs);
8622 }
8623};
8624
8625void OSD::_finish_splits(set<PGRef>& pgs)
7c673cae 8626{
11fdf7f2
TL
8627 dout(10) << __func__ << " " << pgs << dendl;
8628 if (is_stopping())
8629 return;
11fdf7f2
TL
8630 for (set<PGRef>::iterator i = pgs.begin();
8631 i != pgs.end();
8632 ++i) {
8633 PG *pg = i->get();
7c673cae 8634
20effc67 8635 PeeringCtx rctx;
11fdf7f2
TL
8636 pg->lock();
8637 dout(10) << __func__ << " " << *pg << dendl;
8638 epoch_t e = pg->get_osdmap_epoch();
9f95a23c 8639 pg->handle_initialize(rctx);
11fdf7f2 8640 pg->queue_null(e, e);
9f95a23c 8641 dispatch_context(rctx, pg, service.get_osdmap());
11fdf7f2 8642 pg->unlock();
7c673cae 8643
11fdf7f2
TL
8644 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8645 shards[shard_index]->register_and_wake_split_child(pg);
7c673cae 8646 }
11fdf7f2
TL
8647};
8648
8649bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8650 unsigned need)
8651{
8652 std::lock_guard l(merge_lock);
8653 auto& p = merge_waiters[nextmap->get_epoch()][target];
8654 p[src->pg_id] = src;
8655 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8656 << " for " << target << ", have " << p.size() << "/" << need
8657 << dendl;
8658 return p.size() == need;
8659}
8660
8661bool OSD::advance_pg(
8662 epoch_t osd_epoch,
8663 PG *pg,
8664 ThreadPool::TPHandle &handle,
9f95a23c 8665 PeeringCtx &rctx)
11fdf7f2
TL
8666{
8667 if (osd_epoch <= pg->get_osdmap_epoch()) {
8668 return true;
8669 }
8670 ceph_assert(pg->is_locked());
8671 OSDMapRef lastmap = pg->get_osdmap();
11fdf7f2
TL
8672 set<PGRef> new_pgs; // any split children
8673 bool ret = true;
8674
8675 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8676 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8677 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8678 next_epoch <= osd_epoch;
7c673cae
FG
8679 ++next_epoch) {
8680 OSDMapRef nextmap = service.try_get_map(next_epoch);
8681 if (!nextmap) {
8682 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7c673cae
FG
8683 continue;
8684 }
8685
11fdf7f2
TL
8686 unsigned new_pg_num =
8687 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8688 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8689 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8690 // check for merge
8691 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8692 spg_t parent;
8693 if (pg->pg_id.is_merge_source(
8694 old_pg_num,
8695 new_pg_num,
8696 &parent)) {
8697 // we are merge source
8698 PGRef spg = pg; // carry a ref
8699 dout(1) << __func__ << " " << pg->pg_id
8700 << " is merge source, target is " << parent
8701 << dendl;
8702 pg->write_if_dirty(rctx);
9f95a23c
TL
8703 if (!new_pgs.empty()) {
8704 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8705 new_pgs));
8706 new_pgs.clear();
8707 }
8708 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2 8709 pg->ch->flush();
eafe8130
TL
8710 // release backoffs explicitly, since the on_shutdown path
8711 // aggressively tears down backoff state.
8712 if (pg->is_primary()) {
8713 pg->release_pg_backoffs();
8714 }
11fdf7f2
TL
8715 pg->on_shutdown();
8716 OSDShard *sdata = pg->osd_shard;
8717 {
8718 std::lock_guard l(sdata->shard_lock);
8719 if (pg->pg_slot) {
8720 sdata->_detach_pg(pg->pg_slot);
8721 // update pg count now since we might not get an osdmap
8722 // any time soon.
8723 if (pg->is_primary())
8724 logger->dec(l_osd_pg_primary);
9f95a23c
TL
8725 else if (pg->is_nonprimary())
8726 logger->dec(l_osd_pg_replica); // misnomer
11fdf7f2
TL
8727 else
8728 logger->dec(l_osd_pg_stray);
8729 }
8730 }
8731 pg->unlock();
8732
8733 set<spg_t> children;
8734 parent.is_split(new_pg_num, old_pg_num, &children);
8735 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8736 enqueue_peering_evt(
8737 parent,
8738 PGPeeringEventRef(
8739 std::make_shared<PGPeeringEvent>(
8740 nextmap->get_epoch(),
8741 nextmap->get_epoch(),
8742 NullEvt())));
8743 }
8744 ret = false;
8745 goto out;
8746 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8747 // we are merge target
8748 set<spg_t> children;
8749 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8750 dout(20) << __func__ << " " << pg->pg_id
8751 << " is merge target, sources are " << children
8752 << dendl;
8753 map<spg_t,PGRef> sources;
8754 {
8755 std::lock_guard l(merge_lock);
8756 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8757 unsigned need = children.size();
8758 dout(20) << __func__ << " have " << s.size() << "/"
8759 << need << dendl;
8760 if (s.size() == need) {
8761 sources.swap(s);
8762 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8763 if (merge_waiters[nextmap->get_epoch()].empty()) {
8764 merge_waiters.erase(nextmap->get_epoch());
8765 }
8766 }
8767 }
8768 if (!sources.empty()) {
8769 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8770 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8771 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8772 pg->merge_from(
8773 sources, rctx, split_bits,
8774 nextmap->get_pg_pool(
8775 pg->pg_id.pool())->last_pg_merge_meta);
8776 pg->pg_slot->waiting_for_merge_epoch = 0;
8777 } else {
8778 dout(20) << __func__ << " not ready to merge yet" << dendl;
8779 pg->write_if_dirty(rctx);
9f95a23c
TL
8780 if (!new_pgs.empty()) {
8781 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8782 new_pgs));
8783 new_pgs.clear();
8784 }
8785 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
11fdf7f2
TL
8786 pg->unlock();
8787 // kick source(s) to get them ready
8788 for (auto& i : children) {
8789 dout(20) << __func__ << " kicking source " << i << dendl;
8790 enqueue_peering_evt(
8791 i,
8792 PGPeeringEventRef(
8793 std::make_shared<PGPeeringEvent>(
8794 nextmap->get_epoch(),
8795 nextmap->get_epoch(),
8796 NullEvt())));
8797 }
8798 ret = false;
8799 goto out;
8800 }
8801 }
8802 }
8803 }
8804
7c673cae
FG
8805 vector<int> newup, newacting;
8806 int up_primary, acting_primary;
8807 nextmap->pg_to_up_acting_osds(
11fdf7f2 8808 pg->pg_id.pgid,
7c673cae
FG
8809 &newup, &up_primary,
8810 &newacting, &acting_primary);
8811 pg->handle_advance_map(
8812 nextmap, lastmap, newup, up_primary,
8813 newacting, acting_primary, rctx);
8814
494da23a
TL
8815 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8816 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8817 if (oldpool != lastmap->get_pools().end()
8818 && newpool != nextmap->get_pools().end()) {
8819 dout(20) << __func__
8820 << " new pool opts " << newpool->second.opts
8821 << " old pool opts " << oldpool->second.opts
8822 << dendl;
8823
8824 double old_min_interval = 0, new_min_interval = 0;
8825 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8826 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8827
8828 double old_max_interval = 0, new_max_interval = 0;
8829 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8830 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8831
8832 // Assume if an interval is change from set to unset or vice versa the actual config
8833 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8834 // unnecessarily.
8835 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8836 pg->on_info_history_change();
8837 }
8838 }
8839
11fdf7f2
TL
8840 if (new_pg_num && old_pg_num != new_pg_num) {
8841 // check for split
8842 set<spg_t> children;
8843 if (pg->pg_id.is_split(
8844 old_pg_num,
8845 new_pg_num,
8846 &children)) {
8847 split_pgs(
8848 pg, children, &new_pgs, lastmap, nextmap,
8849 rctx);
8850 }
7c673cae
FG
8851 }
8852
8853 lastmap = nextmap;
11fdf7f2 8854 old_pg_num = new_pg_num;
7c673cae
FG
8855 handle.reset_tp_timeout();
8856 }
7c673cae 8857 pg->handle_activate_map(rctx);
11fdf7f2
TL
8858
8859 ret = true;
8860 out:
8861 if (!new_pgs.empty()) {
9f95a23c 8862 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
7c673cae 8863 }
11fdf7f2 8864 return ret;
7c673cae
FG
8865}
8866
8867void OSD::consume_map()
8868{
9f95a23c
TL
8869 ceph_assert(ceph_mutex_is_locked(osd_lock));
8870 auto osdmap = get_osdmap();
7c673cae
FG
8871 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8872
3efd9988
FG
8873 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8874 * speak the older sorting version any more. Be careful not to force
8875 * a shutdown if we are merely processing old maps, though.
8876 */
8877 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8878 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8879 ceph_abort();
8880 }
8881
11fdf7f2
TL
8882 service.pre_publish_map(osdmap);
8883 service.await_reserved_maps();
8884 service.publish_map(osdmap);
7c673cae 8885
11fdf7f2
TL
8886 // prime splits and merges
8887 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8888 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8889 for (auto& shard : shards) {
8890 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8891 }
8892 if (!newly_split.empty()) {
8893 for (auto& shard : shards) {
8894 shard->prime_splits(osdmap, &newly_split);
8895 }
8896 ceph_assert(newly_split.empty());
8897 }
7c673cae 8898
11fdf7f2
TL
8899 // prune sent_ready_to_merge
8900 service.prune_sent_ready_to_merge(osdmap);
7c673cae 8901
11fdf7f2
TL
8902 // FIXME, maybe: We could race against an incoming peering message
8903 // that instantiates a merge PG after identify_merges() below and
8904 // never set up its peer to complete the merge. An OSD restart
8905 // would clear it up. This is a hard race to resolve,
8906 // extraordinarily rare (we only merge PGs that are stable and
8907 // clean, so it'd have to be an imported PG to an OSD with a
8908 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8909 // replace all of this with a seastar-based code soon anyway.
8910 if (!merge_pgs.empty()) {
8911 // mark the pgs we already have, or create new and empty merge
8912 // participants for those we are missing. do this all under the
8913 // shard lock so we don't have to worry about racing pg creates
8914 // via _process.
8915 for (auto& shard : shards) {
8916 shard->prime_merges(osdmap, &merge_pgs);
7c673cae 8917 }
11fdf7f2
TL
8918 ceph_assert(merge_pgs.empty());
8919 }
8920
8921 service.prune_pg_created();
8922
8923 unsigned pushes_to_free = 0;
8924 for (auto& shard : shards) {
8925 shard->consume_map(osdmap, &pushes_to_free);
8926 }
8927
8928 vector<spg_t> pgids;
8929 _get_pgids(&pgids);
8930
8931 // count (FIXME, probably during seastar rewrite)
8932 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8933 vector<PGRef> pgs;
8934 _get_pgs(&pgs);
8935 for (auto& pg : pgs) {
8936 // FIXME (probably during seastar rewrite): this is lockless and
8937 // racy, but we don't want to take pg lock here.
8938 if (pg->is_primary())
8939 num_pg_primary++;
9f95a23c
TL
8940 else if (pg->is_nonprimary())
8941 num_pg_replica++; // misnomer
11fdf7f2
TL
8942 else
8943 num_pg_stray++;
8944 }
3efd9988 8945
11fdf7f2
TL
8946 {
8947 // FIXME (as part of seastar rewrite): move to OSDShard
8948 std::lock_guard l(pending_creates_lock);
8949 for (auto pg = pending_creates_from_osd.begin();
8950 pg != pending_creates_from_osd.end();) {
9f95a23c 8951 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
11fdf7f2
TL
8952 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8953 << "discarding pending_create_from_osd" << dendl;
3efd9988
FG
8954 pg = pending_creates_from_osd.erase(pg);
8955 } else {
8956 ++pg;
8957 }
8958 }
7c673cae
FG
8959 }
8960
7c673cae
FG
8961 service.maybe_inject_dispatch_delay();
8962
8963 dispatch_sessions_waiting_on_map();
8964
8965 service.maybe_inject_dispatch_delay();
8966
11fdf7f2 8967 service.release_reserved_pushes(pushes_to_free);
7c673cae 8968
11fdf7f2
TL
8969 // queue null events to push maps down to individual PGs
8970 for (auto pgid : pgids) {
8971 enqueue_peering_evt(
8972 pgid,
8973 PGPeeringEventRef(
8974 std::make_shared<PGPeeringEvent>(
8975 osdmap->get_epoch(),
8976 osdmap->get_epoch(),
8977 NullEvt())));
7c673cae 8978 }
11fdf7f2 8979 logger->set(l_osd_pg, pgids.size());
7c673cae
FG
8980 logger->set(l_osd_pg_primary, num_pg_primary);
8981 logger->set(l_osd_pg_replica, num_pg_replica);
8982 logger->set(l_osd_pg_stray, num_pg_stray);
8983}
8984
8985void OSD::activate_map()
8986{
9f95a23c
TL
8987 ceph_assert(ceph_mutex_is_locked(osd_lock));
8988 auto osdmap = get_osdmap();
7c673cae
FG
8989
8990 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8991
7c673cae
FG
8992 // norecover?
8993 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8994 if (!service.recovery_is_paused()) {
8995 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8996 service.pause_recovery();
8997 }
8998 } else {
8999 if (service.recovery_is_paused()) {
9000 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
9001 service.unpause_recovery();
9002 }
9003 }
9004
9005 service.activate_map();
9006
9007 // process waiters
9008 take_waiters(waiting_for_osdmap);
9009}
9010
9011bool OSD::require_mon_peer(const Message *m)
9012{
9013 if (!m->get_connection()->peer_is_mon()) {
9014 dout(0) << "require_mon_peer received from non-mon "
9015 << m->get_connection()->get_peer_addr()
9016 << " " << *m << dendl;
9017 return false;
9018 }
9019 return true;
9020}
9021
9022bool OSD::require_mon_or_mgr_peer(const Message *m)
9023{
9024 if (!m->get_connection()->peer_is_mon() &&
9025 !m->get_connection()->peer_is_mgr()) {
9026 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
9027 << m->get_connection()->get_peer_addr()
9028 << " " << *m << dendl;
9029 return false;
9030 }
9031 return true;
9032}
9033
9034bool OSD::require_osd_peer(const Message *m)
9035{
9036 if (!m->get_connection()->peer_is_osd()) {
9037 dout(0) << "require_osd_peer received from non-osd "
9038 << m->get_connection()->get_peer_addr()
9039 << " " << *m << dendl;
9040 return false;
9041 }
9042 return true;
9043}
9044
9045bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9046{
9047 epoch_t up_epoch = service.get_up_epoch();
9048 if (epoch < up_epoch) {
9049 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9050 return false;
9051 }
9052
9053 if (!is_active()) {
9054 dout(7) << "still in boot state, dropping message " << *m << dendl;
9055 return false;
9056 }
9057
9058 return true;
9059}
9060
9f95a23c 9061bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
7c673cae
FG
9062 bool is_fast_dispatch)
9063{
9064 int from = m->get_source().num();
9065
9066 if (map->is_down(from) ||
11fdf7f2 9067 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
7c673cae
FG
9068 dout(5) << "from dead osd." << from << ", marking down, "
9069 << " msg was " << m->get_source_inst().addr
11fdf7f2
TL
9070 << " expected "
9071 << (map->is_up(from) ?
9072 map->get_cluster_addrs(from) : entity_addrvec_t())
7c673cae
FG
9073 << dendl;
9074 ConnectionRef con = m->get_connection();
9075 con->mark_down();
9f95a23c 9076 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
7c673cae 9077 if (!is_fast_dispatch)
9f95a23c 9078 s->session_dispatch_lock.lock();
7c673cae 9079 clear_session_waiting_on_map(s);
11fdf7f2
TL
9080 con->set_priv(nullptr); // break ref <-> session cycle, if any
9081 s->con.reset();
7c673cae 9082 if (!is_fast_dispatch)
9f95a23c 9083 s->session_dispatch_lock.unlock();
7c673cae
FG
9084 }
9085 return false;
9086 }
9087 return true;
9088}
9089
9090
9091/*
9092 * require that we have same (or newer) map, and that
9093 * the source is the pg primary.
9094 */
9095bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9096 bool is_fast_dispatch)
9097{
9098 const Message *m = op->get_req();
9f95a23c 9099 const auto osdmap = get_osdmap();
7c673cae
FG
9100 dout(15) << "require_same_or_newer_map " << epoch
9101 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9102
9f95a23c 9103 ceph_assert(ceph_mutex_is_locked(osd_lock));
7c673cae
FG
9104
9105 // do they have a newer map?
9106 if (epoch > osdmap->get_epoch()) {
9107 dout(7) << "waiting for newer map epoch " << epoch
9108 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9109 wait_for_new_map(op);
9110 return false;
9111 }
9112
9113 if (!require_self_aliveness(op->get_req(), epoch)) {
9114 return false;
9115 }
9116
9117 // ok, our map is same or newer.. do they still exist?
9118 if (m->get_connection()->get_messenger() == cluster_messenger &&
9119 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9120 return false;
9121 }
9122
9123 return true;
9124}
9125
9126
9127
9128
9129
9130// ----------------------------------------
9131// pg creation
9132
9133void OSD::split_pgs(
9134 PG *parent,
31f18b77 9135 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
9136 OSDMapRef curmap,
9137 OSDMapRef nextmap,
9f95a23c 9138 PeeringCtx &rctx)
7c673cae 9139{
11fdf7f2
TL
9140 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9141 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
7c673cae 9142
11fdf7f2
TL
9143 vector<object_stat_sum_t> updated_stats;
9144 parent->start_split_stats(childpgids, &updated_stats);
7c673cae
FG
9145
9146 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9147 for (set<spg_t>::const_iterator i = childpgids.begin();
9148 i != childpgids.end();
9149 ++i, ++stat_iter) {
11fdf7f2
TL
9150 ceph_assert(stat_iter != updated_stats.end());
9151 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
7c673cae
FG
9152 PG* child = _make_pg(nextmap, *i);
9153 child->lock(true);
9154 out_pgs->insert(child);
11fdf7f2 9155 child->ch = store->create_new_collection(child->coll);
7c673cae 9156
11fdf7f2
TL
9157 {
9158 uint32_t shard_index = i->hash_to_shard(shards.size());
9159 assert(NULL != shards[shard_index]);
9160 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9161 }
7c673cae 9162
11fdf7f2
TL
9163 unsigned split_bits = i->get_split_bits(pg_num);
9164 dout(10) << " pg_num is " << pg_num
9165 << ", m_seed " << i->ps()
9166 << ", split_bits is " << split_bits << dendl;
7c673cae
FG
9167 parent->split_colls(
9168 *i,
9169 split_bits,
9170 i->ps(),
11fdf7f2 9171 &child->get_pool().info,
9f95a23c 9172 rctx.transaction);
7c673cae
FG
9173 parent->split_into(
9174 i->pgid,
9175 child,
9176 split_bits);
7c673cae 9177
92f5a8d4
TL
9178 child->init_collection_pool_opts();
9179
9f95a23c 9180 child->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
9181 child->unlock();
9182 }
11fdf7f2 9183 ceph_assert(stat_iter != updated_stats.end());
9f95a23c 9184 parent->finish_split_stats(*stat_iter, rctx.transaction);
7c673cae
FG
9185}
9186
9187/*
9188 * holding osd_lock
9189 */
9190void OSD::handle_pg_create(OpRequestRef op)
9191{
9f95a23c
TL
9192 // NOTE: this can be removed in P release (mimic is the last version to
9193 // send MOSDPGCreate messages).
9194
9195 auto m = op->get_req<MOSDPGCreate>();
11fdf7f2 9196 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
7c673cae
FG
9197
9198 dout(10) << "handle_pg_create " << *m << dendl;
9199
9200 if (!require_mon_peer(op->get_req())) {
9201 return;
9202 }
9203
9204 if (!require_same_or_newer_map(op, m->epoch, false))
9205 return;
9206
9207 op->mark_started();
9208
9f95a23c 9209 const auto osdmap = get_osdmap();
7c673cae
FG
9210 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9211 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9212 p != m->mkpg.end();
9213 ++p, ++ci) {
11fdf7f2 9214 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
7c673cae
FG
9215 epoch_t created = p->second.created;
9216 if (p->second.split_bits) // Skip split pgs
9217 continue;
9218 pg_t on = p->first;
9219
7c673cae
FG
9220 if (!osdmap->have_pg_pool(on.pool())) {
9221 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9222 continue;
9223 }
9224
9225 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9226
9f95a23c
TL
9227 spg_t pgid;
9228 bool mapped = osdmap->get_primary_shard(on, &pgid);
9229 ceph_assert(mapped);
9230
7c673cae
FG
9231 // is it still ours?
9232 vector<int> up, acting;
9233 int up_primary = -1;
9234 int acting_primary = -1;
9235 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9f95a23c 9236 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
7c673cae
FG
9237
9238 if (acting_primary != whoami) {
9239 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9240 << "), my role=" << role << ", skipping" << dendl;
9241 continue;
9242 }
9243
7c673cae 9244
11fdf7f2 9245 PastIntervals pi;
7c673cae
FG
9246 pg_history_t history;
9247 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9248
11fdf7f2
TL
9249 // The mon won't resend unless the primary changed, so we ignore
9250 // same_interval_since. We'll pass this history with the current
9251 // epoch as the event.
7c673cae
FG
9252 if (history.same_primary_since > m->epoch) {
9253 dout(10) << __func__ << ": got obsolete pg create on pgid "
9254 << pgid << " from epoch " << m->epoch
9255 << ", primary changed in " << history.same_primary_since
9256 << dendl;
9257 continue;
9258 }
11fdf7f2
TL
9259 enqueue_peering_evt(
9260 pgid,
9261 PGPeeringEventRef(
9262 std::make_shared<PGPeeringEvent>(
9263 osdmap->get_epoch(),
9264 osdmap->get_epoch(),
9265 NullEvt(),
9266 true,
9267 new PGCreateInfo(
9268 pgid,
9269 osdmap->get_epoch(),
9270 history,
9271 pi,
9272 true)
9273 )));
7c673cae 9274 }
7c673cae 9275
3efd9988 9276 {
11fdf7f2 9277 std::lock_guard l(pending_creates_lock);
3efd9988
FG
9278 if (pending_creates_from_mon == 0) {
9279 last_pg_create_epoch = m->epoch;
9280 }
9281 }
11fdf7f2 9282
7c673cae
FG
9283 maybe_update_heartbeat_peers();
9284}
9285
9286
9287// ----------------------------------------
9288// peering and recovery
9289
9f95a23c 9290void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
7c673cae
FG
9291 ThreadPool::TPHandle *handle)
9292{
11fdf7f2
TL
9293 if (!service.get_osdmap()->is_up(whoami)) {
9294 dout(20) << __func__ << " not up in osdmap" << dendl;
9295 } else if (!is_active()) {
9296 dout(20) << __func__ << " not active" << dendl;
9297 } else {
9f95a23c
TL
9298 for (auto& [osd, ls] : ctx.message_map) {
9299 if (!curmap->is_up(osd)) {
9300 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9301 continue;
9302 }
9303 ConnectionRef con = service.get_con_osd_cluster(
9304 osd, curmap->get_epoch());
9305 if (!con) {
9306 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9307 << dendl;
9308 continue;
9309 }
9310 service.maybe_share_map(con.get(), curmap);
9311 for (auto m : ls) {
9312 con->send_message2(m);
9313 }
9314 ls.clear();
9315 }
7c673cae 9316 }
9f95a23c 9317 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
7c673cae 9318 int tr = store->queue_transaction(
11fdf7f2 9319 pg->ch,
9f95a23c 9320 std::move(ctx.transaction), TrackedOpRef(),
7c673cae 9321 handle);
11fdf7f2 9322 ceph_assert(tr == 0);
7c673cae 9323 }
7c673cae
FG
9324}
9325
11fdf7f2 9326void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
7c673cae 9327{
11fdf7f2
TL
9328 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9329 if (!require_mon_peer(m)) {
9330 m->put();
7c673cae 9331 return;
7c673cae 9332 }
11fdf7f2
TL
9333 for (auto& p : m->pgs) {
9334 spg_t pgid = p.first;
9335 epoch_t created = p.second.first;
9336 utime_t created_stamp = p.second.second;
9f95a23c
TL
9337 auto q = m->pg_extra.find(pgid);
9338 if (q == m->pg_extra.end()) {
9339 dout(20) << __func__ << " " << pgid << " e" << created
9340 << "@" << created_stamp
9341 << " (no history or past_intervals)" << dendl;
9342 // pre-octopus ... no pg history. this can be removed in Q release.
9343 enqueue_peering_evt(
9344 pgid,
9345 PGPeeringEventRef(
9346 std::make_shared<PGPeeringEvent>(
9347 m->epoch,
9348 m->epoch,
9349 NullEvt(),
9350 true,
9351 new PGCreateInfo(
9352 pgid,
9353 created,
9354 pg_history_t(created, created_stamp),
9355 PastIntervals(),
9356 true)
9357 )));
9358 } else {
9359 dout(20) << __func__ << " " << pgid << " e" << created
9360 << "@" << created_stamp
9361 << " history " << q->second.first
9362 << " pi " << q->second.second << dendl;
9363 if (!q->second.second.empty() &&
9364 m->epoch < q->second.second.get_bounds().second) {
9365 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9366 << " and unmatched past_intervals " << q->second.second
9367 << " (history " << q->second.first << ")";
9368 } else {
9369 enqueue_peering_evt(
9370 pgid,
9371 PGPeeringEventRef(
9372 std::make_shared<PGPeeringEvent>(
9373 m->epoch,
9374 m->epoch,
9375 NullEvt(),
9376 true,
9377 new PGCreateInfo(
9378 pgid,
9379 m->epoch,
9380 q->second.first,
9381 q->second.second,
9382 true)
9383 )));
9384 }
9385 }
11fdf7f2 9386 }
7c673cae 9387
11fdf7f2
TL
9388 {
9389 std::lock_guard l(pending_creates_lock);
9390 if (pending_creates_from_mon == 0) {
9391 last_pg_create_epoch = m->epoch;
9392 }
7c673cae
FG
9393 }
9394
11fdf7f2 9395 m->put();
7c673cae
FG
9396}
9397
11fdf7f2 9398void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
7c673cae 9399{
11fdf7f2
TL
9400 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9401 if (!require_osd_peer(m)) {
9402 m->put();
7c673cae
FG
9403 return;
9404 }
11fdf7f2
TL
9405 int from = m->get_source().num();
9406 for (auto& p : m->get_pg_list()) {
9f95a23c 9407 spg_t pgid(p.info.pgid.pgid, p.to);
11fdf7f2
TL
9408 enqueue_peering_evt(
9409 pgid,
9410 PGPeeringEventRef(
9411 std::make_shared<PGPeeringEvent>(
9f95a23c
TL
9412 p.epoch_sent,
9413 p.query_epoch,
11fdf7f2 9414 MNotifyRec(
9f95a23c
TL
9415 pgid, pg_shard_t(from, p.from),
9416 p,
9417 m->get_connection()->get_features()),
11fdf7f2
TL
9418 true,
9419 new PGCreateInfo(
9420 pgid,
9f95a23c
TL
9421 p.query_epoch,
9422 p.info.history,
9423 p.past_intervals,
11fdf7f2
TL
9424 false)
9425 )));
7c673cae 9426 }
11fdf7f2 9427 m->put();
7c673cae
FG
9428}
9429
11fdf7f2 9430void OSD::handle_fast_pg_info(MOSDPGInfo* m)
7c673cae 9431{
11fdf7f2
TL
9432 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9433 if (!require_osd_peer(m)) {
9434 m->put();
7c673cae
FG
9435 return;
9436 }
11fdf7f2
TL
9437 int from = m->get_source().num();
9438 for (auto& p : m->pg_list) {
9439 enqueue_peering_evt(
9f95a23c 9440 spg_t(p.info.pgid.pgid, p.to),
11fdf7f2 9441 PGPeeringEventRef(
20effc67
TL
9442 std::make_shared<PGPeeringEvent>(
9443 p.epoch_sent, p.query_epoch,
9444 MInfoRec(
9445 pg_shard_t(from, p.from),
9446 p.info,
9447 p.epoch_sent)))
11fdf7f2 9448 );
7c673cae 9449 }
11fdf7f2 9450 m->put();
7c673cae
FG
9451}
9452
11fdf7f2 9453void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
7c673cae 9454{
11fdf7f2
TL
9455 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9456 if (!require_osd_peer(m)) {
9457 m->put();
7c673cae
FG
9458 return;
9459 }
11fdf7f2
TL
9460 for (auto& pgid : m->pg_list) {
9461 enqueue_peering_evt(
9462 pgid,
9463 PGPeeringEventRef(
9464 std::make_shared<PGPeeringEvent>(
9465 m->get_epoch(), m->get_epoch(),
9f95a23c 9466 PeeringState::DeleteStart())));
7c673cae 9467 }
11fdf7f2 9468 m->put();
7c673cae
FG
9469}
9470
11fdf7f2 9471void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
c07f9fc5 9472{
11fdf7f2
TL
9473 dout(10) << __func__ << " " << *m << dendl;
9474 if (!require_mon_or_mgr_peer(m)) {
9475 m->put();
9476 return;
9477 }
9478 epoch_t epoch = get_osdmap_epoch();
9479 for (auto pgid : m->forced_pgs) {
9480 if (m->options & OFR_BACKFILL) {
9481 if (m->options & OFR_CANCEL) {
9482 enqueue_peering_evt(
9483 pgid,
9484 PGPeeringEventRef(
9485 std::make_shared<PGPeeringEvent>(
9486 epoch, epoch,
9f95a23c 9487 PeeringState::UnsetForceBackfill())));
11fdf7f2
TL
9488 } else {
9489 enqueue_peering_evt(
9490 pgid,
9491 PGPeeringEventRef(
9492 std::make_shared<PGPeeringEvent>(
9493 epoch, epoch,
9f95a23c 9494 PeeringState::SetForceBackfill())));
11fdf7f2
TL
9495 }
9496 } else if (m->options & OFR_RECOVERY) {
9497 if (m->options & OFR_CANCEL) {
9498 enqueue_peering_evt(
9499 pgid,
9500 PGPeeringEventRef(
9501 std::make_shared<PGPeeringEvent>(
9502 epoch, epoch,
9f95a23c 9503 PeeringState::UnsetForceRecovery())));
11fdf7f2
TL
9504 } else {
9505 enqueue_peering_evt(
9506 pgid,
9507 PGPeeringEventRef(
9508 std::make_shared<PGPeeringEvent>(
9509 epoch, epoch,
9f95a23c 9510 PeeringState::SetForceRecovery())));
c07f9fc5
FG
9511 }
9512 }
9513 }
11fdf7f2 9514 m->put();
c07f9fc5 9515}
7c673cae 9516
11fdf7f2 9517void OSD::handle_pg_query_nopg(const MQuery& q)
7c673cae 9518{
11fdf7f2
TL
9519 spg_t pgid = q.pgid;
9520 dout(10) << __func__ << " " << pgid << dendl;
7c673cae 9521
11fdf7f2
TL
9522 OSDMapRef osdmap = get_osdmap();
9523 if (!osdmap->have_pg_pool(pgid.pool()))
7c673cae
FG
9524 return;
9525
11fdf7f2
TL
9526 dout(10) << " pg " << pgid << " dne" << dendl;
9527 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9528 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9529 if (con) {
9530 Message *m;
9531 if (q.query.type == pg_query_t::LOG ||
9532 q.query.type == pg_query_t::FULLLOG) {
9533 m = new MOSDPGLog(
9534 q.query.from, q.query.to,
9535 osdmap->get_epoch(), empty,
9536 q.query.epoch_sent);
7c673cae 9537 } else {
20effc67
TL
9538 pg_notify_t notify{q.query.from, q.query.to,
9539 q.query.epoch_sent,
9540 osdmap->get_epoch(),
9541 empty,
9542 PastIntervals()};
9543 m = new MOSDPGNotify2(spg_t{pgid.pgid, q.query.from},
9544 std::move(notify));
7c673cae 9545 }
9f95a23c 9546 service.maybe_share_map(con.get(), osdmap);
11fdf7f2 9547 con->send_message(m);
7c673cae
FG
9548 }
9549}
9550
9f95a23c
TL
9551void OSDService::queue_check_readable(spg_t spgid,
9552 epoch_t lpr,
9553 ceph::signedspan delay)
9554{
9555 if (delay == ceph::signedspan::zero()) {
9556 osd->enqueue_peering_evt(
9557 spgid,
9558 PGPeeringEventRef(
9559 std::make_shared<PGPeeringEvent>(
9560 lpr, lpr,
9561 PeeringState::CheckReadable())));
9562 } else {
9563 mono_timer.add_event(
9564 delay,
9565 [this, spgid, lpr]() {
9566 queue_check_readable(spgid, lpr);
9567 });
9568 }
9569}
9570
7c673cae 9571
7c673cae
FG
9572// =========================================================
9573// RECOVERY
9574
9575void OSDService::_maybe_queue_recovery() {
9f95a23c 9576 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
7c673cae
FG
9577 uint64_t available_pushes;
9578 while (!awaiting_throttle.empty() &&
9579 _recover_now(&available_pushes)) {
11fdf7f2 9580 uint64_t to_start = std::min(
7c673cae
FG
9581 available_pushes,
9582 cct->_conf->osd_recovery_max_single_start);
9583 _queue_for_recovery(awaiting_throttle.front(), to_start);
9584 awaiting_throttle.pop_front();
11fdf7f2
TL
9585 dout(10) << __func__ << " starting " << to_start
9586 << ", recovery_ops_reserved " << recovery_ops_reserved
9587 << " -> " << (recovery_ops_reserved + to_start) << dendl;
7c673cae
FG
9588 recovery_ops_reserved += to_start;
9589 }
9590}
9591
9592bool OSDService::_recover_now(uint64_t *available_pushes)
9593{
9594 if (available_pushes)
9595 *available_pushes = 0;
9596
9597 if (ceph_clock_now() < defer_recovery_until) {
9598 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9599 return false;
9600 }
9601
9602 if (recovery_paused) {
9603 dout(15) << __func__ << " paused" << dendl;
9604 return false;
9605 }
9606
9f95a23c 9607 uint64_t max = osd->get_recovery_max_active();
7c673cae
FG
9608 if (max <= recovery_ops_active + recovery_ops_reserved) {
9609 dout(15) << __func__ << " active " << recovery_ops_active
9610 << " + reserved " << recovery_ops_reserved
9611 << " >= max " << max << dendl;
9612 return false;
9613 }
9614
9615 if (available_pushes)
9616 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9617
9618 return true;
9619}
9620
9f95a23c
TL
9621unsigned OSDService::get_target_pg_log_entries() const
9622{
9623 auto num_pgs = osd->get_num_pgs();
9624 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9625 if (num_pgs > 0 && target > 0) {
9626 // target an even spread of our budgeted log entries across all
9627 // PGs. note that while we only get to control the entry count
9628 // for primary PGs, we'll normally be responsible for a mix of
9629 // primary and replica PGs (for the same pool(s) even), so this
9630 // will work out.
9631 return std::max<unsigned>(
9632 std::min<unsigned>(target / num_pgs,
9633 cct->_conf->osd_max_pg_log_entries),
9634 cct->_conf->osd_min_pg_log_entries);
9635 } else {
9636 // fall back to a per-pg value.
9637 return cct->_conf->osd_min_pg_log_entries;
9638 }
9639}
9640
7c673cae
FG
9641void OSD::do_recovery(
9642 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9643 ThreadPool::TPHandle &handle)
9644{
9645 uint64_t started = 0;
31f18b77
FG
9646
9647 /*
9648 * When the value of osd_recovery_sleep is set greater than zero, recovery
9649 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9650 * recovery event's schedule time. This is done by adding a
9651 * recovery_requeue_callback event, which re-queues the recovery op using
9652 * queue_recovery_after_sleep.
9653 */
c07f9fc5 9654 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9655 {
11fdf7f2 9656 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9657 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9658 PGRef pgref(pg);
9f95a23c 9659 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
b32b8144
FG
9660 dout(20) << "do_recovery wake up at "
9661 << ceph_clock_now()
9662 << ", re-queuing recovery" << dendl;
11fdf7f2 9663 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9664 service.recovery_needs_sleep = false;
9665 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9666 });
9667
9668 // This is true for the first recovery op and when the previous recovery op
9669 // has been scheduled in the past. The next recovery op is scheduled after
9670 // completing the sleep from now.
f67539c2 9671
9f95a23c
TL
9672 if (auto now = ceph::real_clock::now();
9673 service.recovery_schedule_time < now) {
9674 service.recovery_schedule_time = now;
b32b8144 9675 }
9f95a23c 9676 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
11fdf7f2 9677 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9f95a23c 9678 recovery_requeue_callback);
b32b8144
FG
9679 dout(20) << "Recovery event scheduled at "
9680 << service.recovery_schedule_time << dendl;
9681 return;
9682 }
7c673cae
FG
9683 }
9684
9685 {
b32b8144 9686 {
11fdf7f2 9687 std::lock_guard l(service.sleep_lock);
b32b8144
FG
9688 service.recovery_needs_sleep = true;
9689 }
9690
7c673cae
FG
9691 if (pg->pg_has_reset_since(queued)) {
9692 goto out;
9693 }
9694
7c673cae
FG
9695 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9696#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 9697 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
7c673cae
FG
9698#endif
9699
11fdf7f2 9700 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
f67539c2 9701 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
7c673cae
FG
9702 << " on " << *pg << dendl;
9703
11fdf7f2 9704 if (do_unfound) {
20effc67 9705 PeeringCtx rctx;
11fdf7f2 9706 rctx.handle = &handle;
9f95a23c 9707 pg->find_unfound(queued, rctx);
11fdf7f2 9708 dispatch_context(rctx, pg, pg->get_osdmap());
7c673cae 9709 }
7c673cae
FG
9710 }
9711
9712 out:
11fdf7f2 9713 ceph_assert(started <= reserved_pushes);
7c673cae
FG
9714 service.release_reserved_pushes(reserved_pushes);
9715}
9716
9717void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9718{
11fdf7f2 9719 std::lock_guard l(recovery_lock);
7c673cae
FG
9720 dout(10) << "start_recovery_op " << *pg << " " << soid
9721 << " (" << recovery_ops_active << "/"
9f95a23c 9722 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9723 << dendl;
9724 recovery_ops_active++;
9725
9726#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9727 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9728 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9729 recovery_oids[pg->pg_id].insert(soid);
7c673cae
FG
9730#endif
9731}
9732
9733void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9734{
11fdf7f2 9735 std::lock_guard l(recovery_lock);
7c673cae
FG
9736 dout(10) << "finish_recovery_op " << *pg << " " << soid
9737 << " dequeue=" << dequeue
9f95a23c
TL
9738 << " (" << recovery_ops_active << "/"
9739 << osd->get_recovery_max_active() << " rops)"
7c673cae
FG
9740 << dendl;
9741
9742 // adjust count
11fdf7f2 9743 ceph_assert(recovery_ops_active > 0);
7c673cae
FG
9744 recovery_ops_active--;
9745
9746#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2
TL
9747 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9748 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9749 recovery_oids[pg->pg_id].erase(soid);
7c673cae
FG
9750#endif
9751
9752 _maybe_queue_recovery();
9753}
9754
9755bool OSDService::is_recovery_active()
9756{
eafe8130
TL
9757 if (cct->_conf->osd_debug_pretend_recovery_active) {
9758 return true;
9759 }
b5b8bbf5 9760 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9761}
9762
11fdf7f2
TL
9763void OSDService::release_reserved_pushes(uint64_t pushes)
9764{
9765 std::lock_guard l(recovery_lock);
9766 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9767 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9768 << dendl;
9769 ceph_assert(recovery_ops_reserved >= pushes);
9770 recovery_ops_reserved -= pushes;
9771 _maybe_queue_recovery();
9772}
9773
7c673cae
FG
9774// =========================================================
9775// OPS
9776
9777bool OSD::op_is_discardable(const MOSDOp *op)
9778{
9779 // drop client request if they are not connected and can't get the
9780 // reply anyway.
9781 if (!op->get_connection()->is_connected()) {
9782 return true;
9783 }
9784 return false;
9785}
9786
11fdf7f2 9787void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
7c673cae 9788{
11fdf7f2
TL
9789 const utime_t stamp = op->get_req()->get_recv_stamp();
9790 const utime_t latency = ceph_clock_now() - stamp;
9791 const unsigned priority = op->get_req()->get_priority();
9792 const int cost = op->get_req()->get_cost();
9793 const uint64_t owner = op->get_req()->get_source().num();
f67539c2 9794 const int type = op->get_req()->get_type();
11fdf7f2
TL
9795
9796 dout(15) << "enqueue_op " << op << " prio " << priority
f67539c2 9797 << " type " << type
11fdf7f2 9798 << " cost " << cost
7c673cae
FG
9799 << " latency " << latency
9800 << " epoch " << epoch
9801 << " " << *(op->get_req()) << dendl;
9802 op->osd_trace.event("enqueue op");
11fdf7f2
TL
9803 op->osd_trace.keyval("priority", priority);
9804 op->osd_trace.keyval("cost", cost);
20effc67
TL
9805
9806 auto enqueue_span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
9807 enqueue_span->AddEvent(__func__, {
9808 {"priority", priority},
9809 {"cost", cost},
9810 {"epoch", epoch},
9811 {"owner", owner},
9812 {"type", type}
9813 });
9814
7c673cae 9815 op->mark_queued_for_pg();
224ce89b 9816 logger->tinc(l_osd_op_before_queue_op_lat, latency);
f67539c2
TL
9817 if (type == MSG_OSD_PG_PUSH ||
9818 type == MSG_OSD_PG_PUSH_REPLY) {
9819 op_shardedwq.queue(
9820 OpSchedulerItem(
9821 unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9822 cost, priority, stamp, owner, epoch));
9823 } else {
9824 op_shardedwq.queue(
9825 OpSchedulerItem(
9826 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9827 cost, priority, stamp, owner, epoch));
9828 }
7c673cae
FG
9829}
9830
11fdf7f2
TL
9831void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9832{
9833 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9834 op_shardedwq.queue(
9f95a23c
TL
9835 OpSchedulerItem(
9836 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
11fdf7f2
TL
9837 10,
9838 cct->_conf->osd_peering_op_priority,
9839 utime_t(),
9840 0,
9841 evt->get_epoch_sent()));
9842}
7c673cae
FG
9843
9844/*
9845 * NOTE: dequeue called in worker thread, with pg lock
9846 */
9847void OSD::dequeue_op(
9848 PGRef pg, OpRequestRef op,
9849 ThreadPool::TPHandle &handle)
9850{
9f95a23c
TL
9851 const Message *m = op->get_req();
9852
11fdf7f2 9853 FUNCTRACE(cct);
9f95a23c 9854 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
7c673cae
FG
9855
9856 utime_t now = ceph_clock_now();
9857 op->set_dequeued_time(now);
9f95a23c
TL
9858
9859 utime_t latency = now - m->get_recv_stamp();
9860 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9861 << " cost " << m->get_cost()
7c673cae 9862 << " latency " << latency
9f95a23c 9863 << " " << *m
7c673cae
FG
9864 << " pg " << *pg << dendl;
9865
224ce89b
WB
9866 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9867
9f95a23c
TL
9868 service.maybe_share_map(m->get_connection().get(),
9869 pg->get_osdmap(),
9870 op->sent_epoch);
7c673cae 9871
11fdf7f2 9872 if (pg->is_deleting())
7c673cae
FG
9873 return;
9874
9875 op->mark_reached_pg();
9876 op->osd_trace.event("dequeue_op");
9877
9878 pg->do_request(op, handle);
9879
9880 // finish
9881 dout(10) << "dequeue_op " << op << " finish" << dendl;
9f95a23c 9882 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
7c673cae
FG
9883}
9884
9885
11fdf7f2
TL
9886void OSD::dequeue_peering_evt(
9887 OSDShard *sdata,
9888 PG *pg,
9889 PGPeeringEventRef evt,
9890 ThreadPool::TPHandle& handle)
7c673cae 9891{
11fdf7f2 9892 auto curmap = sdata->get_osdmap();
9f95a23c
TL
9893 bool need_up_thru = false;
9894 epoch_t same_interval_since = 0;
11fdf7f2
TL
9895 if (!pg) {
9896 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9897 handle_pg_query_nopg(*q);
7c673cae 9898 } else {
11fdf7f2
TL
9899 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9900 ceph_abort();
9901 }
20effc67
TL
9902 } else if (PeeringCtx rctx;
9903 advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9f95a23c 9904 pg->do_peering_event(evt, rctx);
11fdf7f2 9905 if (pg->is_deleted()) {
11fdf7f2
TL
9906 pg->unlock();
9907 return;
7c673cae 9908 }
9f95a23c 9909 dispatch_context(rctx, pg, curmap, &handle);
11fdf7f2
TL
9910 need_up_thru = pg->get_need_up_thru();
9911 same_interval_since = pg->get_same_interval_since();
7c673cae
FG
9912 pg->unlock();
9913 }
11fdf7f2
TL
9914
9915 if (need_up_thru) {
7c673cae 9916 queue_want_up_thru(same_interval_since);
11fdf7f2 9917 }
7c673cae
FG
9918
9919 service.send_pg_temp();
9920}
9921
11fdf7f2
TL
9922void OSD::dequeue_delete(
9923 OSDShard *sdata,
9924 PG *pg,
9925 epoch_t e,
9926 ThreadPool::TPHandle& handle)
9927{
9928 dequeue_peering_evt(
9929 sdata,
9930 pg,
9931 PGPeeringEventRef(
9932 std::make_shared<PGPeeringEvent>(
9933 e, e,
9f95a23c 9934 PeeringState::DeleteSome())),
11fdf7f2
TL
9935 handle);
9936}
9937
9938
9939
7c673cae
FG
9940// --------------------------------
9941
9942const char** OSD::get_tracked_conf_keys() const
9943{
9944 static const char* KEYS[] = {
9945 "osd_max_backfills",
9946 "osd_min_recovery_priority",
224ce89b
WB
9947 "osd_max_trimming_pgs",
9948 "osd_op_complaint_time",
9949 "osd_op_log_threshold",
9950 "osd_op_history_size",
9951 "osd_op_history_duration",
9952 "osd_op_history_slow_op_size",
9953 "osd_op_history_slow_op_threshold",
7c673cae
FG
9954 "osd_enable_op_tracker",
9955 "osd_map_cache_size",
11fdf7f2 9956 "osd_pg_epoch_max_lag_factor",
7c673cae 9957 "osd_pg_epoch_persisted_max_stale",
f67539c2
TL
9958 "osd_recovery_sleep",
9959 "osd_recovery_sleep_hdd",
9960 "osd_recovery_sleep_ssd",
9961 "osd_recovery_sleep_hybrid",
b3b6e05e
TL
9962 "osd_delete_sleep",
9963 "osd_delete_sleep_hdd",
9964 "osd_delete_sleep_ssd",
9965 "osd_delete_sleep_hybrid",
9966 "osd_snap_trim_sleep",
9967 "osd_snap_trim_sleep_hdd",
9968 "osd_snap_trim_sleep_ssd",
20effc67 9969 "osd_snap_trim_sleep_hybrid",
b3b6e05e 9970 "osd_scrub_sleep",
f67539c2
TL
9971 "osd_recovery_max_active",
9972 "osd_recovery_max_active_hdd",
9973 "osd_recovery_max_active_ssd",
7c673cae
FG
9974 // clog & admin clog
9975 "clog_to_monitors",
9976 "clog_to_syslog",
9977 "clog_to_syslog_facility",
9978 "clog_to_syslog_level",
9979 "osd_objectstore_fuse",
9980 "clog_to_graylog",
9981 "clog_to_graylog_host",
9982 "clog_to_graylog_port",
9983 "host",
9984 "fsid",
9985 "osd_recovery_delay_start",
9986 "osd_client_message_size_cap",
9987 "osd_client_message_cap",
31f18b77
FG
9988 "osd_heartbeat_min_size",
9989 "osd_heartbeat_interval",
9f95a23c 9990 "osd_object_clean_region_max_num_intervals",
494da23a
TL
9991 "osd_scrub_min_interval",
9992 "osd_scrub_max_interval",
7c673cae
FG
9993 NULL
9994 };
9995 return KEYS;
9996}
9997
11fdf7f2 9998void OSD::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
9999 const std::set <std::string> &changed)
10000{
9f95a23c 10001 std::lock_guard l{osd_lock};
f67539c2
TL
10002
10003 if (changed.count("osd_max_backfills") ||
39ae355f
TL
10004 changed.count("osd_recovery_max_active") ||
10005 changed.count("osd_recovery_max_active_hdd") ||
10006 changed.count("osd_recovery_max_active_ssd")) {
10007 if (!maybe_override_options_for_qos(&changed) &&
10008 changed.count("osd_max_backfills")) {
10009 // Scheduler is not "mclock". Fallback to earlier behavior
10010 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10011 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10012 }
10013 }
10014 if (changed.count("osd_delete_sleep") ||
f67539c2
TL
10015 changed.count("osd_delete_sleep_hdd") ||
10016 changed.count("osd_delete_sleep_ssd") ||
10017 changed.count("osd_delete_sleep_hybrid") ||
10018 changed.count("osd_snap_trim_sleep") ||
10019 changed.count("osd_snap_trim_sleep_hdd") ||
10020 changed.count("osd_snap_trim_sleep_ssd") ||
10021 changed.count("osd_snap_trim_sleep_hybrid") ||
10022 changed.count("osd_scrub_sleep") ||
10023 changed.count("osd_recovery_sleep") ||
10024 changed.count("osd_recovery_sleep_hdd") ||
10025 changed.count("osd_recovery_sleep_ssd") ||
39ae355f
TL
10026 changed.count("osd_recovery_sleep_hybrid")) {
10027 maybe_override_sleep_options_for_qos();
7c673cae
FG
10028 }
10029 if (changed.count("osd_min_recovery_priority")) {
10030 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10031 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10032 }
10033 if (changed.count("osd_max_trimming_pgs")) {
10034 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10035 }
10036 if (changed.count("osd_op_complaint_time") ||
10037 changed.count("osd_op_log_threshold")) {
10038 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10039 cct->_conf->osd_op_log_threshold);
10040 }
10041 if (changed.count("osd_op_history_size") ||
10042 changed.count("osd_op_history_duration")) {
10043 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10044 cct->_conf->osd_op_history_duration);
10045 }
10046 if (changed.count("osd_op_history_slow_op_size") ||
10047 changed.count("osd_op_history_slow_op_threshold")) {
10048 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10049 cct->_conf->osd_op_history_slow_op_threshold);
10050 }
10051 if (changed.count("osd_enable_op_tracker")) {
10052 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10053 }
7c673cae
FG
10054 if (changed.count("osd_map_cache_size")) {
10055 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10056 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10057 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10058 }
10059 if (changed.count("clog_to_monitors") ||
10060 changed.count("clog_to_syslog") ||
10061 changed.count("clog_to_syslog_level") ||
10062 changed.count("clog_to_syslog_facility") ||
10063 changed.count("clog_to_graylog") ||
10064 changed.count("clog_to_graylog_host") ||
10065 changed.count("clog_to_graylog_port") ||
10066 changed.count("host") ||
10067 changed.count("fsid")) {
10068 update_log_config();
10069 }
11fdf7f2
TL
10070 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10071 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10072 "osd_pg_epoch_max_lag_factor");
10073 }
7c673cae
FG
10074
10075#ifdef HAVE_LIBFUSE
10076 if (changed.count("osd_objectstore_fuse")) {
10077 if (store) {
10078 enable_disable_fuse(false);
10079 }
10080 }
10081#endif
10082
10083 if (changed.count("osd_recovery_delay_start")) {
10084 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10085 service.kick_recovery_queue();
10086 }
10087
10088 if (changed.count("osd_client_message_cap")) {
10089 uint64_t newval = cct->_conf->osd_client_message_cap;
10090 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
20effc67 10091 if (pol.throttler_messages) {
7c673cae
FG
10092 pol.throttler_messages->reset_max(newval);
10093 }
10094 }
10095 if (changed.count("osd_client_message_size_cap")) {
10096 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10097 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
20effc67 10098 if (pol.throttler_bytes) {
7c673cae
FG
10099 pol.throttler_bytes->reset_max(newval);
10100 }
10101 }
9f95a23c
TL
10102 if (changed.count("osd_object_clean_region_max_num_intervals")) {
10103 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10104 }
7c673cae 10105
494da23a
TL
10106 if (changed.count("osd_scrub_min_interval") ||
10107 changed.count("osd_scrub_max_interval")) {
10108 resched_all_scrubs();
10109 dout(0) << __func__ << ": scrub interval change" << dendl;
10110 }
7c673cae 10111 check_config();
f67539c2
TL
10112 if (changed.count("osd_asio_thread_count")) {
10113 service.poolctx.stop();
10114 service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10115 }
7c673cae
FG
10116}
10117
a4b75251
TL
10118void OSD::maybe_override_max_osd_capacity_for_qos()
10119{
10120 // If the scheduler enabled is mclock, override the default
10121 // osd capacity with the value obtained from running the
10122 // osd bench test. This is later used to setup mclock.
10123 if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
20effc67
TL
10124 (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false) &&
10125 (!unsupported_objstore_for_qos())) {
a4b75251
TL
10126 std::string max_capacity_iops_config;
10127 bool force_run_benchmark =
10128 cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
10129
10130 if (store_is_rotational) {
10131 max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
10132 } else {
10133 max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
10134 }
10135
39ae355f
TL
10136 double default_iops = 0.0;
10137 double cur_iops = 0.0;
a4b75251 10138 if (!force_run_benchmark) {
a4b75251 10139 // Get the current osd iops capacity
39ae355f 10140 cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
a4b75251
TL
10141
10142 // Get the default max iops capacity
10143 auto val = cct->_conf.get_val_default(max_capacity_iops_config);
10144 if (!val.has_value()) {
10145 derr << __func__ << " Unable to determine default value of "
10146 << max_capacity_iops_config << dendl;
10147 // Cannot determine default iops. Force a run of the OSD benchmark.
10148 force_run_benchmark = true;
10149 } else {
10150 // Default iops
10151 default_iops = std::stod(val.value());
10152 }
10153
10154 // Determine if we really need to run the osd benchmark
10155 if (!force_run_benchmark && (default_iops != cur_iops)) {
10156 dout(1) << __func__ << std::fixed << std::setprecision(2)
10157 << " default_iops: " << default_iops
10158 << " cur_iops: " << cur_iops
10159 << ". Skip OSD benchmark test." << dendl;
10160 return;
10161 }
10162 }
10163
10164 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
10165 int64_t count = 12288000; // Count of bytes to write
10166 int64_t bsize = 4096; // Block size
10167 int64_t osize = 4194304; // Object size
10168 int64_t onum = 100; // Count of objects to write
10169 double elapsed = 0.0; // Time taken to complete the test
10170 double iops = 0.0;
10171 stringstream ss;
10172 int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
10173 if (ret != 0) {
10174 derr << __func__
10175 << " osd bench err: " << ret
10176 << " osd bench errstr: " << ss.str()
10177 << dendl;
10178 return;
10179 }
10180
10181 double rate = count / elapsed;
10182 iops = rate / bsize;
10183 dout(1) << __func__
10184 << " osd bench result -"
10185 << std::fixed << std::setprecision(3)
10186 << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
10187 << " iops: " << iops
10188 << " elapsed_sec: " << elapsed
10189 << dendl;
10190
39ae355f
TL
10191 // Get the threshold IOPS set for the underlying hdd/ssd.
10192 double threshold_iops = 0.0;
10193 if (store_is_rotational) {
10194 threshold_iops = cct->_conf.get_val<double>(
10195 "osd_mclock_iops_capacity_threshold_hdd");
10196 } else {
10197 threshold_iops = cct->_conf.get_val<double>(
10198 "osd_mclock_iops_capacity_threshold_ssd");
10199 }
10200
10201 // Persist the iops value to the MON store or throw cluster warning
10202 // if the measured iops exceeds the set threshold. If the iops exceed
10203 // the threshold, the default value is used.
10204 if (iops > threshold_iops) {
10205 clog->warn() << "OSD bench result of " << std::to_string(iops)
10206 << " IOPS exceeded the threshold limit of "
10207 << std::to_string(threshold_iops) << " IOPS for osd."
10208 << std::to_string(whoami) << ". IOPS capacity is unchanged"
10209 << " at " << std::to_string(cur_iops) << " IOPS. The"
10210 << " recommendation is to establish the osd's IOPS capacity"
10211 << " using other benchmark tools (e.g. Fio) and then"
10212 << " override osd_mclock_max_capacity_iops_[hdd|ssd].";
10213 } else {
10214 mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
a4b75251 10215 }
39ae355f
TL
10216 }
10217}
a4b75251 10218
39ae355f
TL
10219bool OSD::maybe_override_options_for_qos(const std::set<std::string> *changed)
10220{
10221 // Override options only if the scheduler enabled is mclock and the
10222 // underlying objectstore is supported by mclock
10223 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10224 !unsupported_objstore_for_qos()) {
10225 static const std::map<std::string, uint64_t> recovery_qos_defaults {
10226 {"osd_recovery_max_active", 0},
10227 {"osd_recovery_max_active_hdd", 10},
10228 {"osd_recovery_max_active_ssd", 20},
10229 {"osd_max_backfills", 10},
10230 };
10231
10232 // Check if we were called because of a configuration change
10233 if (changed != nullptr) {
10234 if (cct->_conf.get_val<bool>("osd_mclock_override_recovery_settings")) {
10235 if (changed->count("osd_max_backfills")) {
10236 dout(1) << __func__ << " Set local and remote max backfills to "
10237 << cct->_conf->osd_max_backfills << dendl;
10238 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10239 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10240 }
10241 } else {
10242 // Recovery options change was attempted without setting
10243 // the 'osd_mclock_override_recovery_settings' option.
10244 // Find the key to remove from the configuration db.
10245 std::string key;
10246 if (changed->count("osd_max_backfills")) {
10247 key = "osd_max_backfills";
10248 } else if (changed->count("osd_recovery_max_active")) {
10249 key = "osd_recovery_max_active";
10250 } else if (changed->count("osd_recovery_max_active_hdd")) {
10251 key = "osd_recovery_max_active_hdd";
10252 } else if (changed->count("osd_recovery_max_active_ssd")) {
10253 key = "osd_recovery_max_active_ssd";
10254 } else {
10255 // No key that we are interested in. Return.
10256 return true;
10257 }
10258
10259 // Remove the current entry from the configuration if
10260 // different from its default value.
10261 auto val = recovery_qos_defaults.find(key);
10262 if (val != recovery_qos_defaults.end() &&
10263 cct->_conf.get_val<uint64_t>(key) != val->second) {
10264 static const std::vector<std::string> osds = {
10265 "osd",
10266 "osd." + std::to_string(whoami)
10267 };
10268
10269 for (auto osd : osds) {
10270 std::string cmd =
10271 "{"
10272 "\"prefix\": \"config rm\", "
10273 "\"who\": \"" + osd + "\", "
10274 "\"name\": \"" + key + "\""
10275 "}";
10276 vector<std::string> vcmd{cmd};
10277
10278 dout(1) << __func__ << " Removing Key: " << key
10279 << " for " << osd << " from Mon db" << dendl;
10280 monc->start_mon_command(vcmd, {}, nullptr, nullptr, nullptr);
10281 }
10282
10283 // Raise a cluster warning indicating that the changes did not
10284 // take effect and indicate the reason why.
10285 clog->warn() << "Change to " << key << " on osd."
10286 << std::to_string(whoami) << " did not take effect."
10287 << " Enable osd_mclock_override_recovery_settings before"
10288 << " setting this option.";
10289 }
10290 }
10291 } else { // if (changed != nullptr) (osd boot-up)
10292 // Override the default recovery max active and max backfills to
10293 // higher values based on the type of backing device (hdd/ssd).
10294 // This section is executed only during osd boot-up.
10295 for (auto opt : recovery_qos_defaults) {
10296 cct->_conf.set_val_default(opt.first, std::to_string(opt.second));
10297 if (opt.first == "osd_max_backfills") {
10298 service.local_reserver.set_max(opt.second);
10299 service.remote_reserver.set_max(opt.second);
10300 }
10301 dout(1) << __func__ << " Set default value for " << opt.first
10302 << " to " << opt.second << dendl;
10303 }
a4b75251 10304 }
39ae355f 10305 return true;
a4b75251 10306 }
39ae355f 10307 return false;
a4b75251
TL
10308}
10309
39ae355f 10310void OSD::maybe_override_sleep_options_for_qos()
b3b6e05e 10311{
39ae355f
TL
10312 // Override options only if the scheduler enabled is mclock and the
10313 // underlying objectstore is supported by mclock
20effc67
TL
10314 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10315 !unsupported_objstore_for_qos()) {
b3b6e05e 10316
39ae355f 10317 // Override the various sleep settings
b3b6e05e
TL
10318 // Disable recovery sleep
10319 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10320 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10321 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10322 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10323
10324 // Disable delete sleep
10325 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10326 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10327 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10328 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10329
10330 // Disable snap trim sleep
10331 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10332 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10333 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10334 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10335
10336 // Disable scrub sleep
10337 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
b3b6e05e 10338 }
b3b6e05e
TL
10339}
10340
39ae355f
TL
10341/**
10342 * A context for receiving status from a background mon command to set
10343 * a config option and optionally apply the changes on each op shard.
10344 */
10345class MonCmdSetConfigOnFinish : public Context {
10346 OSD *osd;
10347 CephContext *cct;
10348 std::string key;
10349 std::string val;
10350 bool update_shard;
10351public:
10352 explicit MonCmdSetConfigOnFinish(
10353 OSD *o,
10354 CephContext *cct,
10355 const std::string &k,
10356 const std::string &v,
10357 const bool s)
10358 : osd(o), cct(cct), key(k), val(v), update_shard(s) {}
10359 void finish(int r) override {
10360 if (r != 0) {
10361 // Fallback to setting the config within the in-memory "values" map.
10362 cct->_conf.set_val_default(key, val);
10363 }
10364
10365 // If requested, apply this option on the
10366 // active scheduler of each op shard.
10367 if (update_shard) {
10368 for (auto& shard : osd->shards) {
10369 shard->update_scheduler_config();
10370 }
10371 }
10372 }
10373};
10374
10375void OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
a4b75251
TL
10376{
10377 std::string cmd =
10378 "{"
10379 "\"prefix\": \"config set\", "
10380 "\"who\": \"osd." + std::to_string(whoami) + "\", "
10381 "\"name\": \"" + key + "\", "
10382 "\"value\": \"" + val + "\""
10383 "}";
a4b75251 10384 vector<std::string> vcmd{cmd};
a4b75251 10385
39ae355f
TL
10386 // List of config options to be distributed across each op shard.
10387 // Currently limited to a couple of mClock options.
10388 static const std::vector<std::string> shard_option =
10389 { "osd_mclock_max_capacity_iops_hdd", "osd_mclock_max_capacity_iops_ssd" };
10390 const bool update_shard = std::find(shard_option.begin(),
10391 shard_option.end(),
10392 key) != shard_option.end();
10393
10394 auto on_finish = new MonCmdSetConfigOnFinish(this, cct, key,
10395 val, update_shard);
10396 dout(10) << __func__ << " Set " << key << " = " << val << dendl;
10397 monc->start_mon_command(vcmd, {}, nullptr, nullptr, on_finish);
a4b75251
TL
10398}
10399
20effc67
TL
10400bool OSD::unsupported_objstore_for_qos()
10401{
10402 static const std::vector<std::string> unsupported_objstores = { "filestore" };
10403 return std::find(unsupported_objstores.begin(),
10404 unsupported_objstores.end(),
10405 store->get_type()) != unsupported_objstores.end();
10406}
10407
7c673cae
FG
10408void OSD::update_log_config()
10409{
20effc67
TL
10410 auto parsed_options = clog->parse_client_options(cct);
10411 derr << "log_to_monitors " << parsed_options.log_to_monitors << dendl;
7c673cae
FG
10412}
10413
10414void OSD::check_config()
10415{
10416 // some sanity checks
7c673cae
FG
10417 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10418 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10419 << " is not > osd_pg_epoch_persisted_max_stale ("
10420 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10421 }
9f95a23c 10422 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
f67539c2 10423 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9f95a23c
TL
10424 << cct->_conf->osd_object_clean_region_max_num_intervals
10425 << ") is < 0";
10426 }
7c673cae
FG
10427}
10428
7c673cae
FG
10429// --------------------------------
10430
10431void OSD::get_latest_osdmap()
10432{
10433 dout(10) << __func__ << " -- start" << dendl;
10434
f67539c2
TL
10435 boost::system::error_code ec;
10436 service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
7c673cae
FG
10437
10438 dout(10) << __func__ << " -- finish" << dendl;
10439}
10440
10441// --------------------------------
10442
9f95a23c
TL
10443void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10444 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10445 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
11fdf7f2
TL
10446 dout(10) << "setting " << queries.size() << " queries" << dendl;
10447
10448 std::list<OSDPerfMetricQuery> supported_queries;
10449 for (auto &it : queries) {
10450 auto &query = it.first;
10451 if (!query.key_descriptor.empty()) {
10452 supported_queries.push_back(query);
10453 }
10454 }
10455 if (supported_queries.size() < queries.size()) {
10456 dout(1) << queries.size() - supported_queries.size()
10457 << " unsupported queries" << dendl;
10458 }
11fdf7f2 10459 {
9f95a23c 10460 std::lock_guard locker{m_perf_queries_lock};
11fdf7f2
TL
10461 m_perf_queries = supported_queries;
10462 m_perf_limits = queries;
10463 }
11fdf7f2
TL
10464 std::vector<PGRef> pgs;
10465 _get_pgs(&pgs);
10466 for (auto& pg : pgs) {
9f95a23c 10467 std::scoped_lock l{*pg};
eafe8130 10468 pg->set_dynamic_perf_stats_queries(supported_queries);
7c673cae 10469 }
7c673cae
FG
10470}
10471
9f95a23c
TL
10472MetricPayload OSD::get_perf_reports() {
10473 OSDMetricPayload payload;
10474 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10475
11fdf7f2
TL
10476 std::vector<PGRef> pgs;
10477 _get_pgs(&pgs);
10478 DynamicPerfStats dps;
10479 for (auto& pg : pgs) {
eafe8130
TL
10480 // m_perf_queries can be modified only in set_perf_queries by mgr client
10481 // request, and it is protected by by mgr client's lock, which is held
10482 // when set_perf_queries/get_perf_reports are called, so we may not hold
10483 // m_perf_queries_lock here.
10484 DynamicPerfStats pg_dps(m_perf_queries);
10485 pg->lock();
10486 pg->get_dynamic_perf_stats(&pg_dps);
10487 pg->unlock();
10488 dps.merge(pg_dps);
11fdf7f2 10489 }
9f95a23c
TL
10490 dps.add_to_reports(m_perf_limits, &reports);
10491 dout(20) << "reports for " << reports.size() << " queries" << dendl;
10492
10493 return payload;
11fdf7f2 10494}
224ce89b 10495
7c673cae
FG
10496// =============================================================
10497
10498#undef dout_context
11fdf7f2 10499#define dout_context cct
7c673cae 10500#undef dout_prefix
11fdf7f2 10501#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
7c673cae 10502
11fdf7f2 10503void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
7c673cae 10504{
11fdf7f2
TL
10505 dout(10) << pg->pg_id << " " << pg << dendl;
10506 slot->pg = pg;
10507 pg->osd_shard = this;
10508 pg->pg_slot = slot;
10509 osd->inc_num_pgs();
10510
10511 slot->epoch = pg->get_osdmap_epoch();
10512 pg_slots_by_epoch.insert(*slot);
10513}
10514
10515void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10516{
10517 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10518 slot->pg->osd_shard = nullptr;
10519 slot->pg->pg_slot = nullptr;
10520 slot->pg = nullptr;
10521 osd->dec_num_pgs();
10522
10523 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10524 slot->epoch = 0;
10525 if (waiting_for_min_pg_epoch) {
10526 min_pg_epoch_cond.notify_all();
10527 }
10528}
10529
10530void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10531{
10532 std::lock_guard l(shard_lock);
10533 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10534 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10535 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10536 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10537 slot->epoch = e;
10538 pg_slots_by_epoch.insert(*slot);
10539 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10540 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10541 if (waiting_for_min_pg_epoch) {
10542 min_pg_epoch_cond.notify_all();
10543 }
10544}
10545
10546epoch_t OSDShard::get_min_pg_epoch()
10547{
10548 std::lock_guard l(shard_lock);
10549 auto p = pg_slots_by_epoch.begin();
10550 if (p == pg_slots_by_epoch.end()) {
10551 return 0;
10552 }
10553 return p->epoch;
10554}
10555
10556void OSDShard::wait_min_pg_epoch(epoch_t need)
10557{
10558 std::unique_lock l{shard_lock};
10559 ++waiting_for_min_pg_epoch;
10560 min_pg_epoch_cond.wait(l, [need, this] {
10561 if (pg_slots_by_epoch.empty()) {
10562 return true;
10563 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10564 return true;
10565 } else {
10566 dout(10) << need << " waiting on "
10567 << pg_slots_by_epoch.begin()->epoch << dendl;
10568 return false;
10569 }
10570 });
10571 --waiting_for_min_pg_epoch;
10572}
10573
10574epoch_t OSDShard::get_max_waiting_epoch()
10575{
10576 std::lock_guard l(shard_lock);
10577 epoch_t r = 0;
10578 for (auto& i : pg_slots) {
10579 if (!i.second->waiting_peering.empty()) {
10580 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10581 }
10582 }
10583 return r;
10584}
10585
10586void OSDShard::consume_map(
9f95a23c 10587 const OSDMapRef& new_osdmap,
11fdf7f2
TL
10588 unsigned *pushes_to_free)
10589{
10590 std::lock_guard l(shard_lock);
10591 OSDMapRef old_osdmap;
7c673cae 10592 {
11fdf7f2
TL
10593 std::lock_guard l(osdmap_lock);
10594 old_osdmap = std::move(shard_osdmap);
10595 shard_osdmap = new_osdmap;
10596 }
10597 dout(10) << new_osdmap->get_epoch()
10598 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10599 << dendl;
20effc67 10600 int queued = 0;
11fdf7f2
TL
10601
10602 // check slots
10603 auto p = pg_slots.begin();
10604 while (p != pg_slots.end()) {
10605 OSDShardPGSlot *slot = p->second.get();
10606 const spg_t& pgid = p->first;
10607 dout(20) << __func__ << " " << pgid << dendl;
10608 if (!slot->waiting_for_split.empty()) {
10609 dout(20) << __func__ << " " << pgid
10610 << " waiting for split " << slot->waiting_for_split << dendl;
10611 ++p;
10612 continue;
10613 }
10614 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10615 dout(20) << __func__ << " " << pgid
10616 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10617 << dendl;
10618 ++p;
10619 continue;
10620 }
10621 if (!slot->waiting_peering.empty()) {
10622 epoch_t first = slot->waiting_peering.begin()->first;
10623 if (first <= new_osdmap->get_epoch()) {
10624 dout(20) << __func__ << " " << pgid
10625 << " pending_peering first epoch " << first
10626 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
20effc67 10627 queued += _wake_pg_slot(pgid, slot);
11fdf7f2
TL
10628 }
10629 ++p;
10630 continue;
10631 }
10632 if (!slot->waiting.empty()) {
10633 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10634 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10635 << dendl;
10636 ++p;
10637 continue;
7c673cae 10638 }
11fdf7f2
TL
10639 while (!slot->waiting.empty() &&
10640 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10641 auto& qi = slot->waiting.front();
10642 dout(20) << __func__ << " " << pgid
10643 << " waiting item " << qi
10644 << " epoch " << qi.get_map_epoch()
10645 << " <= " << new_osdmap->get_epoch()
10646 << ", "
10647 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10648 "misdirected")
10649 << ", dropping" << dendl;
10650 *pushes_to_free += qi.get_reserved_pushes();
10651 slot->waiting.pop_front();
10652 }
10653 }
10654 if (slot->waiting.empty() &&
10655 slot->num_running == 0 &&
10656 slot->waiting_for_split.empty() &&
10657 !slot->pg) {
10658 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10659 p = pg_slots.erase(p);
10660 continue;
7c673cae 10661 }
11fdf7f2
TL
10662
10663 ++p;
7c673cae 10664 }
7c673cae 10665 if (queued) {
11fdf7f2 10666 std::lock_guard l{sdata_wait_lock};
20effc67
TL
10667 if (queued == 1)
10668 sdata_cond.notify_one();
10669 else
10670 sdata_cond.notify_all();
7c673cae
FG
10671 }
10672}
10673
20effc67 10674int OSDShard::_wake_pg_slot(
11fdf7f2
TL
10675 spg_t pgid,
10676 OSDShardPGSlot *slot)
10677{
20effc67 10678 int count = 0;
11fdf7f2
TL
10679 dout(20) << __func__ << " " << pgid
10680 << " to_process " << slot->to_process
10681 << " waiting " << slot->waiting
10682 << " waiting_peering " << slot->waiting_peering << dendl;
10683 for (auto i = slot->to_process.rbegin();
10684 i != slot->to_process.rend();
10685 ++i) {
9f95a23c 10686 scheduler->enqueue_front(std::move(*i));
20effc67 10687 count++;
11fdf7f2
TL
10688 }
10689 slot->to_process.clear();
10690 for (auto i = slot->waiting.rbegin();
10691 i != slot->waiting.rend();
10692 ++i) {
9f95a23c 10693 scheduler->enqueue_front(std::move(*i));
20effc67 10694 count++;
11fdf7f2
TL
10695 }
10696 slot->waiting.clear();
10697 for (auto i = slot->waiting_peering.rbegin();
10698 i != slot->waiting_peering.rend();
10699 ++i) {
10700 // this is overkill; we requeue everything, even if some of these
10701 // items are waiting for maps we don't have yet. FIXME, maybe,
10702 // someday, if we decide this inefficiency matters
10703 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
9f95a23c 10704 scheduler->enqueue_front(std::move(*j));
20effc67 10705 count++;
11fdf7f2
TL
10706 }
10707 }
10708 slot->waiting_peering.clear();
10709 ++slot->requeue_seq;
20effc67 10710 return count;
11fdf7f2
TL
10711}
10712
10713void OSDShard::identify_splits_and_merges(
10714 const OSDMapRef& as_of_osdmap,
10715 set<pair<spg_t,epoch_t>> *split_pgs,
10716 set<pair<spg_t,epoch_t>> *merge_pgs)
10717{
10718 std::lock_guard l(shard_lock);
10719 if (shard_osdmap) {
10720 for (auto& i : pg_slots) {
10721 const spg_t& pgid = i.first;
10722 auto *slot = i.second.get();
10723 if (slot->pg) {
10724 osd->service.identify_splits_and_merges(
10725 shard_osdmap, as_of_osdmap, pgid,
10726 split_pgs, merge_pgs);
10727 } else if (!slot->waiting_for_split.empty()) {
10728 osd->service.identify_splits_and_merges(
10729 shard_osdmap, as_of_osdmap, pgid,
10730 split_pgs, nullptr);
10731 } else {
10732 dout(20) << __func__ << " slot " << pgid
9f95a23c 10733 << " has no pg and waiting_for_split " << dendl;
7c673cae 10734 }
11fdf7f2
TL
10735 }
10736 }
10737}
10738
10739void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10740 set<pair<spg_t,epoch_t>> *pgids)
10741{
10742 std::lock_guard l(shard_lock);
10743 _prime_splits(pgids);
10744 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10745 set<pair<spg_t,epoch_t>> newer_children;
10746 for (auto i : *pgids) {
10747 osd->service.identify_splits_and_merges(
10748 as_of_osdmap, shard_osdmap, i.first,
10749 &newer_children, nullptr);
10750 }
10751 newer_children.insert(pgids->begin(), pgids->end());
10752 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10753 << shard_osdmap->get_epoch() << ", new children " << newer_children
10754 << dendl;
10755 _prime_splits(&newer_children);
10756 // note: we don't care what is left over here for other shards.
10757 // if this shard is ahead of us and one isn't, e.g., one thread is
10758 // calling into prime_splits via _process (due to a newly created
10759 // pg) and this shard has a newer map due to a racing consume_map,
10760 // then any grandchildren left here will be identified (or were
10761 // identified) when the slower shard's osdmap is advanced.
10762 // _prime_splits() will tolerate the case where the pgid is
10763 // already primed.
10764 }
10765}
10766
10767void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10768{
10769 dout(10) << *pgids << dendl;
10770 auto p = pgids->begin();
10771 while (p != pgids->end()) {
10772 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10773 if (shard_index == shard_id) {
10774 auto r = pg_slots.emplace(p->first, nullptr);
10775 if (r.second) {
10776 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10777 r.first->second = make_unique<OSDShardPGSlot>();
10778 r.first->second->waiting_for_split.insert(p->second);
7c673cae 10779 } else {
11fdf7f2
TL
10780 auto q = r.first;
10781 ceph_assert(q != pg_slots.end());
10782 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10783 << dendl;
10784 q->second->waiting_for_split.insert(p->second);
7c673cae 10785 }
11fdf7f2
TL
10786 p = pgids->erase(p);
10787 } else {
10788 ++p;
7c673cae
FG
10789 }
10790 }
11fdf7f2
TL
10791}
10792
10793void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10794 set<pair<spg_t,epoch_t>> *merge_pgs)
10795{
10796 std::lock_guard l(shard_lock);
10797 dout(20) << __func__ << " checking shard " << shard_id
10798 << " for remaining merge pgs " << merge_pgs << dendl;
10799 auto p = merge_pgs->begin();
10800 while (p != merge_pgs->end()) {
10801 spg_t pgid = p->first;
10802 epoch_t epoch = p->second;
10803 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10804 if (shard_index != shard_id) {
10805 ++p;
10806 continue;
10807 }
10808 OSDShardPGSlot *slot;
10809 auto r = pg_slots.emplace(pgid, nullptr);
10810 if (r.second) {
10811 r.first->second = make_unique<OSDShardPGSlot>();
10812 }
10813 slot = r.first->second.get();
10814 if (slot->pg) {
10815 // already have pg
10816 dout(20) << __func__ << " have merge participant pg " << pgid
10817 << " " << slot->pg << dendl;
10818 } else if (!slot->waiting_for_split.empty() &&
10819 *slot->waiting_for_split.begin() < epoch) {
10820 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10821 << " " << slot->waiting_for_split << dendl;
10822 } else {
10823 dout(20) << __func__ << " creating empty merge participant " << pgid
10824 << " for merge in " << epoch << dendl;
10825 // leave history zeroed; PG::merge_from() will fill it in.
10826 pg_history_t history;
10827 PGCreateInfo cinfo(pgid, epoch - 1,
10828 history, PastIntervals(), false);
10829 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10830 _attach_pg(r.first->second.get(), pg.get());
10831 _wake_pg_slot(pgid, slot);
10832 pg->unlock();
10833 }
10834 // mark slot for merge
10835 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10836 slot->waiting_for_merge_epoch = epoch;
10837 p = merge_pgs->erase(p);
7c673cae
FG
10838 }
10839}
10840
11fdf7f2 10841void OSDShard::register_and_wake_split_child(PG *pg)
7c673cae 10842{
20effc67 10843 dout(15) << __func__ << ": " << pg << " #:" << pg_slots.size() << dendl;
11fdf7f2
TL
10844 epoch_t epoch;
10845 {
10846 std::lock_guard l(shard_lock);
20effc67 10847 dout(10) << __func__ << ": " << pg->pg_id << " " << pg << dendl;
11fdf7f2
TL
10848 auto p = pg_slots.find(pg->pg_id);
10849 ceph_assert(p != pg_slots.end());
10850 auto *slot = p->second.get();
20effc67
TL
10851 dout(20) << __func__ << ": " << pg->pg_id << " waiting_for_split "
10852 << slot->waiting_for_split << dendl;
11fdf7f2
TL
10853 ceph_assert(!slot->pg);
10854 ceph_assert(!slot->waiting_for_split.empty());
10855 _attach_pg(slot, pg);
10856
10857 epoch = pg->get_osdmap_epoch();
10858 ceph_assert(slot->waiting_for_split.count(epoch));
10859 slot->waiting_for_split.erase(epoch);
10860 if (slot->waiting_for_split.empty()) {
10861 _wake_pg_slot(pg->pg_id, slot);
10862 } else {
10863 dout(10) << __func__ << " still waiting for split on "
10864 << slot->waiting_for_split << dendl;
10865 }
7c673cae 10866 }
11fdf7f2
TL
10867
10868 // kick child to ensure it pulls up to the latest osdmap
10869 osd->enqueue_peering_evt(
10870 pg->pg_id,
10871 PGPeeringEventRef(
10872 std::make_shared<PGPeeringEvent>(
10873 epoch,
10874 epoch,
10875 NullEvt())));
10876
10877 std::lock_guard l{sdata_wait_lock};
10878 sdata_cond.notify_one();
7c673cae
FG
10879}
10880
11fdf7f2 10881void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
7c673cae 10882{
11fdf7f2
TL
10883 std::lock_guard l(shard_lock);
10884 vector<spg_t> to_delete;
10885 for (auto& i : pg_slots) {
10886 if (i.first != parent &&
10887 i.first.get_ancestor(old_pg_num) == parent) {
10888 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10889 << dendl;
10890 _wake_pg_slot(i.first, i.second.get());
10891 to_delete.push_back(i.first);
10892 }
10893 }
10894 for (auto pgid : to_delete) {
10895 pg_slots.erase(pgid);
10896 }
10897}
10898
a4b75251
TL
10899void OSDShard::update_scheduler_config()
10900{
10901 std::lock_guard l(shard_lock);
10902 scheduler->update_configuration();
10903}
10904
20effc67
TL
10905std::string OSDShard::get_scheduler_type()
10906{
10907 std::ostringstream scheduler_type;
10908 scheduler_type << *scheduler;
10909 return scheduler_type.str();
10910}
10911
9f95a23c
TL
10912OSDShard::OSDShard(
10913 int id,
10914 CephContext *cct,
10915 OSD *osd)
10916 : shard_id(id),
10917 cct(cct),
10918 osd(osd),
10919 shard_name(string("OSDShard.") + stringify(id)),
10920 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10921 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10922 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10923 shard_lock_name(shard_name + "::shard_lock"),
10924 shard_lock{make_mutex(shard_lock_name)},
f67539c2 10925 scheduler(ceph::osd::scheduler::make_scheduler(
39ae355f
TL
10926 cct, osd->whoami, osd->num_shards, id, osd->store->is_rotational(),
10927 osd->store->get_type(), osd->monc)),
9f95a23c
TL
10928 context_queue(sdata_wait_lock, sdata_cond)
10929{
10930 dout(0) << "using op scheduler " << *scheduler << dendl;
10931}
10932
11fdf7f2
TL
10933
10934// =============================================================
10935
10936#undef dout_context
10937#define dout_context osd->cct
10938#undef dout_prefix
10939#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10940
10941void OSD::ShardedOpWQ::_add_slot_waiter(
10942 spg_t pgid,
10943 OSDShardPGSlot *slot,
9f95a23c 10944 OpSchedulerItem&& qi)
11fdf7f2
TL
10945{
10946 if (qi.is_peering()) {
10947 dout(20) << __func__ << " " << pgid
10948 << " peering, item epoch is "
10949 << qi.get_map_epoch()
10950 << ", will wait on " << qi << dendl;
10951 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10952 } else {
10953 dout(20) << __func__ << " " << pgid
10954 << " item epoch is "
10955 << qi.get_map_epoch()
10956 << ", will wait on " << qi << dendl;
10957 slot->waiting.push_back(std::move(qi));
7c673cae
FG
10958 }
10959}
10960
10961#undef dout_prefix
10962#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10963
10964void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10965{
11fdf7f2
TL
10966 uint32_t shard_index = thread_index % osd->num_shards;
10967 auto& sdata = osd->shards[shard_index];
10968 ceph_assert(sdata);
10969
10970 // If all threads of shards do oncommits, there is a out-of-order
10971 // problem. So we choose the thread which has the smallest
10972 // thread_index(thread_index < num_shards) of shard to do oncommit
10973 // callback.
10974 bool is_smallest_thread_index = thread_index < osd->num_shards;
7c673cae
FG
10975
10976 // peek at spg_t
11fdf7f2 10977 sdata->shard_lock.lock();
9f95a23c 10978 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10979 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10980 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10981 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10982 // we raced with a context_queue addition, don't wait
10983 wait_lock.unlock();
10984 } else if (!sdata->stop_waiting) {
10985 dout(20) << __func__ << " empty q, waiting" << dendl;
10986 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10987 sdata->shard_lock.unlock();
10988 sdata->sdata_cond.wait(wait_lock);
10989 wait_lock.unlock();
10990 sdata->shard_lock.lock();
9f95a23c 10991 if (sdata->scheduler->empty() &&
11fdf7f2
TL
10992 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10993 sdata->shard_lock.unlock();
10994 return;
10995 }
e306af50 10996 // found a work item; reapply default wq timeouts
11fdf7f2 10997 osd->cct->get_heartbeat_map()->reset_timeout(hb,
e306af50 10998 timeout_interval, suicide_interval);
11fdf7f2
TL
10999 } else {
11000 dout(20) << __func__ << " need return immediately" << dendl;
11001 wait_lock.unlock();
11002 sdata->shard_lock.unlock();
7c673cae
FG
11003 return;
11004 }
11005 }
11fdf7f2
TL
11006
11007 list<Context *> oncommits;
9f95a23c
TL
11008 if (is_smallest_thread_index) {
11009 sdata->context_queue.move_to(oncommits);
7c673cae 11010 }
11fdf7f2 11011
f67539c2
TL
11012 WorkItem work_item;
11013 while (!std::get_if<OpSchedulerItem>(&work_item)) {
11014 if (sdata->scheduler->empty()) {
11015 if (osd->is_stopping()) {
11016 sdata->shard_lock.unlock();
11017 for (auto c : oncommits) {
11018 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11019 delete c;
11020 }
11021 return; // OSD shutdown, discard.
11022 }
11023 sdata->shard_lock.unlock();
11024 handle_oncommits(oncommits);
11025 return;
11026 }
11027
11028 work_item = sdata->scheduler->dequeue();
11fdf7f2
TL
11029 if (osd->is_stopping()) {
11030 sdata->shard_lock.unlock();
11031 for (auto c : oncommits) {
f67539c2
TL
11032 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11033 delete c;
11fdf7f2
TL
11034 }
11035 return; // OSD shutdown, discard.
7c673cae 11036 }
7c673cae 11037
f67539c2
TL
11038 // If the work item is scheduled in the future, wait until
11039 // the time returned in the dequeue response before retrying.
11040 if (auto when_ready = std::get_if<double>(&work_item)) {
11041 if (is_smallest_thread_index) {
11042 sdata->shard_lock.unlock();
11043 handle_oncommits(oncommits);
2a845540 11044 sdata->shard_lock.lock();
f67539c2
TL
11045 }
11046 std::unique_lock wait_lock{sdata->sdata_wait_lock};
11047 auto future_time = ceph::real_clock::from_double(*when_ready);
11048 dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
a4b75251
TL
11049 // Disable heartbeat timeout until we find a non-future work item to process.
11050 osd->cct->get_heartbeat_map()->clear_timeout(hb);
f67539c2
TL
11051 sdata->shard_lock.unlock();
11052 ++sdata->waiting_threads;
11053 sdata->sdata_cond.wait_until(wait_lock, future_time);
11054 --sdata->waiting_threads;
11055 wait_lock.unlock();
11056 sdata->shard_lock.lock();
a4b75251
TL
11057 // Reapply default wq timeouts
11058 osd->cct->get_heartbeat_map()->reset_timeout(hb,
11059 timeout_interval, suicide_interval);
2a845540
TL
11060 // Populate the oncommits list if there were any additions
11061 // to the context_queue while we were waiting
11062 if (is_smallest_thread_index) {
11063 sdata->context_queue.move_to(oncommits);
11064 }
f67539c2
TL
11065 }
11066 } // while
11067
11068 // Access the stored item
11069 auto item = std::move(std::get<OpSchedulerItem>(work_item));
11fdf7f2
TL
11070 if (osd->is_stopping()) {
11071 sdata->shard_lock.unlock();
11072 for (auto c : oncommits) {
11073 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
11074 delete c;
11075 }
11076 return; // OSD shutdown, discard.
11077 }
7c673cae 11078
11fdf7f2
TL
11079 const auto token = item.get_ordering_token();
11080 auto r = sdata->pg_slots.emplace(token, nullptr);
11081 if (r.second) {
11082 r.first->second = make_unique<OSDShardPGSlot>();
7c673cae 11083 }
11fdf7f2
TL
11084 OSDShardPGSlot *slot = r.first->second.get();
11085 dout(20) << __func__ << " " << token
11086 << (r.second ? " (new)" : "")
11087 << " to_process " << slot->to_process
11088 << " waiting " << slot->waiting
11089 << " waiting_peering " << slot->waiting_peering
11090 << dendl;
11091 slot->to_process.push_back(std::move(item));
11092 dout(20) << __func__ << " " << slot->to_process.back()
11093 << " queued" << dendl;
7c673cae 11094
11fdf7f2
TL
11095 retry_pg:
11096 PGRef pg = slot->pg;
7c673cae 11097
11fdf7f2
TL
11098 // lock pg (if we have it)
11099 if (pg) {
11100 // note the requeue seq now...
11101 uint64_t requeue_seq = slot->requeue_seq;
11102 ++slot->num_running;
7c673cae 11103
11fdf7f2
TL
11104 sdata->shard_lock.unlock();
11105 osd->service.maybe_inject_dispatch_delay();
11106 pg->lock();
11107 osd->service.maybe_inject_dispatch_delay();
11108 sdata->shard_lock.lock();
7c673cae 11109
11fdf7f2
TL
11110 auto q = sdata->pg_slots.find(token);
11111 if (q == sdata->pg_slots.end()) {
11112 // this can happen if we race with pg removal.
11113 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
11114 pg->unlock();
11115 sdata->shard_lock.unlock();
11116 handle_oncommits(oncommits);
11117 return;
11118 }
11119 slot = q->second.get();
11120 --slot->num_running;
7c673cae 11121
11fdf7f2
TL
11122 if (slot->to_process.empty()) {
11123 // raced with _wake_pg_slot or consume_map
11124 dout(20) << __func__ << " " << token
11125 << " nothing queued" << dendl;
7c673cae 11126 pg->unlock();
11fdf7f2
TL
11127 sdata->shard_lock.unlock();
11128 handle_oncommits(oncommits);
11129 return;
7c673cae 11130 }
11fdf7f2
TL
11131 if (requeue_seq != slot->requeue_seq) {
11132 dout(20) << __func__ << " " << token
11133 << " requeue_seq " << slot->requeue_seq << " > our "
11134 << requeue_seq << ", we raced with _wake_pg_slot"
11135 << dendl;
7c673cae 11136 pg->unlock();
11fdf7f2
TL
11137 sdata->shard_lock.unlock();
11138 handle_oncommits(oncommits);
11139 return;
7c673cae 11140 }
11fdf7f2
TL
11141 if (slot->pg != pg) {
11142 // this can happen if we race with pg removal.
11143 dout(20) << __func__ << " slot " << token << " no longer attached to "
11144 << pg << dendl;
7c673cae 11145 pg->unlock();
11fdf7f2 11146 goto retry_pg;
7c673cae 11147 }
7c673cae
FG
11148 }
11149
11fdf7f2
TL
11150 dout(20) << __func__ << " " << token
11151 << " to_process " << slot->to_process
11152 << " waiting " << slot->waiting
11153 << " waiting_peering " << slot->waiting_peering << dendl;
11154
11155 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
11156 suicide_interval);
11157
7c673cae 11158 // take next item
11fdf7f2
TL
11159 auto qi = std::move(slot->to_process.front());
11160 slot->to_process.pop_front();
11161 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11162 set<pair<spg_t,epoch_t>> new_children;
11163 OSDMapRef osdmap;
7c673cae 11164
11fdf7f2 11165 while (!pg) {
7c673cae 11166 // should this pg shard exist on this osd in this (or a later) epoch?
11fdf7f2
TL
11167 osdmap = sdata->shard_osdmap;
11168 const PGCreateInfo *create_info = qi.creates_pg();
11169 if (!slot->waiting_for_split.empty()) {
11170 dout(20) << __func__ << " " << token
11171 << " splitting " << slot->waiting_for_split << dendl;
11172 _add_slot_waiter(token, slot, std::move(qi));
11173 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11174 dout(20) << __func__ << " " << token
11175 << " map " << qi.get_map_epoch() << " > "
11176 << osdmap->get_epoch() << dendl;
11177 _add_slot_waiter(token, slot, std::move(qi));
11178 } else if (qi.is_peering()) {
11179 if (!qi.peering_requires_pg()) {
11180 // for pg-less events, we run them under the ordering lock, since
11181 // we don't have the pg lock to keep them ordered.
11182 qi.run(osd, sdata, pg, tp_handle);
11183 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11184 if (create_info) {
11185 if (create_info->by_mon &&
11186 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11187 dout(20) << __func__ << " " << token
11188 << " no pg, no longer primary, ignoring mon create on "
11189 << qi << dendl;
11190 } else {
11191 dout(20) << __func__ << " " << token
11192 << " no pg, should create on " << qi << dendl;
11193 pg = osd->handle_pg_create_info(osdmap, create_info);
11194 if (pg) {
11195 // we created the pg! drop out and continue "normally"!
11196 sdata->_attach_pg(slot, pg.get());
11197 sdata->_wake_pg_slot(token, slot);
11198
11199 // identify split children between create epoch and shard epoch.
11200 osd->service.identify_splits_and_merges(
11201 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11202 sdata->_prime_splits(&new_children);
11203 // distribute remaining split children to other shards below!
11204 break;
11205 }
11206 dout(20) << __func__ << " ignored create on " << qi << dendl;
11207 }
11208 } else {
11209 dout(20) << __func__ << " " << token
11210 << " no pg, peering, !create, discarding " << qi << dendl;
11211 }
11212 } else {
11213 dout(20) << __func__ << " " << token
11214 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11215 << ", discarding " << qi
11216 << dendl;
11217 }
11218 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11219 dout(20) << __func__ << " " << token
11220 << " no pg, should exist e" << osdmap->get_epoch()
11221 << ", will wait on " << qi << dendl;
11222 _add_slot_waiter(token, slot, std::move(qi));
7c673cae 11223 } else {
11fdf7f2
TL
11224 dout(20) << __func__ << " " << token
11225 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11226 << ", dropping " << qi << dendl;
7c673cae 11227 // share map with client?
9f95a23c
TL
11228 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11229 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11230 sdata->shard_osdmap,
11231 (*_op)->sent_epoch);
7c673cae 11232 }
11fdf7f2 11233 unsigned pushes_to_free = qi.get_reserved_pushes();
7c673cae 11234 if (pushes_to_free > 0) {
11fdf7f2 11235 sdata->shard_lock.unlock();
7c673cae 11236 osd->service.release_reserved_pushes(pushes_to_free);
11fdf7f2 11237 handle_oncommits(oncommits);
7c673cae
FG
11238 return;
11239 }
11240 }
11fdf7f2
TL
11241 sdata->shard_lock.unlock();
11242 handle_oncommits(oncommits);
7c673cae
FG
11243 return;
11244 }
11fdf7f2
TL
11245 if (qi.is_peering()) {
11246 OSDMapRef osdmap = sdata->shard_osdmap;
11247 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11248 _add_slot_waiter(token, slot, std::move(qi));
11249 sdata->shard_lock.unlock();
11250 pg->unlock();
11251 handle_oncommits(oncommits);
11252 return;
11253 }
11254 }
11255 sdata->shard_lock.unlock();
7c673cae 11256
11fdf7f2
TL
11257 if (!new_children.empty()) {
11258 for (auto shard : osd->shards) {
11259 shard->prime_splits(osdmap, &new_children);
11260 }
11261 ceph_assert(new_children.empty());
11262 }
7c673cae
FG
11263
11264 // osd_opwq_process marks the point at which an operation has been dequeued
11265 // and will begin to be handled by a worker thread.
11266 {
11267#ifdef WITH_LTTNG
11268 osd_reqid_t reqid;
9f95a23c 11269 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11270 reqid = (*_op)->get_reqid();
11271 }
11272#endif
11273 tracepoint(osd, opwq_process_start, reqid.name._type,
11274 reqid.name._num, reqid.tid, reqid.inc);
11275 }
11276
11277 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11278 Formatter *f = Formatter::create("json");
11279 f->open_object_section("q");
11280 dump(f);
11281 f->close_section();
11282 f->flush(*_dout);
11283 delete f;
11284 *_dout << dendl;
11285
11fdf7f2 11286 qi.run(osd, sdata, pg, tp_handle);
7c673cae
FG
11287
11288 {
11289#ifdef WITH_LTTNG
11290 osd_reqid_t reqid;
9f95a23c 11291 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
7c673cae
FG
11292 reqid = (*_op)->get_reqid();
11293 }
11294#endif
11295 tracepoint(osd, opwq_process_finish, reqid.name._type,
11296 reqid.name._num, reqid.tid, reqid.inc);
11297 }
11298
11fdf7f2 11299 handle_oncommits(oncommits);
7c673cae
FG
11300}
11301
9f95a23c 11302void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
1d09f67e
TL
11303 if (unlikely(m_fast_shutdown) ) {
11304 // stop enqueing when we are in the middle of a fast shutdown
11305 return;
11306 }
11307
7c673cae 11308 uint32_t shard_index =
11fdf7f2 11309 item.get_ordering_token().hash_to_shard(osd->shards.size());
7c673cae 11310
11fdf7f2 11311 OSDShard* sdata = osd->shards[shard_index];
7c673cae 11312 assert (NULL != sdata);
20effc67
TL
11313 if (sdata->get_scheduler_type() == "mClockScheduler") {
11314 item.maybe_set_is_qos_item();
11315 }
11316
11317 dout(20) << __func__ << " " << item << dendl;
7c673cae 11318
9f95a23c
TL
11319 bool empty = true;
11320 {
11321 std::lock_guard l{sdata->shard_lock};
11322 empty = sdata->scheduler->empty();
11323 sdata->scheduler->enqueue(std::move(item));
11324 }
7c673cae 11325
f67539c2 11326 {
9f95a23c 11327 std::lock_guard l{sdata->sdata_wait_lock};
f67539c2
TL
11328 if (empty) {
11329 sdata->sdata_cond.notify_all();
11330 } else if (sdata->waiting_threads) {
11331 sdata->sdata_cond.notify_one();
11332 }
9f95a23c 11333 }
7c673cae
FG
11334}
11335
9f95a23c 11336void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
7c673cae 11337{
1d09f67e
TL
11338 if (unlikely(m_fast_shutdown) ) {
11339 // stop enqueing when we are in the middle of a fast shutdown
11340 return;
11341 }
11342
11fdf7f2
TL
11343 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11344 auto& sdata = osd->shards[shard_index];
11345 ceph_assert(sdata);
11346 sdata->shard_lock.lock();
11347 auto p = sdata->pg_slots.find(item.get_ordering_token());
11348 if (p != sdata->pg_slots.end() &&
11349 !p->second->to_process.empty()) {
7c673cae 11350 // we may be racing with _process, which has dequeued a new item
9f95a23c 11351 // from scheduler, put it on to_process, and is now busy taking the
7c673cae
FG
11352 // pg lock. ensure this old requeued item is ordered before any
11353 // such newer item in to_process.
11fdf7f2
TL
11354 p->second->to_process.push_front(std::move(item));
11355 item = std::move(p->second->to_process.back());
11356 p->second->to_process.pop_back();
11357 dout(20) << __func__
11358 << " " << p->second->to_process.front()
11359 << " shuffled w/ " << item << dendl;
7c673cae 11360 } else {
11fdf7f2 11361 dout(20) << __func__ << " " << item << dendl;
7c673cae 11362 }
9f95a23c 11363 sdata->scheduler->enqueue_front(std::move(item));
11fdf7f2
TL
11364 sdata->shard_lock.unlock();
11365 std::lock_guard l{sdata->sdata_wait_lock};
11366 sdata->sdata_cond.notify_one();
7c673cae
FG
11367}
11368
1d09f67e
TL
11369void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11370{
11371 uint32_t shard_index = 0;
11372 m_fast_shutdown = true;
11373
11374 for (; shard_index < osd->num_shards; shard_index++) {
11375 auto& sdata = osd->shards[shard_index];
11376 ceph_assert(sdata);
11377 sdata->shard_lock.lock();
11378 int work_count = 0;
11379 while(! sdata->scheduler->empty() ) {
11380 auto work_item = sdata->scheduler->dequeue();
11381 work_count++;
11382 }
11383 sdata->shard_lock.unlock();
11384 }
11385}
11386
f67539c2 11387namespace ceph::osd_cmds {
7c673cae 11388
2a845540
TL
11389int heap(CephContext& cct,
11390 const cmdmap_t& cmdmap,
11391 std::ostream& outos,
11392 std::ostream& erros)
7c673cae
FG
11393{
11394 if (!ceph_using_tcmalloc()) {
2a845540 11395 erros << "could not issue heap profiler command -- not using tcmalloc!";
7c673cae
FG
11396 return -EOPNOTSUPP;
11397 }
f67539c2 11398
7c673cae 11399 string cmd;
9f95a23c 11400 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
2a845540 11401 erros << "unable to get value for command \"" << cmd << "\"";
7c673cae 11402 return -EINVAL;
11fdf7f2 11403 }
f67539c2 11404
7c673cae
FG
11405 std::vector<std::string> cmd_vec;
11406 get_str_vec(cmd, cmd_vec);
11fdf7f2
TL
11407
11408 string val;
9f95a23c 11409 if (cmd_getval(cmdmap, "value", val)) {
11fdf7f2
TL
11410 cmd_vec.push_back(val);
11411 }
f67539c2 11412
2a845540 11413 ceph_heap_profiler_handle_command(cmd_vec, outos);
f67539c2 11414
7c673cae
FG
11415 return 0;
11416}
f67539c2
TL
11417
11418} // namespace ceph::osd_cmds