1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/range/adaptor/reversed.hpp>
29 #ifdef HAVE_SYS_PARAM_H
30 #include <sys/param.h>
33 #ifdef HAVE_SYS_MOUNT_H
34 #include <sys/mount.h>
38 #include "osd/scrubber/scrub_machine.h"
39 #include "osd/scrubber/pg_scrubber.h"
41 #include "include/types.h"
42 #include "include/compat.h"
43 #include "include/random.h"
44 #include "include/scope_guard.h"
49 #include "osdc/Objecter.h"
51 #include "common/errno.h"
52 #include "common/ceph_argparse.h"
53 #include "common/ceph_releases.h"
54 #include "common/ceph_time.h"
55 #include "common/version.h"
56 #include "common/async/blocked_completion.h"
57 #include "common/pick_address.h"
58 #include "common/blkdev.h"
59 #include "common/numa.h"
61 #include "os/ObjectStore.h"
63 #include "os/FuseStore.h"
66 #include "PrimaryLogPG.h"
68 #include "msg/Messenger.h"
69 #include "msg/Message.h"
71 #include "mon/MonClient.h"
73 #include "messages/MLog.h"
75 #include "messages/MGenericMessage.h"
76 #include "messages/MOSDPing.h"
77 #include "messages/MOSDFailure.h"
78 #include "messages/MOSDMarkMeDown.h"
79 #include "messages/MOSDMarkMeDead.h"
80 #include "messages/MOSDFull.h"
81 #include "messages/MOSDOp.h"
82 #include "messages/MOSDOpReply.h"
83 #include "messages/MOSDBackoff.h"
84 #include "messages/MOSDBeacon.h"
85 #include "messages/MOSDRepOp.h"
86 #include "messages/MOSDRepOpReply.h"
87 #include "messages/MOSDBoot.h"
88 #include "messages/MOSDPGTemp.h"
89 #include "messages/MOSDPGReadyToMerge.h"
91 #include "messages/MOSDMap.h"
92 #include "messages/MMonGetOSDMap.h"
93 #include "messages/MOSDPGNotify.h"
94 #include "messages/MOSDPGNotify2.h"
95 #include "messages/MOSDPGQuery2.h"
96 #include "messages/MOSDPGLog.h"
97 #include "messages/MOSDPGRemove.h"
98 #include "messages/MOSDPGInfo.h"
99 #include "messages/MOSDPGInfo2.h"
100 #include "messages/MOSDPGCreate2.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
112 #include "messages/MOSDPeeringOp.h"
114 #include "messages/MOSDAlive.h"
116 #include "messages/MOSDScrub2.h"
118 #include "messages/MCommand.h"
119 #include "messages/MCommandReply.h"
121 #include "messages/MPGStats.h"
123 #include "messages/MMonGetPurgedSnaps.h"
124 #include "messages/MMonGetPurgedSnapsReply.h"
126 #include "common/perf_counters.h"
127 #include "common/Timer.h"
128 #include "common/LogClient.h"
129 #include "common/AsyncReserver.h"
130 #include "common/HeartbeatMap.h"
131 #include "common/admin_socket.h"
132 #include "common/ceph_context.h"
134 #include "global/signal_handler.h"
135 #include "global/pidfile.h"
137 #include "include/color.h"
138 #include "perfglue/cpu_profiler.h"
139 #include "perfglue/heap_profiler.h"
141 #include "osd/ClassHandler.h"
142 #include "osd/OpRequest.h"
144 #include "auth/AuthAuthorizeHandler.h"
145 #include "auth/RotatingKeyRing.h"
147 #include "objclass/objclass.h"
149 #include "common/cmdparse.h"
150 #include "include/str_list.h"
151 #include "include/util.h"
153 #include "include/ceph_assert.h"
154 #include "common/config.h"
155 #include "common/EventTrace.h"
157 #include "json_spirit/json_spirit_reader.h"
158 #include "json_spirit/json_spirit_writer.h"
161 #define TRACEPOINT_DEFINE
162 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
163 #include "tracing/osd.h"
164 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
165 #undef TRACEPOINT_DEFINE
167 #define tracepoint(...)
170 #include "osd_tracer.h"
173 #define dout_context cct
174 #define dout_subsys ceph_subsys_osd
176 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
180 using std::lock_guard
;
181 using std::make_pair
;
182 using std::make_tuple
;
183 using std::make_unique
;
186 using std::ostringstream
;
190 using std::stringstream
;
191 using std::to_string
;
192 using std::unique_ptr
;
195 using ceph::bufferlist
;
196 using ceph::bufferptr
;
199 using ceph::fixed_u_to_string
;
200 using ceph::Formatter
;
201 using ceph::heartbeat_handle_d
;
202 using ceph::make_mutex
;
204 using namespace ceph::osd::scheduler
;
205 using TOPNSPC::common::cmd_getval
;
206 using TOPNSPC::common::cmd_getval_or
;
208 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
209 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
213 //Initial features in new superblock.
214 //Features here are also automatically upgraded
215 CompatSet
OSD::get_osd_initial_compat_set() {
216 CompatSet::FeatureSet ceph_osd_feature_compat
;
217 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
218 CompatSet::FeatureSet ceph_osd_feature_incompat
;
219 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
220 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
221 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
222 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
223 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
224 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
225 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
226 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
227 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
228 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
229 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
230 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
231 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
232 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
233 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
234 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
);
235 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
236 ceph_osd_feature_incompat
);
239 //Features are added here that this OSD supports.
240 CompatSet
OSD::get_osd_compat_set() {
241 CompatSet compat
= get_osd_initial_compat_set();
242 //Any features here can be set in code, but not in initial superblock
243 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
247 OSDService::OSDService(OSD
*osd
, ceph::async::io_context_pool
& poolctx
) :
250 whoami(osd
->whoami
), store(osd
->store
.get()),
251 log_client(osd
->log_client
), clog(osd
->clog
),
252 pg_recovery_stats(osd
->pg_recovery_stats
),
253 cluster_messenger(osd
->cluster_messenger
),
254 client_messenger(osd
->client_messenger
),
256 recoverystate_perf(osd
->recoverystate_perf
),
258 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
259 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
260 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
261 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
262 m_scrub_queue
{cct
, *this},
263 agent_valid_iterator(false),
265 flush_mode_high_count(0),
268 agent_stop_flag(false),
269 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
270 last_recalibrate(ceph_clock_now()),
271 promote_max_objects(0),
272 promote_max_bytes(0),
274 objecter(make_unique
<Objecter
>(osd
->client_messenger
->cct
,
275 osd
->objecter_messenger
,
276 osd
->monc
, poolctx
)),
277 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
278 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
280 recovery_request_timer(cct
, recovery_request_lock
, false),
281 sleep_timer(cct
, sleep_lock
, false),
282 reserver_finisher(cct
),
283 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
284 cct
->_conf
->osd_min_recovery_priority
),
285 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
286 cct
->_conf
->osd_min_recovery_priority
),
287 snap_reserver(cct
, &reserver_finisher
,
288 cct
->_conf
->osd_max_trimming_pgs
),
289 recovery_ops_active(0),
290 recovery_ops_reserved(0),
291 recovery_paused(false),
292 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
293 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
294 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
296 cur_ratio(0), physical_ratio(0),
297 boot_epoch(0), up_epoch(0), bind_epoch(0)
301 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
303 str
<< "objecter-finisher-" << i
;
304 auto fin
= make_unique
<Finisher
>(osd
->client_messenger
->cct
, str
.str(), "finisher");
305 objecter_finishers
.push_back(std::move(fin
));
310 void OSDService::add_pgid(spg_t pgid
, PG
*pg
) {
311 std::lock_guard
l(pgid_lock
);
312 if (!pgid_tracker
.count(pgid
)) {
315 pgid_tracker
[pgid
]++;
317 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
319 std::lock_guard
l(pgid_lock
);
320 ceph_assert(pgid_tracker
.count(pgid
));
321 ceph_assert(pgid_tracker
[pgid
] > 0);
322 pgid_tracker
[pgid
]--;
323 if (pgid_tracker
[pgid
] == 0) {
324 pgid_tracker
.erase(pgid
);
325 live_pgs
.erase(pgid
);
328 void OSDService::dump_live_pgids()
330 std::lock_guard
l(pgid_lock
);
331 derr
<< "live pgids:" << dendl
;
332 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
333 i
!= pgid_tracker
.cend();
335 derr
<< "\t" << *i
<< dendl
;
336 live_pgs
[i
->first
]->dump_live_ids();
342 ceph::signedspan
OSDService::get_mnow() const
344 return ceph::mono_clock::now() - osd
->startup_time
;
347 void OSDService::identify_splits_and_merges(
351 set
<pair
<spg_t
,epoch_t
>> *split_children
,
352 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
354 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
355 << " to e" << new_map
->get_epoch() << dendl
;
356 if (!old_map
->have_pg_pool(pgid
.pool())) {
357 dout(20) << __func__
<< " " << pgid
<< " pool " << pgid
.pool()
358 << " does not exist in old map" << dendl
;
361 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
362 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
363 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
364 dout(20) << __func__
<< " " << pgid
<< " pool " << pgid
.pool()
365 << " has no history" << dendl
;
368 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
369 << " to e" << new_map
->get_epoch()
370 << " pg_nums " << p
->second
<< dendl
;
372 queue
.push_back(pgid
);
374 while (!queue
.empty()) {
375 auto cur
= queue
.front();
378 unsigned pgnum
= old_pgnum
;
379 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
380 q
!= p
->second
.end() &&
381 q
->first
<= new_map
->get_epoch();
383 if (pgnum
< q
->second
) {
385 if (cur
.ps() < pgnum
) {
387 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
388 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
389 << " pg_num " << pgnum
<< " -> " << q
->second
390 << " children " << children
<< dendl
;
391 for (auto i
: children
) {
392 split_children
->insert(make_pair(i
, q
->first
));
397 } else if (cur
.ps() < q
->second
) {
398 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
399 << " pg_num " << pgnum
<< " -> " << q
->second
400 << " is a child" << dendl
;
401 // normally we'd capture this from the parent, but it's
402 // possible the parent doesn't exist yet (it will be
403 // fabricated to allow an intervening merge). note this PG
404 // as a split child here to be sure we catch it.
405 split_children
->insert(make_pair(cur
, q
->first
));
407 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
408 << " pg_num " << pgnum
<< " -> " << q
->second
409 << " is post-split, skipping" << dendl
;
411 } else if (merge_pgs
) {
413 if (cur
.ps() >= q
->second
) {
414 if (cur
.ps() < pgnum
) {
416 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
418 parent
.is_split(q
->second
, pgnum
, &children
);
419 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
420 << " pg_num " << pgnum
<< " -> " << q
->second
421 << " is merge source, target " << parent
422 << ", source(s) " << children
<< dendl
;
423 merge_pgs
->insert(make_pair(parent
, q
->first
));
424 if (!did
.count(parent
)) {
425 // queue (and re-scan) parent in case it might not exist yet
426 // and there are some future splits pending on it
427 queue
.push_back(parent
);
429 for (auto c
: children
) {
430 merge_pgs
->insert(make_pair(c
, q
->first
));
436 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
437 << " pg_num " << pgnum
<< " -> " << q
->second
438 << " is beyond old pgnum, skipping" << dendl
;
442 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
443 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
444 << " pg_num " << pgnum
<< " -> " << q
->second
445 << " is merge target, source " << children
<< dendl
;
446 for (auto c
: children
) {
447 merge_pgs
->insert(make_pair(c
, q
->first
));
451 merge_pgs
->insert(make_pair(cur
, q
->first
));
460 void OSDService::need_heartbeat_peer_update()
462 osd
->need_heartbeat_peer_update();
465 HeartbeatStampsRef
OSDService::get_hb_stamps(unsigned peer
)
467 std::lock_guard
l(hb_stamp_lock
);
468 if (peer
>= hb_stamps
.size()) {
469 hb_stamps
.resize(peer
+ 1);
471 if (!hb_stamps
[peer
]) {
472 hb_stamps
[peer
] = ceph::make_ref
<HeartbeatStamps
>(peer
);
474 return hb_stamps
[peer
];
477 void OSDService::queue_renew_lease(epoch_t epoch
, spg_t spgid
)
479 osd
->enqueue_peering_evt(
482 std::make_shared
<PGPeeringEvent
>(
487 void OSDService::start_shutdown()
490 std::lock_guard
l(agent_timer_lock
);
491 agent_timer
.shutdown();
495 std::lock_guard
l(sleep_lock
);
496 sleep_timer
.shutdown();
500 std::lock_guard
l(recovery_request_lock
);
501 recovery_request_timer
.shutdown();
505 void OSDService::shutdown_reserver()
507 reserver_finisher
.wait_for_empty();
508 reserver_finisher
.stop();
511 void OSDService::shutdown()
513 mono_timer
.suspend();
516 std::lock_guard
l(watch_lock
);
517 watch_timer
.shutdown();
520 objecter
->shutdown();
521 for (auto& f
: objecter_finishers
) {
526 publish_map(OSDMapRef());
527 next_osdmap
= OSDMapRef();
530 void OSDService::init()
532 reserver_finisher
.start();
533 for (auto& f
: objecter_finishers
) {
536 objecter
->set_client_incarnation(0);
538 // deprioritize objecter in daemonperf output
539 objecter
->get_logger()->set_prio_adjust(-3);
545 agent_thread
.create("osd_srv_agent");
547 if (cct
->_conf
->osd_recovery_delay_start
)
548 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
551 void OSDService::final_init()
553 objecter
->start(osdmap
.get());
556 void OSDService::activate_map()
558 // wake/unwake the tiering agent
559 std::lock_guard l
{agent_lock
};
561 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
563 agent_cond
.notify_all();
566 OSDMapRef
OSDService::get_nextmap_reserved() {
567 std::lock_guard
l(pre_publish_lock
);
569 epoch_t e
= next_osdmap
->get_epoch();
571 std::map
<epoch_t
, unsigned>::iterator i
=
572 map_reservations
.insert(std::make_pair(e
, 0)).first
;
574 dout(20) << __func__
<< " map_reservations: " << map_reservations
<< dendl
;
578 /// releases reservation on map
579 void OSDService::release_map(OSDMapRef osdmap
) {
580 std::lock_guard
l(pre_publish_lock
);
581 dout(20) << __func__
<< " epoch: " << osdmap
->get_epoch() << dendl
;
582 std::map
<epoch_t
, unsigned>::iterator i
=
583 map_reservations
.find(osdmap
->get_epoch());
584 ceph_assert(i
!= map_reservations
.end());
585 ceph_assert(i
->second
> 0);
586 if (--(i
->second
) == 0) {
587 map_reservations
.erase(i
);
589 if (pre_publish_waiter
) {
590 dout(20) << __func__
<< " notify all." << dendl
;
591 pre_publish_cond
.notify_all();
595 /// blocks until there are no reserved maps prior to next_osdmap
596 void OSDService::await_reserved_maps() {
597 std::unique_lock l
{pre_publish_lock
};
598 dout(20) << __func__
<< " epoch:" << next_osdmap
->get_epoch() << dendl
;
600 ceph_assert(next_osdmap
);
601 pre_publish_waiter
++;
602 pre_publish_cond
.wait(l
, [this] {
603 auto i
= map_reservations
.cbegin();
604 return (i
== map_reservations
.cend() ||
605 i
->first
>= next_osdmap
->get_epoch());
607 pre_publish_waiter
--;
608 dout(20) << __func__
<< " done " << pre_publish_waiter
<< dendl
;
611 void OSDService::request_osdmap_update(epoch_t e
)
613 osd
->osdmap_subscribe(e
, false);
617 class AgentTimeoutCB
: public Context
{
620 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
621 void finish(int) override
{
622 pg
->agent_choose_mode_restart();
626 void OSDService::agent_entry()
628 dout(10) << __func__
<< " start" << dendl
;
629 std::unique_lock agent_locker
{agent_lock
};
631 while (!agent_stop_flag
) {
632 if (agent_queue
.empty()) {
633 dout(20) << __func__
<< " empty queue" << dendl
;
634 agent_cond
.wait(agent_locker
);
637 uint64_t level
= agent_queue
.rbegin()->first
;
638 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
640 << " tiers " << agent_queue
.size()
641 << ", top is " << level
642 << " with pgs " << top
.size()
643 << ", ops " << agent_ops
<< "/"
644 << cct
->_conf
->osd_agent_max_ops
645 << (agent_active
? " active" : " NOT ACTIVE")
647 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
648 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
649 int agent_flush_quota
= max
;
650 if (!flush_mode_high_count
)
651 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
652 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
653 agent_cond
.wait(agent_locker
);
657 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
658 agent_queue_pos
= top
.begin();
659 agent_valid_iterator
= true;
661 PGRef pg
= *agent_queue_pos
;
662 dout(10) << "high_count " << flush_mode_high_count
663 << " agent_ops " << agent_ops
664 << " flush_quota " << agent_flush_quota
<< dendl
;
665 agent_locker
.unlock();
666 if (!pg
->agent_work(max
, agent_flush_quota
)) {
667 dout(10) << __func__
<< " " << pg
->pg_id
668 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
669 << " seconds" << dendl
;
671 logger
->inc(l_osd_tier_delay
);
672 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
673 std::lock_guard timer_locker
{agent_timer_lock
};
674 Context
*cb
= new AgentTimeoutCB(pg
);
675 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
679 dout(10) << __func__
<< " finish" << dendl
;
682 void OSDService::agent_stop()
685 std::lock_guard
l(agent_lock
);
687 // By this time all ops should be cancelled
688 ceph_assert(agent_ops
== 0);
689 // By this time all PGs are shutdown and dequeued
690 if (!agent_queue
.empty()) {
691 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
692 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
693 ceph_abort_msg("agent queue not empty");
696 agent_stop_flag
= true;
697 agent_cond
.notify_all();
702 // -------------------------------------
704 void OSDService::promote_throttle_recalibrate()
706 utime_t now
= ceph_clock_now();
707 double dur
= now
- last_recalibrate
;
708 last_recalibrate
= now
;
709 unsigned prob
= promote_probability_millis
;
711 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
712 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
714 unsigned min_prob
= 1;
716 uint64_t attempts
, obj
, bytes
;
717 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
718 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
719 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
720 << target_obj_sec
<< " obj/sec or "
721 << byte_u_t(target_bytes_sec
) << "/sec"
724 // calculate what the probability *should* be, given the targets
726 if (attempts
&& dur
> 0) {
727 uint64_t avg_size
= 1;
729 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
730 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
731 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
733 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
734 << avg_size
<< dendl
;
735 if (target_obj_sec
&& target_bytes_sec
)
736 new_prob
= std::min(po
, pb
);
737 else if (target_obj_sec
)
739 else if (target_bytes_sec
)
746 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
748 // correct for persistent skew between target rate and actual rate, adjust
751 if (attempts
&& obj
) {
752 actual
= obj
* 1000 / attempts
;
753 ratio
= (double)actual
/ (double)prob
;
754 new_prob
= (double)new_prob
/ ratio
;
756 new_prob
= std::max(new_prob
, min_prob
);
757 new_prob
= std::min(new_prob
, 1000u);
760 prob
= (prob
+ new_prob
) / 2;
761 prob
= std::max(prob
, min_prob
);
762 prob
= std::min(prob
, 1000u);
763 dout(10) << __func__
<< " actual " << actual
764 << ", actual/prob ratio " << ratio
765 << ", adjusted new_prob " << new_prob
766 << ", prob " << promote_probability_millis
<< " -> " << prob
768 promote_probability_millis
= prob
;
770 // set hard limits for this interval to mitigate stampedes
771 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
772 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
775 // -------------------------------------
777 float OSDService::get_failsafe_full_ratio()
779 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
780 if (full_ratio
> 1.0) full_ratio
/= 100.0;
784 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
786 // The OSDMap ratios take precendence. So if the failsafe is .95 and
787 // the admin sets the cluster full to .96, the failsafe moves up to .96
788 // too. (Not that having failsafe == full is ideal, but it's better than
789 // dropping writes before the clusters appears full.)
790 OSDMapRef osdmap
= get_osdmap();
791 if (!osdmap
|| osdmap
->get_epoch() == 0) {
794 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
795 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
796 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
797 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
799 if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
800 // use the failsafe for nearfull and full; the mon isn't using the
801 // flags anyway because we're mid-upgrade.
802 full_ratio
= failsafe_ratio
;
803 backfillfull_ratio
= failsafe_ratio
;
804 nearfull_ratio
= failsafe_ratio
;
805 } else if (full_ratio
<= 0 ||
806 backfillfull_ratio
<= 0 ||
807 nearfull_ratio
<= 0) {
808 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
809 // use failsafe flag. ick. the monitor did something wrong or the user
810 // did something stupid.
811 full_ratio
= failsafe_ratio
;
812 backfillfull_ratio
= failsafe_ratio
;
813 nearfull_ratio
= failsafe_ratio
;
816 if (injectfull_state
> NONE
&& injectfull
) {
817 inject
= "(Injected)";
818 return injectfull_state
;
819 } else if (pratio
> failsafe_ratio
) {
821 } else if (ratio
> full_ratio
) {
823 } else if (ratio
> backfillfull_ratio
) {
825 } else if (pratio
> nearfull_ratio
) {
831 void OSDService::check_full_status(float ratio
, float pratio
)
833 std::lock_guard
l(full_status_lock
);
836 physical_ratio
= pratio
;
840 new_state
= recalc_full_state(ratio
, pratio
, inject
);
842 dout(20) << __func__
<< " cur ratio " << ratio
843 << ", physical ratio " << pratio
844 << ", new state " << get_full_state_name(new_state
)
849 if (cur_state
!= new_state
) {
850 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
851 << " -> " << get_full_state_name(new_state
) << dendl
;
852 if (new_state
== FAILSAFE
) {
853 clog
->error() << "full status failsafe engaged, dropping updates, now "
854 << (int)roundf(ratio
* 100) << "% full";
855 } else if (cur_state
== FAILSAFE
) {
856 clog
->error() << "full status failsafe disengaged, no longer dropping "
857 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
859 cur_state
= new_state
;
863 bool OSDService::need_fullness_update()
865 OSDMapRef osdmap
= get_osdmap();
867 if (osdmap
->exists(whoami
)) {
868 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
870 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
872 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
879 else if (is_backfillfull())
881 else if (is_nearfull())
886 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
888 if (injectfull
&& injectfull_state
>= type
) {
889 // injectfull is either a count of the number of times to return failsafe full
890 // or if -1 then always return full
893 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
894 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
901 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
903 std::lock_guard
l(full_status_lock
);
905 if (_check_inject_full(dpp
, type
))
908 if (cur_state
>= type
)
909 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
910 << " physical " << physical_ratio
<< dendl
;
912 return cur_state
>= type
;
915 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
917 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
919 std::lock_guard
l(full_status_lock
);
920 if (_check_inject_full(dpp
, type
)) {
926 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
929 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
931 if (tentative_state
>= type
)
932 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
934 return tentative_state
>= type
;
937 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
939 return _check_full(dpp
, FAILSAFE
);
942 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
944 return _check_full(dpp
, FULL
);
947 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
949 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
952 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
954 return _check_full(dpp
, BACKFILLFULL
);
957 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
959 return _check_full(dpp
, NEARFULL
);
962 bool OSDService::is_failsafe_full() const
964 std::lock_guard
l(full_status_lock
);
965 return cur_state
== FAILSAFE
;
968 bool OSDService::is_full() const
970 std::lock_guard
l(full_status_lock
);
971 return cur_state
>= FULL
;
974 bool OSDService::is_backfillfull() const
976 std::lock_guard
l(full_status_lock
);
977 return cur_state
>= BACKFILLFULL
;
980 bool OSDService::is_nearfull() const
982 std::lock_guard
l(full_status_lock
);
983 return cur_state
>= NEARFULL
;
986 void OSDService::set_injectfull(s_names type
, int64_t count
)
988 std::lock_guard
l(full_status_lock
);
989 injectfull_state
= type
;
993 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
994 osd_alert_list_t
& alerts
)
996 uint64_t bytes
= stbuf
.total
;
997 uint64_t avail
= stbuf
.available
;
998 uint64_t used
= stbuf
.get_used_raw();
1000 // For testing fake statfs values so it doesn't matter if all
1001 // OSDs are using the same partition.
1002 if (cct
->_conf
->fake_statfs_for_testing
) {
1003 uint64_t total_num_bytes
= 0;
1005 osd
->_get_pgs(&pgs
);
1006 for (auto p
: pgs
) {
1007 total_num_bytes
+= p
->get_stats_num_bytes();
1009 bytes
= cct
->_conf
->fake_statfs_for_testing
;
1010 if (total_num_bytes
< bytes
)
1011 avail
= bytes
- total_num_bytes
;
1014 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
1015 << " adjust available " << avail
1017 used
= bytes
- avail
;
1020 logger
->set(l_osd_stat_bytes
, bytes
);
1021 logger
->set(l_osd_stat_bytes_used
, used
);
1022 logger
->set(l_osd_stat_bytes_avail
, avail
);
1024 std::lock_guard
l(stat_lock
);
1025 osd_stat
.statfs
= stbuf
;
1026 osd_stat
.os_alerts
.clear();
1027 osd_stat
.os_alerts
[whoami
].swap(alerts
);
1028 if (cct
->_conf
->fake_statfs_for_testing
) {
1029 osd_stat
.statfs
.total
= bytes
;
1030 osd_stat
.statfs
.available
= avail
;
1031 // For testing don't want used to go negative, so clear reserved
1032 osd_stat
.statfs
.internally_reserved
= 0;
1036 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
1039 utime_t now
= ceph_clock_now();
1040 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
1041 std::lock_guard
l(stat_lock
);
1042 osd_stat
.hb_peers
.swap(hb_peers
);
1043 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
1044 osd_stat
.num_pgs
= num_pgs
;
1045 // Clean entries that aren't updated
1046 // This is called often enough that we can just remove 1 at a time
1047 for (auto i
: osd_stat
.hb_pingtime
) {
1048 if (i
.second
.last_update
== 0)
1050 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
1051 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
1052 << " last_update " << i
.second
.last_update
<< dendl
;
1053 osd_stat
.hb_pingtime
.erase(i
.first
);
1060 void OSDService::inc_osd_stat_repaired()
1062 std::lock_guard
l(stat_lock
);
1063 osd_stat
.num_shards_repaired
++;
1067 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
1068 uint64_t adjust_used
)
1071 ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1074 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1075 if (new_stat
.statfs
.available
> adjust_used
)
1076 new_stat
.statfs
.available
-= adjust_used
;
1078 new_stat
.statfs
.available
= 0;
1079 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1082 // Check all pgs and adjust kb_used to include all pending backfill data
1083 int backfill_adjusted
= 0;
1085 osd
->_get_pgs(&pgs
);
1086 for (auto p
: pgs
) {
1087 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1089 if (backfill_adjusted
) {
1090 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1092 return ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1095 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1097 dout(20) << __func__
<< " " << m
->get_type_name() << " to osd." << peer
1098 << " from_epoch " << from_epoch
<< dendl
;
1099 OSDMapRef next_map
= get_nextmap_reserved();
1100 // service map is always newer/newest
1101 ceph_assert(from_epoch
<= next_map
->get_epoch());
1103 if (next_map
->is_down(peer
) ||
1104 next_map
->get_info(peer
).up_from
> from_epoch
) {
1106 release_map(next_map
);
1109 ConnectionRef peer_con
;
1110 if (peer
== whoami
) {
1111 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1113 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1114 next_map
->get_cluster_addrs(peer
), false, true);
1116 maybe_share_map(peer_con
.get(), next_map
);
1117 peer_con
->send_message(m
);
1118 release_map(next_map
);
1121 void OSDService::send_message_osd_cluster(std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
)
1123 dout(20) << __func__
<< " from_epoch " << from_epoch
<< dendl
;
1124 OSDMapRef next_map
= get_nextmap_reserved();
1125 // service map is always newer/newest
1126 ceph_assert(from_epoch
<= next_map
->get_epoch());
1128 for (auto& iter
: messages
) {
1129 if (next_map
->is_down(iter
.first
) ||
1130 next_map
->get_info(iter
.first
).up_from
> from_epoch
) {
1134 ConnectionRef peer_con
;
1135 if (iter
.first
== whoami
) {
1136 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1138 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1139 next_map
->get_cluster_addrs(iter
.first
), false, true);
1141 maybe_share_map(peer_con
.get(), next_map
);
1142 peer_con
->send_message(iter
.second
);
1144 release_map(next_map
);
1146 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1148 dout(20) << __func__
<< " to osd." << peer
1149 << " from_epoch " << from_epoch
<< dendl
;
1150 OSDMapRef next_map
= get_nextmap_reserved();
1151 // service map is always newer/newest
1152 ceph_assert(from_epoch
<= next_map
->get_epoch());
1154 if (next_map
->is_down(peer
) ||
1155 next_map
->get_info(peer
).up_from
> from_epoch
) {
1156 release_map(next_map
);
1160 if (peer
== whoami
) {
1161 con
= osd
->cluster_messenger
->get_loopback_connection();
1163 con
= osd
->cluster_messenger
->connect_to_osd(
1164 next_map
->get_cluster_addrs(peer
), false, true);
1166 release_map(next_map
);
1170 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1172 dout(20) << __func__
<< " to osd." << peer
1173 << " from_epoch " << from_epoch
<< dendl
;
1174 OSDMapRef next_map
= get_nextmap_reserved();
1175 // service map is always newer/newest
1176 ceph_assert(from_epoch
<= next_map
->get_epoch());
1178 pair
<ConnectionRef
,ConnectionRef
> ret
;
1179 if (next_map
->is_down(peer
) ||
1180 next_map
->get_info(peer
).up_from
> from_epoch
) {
1181 release_map(next_map
);
1184 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1185 next_map
->get_hb_back_addrs(peer
));
1186 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1187 next_map
->get_hb_front_addrs(peer
));
1188 release_map(next_map
);
1192 entity_name_t
OSDService::get_cluster_msgr_name() const
1194 return cluster_messenger
->get_myname();
1197 void OSDService::queue_want_pg_temp(pg_t pgid
,
1198 const vector
<int>& want
,
1201 std::lock_guard
l(pg_temp_lock
);
1202 auto p
= pg_temp_pending
.find(pgid
);
1203 if (p
== pg_temp_pending
.end() ||
1204 p
->second
.acting
!= want
||
1206 pg_temp_wanted
[pgid
] = {want
, forced
};
1210 void OSDService::remove_want_pg_temp(pg_t pgid
)
1212 std::lock_guard
l(pg_temp_lock
);
1213 pg_temp_wanted
.erase(pgid
);
1214 pg_temp_pending
.erase(pgid
);
1217 void OSDService::_sent_pg_temp()
1219 #ifdef HAVE_STDLIB_MAP_SPLICING
1220 pg_temp_pending
.merge(pg_temp_wanted
);
1222 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1223 make_move_iterator(end(pg_temp_wanted
)));
1225 pg_temp_wanted
.clear();
1228 void OSDService::requeue_pg_temp()
1230 std::lock_guard
l(pg_temp_lock
);
1231 // wanted overrides pending. note that remove_want_pg_temp
1232 // clears the item out of both.
1233 unsigned old_wanted
= pg_temp_wanted
.size();
1234 unsigned old_pending
= pg_temp_pending
.size();
1236 pg_temp_wanted
.swap(pg_temp_pending
);
1237 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1238 << pg_temp_wanted
.size() << dendl
;
1241 std::ostream
& operator<<(std::ostream
& out
,
1242 const OSDService::pg_temp_t
& pg_temp
)
1244 out
<< pg_temp
.acting
;
1245 if (pg_temp
.forced
) {
1251 void OSDService::send_pg_temp()
1253 std::lock_guard
l(pg_temp_lock
);
1254 if (pg_temp_wanted
.empty())
1256 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1257 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1258 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1259 auto& m
= ms
[pg_temp
.forced
];
1261 m
= new MOSDPGTemp(osdmap
->get_epoch());
1262 m
->forced
= pg_temp
.forced
;
1264 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1268 monc
->send_mon_message(m
);
1274 void OSDService::send_pg_created(pg_t pgid
)
1276 std::lock_guard
l(pg_created_lock
);
1277 dout(20) << __func__
<< dendl
;
1278 auto o
= get_osdmap();
1279 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1280 pg_created
.insert(pgid
);
1281 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1285 void OSDService::send_pg_created()
1287 std::lock_guard
l(pg_created_lock
);
1288 dout(20) << __func__
<< dendl
;
1289 auto o
= get_osdmap();
1290 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1291 for (auto pgid
: pg_created
) {
1292 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1297 void OSDService::prune_pg_created()
1299 std::lock_guard
l(pg_created_lock
);
1300 dout(20) << __func__
<< dendl
;
1301 auto o
= get_osdmap();
1302 auto i
= pg_created
.begin();
1303 while (i
!= pg_created
.end()) {
1304 auto p
= o
->get_pg_pool(i
->pool());
1305 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1306 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1307 i
= pg_created
.erase(i
);
1309 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1316 // --------------------------------------
1319 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1320 epoch_t
*_bind_epoch
) const
1322 std::lock_guard
l(epoch_lock
);
1324 *_boot_epoch
= boot_epoch
;
1326 *_up_epoch
= up_epoch
;
1328 *_bind_epoch
= bind_epoch
;
1331 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1332 const epoch_t
*_bind_epoch
)
1334 std::lock_guard
l(epoch_lock
);
1336 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1337 boot_epoch
= *_boot_epoch
;
1340 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1341 up_epoch
= *_up_epoch
;
1344 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1345 bind_epoch
= *_bind_epoch
;
1349 bool OSDService::prepare_to_stop()
1351 std::unique_lock
l(is_stopping_lock
);
1352 if (get_state() != NOT_STOPPING
)
1355 OSDMapRef osdmap
= get_osdmap();
1356 if (osdmap
&& osdmap
->is_up(whoami
)) {
1357 dout(0) << __func__
<< " telling mon we are shutting down and dead " << dendl
;
1358 set_state(PREPARING_TO_STOP
);
1359 monc
->send_mon_message(
1363 osdmap
->get_addrs(whoami
),
1364 osdmap
->get_epoch(),
1365 true, // request ack
1366 true // mark as down and dead
1368 const auto timeout
= ceph::make_timespan(cct
->_conf
->osd_mon_shutdown_timeout
);
1369 is_stopping_cond
.wait_for(l
, timeout
,
1370 [this] { return get_state() == STOPPING
; });
1373 dout(0) << __func__
<< " starting shutdown" << dendl
;
1374 set_state(STOPPING
);
1378 void OSDService::got_stop_ack()
1380 std::scoped_lock
l(is_stopping_lock
);
1381 if (get_state() == PREPARING_TO_STOP
) {
1382 dout(0) << __func__
<< " starting shutdown" << dendl
;
1383 set_state(STOPPING
);
1384 is_stopping_cond
.notify_all();
1386 dout(10) << __func__
<< " ignoring msg" << dendl
;
1390 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1391 OSDSuperblock
& sblock
)
1393 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1394 osdmap
->get_encoding_features());
1395 m
->cluster_osdmap_trim_lower_bound
= sblock
.cluster_osdmap_trim_lower_bound
;
1396 m
->newest_map
= sblock
.newest_map
;
1398 int max
= cct
->_conf
->osd_map_message_max
;
1399 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1401 if (since
< m
->cluster_osdmap_trim_lower_bound
) {
1402 // we don't have the next map the target wants, so start with a
1405 dout(10) << __func__
<< " cluster osdmap lower bound "
1406 << sblock
.cluster_osdmap_trim_lower_bound
1407 << " > since " << since
<< ", starting with full map"
1409 since
= m
->cluster_osdmap_trim_lower_bound
;
1410 if (!get_map_bl(since
, bl
)) {
1411 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1415 max_bytes
-= bl
.length();
1416 m
->maps
[since
] = std::move(bl
);
1418 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1420 if (get_inc_map_bl(e
, bl
)) {
1421 m
->incremental_maps
[e
] = bl
;
1423 dout(10) << __func__
<< " missing incremental map " << e
<< dendl
;
1424 if (!get_map_bl(e
, bl
)) {
1425 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1431 max_bytes
-= bl
.length();
1432 if (max
<= 0 || max_bytes
<= 0) {
1439 if (!m
->maps
.empty() ||
1440 !m
->incremental_maps
.empty()) {
1441 // send what we have so far
1446 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1447 m
->incremental_maps
[m
->newest_map
] = std::move(bl
);
1449 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1450 if (!get_map_bl(m
->newest_map
, bl
)) {
1451 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1455 m
->maps
[m
->newest_map
] = std::move(bl
);
1460 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1462 con
->send_message(m
);
1465 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1466 const OSDMapRef
& osdmap
)
1468 epoch_t to
= osdmap
->get_epoch();
1469 dout(10) << "send_incremental_map " << since
<< " -> " << to
1470 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1474 OSDSuperblock
sblock(get_superblock());
1475 if (since
< sblock
.oldest_map
) {
1476 // just send latest full map
1477 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1478 osdmap
->get_encoding_features());
1479 m
->cluster_osdmap_trim_lower_bound
= sblock
.cluster_osdmap_trim_lower_bound
;
1480 m
->newest_map
= sblock
.newest_map
;
1481 get_map_bl(to
, m
->maps
[to
]);
1486 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1487 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1488 << ", only sending most recent" << dendl
;
1489 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1492 m
= build_incremental_map_msg(since
, to
, sblock
);
1497 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1499 bool found
= map_bl_cache
.lookup(e
, &bl
);
1501 logger
->inc(l_osd_map_bl_cache_hit
);
1504 logger
->inc(l_osd_map_bl_cache_miss
);
1505 found
= store
->read(meta_ch
,
1506 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1507 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1514 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1516 std::lock_guard
l(map_cache_lock
);
1517 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1519 logger
->inc(l_osd_map_bl_cache_hit
);
1522 logger
->inc(l_osd_map_bl_cache_miss
);
1523 found
= store
->read(meta_ch
,
1524 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1525 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1527 _add_map_inc_bl(e
, bl
);
1532 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1534 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1535 // cache a contiguous buffer
1536 if (bl
.get_num_buffers() > 1) {
1539 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1540 map_bl_cache
.add(e
, bl
);
1543 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1545 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1546 // cache a contiguous buffer
1547 if (bl
.get_num_buffers() > 1) {
1550 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1551 map_bl_inc_cache
.add(e
, bl
);
1554 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1556 epoch_t e
= o
->get_epoch();
1558 if (cct
->_conf
->osd_map_dedup
) {
1559 // Dedup against an existing map at a nearby epoch
1560 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1562 OSDMap::dedup(for_dedup
.get(), o
);
1566 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1573 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1575 std::lock_guard
l(map_cache_lock
);
1576 OSDMapRef retval
= map_cache
.lookup(epoch
);
1578 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1579 logger
->inc(l_osd_map_cache_hit
);
1583 logger
->inc(l_osd_map_cache_miss
);
1584 epoch_t lb
= map_cache
.cached_key_lower_bound();
1586 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1587 logger
->inc(l_osd_map_cache_miss_low
);
1588 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1592 OSDMap
*map
= new OSDMap
;
1594 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1596 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1597 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1603 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1605 return _add_map(map
);
1611 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1613 reply_op_error(op
, err
, eversion_t(), 0, {});
1616 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1618 vector
<pg_log_op_return_item_t
> op_returns
)
1620 auto m
= op
->get_req
<MOSDOp
>();
1621 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1623 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1625 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1626 !m
->has_flag(CEPH_OSD_FLAG_RETURNVEC
));
1627 reply
->set_reply_versions(v
, uv
);
1628 reply
->set_op_returns(op_returns
);
1629 m
->get_connection()->send_message(reply
);
1632 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1634 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1638 auto m
= op
->get_req
<MOSDOp
>();
1639 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1641 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1643 if (pg
->is_ec_pg()) {
1645 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1646 * can get this result:
1647 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1648 * [CRUSH_ITEM_NONE, 2, 3]/3
1649 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1651 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1653 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1656 * We can't compute the op target based on the sending map epoch due to
1657 * splitting. The simplest thing is to detect such cases here and drop
1658 * them without an error (the client will resend anyway).
1660 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1661 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1663 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1664 << m
->get_map_epoch() << ", dropping" << dendl
;
1667 pg_t _pgid
= m
->get_raw_pg();
1669 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1670 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1671 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1672 pgid
.shard
!= pg
->pg_id
.shard
) {
1673 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1674 << m
->get_map_epoch() << ", dropping" << dendl
;
1679 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1680 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1681 << " pg " << m
->get_raw_pg()
1682 << " to osd." << whoami
1683 << " not " << pg
->get_acting()
1684 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1687 void OSDService::enqueue_back(OpSchedulerItem
&& qi
)
1689 osd
->op_shardedwq
.queue(std::move(qi
));
1692 void OSDService::enqueue_front(OpSchedulerItem
&& qi
)
1694 osd
->op_shardedwq
.queue_front(std::move(qi
));
1697 void OSDService::queue_recovery_context(
1699 GenContext
<ThreadPool::TPHandle
&> *c
,
1703 epoch_t e
= get_osdmap_epoch();
1705 uint64_t cost_for_queue
= [this, cost
] {
1706 if (cct
->_conf
->osd_op_queue
== "mclock_scheduler") {
1709 /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
1710 * require very large costs for several messages in order to do any
1711 * meaningful amount of throttling. This branch should be removed after
1714 return cct
->_conf
->osd_recovery_cost
;
1720 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1721 new PGRecoveryContext(pg
->get_pgid(), c
, e
, priority
)),
1723 cct
->_conf
->osd_recovery_priority
,
1729 void OSDService::queue_for_snap_trim(PG
*pg
)
1731 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1734 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1735 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1736 cct
->_conf
->osd_snap_trim_cost
,
1737 cct
->_conf
->osd_snap_trim_priority
,
1740 pg
->get_osdmap_epoch()));
1743 template <class MSG_TYPE
>
1744 void OSDService::queue_scrub_event_msg(PG
* pg
,
1745 Scrub::scrub_prio_t with_priority
,
1746 unsigned int qu_priority
,
1747 Scrub::act_token_t act_token
)
1749 const auto epoch
= pg
->get_osdmap_epoch();
1750 auto msg
= new MSG_TYPE(pg
->get_pgid(), epoch
, act_token
);
1751 dout(15) << "queue a scrub event (" << *msg
<< ") for " << *pg
1752 << ". Epoch: " << epoch
<< " token: " << act_token
<< dendl
;
1753 enqueue_back(OpSchedulerItem(
1754 unique_ptr
<OpSchedulerItem::OpQueueable
>(msg
), get_scrub_cost(),
1755 pg
->scrub_requeue_priority(with_priority
, qu_priority
), ceph_clock_now(), 0, epoch
));
1758 template <class MSG_TYPE
>
1759 void OSDService::queue_scrub_event_msg(PG
* pg
,
1760 Scrub::scrub_prio_t with_priority
)
1762 const auto epoch
= pg
->get_osdmap_epoch();
1763 auto msg
= new MSG_TYPE(pg
->get_pgid(), epoch
);
1764 dout(15) << "queue a scrub event (" << *msg
<< ") for " << *pg
<< ". Epoch: " << epoch
<< dendl
;
1765 enqueue_back(OpSchedulerItem(
1766 unique_ptr
<OpSchedulerItem::OpQueueable
>(msg
), get_scrub_cost(),
1767 pg
->scrub_requeue_priority(with_priority
), ceph_clock_now(), 0, epoch
));
1770 int64_t OSDService::get_scrub_cost()
1773 int64_t cost_for_queue
= cct
->_conf
->osd_scrub_cost
;
1774 if (cct
->_conf
->osd_op_queue
== "mclock_scheduler") {
1775 cost_for_queue
= cct
->_conf
->osd_scrub_event_cost
*
1776 cct
->_conf
->osd_shallow_scrub_chunk_max
;
1778 return cost_for_queue
;
1781 void OSDService::queue_for_scrub(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1783 queue_scrub_event_msg
<PGScrub
>(pg
, with_priority
);
1786 void OSDService::queue_scrub_after_repair(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1788 queue_scrub_event_msg
<PGScrubAfterRepair
>(pg
, with_priority
);
1791 void OSDService::queue_for_rep_scrub(PG
* pg
,
1792 Scrub::scrub_prio_t with_priority
,
1793 unsigned int qu_priority
,
1794 Scrub::act_token_t act_token
)
1796 queue_scrub_event_msg
<PGRepScrub
>(pg
, with_priority
, qu_priority
, act_token
);
1799 void OSDService::queue_for_rep_scrub_resched(PG
* pg
,
1800 Scrub::scrub_prio_t with_priority
,
1801 unsigned int qu_priority
,
1802 Scrub::act_token_t act_token
)
1804 // Resulting scrub event: 'SchedReplica'
1805 queue_scrub_event_msg
<PGRepScrubResched
>(pg
, with_priority
, qu_priority
,
1809 void OSDService::queue_for_scrub_granted(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1811 // Resulting scrub event: 'RemotesReserved'
1812 queue_scrub_event_msg
<PGScrubResourcesOK
>(pg
, with_priority
);
1815 void OSDService::queue_for_scrub_denied(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1817 // Resulting scrub event: 'ReservationFailure'
1818 queue_scrub_event_msg
<PGScrubDenied
>(pg
, with_priority
);
1821 void OSDService::queue_for_scrub_resched(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1823 // Resulting scrub event: 'InternalSchedScrub'
1824 queue_scrub_event_msg
<PGScrubResched
>(pg
, with_priority
);
1827 void OSDService::queue_scrub_pushes_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1829 // Resulting scrub event: 'ActivePushesUpd'
1830 queue_scrub_event_msg
<PGScrubPushesUpdate
>(pg
, with_priority
);
1833 void OSDService::queue_scrub_chunk_free(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1835 // Resulting scrub event: 'SelectedChunkFree'
1836 queue_scrub_event_msg
<PGScrubChunkIsFree
>(pg
, with_priority
);
1839 void OSDService::queue_scrub_chunk_busy(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1841 // Resulting scrub event: 'ChunkIsBusy'
1842 queue_scrub_event_msg
<PGScrubChunkIsBusy
>(pg
, with_priority
);
1845 void OSDService::queue_scrub_applied_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1847 queue_scrub_event_msg
<PGScrubAppliedUpdate
>(pg
, with_priority
);
1850 void OSDService::queue_scrub_unblocking(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1852 // Resulting scrub event: 'Unblocked'
1853 queue_scrub_event_msg
<PGScrubUnblocked
>(pg
, with_priority
);
1856 void OSDService::queue_scrub_digest_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1858 // Resulting scrub event: 'DigestUpdate'
1859 queue_scrub_event_msg
<PGScrubDigestUpdate
>(pg
, with_priority
);
1862 void OSDService::queue_scrub_got_local_map(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1864 // Resulting scrub event: 'IntLocalMapDone'
1865 queue_scrub_event_msg
<PGScrubGotLocalMap
>(pg
, with_priority
);
1868 void OSDService::queue_scrub_got_repl_maps(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1870 // Resulting scrub event: 'GotReplicas'
1871 queue_scrub_event_msg
<PGScrubGotReplMaps
>(pg
, with_priority
);
1874 void OSDService::queue_scrub_replica_pushes(PG
*pg
, Scrub::scrub_prio_t with_priority
)
1876 // Resulting scrub event: 'ReplicaPushesUpd'
1877 queue_scrub_event_msg
<PGScrubReplicaPushes
>(pg
, with_priority
);
1880 void OSDService::queue_scrub_is_finished(PG
*pg
)
1882 // Resulting scrub event: 'ScrubFinished'
1883 queue_scrub_event_msg
<PGScrubScrubFinished
>(pg
, Scrub::scrub_prio_t::high_priority
);
1886 void OSDService::queue_scrub_next_chunk(PG
*pg
, Scrub::scrub_prio_t with_priority
)
1888 // Resulting scrub event: 'NextChunk'
1889 queue_scrub_event_msg
<PGScrubGetNextChunk
>(pg
, with_priority
);
1892 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1894 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1897 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1898 new PGDelete(pgid
, e
)),
1899 cct
->_conf
->osd_pg_delete_cost
,
1900 cct
->_conf
->osd_pg_delete_priority
,
1906 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1908 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1913 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1915 std::lock_guard
l(merge_lock
);
1916 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1917 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1918 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1919 _send_ready_to_merge();
1922 void OSDService::set_ready_to_merge_target(PG
*pg
,
1924 epoch_t last_epoch_started
,
1925 epoch_t last_epoch_clean
)
1927 std::lock_guard
l(merge_lock
);
1928 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1929 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1932 last_epoch_clean
)));
1933 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1934 _send_ready_to_merge();
1937 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1939 std::lock_guard
l(merge_lock
);
1940 dout(10) << __func__
<< " " << source
<< dendl
;
1941 not_ready_to_merge_source
.insert(source
);
1942 assert(ready_to_merge_source
.count(source
) == 0);
1943 _send_ready_to_merge();
1946 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1948 std::lock_guard
l(merge_lock
);
1949 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1950 not_ready_to_merge_target
[target
] = source
;
1951 assert(ready_to_merge_target
.count(target
) == 0);
1952 _send_ready_to_merge();
1955 void OSDService::send_ready_to_merge()
1957 std::lock_guard
l(merge_lock
);
1958 _send_ready_to_merge();
1961 void OSDService::_send_ready_to_merge()
1963 dout(20) << __func__
1964 << " ready_to_merge_source " << ready_to_merge_source
1965 << " not_ready_to_merge_source " << not_ready_to_merge_source
1966 << " ready_to_merge_target " << ready_to_merge_target
1967 << " not_ready_to_merge_target " << not_ready_to_merge_target
1968 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1970 for (auto src
: not_ready_to_merge_source
) {
1971 if (sent_ready_to_merge_source
.count(src
) == 0) {
1972 monc
->send_mon_message(new MOSDPGReadyToMerge(
1976 osdmap
->get_epoch()));
1977 sent_ready_to_merge_source
.insert(src
);
1980 for (auto p
: not_ready_to_merge_target
) {
1981 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1982 monc
->send_mon_message(new MOSDPGReadyToMerge(
1986 osdmap
->get_epoch()));
1987 sent_ready_to_merge_source
.insert(p
.second
);
1990 for (auto src
: ready_to_merge_source
) {
1991 if (not_ready_to_merge_source
.count(src
.first
) ||
1992 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1995 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1996 if (p
!= ready_to_merge_target
.end() &&
1997 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1998 monc
->send_mon_message(new MOSDPGReadyToMerge(
1999 src
.first
, // source pgid
2000 src
.second
, // src version
2001 std::get
<0>(p
->second
), // target version
2002 std::get
<1>(p
->second
), // PG's last_epoch_started
2003 std::get
<2>(p
->second
), // PG's last_epoch_clean
2005 osdmap
->get_epoch()));
2006 sent_ready_to_merge_source
.insert(src
.first
);
2011 void OSDService::clear_ready_to_merge(PG
*pg
)
2013 std::lock_guard
l(merge_lock
);
2014 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
2015 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
2016 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
2017 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
2018 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
2019 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
2022 void OSDService::clear_sent_ready_to_merge()
2024 std::lock_guard
l(merge_lock
);
2025 sent_ready_to_merge_source
.clear();
2028 void OSDService::prune_sent_ready_to_merge(const OSDMapRef
& osdmap
)
2030 std::lock_guard
l(merge_lock
);
2031 auto i
= sent_ready_to_merge_source
.begin();
2032 while (i
!= sent_ready_to_merge_source
.end()) {
2033 if (!osdmap
->pg_exists(*i
)) {
2034 dout(10) << __func__
<< " " << *i
<< dendl
;
2035 i
= sent_ready_to_merge_source
.erase(i
);
2037 dout(20) << __func__
<< " exist " << *i
<< dendl
;
2045 void OSDService::_queue_for_recovery(
2046 pg_awaiting_throttle_t p
,
2047 uint64_t reserved_pushes
)
2049 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
2051 uint64_t cost_for_queue
= [this, &reserved_pushes
, &p
] {
2052 if (cct
->_conf
->osd_op_queue
== "mclock_scheduler") {
2053 return p
.cost_per_object
* reserved_pushes
;
2055 /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
2056 * require very large costs for several messages in order to do any
2057 * meaningful amount of throttling. This branch should be removed after
2060 return cct
->_conf
->osd_recovery_cost
;
2066 unique_ptr
<OpSchedulerItem::OpQueueable
>(
2073 cct
->_conf
->osd_recovery_priority
,
2079 // ====================================================================
2083 #define dout_prefix *_dout
2085 // Commands shared between OSD's console and admin console:
2086 namespace ceph::osd_cmds
{
2088 int heap(CephContext
& cct
,
2089 const cmdmap_t
& cmdmap
,
2090 std::ostream
& outos
,
2091 std::ostream
& erros
);
2093 } // namespace ceph::osd_cmds
2095 int OSD::mkfs(CephContext
*cct
,
2096 std::unique_ptr
<ObjectStore
> store
,
2099 string osdspec_affinity
)
2105 // if we are fed a uuid for this osd, use it.
2106 store
->set_fsid(cct
->_conf
->osd_uuid
);
2108 ret
= store
->mkfs();
2110 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
2111 << cpp_strerror(ret
) << dendl
;
2115 store
->set_cache_shards(1); // doesn't matter for mkfs!
2117 ret
= store
->mount();
2119 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
2120 << cpp_strerror(ret
) << dendl
;
2124 auto umount_store
= make_scope_guard([&] {
2128 ObjectStore::CollectionHandle ch
=
2129 store
->open_collection(coll_t::meta());
2131 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
2133 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
2136 /* if we already have superblock, check content of superblock */
2137 dout(0) << " have superblock" << dendl
;
2138 auto p
= sbbl
.cbegin();
2140 if (whoami
!= sb
.whoami
) {
2141 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
2145 if (fsid
!= sb
.cluster_fsid
) {
2146 derr
<< "provided cluster fsid " << fsid
2147 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
2151 // create superblock
2152 sb
.cluster_fsid
= fsid
;
2153 sb
.osd_fsid
= store
->get_fsid();
2155 sb
.compat_features
= get_osd_initial_compat_set();
2160 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
2162 ObjectStore::Transaction t
;
2163 t
.create_collection(coll_t::meta(), 0);
2164 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
2165 ret
= store
->queue_transaction(ch
, std::move(t
));
2167 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2168 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
2174 ret
= write_meta(cct
, store
.get(), sb
.cluster_fsid
, sb
.osd_fsid
, whoami
, osdspec_affinity
);
2176 derr
<< "OSD::mkfs: failed to write fsid file: error "
2177 << cpp_strerror(ret
) << dendl
;
2182 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
, string
& osdspec_affinity
)
2187 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2188 r
= store
->write_meta("magic", val
);
2192 snprintf(val
, sizeof(val
), "%d", whoami
);
2193 r
= store
->write_meta("whoami", val
);
2197 cluster_fsid
.print(val
);
2198 r
= store
->write_meta("ceph_fsid", val
);
2202 string key
= cct
->_conf
.get_val
<string
>("key");
2204 r
= store
->write_meta("osd_key", key
);
2208 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2209 if (!keyfile
.empty()) {
2212 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2214 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2215 << err
<< ": " << cpp_strerror(r
) << dendl
;
2218 r
= store
->write_meta("osd_key", keybl
.to_str());
2223 if (!osdspec_affinity
.empty()) {
2224 r
= store
->write_meta("osdspec_affinity", osdspec_affinity
.c_str());
2229 r
= store
->write_meta("ceph_version_when_created", pretty_version_to_str());
2233 ostringstream created_at
;
2234 utime_t now
= ceph_clock_now();
2235 now
.gmtime(created_at
);
2236 r
= store
->write_meta("created_at", created_at
.str());
2240 r
= store
->write_meta("ready", "ready");
2247 int OSD::peek_meta(ObjectStore
*store
,
2249 uuid_d
*cluster_fsid
,
2252 ceph_release_t
*require_osd_release
)
2256 int r
= store
->read_meta("magic", &val
);
2261 r
= store
->read_meta("whoami", &val
);
2264 *whoami
= atoi(val
.c_str());
2266 r
= store
->read_meta("ceph_fsid", &val
);
2269 r
= cluster_fsid
->parse(val
.c_str());
2273 r
= store
->read_meta("fsid", &val
);
2275 *osd_fsid
= uuid_d();
2277 r
= osd_fsid
->parse(val
.c_str());
2282 r
= store
->read_meta("require_osd_release", &val
);
2284 *require_osd_release
= ceph_release_from_name(val
);
2292 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2296 OSD::OSD(CephContext
*cct_
,
2297 std::unique_ptr
<ObjectStore
> store_
,
2299 Messenger
*internal_messenger
,
2300 Messenger
*external_messenger
,
2301 Messenger
*hb_client_front
,
2302 Messenger
*hb_client_back
,
2303 Messenger
*hb_front_serverm
,
2304 Messenger
*hb_back_serverm
,
2305 Messenger
*osdc_messenger
,
2307 const std::string
&dev
, const std::string
&jdev
,
2308 ceph::async::io_context_pool
& poolctx
) :
2310 tick_timer(cct
, osd_lock
),
2311 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2312 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2313 cluster_messenger(internal_messenger
),
2314 client_messenger(external_messenger
),
2315 objecter_messenger(osdc_messenger
),
2317 mgrc(cct_
, client_messenger
, &mc
->monmap
),
2318 logger(create_logger()),
2319 recoverystate_perf(create_recoverystate_perf()),
2320 store(std::move(store_
)),
2321 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2322 clog(log_client
.create_channel()),
2324 dev_path(dev
), journal_path(jdev
),
2325 store_is_rotational(store
->is_rotational()),
2326 trace_endpoint("0.0.0.0", 0, "osd"),
2328 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2329 "osd_pg_epoch_max_lag_factor")),
2330 osd_compat(get_osd_compat_set()),
2331 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2332 get_num_op_threads()),
2333 heartbeat_stop(false),
2334 heartbeat_need_update(true),
2335 hb_front_client_messenger(hb_client_front
),
2336 hb_back_client_messenger(hb_client_back
),
2337 hb_front_server_messenger(hb_front_serverm
),
2338 hb_back_server_messenger(hb_back_serverm
),
2340 heartbeat_thread(this),
2341 heartbeat_dispatcher(this),
2342 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2343 cct
->_conf
->osd_num_op_tracker_shard
),
2344 test_ops_hook(NULL
),
2347 ceph::make_timespan(cct
->_conf
->osd_op_thread_timeout
),
2348 ceph::make_timespan(cct
->_conf
->osd_op_thread_suicide_timeout
),
2350 last_pg_create_epoch(0),
2353 requested_full_first(0),
2354 requested_full_last(0),
2355 service(this, poolctx
)
2358 if (!gss_ktfile_client
.empty()) {
2359 // Assert we can export environment variable
2361 The default client keytab is used, if it is present and readable,
2362 to automatically obtain initial credentials for GSSAPI client
2363 applications. The principal name of the first entry in the client
2364 keytab is used by default when obtaining initial credentials.
2365 1. The KRB5_CLIENT_KTNAME environment variable.
2366 2. The default_client_keytab_name profile variable in [libdefaults].
2367 3. The hardcoded default, DEFCKTNAME.
2369 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2370 gss_ktfile_client
.c_str(), 1));
2371 ceph_assert(set_result
== 0);
2374 monc
->set_messenger(client_messenger
);
2375 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2376 cct
->_conf
->osd_op_log_threshold
);
2377 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2378 cct
->_conf
->osd_op_history_duration
);
2379 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2380 cct
->_conf
->osd_op_history_slow_op_threshold
);
2381 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
2383 std::stringstream ss
;
2384 ss
<< "osd." << whoami
;
2385 trace_endpoint
.copy_name(ss
.str());
2388 // initialize shards
2389 num_shards
= get_num_op_shards();
2390 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2391 OSDShard
*one_shard
= new OSDShard(
2395 shards
.push_back(one_shard
);
2401 while (!shards
.empty()) {
2402 delete shards
.back();
2405 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2406 cct
->get_perfcounters_collection()->remove(logger
);
2407 delete recoverystate_perf
;
2411 double OSD::get_tick_interval() const
2413 // vary +/- 5% to avoid scrub scheduling livelocks
2414 constexpr auto delta
= 0.05;
2415 return (OSD_TICK_INTERVAL
*
2416 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2419 void OSD::handle_signal(int signum
)
2421 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2422 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2428 std::lock_guard
lock(osd_lock
);
2432 if (store
->test_mount_in_use()) {
2433 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2434 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2438 cct
->_conf
.add_observer(this);
2442 int OSD::set_numa_affinity()
2444 // storage numa node
2445 int store_node
= -1;
2446 store
->get_numa_node(&store_node
, nullptr, nullptr);
2447 if (store_node
>= 0) {
2448 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2451 // check network numa node(s)
2452 int front_node
= -1, back_node
= -1;
2453 string front_iface
= pick_iface(
2455 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2456 string back_iface
= pick_iface(
2458 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2459 int r
= get_iface_numa_node(front_iface
, &front_node
);
2460 if (r
>= 0 && front_node
>= 0) {
2461 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2462 << front_node
<< dendl
;
2463 r
= get_iface_numa_node(back_iface
, &back_node
);
2464 if (r
>= 0 && back_node
>= 0) {
2465 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2466 << back_node
<< dendl
;
2467 if (front_node
== back_node
&&
2468 front_node
== store_node
) {
2469 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2470 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2471 numa_node
= front_node
;
2473 } else if (front_node
!= back_node
) {
2474 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2477 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2480 } else if (back_node
== -2) {
2481 dout(1) << __func__
<< " cluster network " << back_iface
2482 << " ports numa nodes do not match" << dendl
;
2484 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2485 << "' numa node: " << cpp_strerror(r
) << dendl
;
2487 } else if (front_node
== -2) {
2488 dout(1) << __func__
<< " public network " << front_iface
2489 << " ports numa nodes do not match" << dendl
;
2491 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2492 << "' numa node: " << cpp_strerror(r
) << dendl
;
2494 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2495 // this takes precedence over the automagic logic above
2498 if (numa_node
>= 0) {
2499 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2501 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2502 << " CPUs" << dendl
;
2505 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2507 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2509 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2512 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2518 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2525 class OSDSocketHook
: public AdminSocketHook
{
2528 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2529 int call(std::string_view prefix
, const cmdmap_t
& cmdmap
,
2530 const bufferlist
& inbl
,
2533 bufferlist
& out
) override
{
2534 ceph_abort("should use async hook");
2537 std::string_view prefix
,
2538 const cmdmap_t
& cmdmap
,
2540 const bufferlist
& inbl
,
2541 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
) override
{
2543 osd
->asok_command(prefix
, cmdmap
, f
, inbl
, on_finish
);
2544 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2546 on_finish(-EINVAL
, e
.what(), empty
);
2551 std::set
<int64_t> OSD::get_mapped_pools()
2553 std::set
<int64_t> pools
;
2554 std::vector
<spg_t
> pgids
;
2556 for (const auto &pgid
: pgids
) {
2557 pools
.insert(pgid
.pool());
2562 OSD::PGRefOrError
OSD::locate_asok_target(const cmdmap_t
& cmdmap
,
2567 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2568 ss
<< "no pgid specified";
2569 return OSD::PGRefOrError
{std::nullopt
, -EINVAL
};
2573 if (!pgid
.parse(pgidstr
.c_str())) {
2574 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2575 return OSD::PGRefOrError
{std::nullopt
, -EINVAL
};
2580 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) && (pg
= _lookup_lock_pg(pcand
))) {
2581 if (pg
->is_primary() || !only_primary
) {
2582 return OSD::PGRefOrError
{pg
, 0};
2585 ss
<< "not primary for pgid " << pgid
;
2587 return OSD::PGRefOrError
{std::nullopt
, -EAGAIN
};
2589 ss
<< "i don't have pgid " << pgid
;
2590 return OSD::PGRefOrError
{std::nullopt
, -ENOENT
};
2594 // note that the cmdmap is explicitly copied into asok_route_to_pg()
2595 int OSD::asok_route_to_pg(
2597 std::string_view prefix
,
2601 const bufferlist
& inbl
,
2603 std::function
<void(int, const std::string
&, bufferlist
&)> on_finish
)
2605 auto [target_pg
, ret
] = locate_asok_target(cmdmap
, ss
, only_primary
);
2607 if (!target_pg
.has_value()) {
2608 // 'ss' and 'ret' already contain the error information
2609 on_finish(ret
, ss
.str(), outbl
);
2613 // the PG was locked by locate_asok_target()
2615 (*target_pg
)->do_command(prefix
, cmdmap
, inbl
, on_finish
);
2616 (*target_pg
)->unlock();
2617 return 0; // the pg handler calls on_finish directly
2618 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2619 (*target_pg
)->unlock();
2621 on_finish(ret
, ss
.str(), outbl
);
2626 void OSD::asok_command(
2627 std::string_view prefix
, const cmdmap_t
& cmdmap
,
2629 const bufferlist
& inbl
,
2630 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
2633 stringstream ss
; // stderr error message stream
2634 bufferlist outbl
; // if empty at end, we'll dump formatter as output
2636 // --- PG commands are routed here to PG::do_command ---
2637 if (prefix
== "pg" ||
2638 prefix
== "query" ||
2640 prefix
== "mark_unfound_lost" ||
2641 prefix
== "list_unfound" ||
2642 prefix
== "scrub" ||
2643 prefix
== "deep_scrub"
2647 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2648 ss
<< "no pgid specified";
2652 if (!pgid
.parse(pgidstr
.c_str())) {
2653 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2659 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
2660 (pg
= _lookup_lock_pg(pcand
))) {
2661 if (pg
->is_primary()) {
2662 cmdmap_t new_cmdmap
= cmdmap
;
2664 pg
->do_command(prefix
, new_cmdmap
, inbl
, on_finish
);
2666 return; // the pg handler calls on_finish directly
2667 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2674 ss
<< "not primary for pgid " << pgid
;
2675 // do not reply; they will get newer maps and realize they
2682 ss
<< "i don't have pgid " << pgid
;
2687 // --- PG commands that will be answered even if !primary ---
2689 else if (prefix
== "scrubdebug") {
2690 asok_route_to_pg(false, prefix
, cmdmap
, f
, ss
, inbl
, outbl
, on_finish
);
2694 // --- OSD commands follow ---
2696 else if (prefix
== "status") {
2697 lock_guard
l(osd_lock
);
2698 f
->open_object_section("status");
2699 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2700 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2701 f
->dump_unsigned("whoami", superblock
.whoami
);
2702 f
->dump_string("state", get_state_name(get_state()));
2703 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2704 f
->dump_unsigned("cluster_osdmap_trim_lower_bound",
2705 superblock
.cluster_osdmap_trim_lower_bound
);
2706 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2707 f
->dump_unsigned("num_pgs", num_pgs
);
2709 } else if (prefix
== "flush_journal") {
2710 store
->flush_journal();
2711 } else if (prefix
== "dump_ops_in_flight" ||
2713 prefix
== "dump_blocked_ops" ||
2714 prefix
== "dump_blocked_ops_count" ||
2715 prefix
== "dump_historic_ops" ||
2716 prefix
== "dump_historic_ops_by_duration" ||
2717 prefix
== "dump_historic_slow_ops") {
2719 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2720 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2721 will start to track new ops received afterwards.";
2723 set
<string
> filters
;
2724 vector
<string
> filter_str
;
2725 if (cmd_getval(cmdmap
, "filterstr", filter_str
)) {
2726 copy(filter_str
.begin(), filter_str
.end(),
2727 inserter(filters
, filters
.end()));
2730 if (prefix
== "dump_ops_in_flight" ||
2732 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2738 if (prefix
== "dump_blocked_ops") {
2739 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2745 if (prefix
== "dump_blocked_ops_count") {
2746 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
, true)) {
2752 if (prefix
== "dump_historic_ops") {
2753 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2759 if (prefix
== "dump_historic_ops_by_duration") {
2760 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2766 if (prefix
== "dump_historic_slow_ops") {
2767 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2773 } else if (prefix
== "dump_op_pq_state") {
2774 f
->open_object_section("pq");
2775 op_shardedwq
.dump(f
);
2777 } else if (prefix
== "dump_blocklist") {
2778 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2779 list
<pair
<entity_addr_t
,utime_t
> > rbl
;
2780 OSDMapRef curmap
= service
.get_osdmap();
2781 curmap
->get_blocklist(&bl
, &rbl
);
2783 f
->open_array_section("blocklist");
2784 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2785 it
!= bl
.end(); ++it
) {
2786 f
->open_object_section("entry");
2787 f
->open_object_section("entity_addr_t");
2789 f
->close_section(); //entity_addr_t
2790 it
->second
.localtime(f
->dump_stream("expire_time"));
2791 f
->close_section(); //entry
2793 f
->close_section(); //blocklist
2794 f
->open_array_section("range_blocklist");
2795 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= rbl
.begin();
2796 it
!= rbl
.end(); ++it
) {
2797 f
->open_object_section("entry");
2798 f
->open_object_section("entity_addr_t");
2800 f
->close_section(); //entity_addr_t
2801 it
->second
.localtime(f
->dump_stream("expire_time"));
2802 f
->close_section(); //entry
2804 f
->close_section(); //blocklist
2805 } else if (prefix
== "dump_watchers") {
2806 list
<obj_watch_item_t
> watchers
;
2810 for (auto& pg
: pgs
) {
2811 list
<obj_watch_item_t
> pg_watchers
;
2812 pg
->get_watchers(&pg_watchers
);
2813 watchers
.splice(watchers
.end(), pg_watchers
);
2816 f
->open_array_section("watchers");
2817 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2818 it
!= watchers
.end(); ++it
) {
2820 f
->open_object_section("watch");
2822 f
->dump_string("namespace", it
->obj
.nspace
);
2823 f
->dump_string("object", it
->obj
.oid
.name
);
2825 f
->open_object_section("entity_name");
2826 it
->wi
.name
.dump(f
);
2827 f
->close_section(); //entity_name_t
2829 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2830 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2832 f
->open_object_section("entity_addr_t");
2833 it
->wi
.addr
.dump(f
);
2834 f
->close_section(); //entity_addr_t
2836 f
->close_section(); //watch
2839 f
->close_section(); //watchers
2840 } else if (prefix
== "dump_recovery_reservations") {
2841 f
->open_object_section("reservations");
2842 f
->open_object_section("local_reservations");
2843 service
.local_reserver
.dump(f
);
2845 f
->open_object_section("remote_reservations");
2846 service
.remote_reserver
.dump(f
);
2849 } else if (prefix
== "dump_scrub_reservations") {
2850 f
->open_object_section("scrub_reservations");
2851 service
.get_scrub_services().dump_scrub_reservations(f
);
2853 } else if (prefix
== "get_latest_osdmap") {
2854 get_latest_osdmap();
2855 } else if (prefix
== "set_heap_property") {
2859 bool success
= false;
2860 if (!cmd_getval(cmdmap
, "property", property
)) {
2861 error
= "unable to get property";
2863 } else if (!cmd_getval(cmdmap
, "value", value
)) {
2864 error
= "unable to get value";
2866 } else if (value
< 0) {
2867 error
= "negative value not allowed";
2869 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2870 error
= "invalid property";
2875 f
->open_object_section("result");
2876 f
->dump_string("error", error
);
2877 f
->dump_bool("success", success
);
2879 } else if (prefix
== "get_heap_property") {
2883 bool success
= false;
2884 if (!cmd_getval(cmdmap
, "property", property
)) {
2885 error
= "unable to get property";
2887 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2888 error
= "invalid property";
2893 f
->open_object_section("result");
2894 f
->dump_string("error", error
);
2895 f
->dump_bool("success", success
);
2896 f
->dump_int("value", value
);
2898 } else if (prefix
== "dump_objectstore_kv_stats") {
2899 store
->get_db_statistics(f
);
2900 } else if (prefix
== "dump_scrubs") {
2901 service
.get_scrub_services().dump_scrubs(f
);
2902 } else if (prefix
== "calc_objectstore_db_histogram") {
2903 store
->generate_db_histogram(f
);
2904 } else if (prefix
== "flush_store_cache") {
2905 store
->flush_cache(&ss
);
2906 } else if (prefix
== "rotate-stored-key") {
2907 store
->write_meta("osd_key", inbl
.to_str());
2908 } else if (prefix
== "dump_pgstate_history") {
2909 f
->open_object_section("pgstate_history");
2910 f
->open_array_section("pgs");
2913 for (auto& pg
: pgs
) {
2914 f
->open_object_section("pg");
2915 f
->dump_stream("pg") << pg
->pg_id
;
2916 f
->dump_string("currently", pg
->get_current_state());
2917 pg
->dump_pgstate_history(f
);
2922 } else if (prefix
== "compact") {
2923 dout(1) << "triggering manual compaction" << dendl
;
2924 auto start
= ceph::coarse_mono_clock::now();
2926 auto end
= ceph::coarse_mono_clock::now();
2927 double duration
= std::chrono::duration
<double>(end
-start
).count();
2928 dout(1) << "finished manual compaction in "
2930 << " seconds" << dendl
;
2931 f
->open_object_section("compact_result");
2932 f
->dump_float("elapsed_time", duration
);
2934 } else if (prefix
== "get_mapped_pools") {
2935 f
->open_array_section("mapped_pools");
2936 set
<int64_t> poollist
= get_mapped_pools();
2937 for (auto pool
: poollist
) {
2938 f
->dump_int("pool_id", pool
);
2941 } else if (prefix
== "smart") {
2943 cmd_getval(cmdmap
, "devid", devid
);
2945 probe_smart(devid
, out
);
2946 outbl
.append(out
.str());
2947 } else if (prefix
== "list_devices") {
2948 set
<string
> devnames
;
2949 store
->get_devices(&devnames
);
2950 f
->open_array_section("list_devices");
2951 for (auto dev
: devnames
) {
2952 if (dev
.find("dm-") == 0) {
2956 f
->open_object_section("device");
2957 f
->dump_string("device", "/dev/" + dev
);
2958 f
->dump_string("device_id", get_device_id(dev
, &err
));
2962 } else if (prefix
== "send_beacon") {
2963 lock_guard
l(osd_lock
);
2965 send_beacon(ceph::coarse_mono_clock::now());
2969 else if (prefix
== "cluster_log") {
2971 cmd_getval(cmdmap
, "message", msg
);
2974 ss
<< "ignoring empty log message";
2977 string message
= msg
.front();
2978 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
2979 message
+= " " + *a
;
2981 cmd_getval(cmdmap
, "level", lvl
);
2982 clog_type level
= string_to_clog_type(lvl
);
2985 ss
<< "unknown level '" << lvl
<< "'";
2988 clog
->do_log(level
, message
);
2991 else if (prefix
== "bench") {
2992 // default count 1G, size 4MB
2993 int64_t count
= cmd_getval_or
<int64_t>(cmdmap
, "count", 1LL << 30);
2994 int64_t bsize
= cmd_getval_or
<int64_t>(cmdmap
, "size", 4LL << 20);
2995 int64_t osize
= cmd_getval_or
<int64_t>(cmdmap
, "object_size", 0);
2996 int64_t onum
= cmd_getval_or
<int64_t>(cmdmap
, "object_num", 0);
2997 double elapsed
= 0.0;
2999 ret
= run_osd_bench_test(count
, bsize
, osize
, onum
, &elapsed
, ss
);
3004 double rate
= count
/ elapsed
;
3005 double iops
= rate
/ bsize
;
3006 f
->open_object_section("osd_bench_results");
3007 f
->dump_int("bytes_written", count
);
3008 f
->dump_int("blocksize", bsize
);
3009 f
->dump_float("elapsed_sec", elapsed
);
3010 f
->dump_float("bytes_per_sec", rate
);
3011 f
->dump_float("iops", iops
);
3015 else if (prefix
== "flush_pg_stats") {
3016 mgrc
.send_pgstats();
3017 f
->dump_unsigned("stat_seq", service
.get_osd_stat_seq());
3020 else if (prefix
== "heap") {
3021 std::stringstream outss
;
3022 ret
= ceph::osd_cmds::heap(*cct
, cmdmap
, outss
, ss
);
3023 outbl
.append(outss
);
3026 else if (prefix
== "debug dump_missing") {
3027 f
->open_array_section("pgs");
3030 for (auto& pg
: pgs
) {
3031 string s
= stringify(pg
->pg_id
);
3032 f
->open_array_section(s
.c_str());
3034 pg
->dump_missing(f
);
3041 else if (prefix
== "debug kick_recovery_wq") {
3043 cmd_getval(cmdmap
, "delay", delay
);
3046 ret
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
3048 ss
<< "kick_recovery_wq: error setting "
3049 << "osd_recovery_delay_start to '" << delay
<< "': error "
3053 cct
->_conf
.apply_changes(nullptr);
3054 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
3055 << "to " << cct
->_conf
->osd_recovery_delay_start
;
3058 else if (prefix
== "cpu_profiler") {
3061 cmd_getval(cmdmap
, "arg", arg
);
3062 vector
<string
> argvec
;
3063 get_str_vec(arg
, argvec
);
3064 cpu_profiler_handle_command(argvec
, ds
);
3065 outbl
.append(ds
.str());
3068 else if (prefix
== "dump_pg_recovery_stats") {
3069 lock_guard
l(osd_lock
);
3070 pg_recovery_stats
.dump_formatted(f
);
3073 else if (prefix
== "reset_pg_recovery_stats") {
3074 lock_guard
l(osd_lock
);
3075 pg_recovery_stats
.reset();
3078 else if (prefix
== "perf histogram dump") {
3080 std::string counter
;
3081 cmd_getval(cmdmap
, "logger", logger
);
3082 cmd_getval(cmdmap
, "counter", counter
);
3083 cct
->get_perfcounters_collection()->dump_formatted_histograms(
3084 f
, false, logger
, counter
);
3087 else if (prefix
== "cache drop") {
3088 lock_guard
l(osd_lock
);
3089 dout(20) << "clearing all caches" << dendl
;
3090 // Clear the objectstore's cache - onode and buffer for Bluestore,
3091 // system's pagecache for Filestore
3092 ret
= store
->flush_cache(&ss
);
3094 ss
<< "Error flushing objectstore cache: " << cpp_strerror(ret
);
3097 // Clear the objectcontext cache (per PG)
3100 for (auto& pg
: pgs
) {
3105 else if (prefix
== "cache status") {
3106 lock_guard
l(osd_lock
);
3107 int obj_ctx_count
= 0;
3110 for (auto& pg
: pgs
) {
3111 obj_ctx_count
+= pg
->get_cache_obj_count();
3113 f
->open_object_section("cache_status");
3114 f
->dump_int("object_ctx", obj_ctx_count
);
3115 store
->dump_cache_stats(f
);
3119 else if (prefix
== "scrub_purged_snaps") {
3120 lock_guard
l(osd_lock
);
3121 scrub_purged_snaps();
3124 else if (prefix
== "dump_osd_network") {
3125 lock_guard
l(osd_lock
);
3127 if (!(cmd_getval(cmdmap
, "value", value
))) {
3128 // Convert milliseconds to microseconds
3129 value
= static_cast<double>(g_conf().get_val
<double>(
3130 "mon_warn_on_slow_ping_time")) * 1000;
3132 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
3133 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
3134 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
3137 // Convert user input to microseconds
3140 if (value
< 0) value
= 0;
3142 struct osd_ping_time_t
{
3146 std::array
<uint32_t,3> times
;
3147 std::array
<uint32_t,3> min
;
3148 std::array
<uint32_t,3> max
;
3150 uint32_t last_update
;
3152 bool operator<(const osd_ping_time_t
& rhs
) const {
3153 if (pingtime
< rhs
.pingtime
)
3155 if (pingtime
> rhs
.pingtime
)
3165 set
<osd_ping_time_t
> sorted
;
3166 // Get pingtimes under lock and not on the stack
3167 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
3168 service
.get_hb_pingtime(pingtimes
);
3169 for (auto j
: *pingtimes
) {
3170 if (j
.second
.last_update
== 0)
3172 osd_ping_time_t item
;
3173 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
3174 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
3175 if (item
.pingtime
>= value
) {
3177 item
.times
[0] = j
.second
.back_pingtime
[0];
3178 item
.times
[1] = j
.second
.back_pingtime
[1];
3179 item
.times
[2] = j
.second
.back_pingtime
[2];
3180 item
.min
[0] = j
.second
.back_min
[0];
3181 item
.min
[1] = j
.second
.back_min
[1];
3182 item
.min
[2] = j
.second
.back_min
[2];
3183 item
.max
[0] = j
.second
.back_max
[0];
3184 item
.max
[1] = j
.second
.back_max
[1];
3185 item
.max
[2] = j
.second
.back_max
[2];
3186 item
.last
= j
.second
.back_last
;
3188 item
.last_update
= j
.second
.last_update
;
3189 sorted
.emplace(item
);
3191 if (j
.second
.front_last
== 0)
3193 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
3194 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
3195 if (item
.pingtime
>= value
) {
3197 item
.times
[0] = j
.second
.front_pingtime
[0];
3198 item
.times
[1] = j
.second
.front_pingtime
[1];
3199 item
.times
[2] = j
.second
.front_pingtime
[2];
3200 item
.min
[0] = j
.second
.front_min
[0];
3201 item
.min
[1] = j
.second
.front_min
[1];
3202 item
.min
[2] = j
.second
.front_min
[2];
3203 item
.max
[0] = j
.second
.front_max
[0];
3204 item
.max
[1] = j
.second
.front_max
[1];
3205 item
.max
[2] = j
.second
.front_max
[2];
3206 item
.last
= j
.second
.front_last
;
3207 item
.last_update
= j
.second
.last_update
;
3209 sorted
.emplace(item
);
3214 // Network ping times (1min 5min 15min)
3215 f
->open_object_section("network_ping_times");
3216 f
->dump_int("threshold", value
/ 1000);
3217 f
->open_array_section("entries");
3218 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
3219 ceph_assert(sitem
.pingtime
>= value
);
3220 f
->open_object_section("entry");
3222 const time_t lu(sitem
.last_update
);
3224 string
lustr(ctime_r(&lu
, buffer
));
3225 lustr
.pop_back(); // Remove trailing \n
3226 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3227 f
->dump_string("last update", lustr
);
3228 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
3229 f
->dump_int("from osd", whoami
);
3230 f
->dump_int("to osd", sitem
.to
);
3231 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
3232 f
->open_object_section("average");
3233 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
3234 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
3235 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
3236 f
->close_section(); // average
3237 f
->open_object_section("min");
3238 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3239 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3240 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3241 f
->close_section(); // min
3242 f
->open_object_section("max");
3243 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3244 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3245 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3246 f
->close_section(); // max
3247 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
3248 f
->close_section(); // entry
3250 f
->close_section(); // entries
3251 f
->close_section(); // network_ping_times
3252 } else if (prefix
== "dump_pool_statfs") {
3253 lock_guard
l(osd_lock
);
3256 if (!(cmd_getval(cmdmap
, "poolid", p
))) {
3257 ss
<< "Error dumping pool statfs: no poolid provided";
3263 bool per_pool_omap_stats
= false;
3265 ret
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
3267 ss
<< "Error dumping pool statfs: " << cpp_strerror(ret
);
3270 ss
<< "dumping pool statfs...";
3271 f
->open_object_section("pool_statfs");
3272 f
->dump_int("poolid", p
);
3277 ceph_abort_msg("broken asok registration");
3281 on_finish(ret
, ss
.str(), outbl
);
3284 int OSD::run_osd_bench_test(
3293 srand(time(NULL
) % (unsigned long) -1);
3294 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
3296 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
3297 // let us limit the block size because the next checks rely on it
3298 // having a sane value. If we allow any block size to be set things
3299 // can still go sideways.
3300 ss
<< "block 'size' values are capped at "
3301 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
3302 << " a higher value, please adjust 'osd_bench_max_block_size'";
3305 } else if (bsize
< (int64_t) (1 << 20)) {
3306 // entering the realm of small block sizes.
3307 // limit the count to a sane value, assuming a configurable amount of
3308 // IOPS and duration, so that the OSD doesn't get hung up on this,
3309 // preventing timeouts from going off
3311 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
3312 if (count
> max_count
) {
3313 ss
<< "'count' values greater than " << max_count
3314 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
3315 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
3316 << " for " << duration
<< " seconds,"
3317 << " can cause ill effects on osd. "
3318 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3319 << " value if you wish to use a higher 'count'.";
3324 // 1MB block sizes are big enough so that we get more stuff done.
3325 // However, to avoid the osd from getting hung on this and having
3326 // timers being triggered, we are going to limit the count assuming
3327 // a configurable throughput and duration.
3328 // NOTE: max_count is the total amount of bytes that we believe we
3329 // will be able to write during 'duration' for the given
3330 // throughput. The block size hardly impacts this unless it's
3331 // way too big. Given we already check how big the block size
3332 // is, it's safe to assume everything will check out.
3334 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
3335 if (count
> max_count
) {
3336 ss
<< "'count' values greater than " << max_count
3337 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
3338 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
3339 << " for " << duration
<< " seconds,"
3340 << " can cause ill effects on osd. "
3341 << " Please adjust 'osd_bench_large_size_max_throughput'"
3342 << " with a higher value if you wish to use a higher 'count'.";
3348 if (osize
&& bsize
> osize
) {
3352 dout(1) << " bench count " << count
3353 << " bsize " << byte_u_t(bsize
) << dendl
;
3355 ObjectStore::Transaction cleanupt
;
3357 if (osize
&& onum
) {
3359 bufferptr
bp(osize
);
3360 memset(bp
.c_str(), 'a', bp
.length());
3361 bl
.push_back(std::move(bp
));
3362 bl
.rebuild_page_aligned();
3363 for (int i
=0; i
<onum
; ++i
) {
3365 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
3367 hobject_t
soid(sobject_t(oid
, 0));
3368 ObjectStore::Transaction t
;
3369 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
3370 store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
3371 cleanupt
.remove(coll_t(), ghobject_t(soid
));
3377 if (!service
.meta_ch
->flush_commit(&waiter
)) {
3383 utime_t start
= ceph_clock_now();
3384 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
3386 unsigned offset
= 0;
3387 bufferptr
bp(bsize
);
3388 memset(bp
.c_str(), rand() & 0xff, bp
.length());
3389 bl
.push_back(std::move(bp
));
3390 bl
.rebuild_page_aligned();
3391 if (onum
&& osize
) {
3392 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
3393 offset
= rand() % (osize
/ bsize
) * bsize
;
3395 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
3398 hobject_t
soid(sobject_t(oid
, 0));
3399 ObjectStore::Transaction t
;
3400 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
3401 store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
3402 if (!onum
|| !osize
) {
3403 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
3410 if (!service
.meta_ch
->flush_commit(&waiter
)) {
3414 utime_t end
= ceph_clock_now();
3415 *elapsed
= end
- start
;
3418 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), nullptr);
3421 if (!service
.meta_ch
->flush_commit(&waiter
)) {
3429 class TestOpsSocketHook
: public AdminSocketHook
{
3430 OSDService
*service
;
3433 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
3434 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
3437 std::ostream
& errss
,
3438 bufferlist
& out
) override
{
3442 test_ops(service
, store
, command
, cmdmap
, outss
);
3444 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
3450 void test_ops(OSDService
*service
, ObjectStore
*store
,
3451 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
3455 class OSD::C_Tick
: public Context
{
3458 explicit C_Tick(OSD
*o
) : osd(o
) {}
3459 void finish(int r
) override
{
3464 class OSD::C_Tick_WithoutOSDLock
: public Context
{
3467 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
3468 void finish(int r
) override
{
3469 osd
->tick_without_osd_lock();
3473 int OSD::enable_disable_fuse(bool stop
)
3477 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
3478 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
3479 dout(1) << __func__
<< " disabling" << dendl
;
3483 r
= ::rmdir(mntpath
.c_str());
3486 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
3487 << cpp_strerror(r
) << dendl
;
3492 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
3493 dout(1) << __func__
<< " enabling" << dendl
;
3494 r
= ::mkdir(mntpath
.c_str(), 0700);
3497 if (r
< 0 && r
!= -EEXIST
) {
3498 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
3499 << cpp_strerror(r
) << dendl
;
3502 fuse_store
= new FuseStore(store
.get(), mntpath
);
3503 r
= fuse_store
->start();
3505 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
3511 #endif // HAVE_LIBFUSE
3515 size_t OSD::get_num_cache_shards()
3517 return cct
->_conf
.get_val
<Option::size_t>("osd_num_cache_shards");
3520 int OSD::get_num_op_shards()
3522 if (cct
->_conf
->osd_op_num_shards
)
3523 return cct
->_conf
->osd_op_num_shards
;
3524 if (store_is_rotational
)
3525 return cct
->_conf
->osd_op_num_shards_hdd
;
3527 return cct
->_conf
->osd_op_num_shards_ssd
;
3530 int OSD::get_num_op_threads()
3532 if (cct
->_conf
->osd_op_num_threads_per_shard
)
3533 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
3534 if (store_is_rotational
)
3535 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
3537 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
3540 float OSD::get_osd_recovery_sleep()
3542 if (cct
->_conf
->osd_recovery_sleep
)
3543 return cct
->_conf
->osd_recovery_sleep
;
3544 if (!store_is_rotational
&& !journal_is_rotational
)
3545 return cct
->_conf
->osd_recovery_sleep_ssd
;
3546 else if (store_is_rotational
&& !journal_is_rotational
)
3547 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3549 return cct
->_conf
->osd_recovery_sleep_hdd
;
3552 float OSD::get_osd_delete_sleep()
3554 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3555 if (osd_delete_sleep
> 0)
3556 return osd_delete_sleep
;
3557 if (!store_is_rotational
&& !journal_is_rotational
)
3558 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3559 if (store_is_rotational
&& !journal_is_rotational
)
3560 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3561 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3564 int OSD::get_recovery_max_active()
3566 if (cct
->_conf
->osd_recovery_max_active
)
3567 return cct
->_conf
->osd_recovery_max_active
;
3568 if (store_is_rotational
)
3569 return cct
->_conf
->osd_recovery_max_active_hdd
;
3571 return cct
->_conf
->osd_recovery_max_active_ssd
;
3574 float OSD::get_osd_snap_trim_sleep()
3576 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3577 if (osd_snap_trim_sleep
> 0)
3578 return osd_snap_trim_sleep
;
3579 if (!store_is_rotational
&& !journal_is_rotational
)
3580 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3581 if (store_is_rotational
&& !journal_is_rotational
)
3582 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3583 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3589 CompatSet initial
, diff
;
3590 std::lock_guard
lock(osd_lock
);
3593 tracing::osd::tracer
.init("osd");
3595 tick_timer_without_osd_lock
.init();
3596 service
.recovery_request_timer
.init();
3597 service
.sleep_timer
.init();
3599 boot_finisher
.start();
3603 store
->read_meta("require_osd_release", &val
);
3604 last_require_osd_release
= ceph_release_from_name(val
);
3608 dout(2) << "init " << dev_path
3609 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3611 dout(2) << "journal " << journal_path
<< dendl
;
3612 ceph_assert(store
); // call pre_init() first!
3614 store
->set_cache_shards(get_num_cache_shards());
3616 int rotating_auth_attempts
= 0;
3617 auto rotating_auth_timeout
=
3618 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3620 int r
= store
->mount();
3622 derr
<< "OSD:init: unable to mount object store" << dendl
;
3625 journal_is_rotational
= store
->is_journal_rotational();
3626 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3629 enable_disable_fuse(false);
3631 dout(2) << "boot" << dendl
;
3633 service
.meta_ch
= store
->open_collection(coll_t::meta());
3634 if (!service
.meta_ch
) {
3635 derr
<< "OSD:init: unable to open meta collection"
3640 // initialize the daily loadavg with current 15min loadavg
3642 if (getloadavg(loadavgs
, 3) == 3) {
3643 daily_loadavg
= loadavgs
[2];
3645 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3646 daily_loadavg
= 1.0;
3649 // sanity check long object name handling
3652 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3653 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3654 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3655 r
= store
->validate_hobject_key(l
);
3657 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3658 << "object name[space] len" << dendl
;
3659 derr
<< " osd max object name len = "
3660 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3661 derr
<< " osd max object namespace len = "
3662 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3663 derr
<< cpp_strerror(r
) << dendl
;
3664 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3667 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3670 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3675 r
= read_superblock();
3677 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3682 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3683 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3684 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3685 derr
<< " daemon features " << osd_compat
<< dendl
;
3687 if (osd_compat
.writeable(superblock
.compat_features
)) {
3688 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3689 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3694 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3695 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3701 assert_warn(whoami
== superblock
.whoami
);
3702 if (whoami
!= superblock
.whoami
) {
3703 derr
<< "OSD::init: superblock says osd"
3704 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3709 startup_time
= ceph::mono_clock::now();
3711 // load up "current" osdmap
3712 assert_warn(!get_osdmap());
3714 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3718 osdmap
= get_map(superblock
.current_epoch
);
3721 // make sure we don't have legacy pgs deleting
3724 int r
= store
->list_collections(ls
);
3725 ceph_assert(r
>= 0);
3728 if (c
.is_pg(&pgid
) &&
3729 !osdmap
->have_pg_pool(pgid
.pool())) {
3730 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3731 if (!store
->exists(service
.meta_ch
, oid
)) {
3732 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3733 << pgid
.pool() << " for pg " << pgid
3734 << "; please downgrade to luminous and allow "
3735 << "pg deletion to complete before upgrading" << dendl
;
3742 initial
= get_osd_initial_compat_set();
3743 diff
= superblock
.compat_features
.unsupported(initial
);
3744 if (superblock
.compat_features
.merge(initial
)) {
3745 // Are we adding SNAPMAPPER2?
3746 if (diff
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
)) {
3747 dout(1) << __func__
<< " upgrade snap_mapper (first start as octopus)"
3749 auto ch
= service
.meta_ch
;
3750 auto hoid
= make_snapmapper_oid();
3751 unsigned max
= cct
->_conf
->osd_target_transaction_size
;
3752 r
= SnapMapper::convert_legacy(cct
, store
.get(), ch
, hoid
, max
);
3756 // We need to persist the new compat_set before we
3758 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3760 if (!superblock
.cluster_osdmap_trim_lower_bound
) {
3761 superblock
.cluster_osdmap_trim_lower_bound
= superblock
.oldest_map
;
3764 ObjectStore::Transaction t
;
3765 write_superblock(t
);
3766 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3771 // make sure snap mapper object exists
3772 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3773 dout(10) << "init creating/touching snapmapper object" << dendl
;
3774 ObjectStore::Transaction t
;
3775 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3776 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3780 if (!store
->exists(service
.meta_ch
, OSD::make_purged_snaps_oid())) {
3781 dout(10) << "init creating/touching purged_snaps object" << dendl
;
3782 ObjectStore::Transaction t
;
3783 t
.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3784 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3789 if (cct
->_conf
->osd_open_classes_on_start
) {
3790 int r
= ClassHandler::get_instance().open_all_classes();
3792 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3795 check_osdmap_features();
3798 epoch_t bind_epoch
= osdmap
->get_epoch();
3799 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3802 clear_temp_objects();
3804 // initialize osdmap references in sharded wq
3805 for (auto& shard
: shards
) {
3806 std::lock_guard
l(shard
->osdmap_lock
);
3807 shard
->shard_osdmap
= osdmap
;
3810 // load up pgs (as they previously existed)
3813 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3815 if (cct
->_conf
.get_val
<bool>("osd_compact_on_start")) {
3816 dout(2) << "compacting object store's omap" << dendl
;
3822 struct store_statfs_t stbuf
;
3823 osd_alert_list_t alerts
;
3824 int r
= store
->statfs(&stbuf
, &alerts
);
3825 ceph_assert(r
== 0);
3826 service
.set_statfs(stbuf
, alerts
);
3829 // client_messenger's auth_client will be set up by monc->init() later.
3830 for (auto m
: { cluster_messenger
,
3832 hb_front_client_messenger
,
3833 hb_back_client_messenger
,
3834 hb_front_server_messenger
,
3835 hb_back_server_messenger
} ) {
3836 m
->set_auth_client(monc
);
3838 for (auto m
: { client_messenger
,
3840 hb_front_server_messenger
,
3841 hb_back_server_messenger
}) {
3842 m
->set_auth_server(monc
);
3844 monc
->set_handle_authentication_dispatcher(this);
3846 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3847 | CEPH_ENTITY_TYPE_MGR
);
3852 mgrc
.set_pgstats_cb([this]() { return collect_pg_stats(); });
3853 mgrc
.set_perf_metric_query_cb(
3854 [this](const ConfigPayload
&config_payload
) {
3855 set_perf_queries(config_payload
);
3858 return get_perf_reports();
3862 // tell monc about log_client so it will know about mon session resets
3863 monc
->set_log_client(&log_client
);
3864 update_log_config();
3867 client_messenger
->add_dispatcher_tail(&mgrc
);
3868 client_messenger
->add_dispatcher_tail(this);
3869 cluster_messenger
->add_dispatcher_head(this);
3871 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3872 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3873 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3874 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3876 objecter_messenger
->add_dispatcher_head(service
.objecter
.get());
3879 service
.publish_map(osdmap
);
3880 service
.publish_superblock(superblock
);
3882 for (auto& shard
: shards
) {
3883 // put PGs in a temporary set because we may modify pg_slots
3884 // unordered_map below.
3886 for (auto& i
: shard
->pg_slots
) {
3887 PGRef pg
= i
.second
->pg
;
3893 for (auto pg
: pgs
) {
3894 std::scoped_lock l
{*pg
};
3895 set
<pair
<spg_t
,epoch_t
>> new_children
;
3896 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3897 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3898 &new_children
, &merge_pgs
);
3899 if (!new_children
.empty()) {
3900 for (auto shard
: shards
) {
3901 shard
->prime_splits(osdmap
, &new_children
);
3903 assert(new_children
.empty());
3905 if (!merge_pgs
.empty()) {
3906 for (auto shard
: shards
) {
3907 shard
->prime_merges(osdmap
, &merge_pgs
);
3909 assert(merge_pgs
.empty());
3916 // start the heartbeat
3917 heartbeat_thread
.create("osd_srv_heartbt");
3920 tick_timer
.add_event_after(get_tick_interval(),
3923 std::lock_guard
l(tick_timer_lock
);
3924 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3925 new C_Tick_WithoutOSDLock(this));
3930 r
= monc
->authenticate();
3932 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3937 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3938 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3939 ++rotating_auth_attempts
;
3940 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3941 derr
<< __func__
<< " wait_auth_rotating timed out"
3942 <<" -- maybe I have a clock skew against the monitors?" << dendl
;
3947 r
= update_crush_device_class();
3949 derr
<< __func__
<< " unable to update_crush_device_class: "
3950 << cpp_strerror(r
) << dendl
;
3954 r
= update_crush_location();
3956 derr
<< __func__
<< " unable to update_crush_location: "
3957 << cpp_strerror(r
) << dendl
;
3965 // start objecter *after* we have authenticated, so that we don't ignore
3966 // the OSDMaps it requests.
3967 service
.final_init();
3971 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3974 dout(0) << "done with init, starting boot process" << dendl
;
3976 // subscribe to any pg creations
3977 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3979 // MgrClient needs this (it doesn't have MonClient reference itself)
3980 monc
->sub_want("mgrmap", 0, 0);
3982 // we don't need to ask for an osdmap here; objecter will
3983 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3989 // Override a few options if mclock scheduler is enabled.
3990 maybe_override_sleep_options_for_qos();
3991 maybe_override_cost_for_qos();
3992 maybe_override_options_for_qos();
3993 maybe_override_max_osd_capacity_for_qos();
3998 enable_disable_fuse(true);
4004 void OSD::final_init()
4006 AdminSocket
*admin_socket
= cct
->get_admin_socket();
4007 asok_hook
= new OSDSocketHook(this);
4008 int r
= admin_socket
->register_command("status", asok_hook
,
4009 "high-level status of OSD");
4010 ceph_assert(r
== 0);
4011 r
= admin_socket
->register_command("flush_journal",
4013 "flush the journal to permanent store");
4014 ceph_assert(r
== 0);
4015 r
= admin_socket
->register_command("dump_ops_in_flight " \
4016 "name=filterstr,type=CephString,n=N,req=false",
4018 "show the ops currently in flight");
4019 ceph_assert(r
== 0);
4020 r
= admin_socket
->register_command("ops " \
4021 "name=filterstr,type=CephString,n=N,req=false",
4023 "show the ops currently in flight");
4024 ceph_assert(r
== 0);
4025 r
= admin_socket
->register_command("dump_blocked_ops " \
4026 "name=filterstr,type=CephString,n=N,req=false",
4028 "show the blocked ops currently in flight");
4029 ceph_assert(r
== 0);
4030 r
= admin_socket
->register_command("dump_blocked_ops_count " \
4031 "name=filterstr,type=CephString,n=N,req=false",
4033 "show the count of blocked ops currently in flight");
4034 ceph_assert(r
== 0);
4035 r
= admin_socket
->register_command("dump_historic_ops " \
4036 "name=filterstr,type=CephString,n=N,req=false",
4039 ceph_assert(r
== 0);
4040 r
= admin_socket
->register_command("dump_historic_slow_ops " \
4041 "name=filterstr,type=CephString,n=N,req=false",
4043 "show slowest recent ops");
4044 ceph_assert(r
== 0);
4045 r
= admin_socket
->register_command("dump_historic_ops_by_duration " \
4046 "name=filterstr,type=CephString,n=N,req=false",
4048 "show slowest recent ops, sorted by duration");
4049 ceph_assert(r
== 0);
4050 r
= admin_socket
->register_command("dump_op_pq_state",
4052 "dump op queue state");
4053 ceph_assert(r
== 0);
4054 r
= admin_socket
->register_command("dump_blocklist",
4056 "dump blocklisted clients and times");
4057 ceph_assert(r
== 0);
4058 r
= admin_socket
->register_command("dump_watchers",
4060 "show clients which have active watches,"
4061 " and on which objects");
4062 ceph_assert(r
== 0);
4063 r
= admin_socket
->register_command("dump_recovery_reservations",
4065 "show recovery reservations");
4066 ceph_assert(r
== 0);
4067 r
= admin_socket
->register_command("dump_scrub_reservations",
4069 "show scrub reservations");
4070 ceph_assert(r
== 0);
4071 r
= admin_socket
->register_command("get_latest_osdmap",
4073 "force osd to update the latest map from "
4075 ceph_assert(r
== 0);
4077 r
= admin_socket
->register_command("set_heap_property " \
4078 "name=property,type=CephString " \
4079 "name=value,type=CephInt",
4081 "update malloc extension heap property");
4082 ceph_assert(r
== 0);
4084 r
= admin_socket
->register_command("get_heap_property " \
4085 "name=property,type=CephString",
4087 "get malloc extension heap property");
4088 ceph_assert(r
== 0);
4090 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
4092 "print statistics of kvdb which used by bluestore");
4093 ceph_assert(r
== 0);
4095 r
= admin_socket
->register_command("dump_scrubs",
4097 "print scheduled scrubs");
4098 ceph_assert(r
== 0);
4100 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
4102 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
4103 ceph_assert(r
== 0);
4105 r
= admin_socket
->register_command("flush_store_cache",
4107 "Flush bluestore internal cache");
4108 ceph_assert(r
== 0);
4109 r
= admin_socket
->register_command("rotate-stored-key",
4111 "Update the stored osd_key");
4112 ceph_assert(r
== 0);
4113 r
= admin_socket
->register_command("dump_pgstate_history",
4115 "show recent state history");
4116 ceph_assert(r
== 0);
4118 r
= admin_socket
->register_command("compact",
4120 "Commpact object store's omap."
4121 " WARNING: Compaction probably slows your requests");
4122 ceph_assert(r
== 0);
4124 r
= admin_socket
->register_command("get_mapped_pools",
4126 "dump pools whose PG(s) are mapped to this OSD.");
4128 ceph_assert(r
== 0);
4130 r
= admin_socket
->register_command("smart name=devid,type=CephString,req=false",
4132 "probe OSD devices for SMART data.");
4134 ceph_assert(r
== 0);
4136 r
= admin_socket
->register_command("list_devices",
4138 "list OSD devices.");
4139 r
= admin_socket
->register_command("send_beacon",
4141 "send OSD beacon to mon immediately");
4143 r
= admin_socket
->register_command(
4144 "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
4145 "Dump osd heartbeat network ping times");
4146 ceph_assert(r
== 0);
4148 r
= admin_socket
->register_command(
4149 "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook
,
4150 "Dump store's statistics for the given pool");
4151 ceph_assert(r
== 0);
4153 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
.get());
4154 // Note: pools are CephString instead of CephPoolname because
4155 // these commands traditionally support both pool names and numbers
4156 r
= admin_socket
->register_command(
4158 "name=pool,type=CephString " \
4159 "name=objname,type=CephObjectname " \
4160 "name=key,type=CephString "\
4161 "name=val,type=CephString",
4164 ceph_assert(r
== 0);
4165 r
= admin_socket
->register_command(
4167 "name=pool,type=CephString " \
4168 "name=objname,type=CephObjectname " \
4169 "name=key,type=CephString",
4172 ceph_assert(r
== 0);
4173 r
= admin_socket
->register_command(
4175 "name=pool,type=CephString " \
4176 "name=objname,type=CephObjectname " \
4177 "name=header,type=CephString",
4180 ceph_assert(r
== 0);
4182 r
= admin_socket
->register_command(
4184 "name=pool,type=CephString " \
4185 "name=objname,type=CephObjectname",
4187 "output entire object map");
4188 ceph_assert(r
== 0);
4190 r
= admin_socket
->register_command(
4192 "name=pool,type=CephString " \
4193 "name=objname,type=CephObjectname " \
4194 "name=len,type=CephInt",
4196 "truncate object to length");
4197 ceph_assert(r
== 0);
4199 r
= admin_socket
->register_command(
4201 "name=pool,type=CephString " \
4202 "name=objname,type=CephObjectname " \
4203 "name=shardid,type=CephInt,req=false,range=0|255",
4205 "inject data error to an object");
4206 ceph_assert(r
== 0);
4208 r
= admin_socket
->register_command(
4210 "name=pool,type=CephString " \
4211 "name=objname,type=CephObjectname " \
4212 "name=shardid,type=CephInt,req=false,range=0|255",
4214 "inject metadata error to an object");
4215 ceph_assert(r
== 0);
4216 r
= admin_socket
->register_command(
4217 "set_recovery_delay " \
4218 "name=utime,type=CephInt,req=false",
4220 "Delay osd recovery by specified seconds");
4221 ceph_assert(r
== 0);
4222 r
= admin_socket
->register_command(
4224 "name=type,type=CephString,req=false " \
4225 "name=count,type=CephInt,req=false ",
4227 "Inject a full disk (optional count times)");
4228 ceph_assert(r
== 0);
4229 r
= admin_socket
->register_command(
4231 "name=count,type=CephInt,req=false " \
4232 "name=size,type=CephInt,req=false " \
4233 "name=object_size,type=CephInt,req=false " \
4234 "name=object_num,type=CephInt,req=false ",
4236 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4237 "(default count=1G default size=4MB). Results in log.");
4238 ceph_assert(r
== 0);
4239 r
= admin_socket
->register_command(
4241 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4242 "name=message,type=CephString,n=N",
4244 "log a message to the cluster log");
4245 ceph_assert(r
== 0);
4246 r
= admin_socket
->register_command(
4250 ceph_assert(r
== 0);
4251 r
= admin_socket
->register_command(
4253 "name=heapcmd,type=CephChoices,strings=" \
4254 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4255 "name=value,type=CephString,req=false",
4257 "show heap usage info (available only if compiled with tcmalloc)");
4258 ceph_assert(r
== 0);
4259 r
= admin_socket
->register_command(
4260 "debug dump_missing " \
4261 "name=filename,type=CephFilepath",
4263 "dump missing objects to a named file");
4264 ceph_assert(r
== 0);
4265 r
= admin_socket
->register_command(
4266 "debug kick_recovery_wq " \
4267 "name=delay,type=CephInt,range=0",
4269 "set osd_recovery_delay_start to <val>");
4270 ceph_assert(r
== 0);
4271 r
= admin_socket
->register_command(
4273 "name=arg,type=CephChoices,strings=status|flush",
4275 "run cpu profiling on daemon");
4276 ceph_assert(r
== 0);
4277 r
= admin_socket
->register_command(
4278 "dump_pg_recovery_stats",
4280 "dump pg recovery statistics");
4281 ceph_assert(r
== 0);
4282 r
= admin_socket
->register_command(
4283 "reset_pg_recovery_stats",
4285 "reset pg recovery statistics");
4286 ceph_assert(r
== 0);
4287 r
= admin_socket
->register_command(
4290 "Drop all OSD caches");
4291 ceph_assert(r
== 0);
4292 r
= admin_socket
->register_command(
4295 "Get OSD caches statistics");
4296 ceph_assert(r
== 0);
4297 r
= admin_socket
->register_command(
4298 "scrub_purged_snaps",
4300 "Scrub purged_snaps vs snapmapper index");
4301 ceph_assert(r
== 0);
4302 r
= admin_socket
->register_command(
4304 "name=pgid,type=CephPgid " \
4305 "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4306 "name=value,type=CephString,req=false",
4308 "debug the scrubber");
4309 ceph_assert(r
== 0);
4311 // -- pg commands --
4312 // old form: ceph pg <pgid> command ...
4313 r
= admin_socket
->register_command(
4315 "name=pgid,type=CephPgid " \
4316 "name=cmd,type=CephChoices,strings=query",
4319 ceph_assert(r
== 0);
4320 r
= admin_socket
->register_command(
4322 "name=pgid,type=CephPgid " \
4323 "name=cmd,type=CephChoices,strings=log",
4326 ceph_assert(r
== 0);
4327 r
= admin_socket
->register_command(
4329 "name=pgid,type=CephPgid " \
4330 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4331 "name=mulcmd,type=CephChoices,strings=revert|delete",
4334 ceph_assert(r
== 0);
4335 r
= admin_socket
->register_command(
4337 "name=pgid,type=CephPgid " \
4338 "name=cmd,type=CephChoices,strings=list_unfound " \
4339 "name=offset,type=CephString,req=false",
4342 ceph_assert(r
== 0);
4343 r
= admin_socket
->register_command(
4345 "name=pgid,type=CephPgid " \
4346 "name=cmd,type=CephChoices,strings=scrub " \
4347 "name=time,type=CephInt,req=false",
4350 ceph_assert(r
== 0);
4351 r
= admin_socket
->register_command(
4353 "name=pgid,type=CephPgid " \
4354 "name=cmd,type=CephChoices,strings=deep_scrub " \
4355 "name=time,type=CephInt,req=false",
4358 ceph_assert(r
== 0);
4359 // new form: tell <pgid> <cmd> for both cli and rest
4360 r
= admin_socket
->register_command(
4363 "show details of a specific pg");
4364 ceph_assert(r
== 0);
4365 r
= admin_socket
->register_command(
4368 "dump pg_log of a specific pg");
4369 ceph_assert(r
== 0);
4370 r
= admin_socket
->register_command(
4371 "mark_unfound_lost " \
4372 "name=pgid,type=CephPgid,req=false " \
4373 "name=mulcmd,type=CephChoices,strings=revert|delete",
4375 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4376 ceph_assert(r
== 0);
4377 r
= admin_socket
->register_command(
4379 "name=pgid,type=CephPgid,req=false " \
4380 "name=offset,type=CephString,req=false",
4382 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4383 ceph_assert(r
== 0);
4384 r
= admin_socket
->register_command(
4386 "name=pgid,type=CephPgid,req=false " \
4387 "name=time,type=CephInt,req=false",
4389 "Trigger a scheduled scrub ");
4390 ceph_assert(r
== 0);
4391 r
= admin_socket
->register_command(
4393 "name=pgid,type=CephPgid,req=false " \
4394 "name=time,type=CephInt,req=false",
4396 "Trigger a scheduled deep scrub ");
4397 ceph_assert(r
== 0);
4400 PerfCounters
* OSD::create_logger()
4402 PerfCounters
* logger
= build_osd_logger(cct
);
4403 cct
->get_perfcounters_collection()->add(logger
);
4407 PerfCounters
* OSD::create_recoverystate_perf()
4409 PerfCounters
* recoverystate_perf
= build_recoverystate_perf(cct
);
4410 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
4411 return recoverystate_perf
;
4416 // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4417 //cct->_conf->osd_fast_shutdown = true;
4419 dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4420 << cct
->_conf
->osd_fast_shutdown
4421 << ", null-fm = " << store
->has_null_manager() << dendl
;
4423 utime_t start_time_func
= ceph_clock_now();
4425 if (cct
->_conf
->osd_fast_shutdown
) {
4426 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4427 if (cct
->_conf
->osd_fast_shutdown_notify_mon
)
4428 service
.prepare_to_stop();
4430 // There is no state we need to keep wehn running in NULL-FM moode
4431 if (!store
->has_null_manager()) {
4435 } else if (!service
.prepare_to_stop()) {
4436 return 0; // already shutting down
4440 if (is_stopping()) {
4445 if (!cct
->_conf
->osd_fast_shutdown
) {
4446 dout(0) << "shutdown" << dendl
;
4449 // don't accept new task for this OSD
4450 set_state(STATE_STOPPING
);
4452 // Disabled debugging during fast-shutdown
4453 if (!cct
->_conf
->osd_fast_shutdown
&& cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4454 cct
->_conf
.set_val("debug_osd", "100");
4455 cct
->_conf
.set_val("debug_journal", "100");
4456 cct
->_conf
.set_val("debug_filestore", "100");
4457 cct
->_conf
.set_val("debug_bluestore", "100");
4458 cct
->_conf
.set_val("debug_ms", "100");
4459 cct
->_conf
.apply_changes(nullptr);
4462 // stop MgrClient earlier as it's more like an internal consumer of OSD
4464 // should occur before unmounting the database in fast-shutdown to avoid
4465 // a race condition (see https://tracker.ceph.com/issues/56101)
4468 if (cct
->_conf
->osd_fast_shutdown
) {
4469 // first, stop new task from being taken from op_shardedwq
4470 // and clear all pending tasks
4471 op_shardedwq
.stop_for_fast_shutdown();
4473 utime_t start_time_timer
= ceph_clock_now();
4474 tick_timer
.shutdown();
4476 std::lock_guard
l(tick_timer_lock
);
4477 tick_timer_without_osd_lock
.shutdown();
4481 utime_t start_time_osd_drain
= ceph_clock_now();
4483 // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4487 utime_t start_time_umount
= ceph_clock_now();
4488 store
->prepare_for_fast_shutdown();
4489 std::lock_guard
lock(osd_lock
);
4490 // TBD: assert in allocator that nothing is being add
4493 utime_t end_time
= ceph_clock_now();
4494 if (cct
->_conf
->osd_fast_shutdown_timeout
) {
4495 ceph_assert(end_time
- start_time_func
< cct
->_conf
->osd_fast_shutdown_timeout
);
4497 dout(0) <<"Fast Shutdown duration total :" << end_time
- start_time_func
<< " seconds" << dendl
;
4498 dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount
- start_time_osd_drain
<< " seconds" << dendl
;
4499 dout(0) <<"Fast Shutdown duration umount :" << end_time
- start_time_umount
<< " seconds" << dendl
;
4500 dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain
- start_time_timer
<< " seconds" << dendl
;
4503 // now it is safe to exit
4507 service
.start_shutdown();
4509 // stop sending work to pgs. this just prevents any new work in _process
4510 // from racing with on_shutdown and potentially entering the pg after.
4511 op_shardedwq
.drain();
4517 for (auto pg
: pgs
) {
4522 // drain op queue again (in case PGs requeued something)
4523 op_shardedwq
.drain();
4525 // unregister commands
4526 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4530 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4531 delete test_ops_hook
;
4532 test_ops_hook
= NULL
;
4537 std::lock_guard l
{heartbeat_lock
};
4538 heartbeat_stop
= true;
4539 heartbeat_cond
.notify_all();
4540 heartbeat_peers
.clear();
4542 heartbeat_thread
.join();
4544 hb_back_server_messenger
->mark_down_all();
4545 hb_front_server_messenger
->mark_down_all();
4546 hb_front_client_messenger
->mark_down_all();
4547 hb_back_client_messenger
->mark_down_all();
4551 dout(10) << "op sharded tp stopped" << dendl
;
4553 dout(10) << "stopping agent" << dendl
;
4554 service
.agent_stop();
4556 boot_finisher
.wait_for_empty();
4560 boot_finisher
.stop();
4561 reset_heartbeat_peers(true);
4563 tick_timer
.shutdown();
4566 std::lock_guard
l(tick_timer_lock
);
4567 tick_timer_without_osd_lock
.shutdown();
4570 // note unmount epoch
4571 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl
;
4572 superblock
.mounted
= service
.get_boot_epoch();
4573 superblock
.clean_thru
= get_osdmap_epoch();
4574 ObjectStore::Transaction t
;
4575 write_superblock(t
);
4576 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4578 derr
<< "OSD::shutdown: error writing superblock: "
4579 << cpp_strerror(r
) << dendl
;
4583 service
.shutdown_reserver();
4586 #ifdef PG_DEBUG_REFS
4587 service
.dump_live_pgids();
4591 _get_pgs(&pgs
, true);
4595 for (auto& pg
: pgs
) {
4596 if (pg
->is_deleted()) {
4599 dout(20) << " kicking pg " << pg
<< dendl
;
4601 if (pg
->get_num_ref() != 1) {
4602 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4603 << pg
->get_num_ref() << dendl
;
4604 #ifdef PG_DEBUG_REFS
4605 pg
->dump_live_ids();
4607 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4615 #ifdef PG_DEBUG_REFS
4616 service
.dump_live_pgids();
4620 cct
->_conf
.remove_observer(this);
4623 service
.meta_ch
.reset();
4625 dout(10) << "syncing store" << dendl
;
4626 enable_disable_fuse(true);
4628 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4629 dout(10) << "flushing journal" << dendl
;
4630 store
->flush_journal();
4636 std::unique_lock l
{map_lock
};
4637 set_osdmap(OSDMapRef());
4639 for (auto s
: shards
) {
4640 std::lock_guard
l(s
->osdmap_lock
);
4641 s
->shard_osdmap
= OSDMapRef();
4645 std::lock_guard
lock(osd_lock
);
4648 dout(10) << "Store synced" << dendl
;
4650 op_tracker
.on_shutdown();
4652 ClassHandler::get_instance().shutdown();
4653 client_messenger
->shutdown();
4654 cluster_messenger
->shutdown();
4655 hb_front_client_messenger
->shutdown();
4656 hb_back_client_messenger
->shutdown();
4657 objecter_messenger
->shutdown();
4658 hb_front_server_messenger
->shutdown();
4659 hb_back_server_messenger
->shutdown();
4661 utime_t duration
= ceph_clock_now() - start_time_func
;
4662 dout(0) <<"Slow Shutdown duration:" << duration
<< " seconds" << dendl
;
4668 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4670 bool created
= false;
4672 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4673 vector
<string
> vcmd
{cmd
};
4677 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4680 if (r
== -ENOENT
&& !created
) {
4681 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4682 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4683 vector
<string
> vnewcmd
{newcmd
};
4687 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4690 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4691 << cpp_strerror(r
) << dendl
;
4697 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4706 int OSD::update_crush_location()
4708 if (!cct
->_conf
->osd_crush_update_on_start
) {
4709 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4714 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4715 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4717 struct store_statfs_t st
;
4718 osd_alert_list_t alerts
;
4719 int r
= store
->statfs(&st
, &alerts
);
4721 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4724 snprintf(weight
, sizeof(weight
), "%.4lf",
4727 double(1ull << 40 /* TB */)));
4730 dout(10) << __func__
<< " crush location is " << cct
->crush_location
<< dendl
;
4733 string("{\"prefix\": \"osd crush create-or-move\", ") +
4734 string("\"id\": ") + stringify(whoami
) + ", " +
4735 string("\"weight\":") + weight
+ ", " +
4736 string("\"args\": [") + stringify(cct
->crush_location
) + "]}";
4737 return mon_cmd_maybe_osd_create(cmd
);
4740 int OSD::update_crush_device_class()
4742 if (!cct
->_conf
->osd_class_update_on_start
) {
4743 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4747 string device_class
;
4748 int r
= store
->read_meta("crush_device_class", &device_class
);
4749 if (r
< 0 || device_class
.empty()) {
4750 device_class
= store
->get_default_device_class();
4753 if (device_class
.empty()) {
4754 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4759 string("{\"prefix\": \"osd crush set-device-class\", ") +
4760 string("\"class\": \"") + device_class
+ string("\", ") +
4761 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4763 r
= mon_cmd_maybe_osd_create(cmd
);
4765 // good, already bound to a device-class
4772 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4774 dout(10) << "write_superblock " << superblock
<< dendl
;
4776 //hack: at minimum it's using the baseline feature set
4777 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4778 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4781 encode(superblock
, bl
);
4782 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4785 int OSD::read_superblock()
4788 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4792 auto p
= bl
.cbegin();
4793 decode(superblock
, p
);
4795 dout(10) << "read_superblock " << superblock
<< dendl
;
4800 void OSD::clear_temp_objects()
4802 dout(10) << __func__
<< dendl
;
4804 store
->list_collections(ls
);
4805 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4807 if (!p
->is_pg(&pgid
))
4810 // list temp objects
4811 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4813 vector
<ghobject_t
> temps
;
4816 vector
<ghobject_t
> objects
;
4817 auto ch
= store
->open_collection(*p
);
4819 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4820 store
->get_ideal_list_max(),
4822 if (objects
.empty())
4824 vector
<ghobject_t
>::iterator q
;
4825 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4826 // Hammer set pool for temps to -1, so check for clean-up
4827 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4828 temps
.push_back(*q
);
4833 // If we saw a non-temp object and hit the break above we can
4834 // break out of the while loop too.
4835 if (q
!= objects
.end())
4838 if (!temps
.empty()) {
4839 ObjectStore::Transaction t
;
4841 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4842 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4844 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4845 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4846 t
= ObjectStore::Transaction();
4851 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4857 void OSD::recursive_remove_collection(CephContext
* cct
,
4858 ObjectStore
*store
, spg_t pgid
,
4864 make_snapmapper_oid());
4866 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4867 ObjectStore::Transaction t
;
4868 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4871 int max
= cct
->_conf
->osd_target_transaction_size
;
4872 vector
<ghobject_t
> objects
;
4873 objects
.reserve(max
);
4876 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4877 max
, &objects
, &next
);
4878 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4879 if (objects
.empty())
4881 for (auto& p
: objects
) {
4882 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4883 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4884 if (r
!= 0 && r
!= -ENOENT
)
4888 int r
= store
->queue_transaction(ch
, std::move(t
));
4889 ceph_assert(r
== 0);
4890 t
= ObjectStore::Transaction();
4892 t
.remove_collection(tmp
);
4893 int r
= store
->queue_transaction(ch
, std::move(t
));
4894 ceph_assert(r
== 0);
4897 if (!ch
->flush_commit(&waiter
)) {
4903 // ======================================================
4907 OSDMapRef createmap
,
4910 dout(10) << __func__
<< " " << pgid
<< dendl
;
4912 map
<string
,string
> ec_profile
;
4914 if (createmap
->have_pg_pool(pgid
.pool())) {
4915 pi
= *createmap
->get_pg_pool(pgid
.pool());
4916 name
= createmap
->get_pool_name(pgid
.pool());
4917 if (pi
.is_erasure()) {
4918 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4921 // pool was deleted; grab final pg_pool_t off disk.
4922 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4924 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4926 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4930 ceph_assert(r
>= 0);
4931 auto p
= bl
.cbegin();
4934 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4935 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4936 << " tombstone" << dendl
;
4939 decode(ec_profile
, p
);
4941 PGPool
pool(createmap
, pgid
.pool(), pi
, name
);
4943 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4944 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4945 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4951 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4954 v
->reserve(get_num_pgs());
4955 for (auto& s
: shards
) {
4956 std::lock_guard
l(s
->shard_lock
);
4957 for (auto& j
: s
->pg_slots
) {
4959 !j
.second
->pg
->is_deleted()) {
4960 v
->push_back(j
.second
->pg
);
4962 s
->_detach_pg(j
.second
.get());
4969 void OSD::_get_pgids(vector
<spg_t
> *v
)
4972 v
->reserve(get_num_pgs());
4973 for (auto& s
: shards
) {
4974 std::lock_guard
l(s
->shard_lock
);
4975 for (auto& j
: s
->pg_slots
) {
4977 !j
.second
->pg
->is_deleted()) {
4978 v
->push_back(j
.first
);
4984 void OSD::register_pg(PGRef pg
)
4986 spg_t pgid
= pg
->get_pgid();
4987 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4988 auto sdata
= shards
[shard_index
];
4989 std::lock_guard
l(sdata
->shard_lock
);
4990 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4991 ceph_assert(r
.second
);
4992 auto *slot
= r
.first
->second
.get();
4993 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4994 sdata
->_attach_pg(slot
, pg
.get());
4997 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4999 auto sdata
= pg
->osd_shard
;
5002 std::lock_guard
l(sdata
->shard_lock
);
5003 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
5004 if (p
== sdata
->pg_slots
.end() ||
5006 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
5009 if (p
->second
->waiting_for_merge_epoch
) {
5010 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
5013 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
5014 sdata
->_detach_pg(p
->second
.get());
5017 for (auto shard
: shards
) {
5018 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
5021 // update pg count now since we might not get an osdmap any time soon.
5022 if (pg
->is_primary())
5023 service
.logger
->dec(l_osd_pg_primary
);
5024 else if (pg
->is_nonprimary())
5025 service
.logger
->dec(l_osd_pg_replica
); // misnomver
5027 service
.logger
->dec(l_osd_pg_stray
);
5032 PGRef
OSD::_lookup_pg(spg_t pgid
)
5034 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
5035 auto sdata
= shards
[shard_index
];
5036 std::lock_guard
l(sdata
->shard_lock
);
5037 auto p
= sdata
->pg_slots
.find(pgid
);
5038 if (p
== sdata
->pg_slots
.end()) {
5041 return p
->second
->pg
;
5044 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
5046 PGRef pg
= _lookup_pg(pgid
);
5051 if (!pg
->is_deleted()) {
5058 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
5060 return _lookup_lock_pg(pgid
);
5063 void OSD::load_pgs()
5065 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5066 dout(0) << "load_pgs" << dendl
;
5069 auto pghist
= make_pg_num_history_oid();
5071 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
5072 if (r
>= 0 && bl
.length() > 0) {
5073 auto p
= bl
.cbegin();
5074 decode(pg_num_history
, p
);
5076 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
5080 int r
= store
->list_collections(ls
);
5082 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
5086 for (vector
<coll_t
>::iterator it
= ls
.begin();
5090 if (it
->is_temp(&pgid
) ||
5091 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
.get(), pgid
))) {
5092 dout(10) << "load_pgs " << *it
5093 << " removing, legacy or flagged for removal pg" << dendl
;
5094 recursive_remove_collection(cct
, store
.get(), pgid
, *it
);
5098 if (!it
->is_pg(&pgid
)) {
5099 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
5103 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
5104 epoch_t map_epoch
= 0;
5105 int r
= PG::peek_map_epoch(store
.get(), pgid
, &map_epoch
);
5107 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
5113 if (map_epoch
> 0) {
5114 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
5116 if (!get_osdmap()->have_pg_pool(pgid
.pool())) {
5117 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
5118 << " on pg " << pgid
<< ", but the pool is not present in the "
5119 << "current map, so this is probably a result of bug 10617. "
5120 << "Skipping the pg for now, you can use ceph-objectstore-tool "
5121 << "to clean it up later." << dendl
;
5124 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
5125 << map_epoch
<< ", but missing map. Crashing."
5127 ceph_abort_msg("Missing map in load_pgs");
5130 pg
= _make_pg(pgosdmap
, pgid
);
5132 pg
= _make_pg(get_osdmap(), pgid
);
5135 recursive_remove_collection(cct
, store
.get(), pgid
, *it
);
5139 // there can be no waiters here, so we don't call _wake_pg_slot
5142 pg
->ch
= store
->open_collection(pg
->coll
);
5144 // read pg state, log
5145 pg
->read_state(store
.get());
5148 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
5151 recursive_remove_collection(cct
, store
.get(), pgid
, *it
);
5155 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
5156 assert(NULL
!= shards
[shard_index
]);
5157 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
5160 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
5166 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
5170 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
5171 const PGCreateInfo
*info
)
5173 spg_t pgid
= info
->pgid
;
5175 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
5176 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
5180 OSDMapRef startmap
= get_map(info
->epoch
);
5183 int64_t pool_id
= pgid
.pgid
.pool();
5184 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
5186 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
5189 if (osdmap
->require_osd_release
>= ceph_release_t::nautilus
&&
5190 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
5191 // this ensures we do not process old creating messages after the
5192 // pool's initial pgs have been created (and pg are subsequently
5193 // allowed to split or merge).
5194 dout(20) << __func__
<< " dropping " << pgid
5195 << "create, pool does not have CREATING flag set" << dendl
;
5200 int up_primary
, acting_primary
;
5201 vector
<int> up
, acting
;
5202 startmap
->pg_to_up_acting_osds(
5203 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
5205 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
5206 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
5207 store
->get_type() != "bluestore") {
5208 clog
->warn() << "pg " << pgid
5209 << " is at risk of silent data corruption: "
5210 << "the pool allows ec overwrites but is not stored in "
5211 << "bluestore, so deep scrubbing will not detect bitrot";
5214 create_pg_collection(
5215 rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
5216 init_pg_ondisk(rctx
.transaction
, pgid
, pp
);
5218 int role
= startmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
5220 PGRef pg
= _make_pg(startmap
, pgid
);
5221 pg
->ch
= store
->create_new_collection(pg
->coll
);
5224 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
5225 assert(NULL
!= shards
[shard_index
]);
5226 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
5231 // we are holding the shard lock
5232 ceph_assert(!pg
->is_deleted());
5241 info
->past_intervals
,
5244 pg
->init_collection_pool_opts();
5246 if (pg
->is_primary()) {
5247 std::lock_guard locker
{m_perf_queries_lock
};
5248 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
5251 pg
->handle_initialize(rctx
);
5252 pg
->handle_activate_map(rctx
);
5254 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
5256 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
5260 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
5264 const auto max_pgs_per_osd
=
5265 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
5266 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
5268 if (num_pgs
< max_pgs_per_osd
) {
5272 std::lock_guard
l(pending_creates_lock
);
5273 if (is_mon_create
) {
5274 pending_creates_from_mon
++;
5276 bool is_primary
= osdmap
->get_pg_acting_role(pgid
, whoami
) == 0;
5277 pending_creates_from_osd
.emplace(pgid
, is_primary
);
5279 dout(1) << __func__
<< " withhold creation of pg " << pgid
5280 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
5284 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5285 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5286 // to up set if pg_temp is empty. so an empty pg_temp won't work.
5287 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
5288 if (acting
.size() > 1) {
5291 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
5292 twiddled
.push_back(-1);
5297 void OSD::resume_creating_pg()
5299 bool do_sub_pg_creates
= false;
5300 bool have_pending_creates
= false;
5302 const auto max_pgs_per_osd
=
5303 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
5304 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
5305 if (max_pgs_per_osd
<= num_pgs
) {
5306 // this could happen if admin decreases this setting before a PG is removed
5309 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
5310 std::lock_guard
l(pending_creates_lock
);
5311 if (pending_creates_from_mon
> 0) {
5312 dout(20) << __func__
<< " pending_creates_from_mon "
5313 << pending_creates_from_mon
<< dendl
;
5314 do_sub_pg_creates
= true;
5315 if (pending_creates_from_mon
>= spare_pgs
) {
5316 spare_pgs
= pending_creates_from_mon
= 0;
5318 spare_pgs
-= pending_creates_from_mon
;
5319 pending_creates_from_mon
= 0;
5322 auto pg
= pending_creates_from_osd
.cbegin();
5323 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
5324 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
5326 get_osdmap()->pg_to_up_acting_osds(pg
->first
.pgid
, nullptr, nullptr, &acting
, nullptr);
5327 service
.queue_want_pg_temp(pg
->first
.pgid
, twiddle(acting
), true);
5328 pg
= pending_creates_from_osd
.erase(pg
);
5329 do_sub_pg_creates
= true;
5332 have_pending_creates
= (pending_creates_from_mon
> 0 ||
5333 !pending_creates_from_osd
.empty());
5336 bool do_renew_subs
= false;
5337 if (do_sub_pg_creates
) {
5338 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
5339 dout(4) << __func__
<< ": resolicit pg creates from mon since "
5340 << last_pg_create_epoch
<< dendl
;
5341 do_renew_subs
= true;
5344 version_t start
= get_osdmap_epoch() + 1;
5345 if (have_pending_creates
) {
5346 // don't miss any new osdmap deleting PGs
5347 if (monc
->sub_want("osdmap", start
, 0)) {
5348 dout(4) << __func__
<< ": resolicit osdmap from mon since "
5350 do_renew_subs
= true;
5352 } else if (do_sub_pg_creates
) {
5353 // no need to subscribe the osdmap continuously anymore
5354 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5355 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
5356 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
5358 do_renew_subs
= true;
5362 if (do_renew_subs
) {
5366 service
.send_pg_temp();
5369 void OSD::_add_heartbeat_peer(int p
)
5375 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
5376 if (i
== heartbeat_peers
.end()) {
5377 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, get_osdmap_epoch());
5380 assert(cons
.second
);
5382 hi
= &heartbeat_peers
[p
];
5385 auto stamps
= service
.get_hb_stamps(p
);
5387 auto sb
= ceph::make_ref
<Session
>(cct
, cons
.first
.get());
5389 sb
->stamps
= stamps
;
5390 hi
->hb_interval_start
= ceph_clock_now();
5391 hi
->con_back
= cons
.first
.get();
5392 hi
->con_back
->set_priv(sb
);
5394 auto sf
= ceph::make_ref
<Session
>(cct
, cons
.second
.get());
5396 sf
->stamps
= stamps
;
5397 hi
->con_front
= cons
.second
.get();
5398 hi
->con_front
->set_priv(sf
);
5400 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5401 << " " << hi
->con_back
->get_peer_addr()
5402 << " " << hi
->con_front
->get_peer_addr()
5407 hi
->epoch
= get_osdmap_epoch();
5410 void OSD::_remove_heartbeat_peer(int n
)
5412 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5413 ceph_assert(q
!= heartbeat_peers
.end());
5414 dout(20) << " removing heartbeat peer osd." << n
5415 << " " << q
->second
.con_back
->get_peer_addr()
5416 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5418 q
->second
.clear_mark_down();
5419 heartbeat_peers
.erase(q
);
5422 void OSD::need_heartbeat_peer_update()
5426 dout(20) << "need_heartbeat_peer_update" << dendl
;
5427 heartbeat_set_peers_need_update();
5430 void OSD::maybe_update_heartbeat_peers()
5432 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5434 if (is_waiting_for_healthy() || is_active()) {
5435 utime_t now
= ceph_clock_now();
5436 if (last_heartbeat_resample
== utime_t()) {
5437 last_heartbeat_resample
= now
;
5438 heartbeat_set_peers_need_update();
5439 } else if (!heartbeat_peers_need_update()) {
5440 utime_t dur
= now
- last_heartbeat_resample
;
5441 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5442 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5443 heartbeat_set_peers_need_update();
5444 last_heartbeat_resample
= now
;
5445 // automatically clean up any stale heartbeat peers
5446 // if we are unhealthy, then clean all
5447 reset_heartbeat_peers(is_waiting_for_healthy());
5452 if (!heartbeat_peers_need_update())
5454 heartbeat_clear_peers_need_update();
5456 std::lock_guard
l(heartbeat_lock
);
5458 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5461 // build heartbeat from set
5465 for (auto& pg
: pgs
) {
5466 pg
->with_heartbeat_peers([&](int peer
) {
5467 if (get_osdmap()->is_up(peer
)) {
5468 _add_heartbeat_peer(peer
);
5474 // include next and previous up osds to ensure we have a fully-connected set
5475 set
<int> want
, extras
;
5476 const int next
= get_osdmap()->get_next_up_osd_after(whoami
);
5479 int prev
= get_osdmap()->get_previous_up_osd_before(whoami
);
5480 if (prev
>= 0 && prev
!= next
)
5483 // make sure we have at least **min_down** osds coming from different
5484 // subtree level (e.g., hosts) for fast failure detection.
5485 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5486 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5487 auto limit
= std::max(min_down
, (uint64_t)cct
->_conf
->osd_heartbeat_min_peers
);
5488 get_osdmap()->get_random_up_osds_by_subtree(
5489 whoami
, subtree
, limit
, want
, &want
);
5491 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5492 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5494 _add_heartbeat_peer(*p
);
5497 // remove down peers; enumerate extras
5498 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5499 while (p
!= heartbeat_peers
.end()) {
5500 if (!get_osdmap()->is_up(p
->first
)) {
5503 _remove_heartbeat_peer(o
);
5506 if (p
->second
.epoch
< get_osdmap_epoch()) {
5507 extras
.insert(p
->first
);
5513 for (int n
= next
; n
>= 0; ) {
5514 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5516 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5517 dout(10) << " adding random peer osd." << n
<< dendl
;
5519 _add_heartbeat_peer(n
);
5521 n
= get_osdmap()->get_next_up_osd_after(n
);
5523 break; // came full circle; stop
5527 for (set
<int>::iterator p
= extras
.begin();
5528 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5532 _remove_heartbeat_peer(*p
);
5535 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5537 // clean up stale failure pending
5538 for (auto it
= failure_pending
.begin(); it
!= failure_pending
.end();) {
5539 if (heartbeat_peers
.count(it
->first
) == 0) {
5540 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
5541 failure_pending
.erase(it
++);
5548 void OSD::reset_heartbeat_peers(bool all
)
5550 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5551 dout(10) << "reset_heartbeat_peers" << dendl
;
5552 utime_t stale
= ceph_clock_now();
5553 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5554 std::lock_guard
l(heartbeat_lock
);
5555 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5556 auto& [peer
, hi
] = *it
;
5557 if (all
|| hi
.is_stale(stale
)) {
5558 hi
.clear_mark_down();
5559 // stop sending failure_report to mon too
5560 failure_queue
.erase(peer
);
5561 failure_pending
.erase(peer
);
5562 it
= heartbeat_peers
.erase(it
);
5569 void OSD::handle_osd_ping(MOSDPing
*m
)
5571 if (superblock
.cluster_fsid
!= m
->fsid
) {
5572 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5573 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
5579 int from
= m
->get_source().num();
5581 heartbeat_lock
.lock();
5582 if (is_stopping()) {
5583 heartbeat_lock
.unlock();
5588 utime_t now
= ceph_clock_now();
5589 auto mnow
= service
.get_mnow();
5590 ConnectionRef
con(m
->get_connection());
5591 OSDMapRef curmap
= service
.get_osdmap();
5593 heartbeat_lock
.unlock();
5598 auto sref
= con
->get_priv();
5599 Session
*s
= static_cast<Session
*>(sref
.get());
5601 heartbeat_lock
.unlock();
5607 s
->stamps
= service
.get_hb_stamps(from
);
5612 case MOSDPing::PING
:
5614 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5615 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5616 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5617 if (heartbeat_drop
->second
== 0) {
5618 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5620 --heartbeat_drop
->second
;
5621 dout(5) << "Dropping heartbeat from " << from
5622 << ", " << heartbeat_drop
->second
5623 << " remaining to drop" << dendl
;
5626 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5627 ((((double)(rand()%100))/100.0))) {
5629 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5630 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5631 dout(5) << "Dropping heartbeat from " << from
5632 << ", " << heartbeat_drop
->second
5633 << " remaining to drop" << dendl
;
5638 ceph::signedspan sender_delta_ub
{};
5639 s
->stamps
->got_ping(
5645 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5647 if (!cct
->get_heartbeat_map()->is_healthy()) {
5648 dout(10) << "internal heartbeat not healthy, dropping ping request"
5653 Message
*r
= new MOSDPing(monc
->get_fsid(),
5654 curmap
->get_epoch(),
5655 MOSDPing::PING_REPLY
,
5659 service
.get_up_epoch(),
5660 cct
->_conf
->osd_heartbeat_min_size
,
5662 con
->send_message(r
);
5664 if (curmap
->is_up(from
)) {
5666 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5667 from
, curmap
->get_epoch());
5669 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5672 } else if (!curmap
->exists(from
) ||
5673 curmap
->get_down_at(from
) > m
->map_epoch
) {
5674 // tell them they have died
5675 Message
*r
= new MOSDPing(monc
->get_fsid(),
5676 curmap
->get_epoch(),
5681 service
.get_up_epoch(),
5682 cct
->_conf
->osd_heartbeat_min_size
);
5683 con
->send_message(r
);
5688 case MOSDPing::PING_REPLY
:
5690 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5691 if (i
!= heartbeat_peers
.end()) {
5692 auto acked
= i
->second
.ping_history
.find(m
->ping_stamp
);
5693 if (acked
!= i
->second
.ping_history
.end()) {
5694 int &unacknowledged
= acked
->second
.second
;
5695 if (con
== i
->second
.con_back
) {
5696 dout(25) << "handle_osd_ping got reply from osd." << from
5697 << " first_tx " << i
->second
.first_tx
5698 << " last_tx " << i
->second
.last_tx
5699 << " last_rx_back " << i
->second
.last_rx_back
5701 << " last_rx_front " << i
->second
.last_rx_front
5703 i
->second
.last_rx_back
= now
;
5704 ceph_assert(unacknowledged
> 0);
5706 // if there is no front con, set both stamps.
5707 if (i
->second
.con_front
== NULL
) {
5708 i
->second
.last_rx_front
= now
;
5709 ceph_assert(unacknowledged
> 0);
5712 } else if (con
== i
->second
.con_front
) {
5713 dout(25) << "handle_osd_ping got reply from osd." << from
5714 << " first_tx " << i
->second
.first_tx
5715 << " last_tx " << i
->second
.last_tx
5716 << " last_rx_back " << i
->second
.last_rx_back
5717 << " last_rx_front " << i
->second
.last_rx_front
5720 i
->second
.last_rx_front
= now
;
5721 ceph_assert(unacknowledged
> 0);
5725 if (unacknowledged
== 0) {
5726 // succeeded in getting all replies
5727 dout(25) << "handle_osd_ping got all replies from osd." << from
5728 << " , erase pending ping(sent at " << m
->ping_stamp
<< ")"
5729 << " and older pending ping(s)"
5732 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5733 ++i
->second
.hb_average_count
;
5734 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->ping_stamp
);
5735 i
->second
.hb_total_back
+= back_pingtime
;
5736 if (back_pingtime
< i
->second
.hb_min_back
)
5737 i
->second
.hb_min_back
= back_pingtime
;
5738 if (back_pingtime
> i
->second
.hb_max_back
)
5739 i
->second
.hb_max_back
= back_pingtime
;
5740 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->ping_stamp
);
5741 i
->second
.hb_total_front
+= front_pingtime
;
5742 if (front_pingtime
< i
->second
.hb_min_front
)
5743 i
->second
.hb_min_front
= front_pingtime
;
5744 if (front_pingtime
> i
->second
.hb_max_front
)
5745 i
->second
.hb_max_front
= front_pingtime
;
5747 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5748 if (i
->second
.hb_interval_start
== utime_t())
5749 i
->second
.hb_interval_start
= now
;
5750 int64_t hb_avg_time_period
= 60;
5751 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5752 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5754 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5755 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5756 uint32_t back_min
= i
->second
.hb_min_back
;
5757 uint32_t back_max
= i
->second
.hb_max_back
;
5758 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5759 uint32_t front_min
= i
->second
.hb_min_front
;
5760 uint32_t front_max
= i
->second
.hb_max_front
;
5762 // Reset for new interval
5763 i
->second
.hb_average_count
= 0;
5764 i
->second
.hb_interval_start
= now
;
5765 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5766 i
->second
.hb_min_back
= UINT_MAX
;
5767 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5768 i
->second
.hb_min_front
= UINT_MAX
;
5770 // Record per osd interace ping times
5771 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5772 if (i
->second
.hb_back_pingtime
.size() == 0) {
5773 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5774 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5775 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5776 i
->second
.hb_back_min
.push_back(back_min
);
5777 i
->second
.hb_back_max
.push_back(back_max
);
5778 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5779 i
->second
.hb_front_min
.push_back(front_min
);
5780 i
->second
.hb_front_max
.push_back(front_max
);
5781 ++i
->second
.hb_index
;
5784 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5785 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5786 i
->second
.hb_back_min
[index
] = back_min
;
5787 i
->second
.hb_back_max
[index
] = back_max
;
5788 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5789 i
->second
.hb_front_min
[index
] = front_min
;
5790 i
->second
.hb_front_max
[index
] = front_max
;
5791 ++i
->second
.hb_index
;
5795 std::lock_guard
l(service
.stat_lock
);
5796 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5797 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5800 uint32_t min
= UINT_MAX
;
5804 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5805 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5807 int index
= (i
->second
.hb_index
+ k
) % size
;
5808 total
+= i
->second
.hb_back_pingtime
[index
];
5809 if (i
->second
.hb_back_min
[index
] < min
)
5810 min
= i
->second
.hb_back_min
[index
];
5811 if (i
->second
.hb_back_max
[index
] > max
)
5812 max
= i
->second
.hb_back_max
[index
];
5813 if (count
== 1 || count
== 5 || count
== 15) {
5814 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5815 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5816 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5823 if (i
->second
.con_front
!= NULL
) {
5824 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5831 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5833 int index
= (i
->second
.hb_index
+ k
) % size
;
5834 total
+= i
->second
.hb_front_pingtime
[index
];
5835 if (i
->second
.hb_front_min
[index
] < min
)
5836 min
= i
->second
.hb_front_min
[index
];
5837 if (i
->second
.hb_front_max
[index
] > max
)
5838 max
= i
->second
.hb_front_max
[index
];
5839 if (count
== 1 || count
== 5 || count
== 15) {
5840 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5841 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5842 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5851 std::lock_guard
l(service
.stat_lock
);
5852 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5853 if (i
->second
.con_front
!= NULL
)
5854 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5856 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5859 if (i
->second
.is_healthy(now
)) {
5860 // Cancel false reports
5861 auto failure_queue_entry
= failure_queue
.find(from
);
5862 if (failure_queue_entry
!= failure_queue
.end()) {
5863 dout(10) << "handle_osd_ping canceling queued "
5864 << "failure report for osd." << from
<< dendl
;
5865 failure_queue
.erase(failure_queue_entry
);
5868 auto failure_pending_entry
= failure_pending
.find(from
);
5869 if (failure_pending_entry
!= failure_pending
.end()) {
5870 dout(10) << "handle_osd_ping canceling in-flight "
5871 << "failure report for osd." << from
<< dendl
;
5872 send_still_alive(curmap
->get_epoch(),
5874 failure_pending_entry
->second
.second
);
5875 failure_pending
.erase(failure_pending_entry
);
5879 // old replies, deprecated by newly sent pings.
5880 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->ping_stamp
5881 << ") is found, treat as covered by newly sent pings "
5888 curmap
->is_up(from
)) {
5890 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5891 from
, curmap
->get_epoch());
5893 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5898 s
->stamps
->got_ping_reply(
5902 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5906 case MOSDPing::YOU_DIED
:
5907 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5908 << " says i am down in " << m
->map_epoch
<< dendl
;
5909 osdmap_subscribe(curmap
->get_epoch()+1, false);
5913 heartbeat_lock
.unlock();
5917 void OSD::heartbeat_entry()
5919 std::unique_lock
l(heartbeat_lock
);
5922 while (!heartbeat_stop
) {
5926 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5927 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5929 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5931 auto w
= ceph::make_timespan(wait
);
5932 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5933 heartbeat_cond
.wait_for(l
, w
);
5936 dout(30) << "heartbeat_entry woke up" << dendl
;
5940 void OSD::heartbeat_check()
5942 ceph_assert(ceph_mutex_is_locked(heartbeat_lock
));
5943 utime_t now
= ceph_clock_now();
5945 // check for incoming heartbeats (move me elsewhere?)
5946 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5947 p
!= heartbeat_peers
.end();
5950 if (p
->second
.first_tx
== utime_t()) {
5951 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5952 << " yet, skipping" << dendl
;
5956 dout(25) << "heartbeat_check osd." << p
->first
5957 << " first_tx " << p
->second
.first_tx
5958 << " last_tx " << p
->second
.last_tx
5959 << " last_rx_back " << p
->second
.last_rx_back
5960 << " last_rx_front " << p
->second
.last_rx_front
5962 if (p
->second
.is_unhealthy(now
)) {
5963 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5964 if (p
->second
.last_rx_back
== utime_t() ||
5965 p
->second
.last_rx_front
== utime_t()) {
5966 derr
<< "heartbeat_check: no reply from "
5967 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5968 << " osd." << p
->first
5969 << " ever on either front or back, first ping sent "
5970 << p
->second
.first_tx
5971 << " (oldest deadline " << oldest_deadline
<< ")"
5974 failure_queue
[p
->first
] = p
->second
.first_tx
;
5976 derr
<< "heartbeat_check: no reply from "
5977 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5978 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5979 << " front " << p
->second
.last_rx_front
5980 << " (oldest deadline " << oldest_deadline
<< ")"
5983 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5989 void OSD::heartbeat()
5991 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock
));
5992 dout(30) << "heartbeat" << dendl
;
5994 auto load_for_logger
= service
.get_scrub_services().update_load_average();
5995 if (load_for_logger
) {
5996 logger
->set(l_osd_loadavg
, load_for_logger
.value());
5998 dout(30) << "heartbeat checking stats" << dendl
;
6000 // refresh peer list and osd stats
6001 vector
<int> hb_peers
;
6002 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6003 p
!= heartbeat_peers
.end();
6005 hb_peers
.push_back(p
->first
);
6007 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
6008 dout(5) << __func__
<< " " << new_stat
<< dendl
;
6009 ceph_assert(new_stat
.statfs
.total
);
6012 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
6014 service
.check_full_status(ratio
, pratio
);
6016 utime_t now
= ceph_clock_now();
6017 auto mnow
= service
.get_mnow();
6018 utime_t deadline
= now
;
6019 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
6022 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
6023 i
!= heartbeat_peers
.end();
6025 int peer
= i
->first
;
6026 Session
*s
= static_cast<Session
*>(i
->second
.con_back
->get_priv().get());
6028 dout(30) << "heartbeat osd." << peer
<< " has no open con" << dendl
;
6031 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
6033 i
->second
.last_tx
= now
;
6034 if (i
->second
.first_tx
== utime_t())
6035 i
->second
.first_tx
= now
;
6036 i
->second
.ping_history
[now
] = make_pair(deadline
,
6037 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
6038 if (i
->second
.hb_interval_start
== utime_t())
6039 i
->second
.hb_interval_start
= now
;
6041 std::optional
<ceph::signedspan
> delta_ub
;
6042 s
->stamps
->sent_ping(&delta_ub
);
6044 i
->second
.con_back
->send_message(
6045 new MOSDPing(monc
->get_fsid(),
6046 service
.get_osdmap_epoch(),
6051 service
.get_up_epoch(),
6052 cct
->_conf
->osd_heartbeat_min_size
,
6055 if (i
->second
.con_front
)
6056 i
->second
.con_front
->send_message(
6057 new MOSDPing(monc
->get_fsid(),
6058 service
.get_osdmap_epoch(),
6063 service
.get_up_epoch(),
6064 cct
->_conf
->osd_heartbeat_min_size
,
6068 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
6070 // hmm.. am i all alone?
6071 dout(30) << "heartbeat lonely?" << dendl
;
6072 if (heartbeat_peers
.empty()) {
6073 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
6074 last_mon_heartbeat
= now
;
6075 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
6076 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6080 dout(30) << "heartbeat done" << dendl
;
6083 bool OSD::heartbeat_reset(Connection
*con
)
6085 std::lock_guard
l(heartbeat_lock
);
6086 auto s
= con
->get_priv();
6087 dout(20) << __func__
<< " con " << con
<< " s " << s
.get() << dendl
;
6088 con
->set_priv(nullptr);
6090 if (is_stopping()) {
6093 auto session
= static_cast<Session
*>(s
.get());
6094 auto p
= heartbeat_peers
.find(session
->peer
);
6095 if (p
!= heartbeat_peers
.end() &&
6096 (p
->second
.con_back
== con
||
6097 p
->second
.con_front
== con
)) {
6098 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
6099 << ", reopening" << dendl
;
6100 p
->second
.clear_mark_down(con
);
6101 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
6103 p
->second
.con_back
= newcon
.first
.get();
6104 p
->second
.con_back
->set_priv(s
);
6105 if (newcon
.second
) {
6106 p
->second
.con_front
= newcon
.second
.get();
6107 p
->second
.con_front
->set_priv(s
);
6109 p
->second
.ping_history
.clear();
6111 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
6112 << ", raced with osdmap update, closing out peer" << dendl
;
6113 heartbeat_peers
.erase(p
);
6116 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
6124 // =========================================
6128 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6129 dout(10) << "tick" << dendl
;
6131 utime_t now
= ceph_clock_now();
6132 // throw out any obsolete markdown log
6133 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
6134 while (!osd_markdown_log
.empty() &&
6135 osd_markdown_log
.front() + grace
< now
)
6136 osd_markdown_log
.pop_front();
6138 if (is_active() || is_waiting_for_healthy()) {
6139 maybe_update_heartbeat_peers();
6142 if (is_waiting_for_healthy()) {
6146 if (is_waiting_for_healthy() || is_booting()) {
6147 std::lock_guard
l(heartbeat_lock
);
6148 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
6149 last_mon_heartbeat
= now
;
6150 dout(1) << __func__
<< " checking mon for new map" << dendl
;
6151 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6155 // scrub purged_snaps every deep scrub interval
6157 const utime_t last
= superblock
.last_purged_snaps_scrub
;
6158 utime_t next
= last
;
6159 next
+= cct
->_conf
->osd_scrub_min_interval
;
6161 // use a seed that is stable for each scrub interval, but varies
6162 // by OSD to avoid any herds.
6163 rng
.seed(whoami
+ superblock
.last_purged_snaps_scrub
.sec());
6164 double r
= (rng() % 1024) / 1024.0;
6166 cct
->_conf
->osd_scrub_min_interval
*
6167 cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
6168 if (next
< ceph_clock_now()) {
6169 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
6170 << " next " << next
<< " ... now" << dendl
;
6171 scrub_purged_snaps();
6173 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
6174 << " next " << next
<< dendl
;
6178 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
6181 void OSD::tick_without_osd_lock()
6183 ceph_assert(ceph_mutex_is_locked(tick_timer_lock
));
6184 dout(10) << "tick_without_osd_lock" << dendl
;
6186 logger
->set(l_osd_cached_crc
, ceph::buffer::get_cached_crc());
6187 logger
->set(l_osd_cached_crc_adjusted
, ceph::buffer::get_cached_crc_adjusted());
6188 logger
->set(l_osd_missed_crc
, ceph::buffer::get_missed_crc());
6190 // refresh osd stats
6191 struct store_statfs_t stbuf
;
6192 osd_alert_list_t alerts
;
6193 int r
= store
->statfs(&stbuf
, &alerts
);
6194 ceph_assert(r
== 0);
6195 service
.set_statfs(stbuf
, alerts
);
6197 // osd_lock is not being held, which means the OSD state
6198 // might change when doing the monitor report
6199 if (is_active() || is_waiting_for_healthy()) {
6201 std::lock_guard l
{heartbeat_lock
};
6204 map_lock
.lock_shared();
6205 std::lock_guard
l(mon_report_lock
);
6208 utime_t now
= ceph_clock_now();
6209 if (service
.need_fullness_update() ||
6210 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
6211 last_mon_report
= now
;
6215 map_lock
.unlock_shared();
6217 epoch_t max_waiting_epoch
= 0;
6218 for (auto s
: shards
) {
6219 max_waiting_epoch
= std::max(max_waiting_epoch
,
6220 s
->get_max_waiting_epoch());
6222 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
6223 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
6224 << ", requesting new map" << dendl
;
6225 osdmap_subscribe(superblock
.newest_map
+ 1, false);
6230 if (!scrub_random_backoff()) {
6233 service
.promote_throttle_recalibrate();
6234 resume_creating_pg();
6235 bool need_send_beacon
= false;
6236 const auto now
= ceph::coarse_mono_clock::now();
6238 // borrow lec lock to pretect last_sent_beacon from changing
6239 std::lock_guard l
{min_last_epoch_clean_lock
};
6240 const auto elapsed
= now
- last_sent_beacon
;
6241 if (std::chrono::duration_cast
<std::chrono::seconds
>(elapsed
).count() >
6242 cct
->_conf
->osd_beacon_report_interval
) {
6243 need_send_beacon
= true;
6246 if (need_send_beacon
) {
6251 mgrc
.update_daemon_health(get_health_metrics());
6252 service
.kick_recovery_queue();
6253 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
6254 new C_Tick_WithoutOSDLock(this));
6258 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6259 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
6260 // setomapheader <pool-id> [namespace/]<obj-name> <header>
6261 // getomap <pool> [namespace/]<obj-name>
6262 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
6263 // injectmdataerr [namespace/]<obj-name> [shardid]
6264 // injectdataerr [namespace/]<obj-name> [shardid]
6266 // set_recovery_delay [utime]
6267 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
6268 std::string_view command
,
6269 const cmdmap_t
& cmdmap
, ostream
&ss
)
6272 //Support changing the omap on a single osd by using the Admin Socket to
6273 //directly request the osd make a change.
6274 if (command
== "setomapval" || command
== "rmomapkey" ||
6275 command
== "setomapheader" || command
== "getomap" ||
6276 command
== "truncobj" || command
== "injectmdataerr" ||
6277 command
== "injectdataerr"
6281 OSDMapRef curmap
= service
->get_osdmap();
6286 cmd_getval(cmdmap
, "pool", poolstr
);
6287 pool
= curmap
->lookup_pg_pool_name(poolstr
);
6288 //If we can't find it by name then maybe id specified
6289 if (pool
< 0 && isdigit(poolstr
[0]))
6290 pool
= atoll(poolstr
.c_str());
6292 ss
<< "Invalid pool '" << poolstr
<< "''";
6296 string objname
, nspace
;
6297 cmd_getval(cmdmap
, "objname", objname
);
6298 std::size_t found
= objname
.find_first_of('/');
6299 if (found
!= string::npos
) {
6300 nspace
= objname
.substr(0, found
);
6301 objname
= objname
.substr(found
+1);
6303 object_locator_t
oloc(pool
, nspace
);
6304 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
6307 ss
<< "Invalid namespace/objname";
6311 int64_t shardid
= cmd_getval_or
<int64_t>(cmdmap
, "shardid", shard_id_t::NO_SHARD
);
6312 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
6313 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
6314 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
6315 if (curmap
->pg_is_ec(rawpg
)) {
6316 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
6317 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
6322 ObjectStore::Transaction t
;
6324 if (command
== "setomapval") {
6325 map
<string
, bufferlist
> newattrs
;
6328 cmd_getval(cmdmap
, "key", key
);
6329 cmd_getval(cmdmap
, "val", valstr
);
6332 newattrs
[key
] = val
;
6333 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
6334 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6336 ss
<< "error=" << r
;
6339 } else if (command
== "rmomapkey") {
6341 cmd_getval(cmdmap
, "key", key
);
6343 t
.omap_rmkey(coll_t(pgid
), ghobject_t(obj
), key
);
6344 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6346 ss
<< "error=" << r
;
6349 } else if (command
== "setomapheader") {
6350 bufferlist newheader
;
6353 cmd_getval(cmdmap
, "header", headerstr
);
6354 newheader
.append(headerstr
);
6355 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
6356 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6358 ss
<< "error=" << r
;
6361 } else if (command
== "getomap") {
6362 //Debug: Output entire omap
6364 map
<string
, bufferlist
> keyvals
;
6365 auto ch
= store
->open_collection(coll_t(pgid
));
6367 ss
<< "unable to open collection for " << pgid
;
6370 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
6372 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
6373 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
6374 it
!= keyvals
.end(); ++it
)
6375 ss
<< " key=" << (*it
).first
<< " val="
6376 << string((*it
).second
.c_str(), (*it
).second
.length());
6378 ss
<< "error=" << r
;
6381 } else if (command
== "truncobj") {
6383 cmd_getval(cmdmap
, "len", trunclen
);
6384 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
6385 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6387 ss
<< "error=" << r
;
6390 } else if (command
== "injectdataerr") {
6391 store
->inject_data_error(gobj
);
6393 } else if (command
== "injectmdataerr") {
6394 store
->inject_mdata_error(gobj
);
6399 if (command
== "set_recovery_delay") {
6400 int64_t delay
= cmd_getval_or
<int64_t>(cmdmap
, "utime", 0);
6403 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
6406 ss
<< "set_recovery_delay: error setting "
6407 << "osd_recovery_delay_start to '" << delay
<< "': error "
6411 service
->cct
->_conf
.apply_changes(nullptr);
6412 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
6413 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
6416 if (command
== "injectfull") {
6417 int64_t count
= cmd_getval_or
<int64_t>(cmdmap
, "count", -1);
6418 string type
= cmd_getval_or
<string
>(cmdmap
, "type", "full");
6419 OSDService::s_names state
;
6421 if (type
== "none" || count
== 0) {
6425 state
= service
->get_full_state(type
);
6426 if (state
== OSDService::s_names::INVALID
) {
6427 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6430 service
->set_injectfull(state
, count
);
6433 ss
<< "Internal error - command=" << command
;
6436 // =========================================
6438 void OSD::ms_handle_connect(Connection
*con
)
6440 dout(10) << __func__
<< " con " << con
<< dendl
;
6441 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6442 std::lock_guard
l(osd_lock
);
6445 dout(10) << __func__
<< " on mon" << dendl
;
6449 } else if (is_booting()) {
6450 _send_boot(); // resend boot message
6452 map_lock
.lock_shared();
6453 std::lock_guard
l2(mon_report_lock
);
6455 utime_t now
= ceph_clock_now();
6456 last_mon_report
= now
;
6458 // resend everything, it's a new session
6461 service
.requeue_pg_temp();
6462 service
.clear_sent_ready_to_merge();
6463 service
.send_pg_temp();
6464 service
.send_ready_to_merge();
6465 service
.send_pg_created();
6469 map_lock
.unlock_shared();
6471 send_beacon(ceph::coarse_mono_clock::now());
6475 // full map requests may happen while active or pre-boot
6476 if (requested_full_first
) {
6477 rerequest_full_maps();
6482 void OSD::ms_handle_fast_connect(Connection
*con
)
6484 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6485 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6486 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6487 s
= ceph::make_ref
<Session
>(cct
, con
);
6489 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6490 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6491 // we don't connect to clients
6492 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6493 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6498 void OSD::ms_handle_fast_accept(Connection
*con
)
6500 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6501 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6502 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6503 s
= ceph::make_ref
<Session
>(cct
, con
);
6505 dout(10) << "new session (incoming)" << s
<< " con=" << con
6506 << " addr=" << con
->get_peer_addr()
6507 << " must have raced with connect" << dendl
;
6508 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6509 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6514 bool OSD::ms_handle_reset(Connection
*con
)
6516 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6517 dout(2) << "ms_handle_reset con " << con
<< " session " << session
.get() << dendl
;
6520 session
->wstate
.reset(con
);
6521 session
->con
->set_priv(nullptr);
6522 session
->con
.reset(); // break con <-> session ref cycle
6523 // note that we break session->con *before* the session_handle_reset
6524 // cleanup below. this avoids a race between us and
6525 // PG::add_backoff, Session::check_backoff, etc.
6526 session_handle_reset(session
);
6530 bool OSD::ms_handle_refused(Connection
*con
)
6532 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6535 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6536 dout(2) << "ms_handle_refused con " << con
<< " session " << session
.get() << dendl
;
6539 int type
= con
->get_peer_type();
6540 // handle only OSD failures here
6541 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6542 OSDMapRef osdmap
= get_osdmap();
6544 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6545 if (id
>= 0 && osdmap
->is_up(id
)) {
6546 // I'm cheating mon heartbeat grace logic, because we know it's not going
6547 // to respawn alone. +1 so we won't hit any boundary case.
6548 monc
->send_mon_message(
6552 osdmap
->get_addrs(id
),
6553 cct
->_conf
->osd_heartbeat_grace
+ 1,
6554 osdmap
->get_epoch(),
6555 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6563 struct CB_OSD_GetVersion
{
6565 explicit CB_OSD_GetVersion(OSD
*o
) : osd(o
) {}
6566 void operator ()(boost::system::error_code ec
, version_t newest
,
6569 osd
->_got_mon_epochs(oldest
, newest
);
6573 void OSD::start_boot()
6575 if (!_is_healthy()) {
6576 // if we are not healthy, do not mark ourselves up (yet)
6577 dout(1) << "not healthy; waiting to boot" << dendl
;
6578 if (!is_waiting_for_healthy())
6579 start_waiting_for_healthy();
6580 // send pings sooner rather than later
6584 dout(1) << __func__
<< dendl
;
6585 set_state(STATE_PREBOOT
);
6586 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6587 << ".." << superblock
.newest_map
<< dendl
;
6588 monc
->get_version("osdmap", CB_OSD_GetVersion(this));
6591 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6593 std::lock_guard
l(osd_lock
);
6595 _preboot(oldest
, newest
);
6599 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6601 ceph_assert(is_preboot());
6602 dout(10) << __func__
<< " _preboot mon has osdmaps "
6603 << oldest
<< ".." << newest
<< dendl
;
6605 // ensure our local fullness awareness is accurate
6607 std::lock_guard
l(heartbeat_lock
);
6611 const auto& monmap
= monc
->monmap
;
6612 const auto osdmap
= get_osdmap();
6613 // if our map within recent history, try to add ourselves to the osdmap.
6614 if (osdmap
->get_epoch() == 0) {
6615 derr
<< "waiting for initial osdmap" << dendl
;
6616 } else if (osdmap
->is_destroyed(whoami
)) {
6617 derr
<< "osdmap says I am destroyed" << dendl
;
6618 // provide a small margin so we don't livelock seeing if we
6619 // un-destroyed ourselves.
6620 if (osdmap
->get_epoch() > newest
- 1) {
6623 } else if (osdmap
->is_noup(whoami
)) {
6624 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6625 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6626 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6628 } else if (service
.need_fullness_update()) {
6629 derr
<< "osdmap fullness state needs update" << dendl
;
6631 } else if (monmap
.min_mon_release
>= ceph_release_t::octopus
&&
6632 superblock
.purged_snaps_last
< superblock
.current_epoch
) {
6633 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6634 << " < newest_map " << superblock
.current_epoch
<< dendl
;
6635 _get_purged_snaps();
6636 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6637 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6639 // wait for pgs to fully catch up in a different thread, since
6640 // this thread might be required for splitting and merging PGs to
6642 boot_finisher
.queue(
6645 std::unique_lock
l(osd_lock
);
6647 dout(10) << __func__
<< " waiting for peering work to drain"
6650 for (auto shard
: shards
) {
6651 shard
->wait_min_pg_epoch(get_osdmap_epoch());
6662 // get all the latest maps
6663 if (osdmap
->get_epoch() + 1 >= oldest
)
6664 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6666 osdmap_subscribe(oldest
- 1, true);
6669 void OSD::_get_purged_snaps()
6671 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6672 // overlapping requests to the mon, which will be somewhat inefficient, but
6673 // it should be reliable.
6674 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6675 << ", newest_map " << superblock
.current_epoch
<< dendl
;
6676 MMonGetPurgedSnaps
*m
= new MMonGetPurgedSnaps(
6677 superblock
.purged_snaps_last
+ 1,
6678 superblock
.current_epoch
+ 1);
6679 monc
->send_mon_message(m
);
6682 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply
*m
)
6684 dout(10) << __func__
<< " " << *m
<< dendl
;
6685 ObjectStore::Transaction t
;
6686 if (!is_preboot() ||
6687 m
->last
< superblock
.purged_snaps_last
) {
6690 OSDriver osdriver
{store
.get(), service
.meta_ch
, make_purged_snaps_oid()};
6691 SnapMapper::record_purged_snaps(
6694 osdriver
.get_transaction(&t
),
6697 superblock
.purged_snaps_last
= m
->last
;
6698 write_superblock(t
);
6699 store
->queue_transaction(
6702 service
.publish_superblock(superblock
);
6703 if (m
->last
< superblock
.current_epoch
) {
6704 _get_purged_snaps();
6712 void OSD::send_full_update()
6714 if (!service
.need_fullness_update())
6717 if (service
.is_full()) {
6718 state
= CEPH_OSD_FULL
;
6719 } else if (service
.is_backfillfull()) {
6720 state
= CEPH_OSD_BACKFILLFULL
;
6721 } else if (service
.is_nearfull()) {
6722 state
= CEPH_OSD_NEARFULL
;
6725 OSDMap::calc_state_set(state
, s
);
6726 dout(10) << __func__
<< " want state " << s
<< dendl
;
6727 monc
->send_mon_message(new MOSDFull(get_osdmap_epoch(), state
));
6730 void OSD::start_waiting_for_healthy()
6732 dout(1) << "start_waiting_for_healthy" << dendl
;
6733 set_state(STATE_WAITING_FOR_HEALTHY
);
6734 last_heartbeat_resample
= utime_t();
6736 // subscribe to osdmap updates, in case our peers really are known to be dead
6737 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6740 bool OSD::_is_healthy()
6742 if (!cct
->get_heartbeat_map()->is_healthy()) {
6743 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6747 if (is_waiting_for_healthy()) {
6748 utime_t now
= ceph_clock_now();
6749 if (osd_markdown_log
.empty()) {
6750 dout(5) << __func__
<< " force returning true since last markdown"
6751 << " was " << cct
->_conf
->osd_max_markdown_period
6752 << "s ago" << dendl
;
6755 std::lock_guard
l(heartbeat_lock
);
6756 int num
= 0, up
= 0;
6757 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6758 p
!= heartbeat_peers
.end();
6760 if (p
->second
.is_healthy(now
))
6764 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6765 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6766 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6774 void OSD::_send_boot()
6776 dout(10) << "_send_boot" << dendl
;
6777 Connection
*local_connection
=
6778 cluster_messenger
->get_loopback_connection().get();
6779 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6780 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6781 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6782 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6784 dout(20) << " initial client_addrs " << client_addrs
6785 << ", cluster_addrs " << cluster_addrs
6786 << ", hb_back_addrs " << hb_back_addrs
6787 << ", hb_front_addrs " << hb_front_addrs
6789 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6790 dout(10) << " assuming cluster_addrs match client_addrs "
6791 << client_addrs
<< dendl
;
6792 cluster_addrs
= cluster_messenger
->get_myaddrs();
6794 if (auto session
= local_connection
->get_priv(); !session
) {
6795 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6798 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6799 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6800 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6801 << cluster_addrs
<< dendl
;
6802 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6804 if (auto session
= local_connection
->get_priv(); !session
) {
6805 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6808 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6809 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6810 dout(10) << " assuming hb_front_addrs match client_addrs "
6811 << client_addrs
<< dendl
;
6812 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6814 if (auto session
= local_connection
->get_priv(); !session
) {
6815 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6818 // we now know what our front and back addrs will be, and we are
6819 // about to tell the mon what our metadata (including numa bindings)
6820 // are, so now is a good time!
6821 set_numa_affinity();
6823 MOSDBoot
*mboot
= new MOSDBoot(
6824 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6825 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6827 dout(10) << " final client_addrs " << client_addrs
6828 << ", cluster_addrs " << cluster_addrs
6829 << ", hb_back_addrs " << hb_back_addrs
6830 << ", hb_front_addrs " << hb_front_addrs
6832 _collect_metadata(&mboot
->metadata
);
6833 monc
->send_mon_message(mboot
);
6834 set_state(STATE_BOOTING
);
6837 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6840 (*pm
)["osd_data"] = dev_path
;
6841 if (store
->get_type() == "filestore") {
6842 // not applicable for bluestore
6843 (*pm
)["osd_journal"] = journal_path
;
6845 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6846 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6847 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6848 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6851 (*pm
)["osd_objectstore"] = store
->get_type();
6852 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6853 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6854 (*pm
)["default_device_class"] = store
->get_default_device_class();
6855 string osdspec_affinity
;
6856 int r
= store
->read_meta("osdspec_affinity", &osdspec_affinity
);
6857 if (r
< 0 || osdspec_affinity
.empty()) {
6858 osdspec_affinity
= "";
6860 (*pm
)["osdspec_affinity"] = osdspec_affinity
;
6861 string ceph_version_when_created
;
6862 r
= store
->read_meta("ceph_version_when_created", &ceph_version_when_created
);
6863 if (r
<0 || ceph_version_when_created
.empty()) {
6864 ceph_version_when_created
= "";
6866 (*pm
)["ceph_version_when_created"] = ceph_version_when_created
;
6868 r
= store
->read_meta("created_at", &created_at
);
6869 if (r
< 0 || created_at
.empty()) {
6872 (*pm
)["created_at"] = created_at
;
6873 store
->collect_metadata(pm
);
6875 collect_sys_info(pm
, cct
);
6877 (*pm
)["front_iface"] = pick_iface(
6879 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6880 (*pm
)["back_iface"] = pick_iface(
6882 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6888 set
<string
> unknown
;
6889 for (auto nm
: { "front_iface", "back_iface" }) {
6890 if (!(*pm
)[nm
].size()) {
6895 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6897 unknown
.insert((*pm
)[nm
]);
6905 if (unknown
.size()) {
6906 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6908 if (!nodes
.empty()) {
6909 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6911 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6912 (*pm
)["network_numa_node"] = stringify(node
);
6916 if (numa_node
>= 0) {
6917 (*pm
)["numa_node"] = stringify(numa_node
);
6918 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6922 set
<string
> devnames
;
6923 store
->get_devices(&devnames
);
6924 map
<string
,string
> errs
;
6925 get_device_metadata(devnames
, pm
, &errs
);
6926 for (auto& i
: errs
) {
6927 dout(1) << __func__
<< " " << i
.first
<< ": " << i
.second
<< dendl
;
6929 dout(10) << __func__
<< " " << *pm
<< dendl
;
6932 void OSD::queue_want_up_thru(epoch_t want
)
6934 std::shared_lock map_locker
{map_lock
};
6935 epoch_t cur
= get_osdmap()->get_up_thru(whoami
);
6936 std::lock_guard
report_locker(mon_report_lock
);
6937 if (want
> up_thru_wanted
) {
6938 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6939 << ", currently " << cur
6941 up_thru_wanted
= want
;
6944 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6945 << ", currently " << cur
6950 void OSD::send_alive()
6952 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6953 const auto osdmap
= get_osdmap();
6954 if (!osdmap
->exists(whoami
))
6956 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6957 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6958 if (up_thru_wanted
> up_thru
) {
6959 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6960 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6964 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6966 dout(10) << __func__
<< " " << first
<< ".." << last
6967 << ", previously requested "
6968 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6969 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6970 ceph_assert(first
> 0 && last
> 0);
6971 ceph_assert(first
<= last
);
6972 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6973 if (requested_full_first
== 0) {
6975 requested_full_first
= first
;
6976 requested_full_last
= last
;
6977 } else if (last
<= requested_full_last
) {
6981 // additional request
6982 first
= requested_full_last
+ 1;
6983 requested_full_last
= last
;
6985 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6986 req
->request_full(first
, last
);
6987 monc
->send_mon_message(req
);
6990 void OSD::got_full_map(epoch_t e
)
6992 ceph_assert(requested_full_first
<= requested_full_last
);
6993 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6994 if (requested_full_first
== 0) {
6995 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6998 if (e
< requested_full_first
) {
6999 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
7000 << ".." << requested_full_last
7001 << ", ignoring" << dendl
;
7004 if (e
>= requested_full_last
) {
7005 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
7006 << ".." << requested_full_last
<< ", resetting" << dendl
;
7007 requested_full_first
= requested_full_last
= 0;
7011 requested_full_first
= e
+ 1;
7013 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
7014 << ".." << requested_full_last
7015 << ", still need more" << dendl
;
7018 void OSD::requeue_failures()
7020 std::lock_guard
l(heartbeat_lock
);
7021 unsigned old_queue
= failure_queue
.size();
7022 unsigned old_pending
= failure_pending
.size();
7023 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
7024 failure_queue
[p
->first
] = p
->second
.first
;
7025 failure_pending
.erase(p
++);
7027 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
7028 << failure_queue
.size() << dendl
;
7031 void OSD::send_failures()
7033 ceph_assert(ceph_mutex_is_locked(map_lock
));
7034 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
7035 std::lock_guard
l(heartbeat_lock
);
7036 utime_t now
= ceph_clock_now();
7037 const auto osdmap
= get_osdmap();
7038 while (!failure_queue
.empty()) {
7039 int osd
= failure_queue
.begin()->first
;
7040 if (!failure_pending
.count(osd
)) {
7041 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
7042 monc
->send_mon_message(
7046 osdmap
->get_addrs(osd
),
7048 osdmap
->get_epoch()));
7049 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
7050 osdmap
->get_addrs(osd
));
7052 failure_queue
.erase(osd
);
7056 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
7058 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
7059 MOSDFailure::FLAG_ALIVE
);
7060 monc
->send_mon_message(m
);
7063 void OSD::cancel_pending_failures()
7065 std::lock_guard
l(heartbeat_lock
);
7066 auto it
= failure_pending
.begin();
7067 while (it
!= failure_pending
.end()) {
7068 dout(10) << __func__
<< " canceling in-flight failure report for osd."
7069 << it
->first
<< dendl
;
7070 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
7071 failure_pending
.erase(it
++);
7075 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
7077 const auto& monmap
= monc
->monmap
;
7078 // send beacon to mon even if we are just connected, and the monmap is not
7079 // initialized yet by then.
7080 if (monmap
.epoch
> 0 &&
7081 monmap
.get_required_features().contains_all(
7082 ceph::features::mon::FEATURE_LUMINOUS
)) {
7083 dout(20) << __func__
<< " sending" << dendl
;
7084 MOSDBeacon
* beacon
= nullptr;
7086 std::lock_guard l
{min_last_epoch_clean_lock
};
7087 beacon
= new MOSDBeacon(get_osdmap_epoch(),
7088 min_last_epoch_clean
,
7089 superblock
.last_purged_snaps_scrub
,
7090 cct
->_conf
->osd_beacon_report_interval
);
7091 beacon
->pgs
= min_last_epoch_clean_pgs
;
7092 last_sent_beacon
= now
;
7094 monc
->send_mon_message(beacon
);
7096 dout(20) << __func__
<< " not sending" << dendl
;
7100 void OSD::handle_command(MCommand
*m
)
7102 ConnectionRef con
= m
->get_connection();
7103 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
7105 con
->send_message(new MCommandReply(m
, -EACCES
));
7109 if (!session
->caps
.allow_all()) {
7110 con
->send_message(new MCommandReply(m
, -EACCES
));
7114 cct
->get_admin_socket()->queue_tell_command(m
);
7119 class unlock_guard
{
7122 explicit unlock_guard(ceph::mutex
& mutex
)
7127 unlock_guard(unlock_guard
&) = delete;
7134 void OSD::scrub_purged_snaps()
7136 dout(10) << __func__
<< dendl
;
7137 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7138 SnapMapper::Scrubber
s(cct
, store
.get(), service
.meta_ch
,
7139 make_snapmapper_oid(),
7140 make_purged_snaps_oid());
7141 clog
->debug() << "purged_snaps scrub starts";
7144 if (s
.stray
.size()) {
7145 clog
->debug() << "purged_snaps scrub found " << s
.stray
.size() << " strays";
7147 clog
->debug() << "purged_snaps scrub ok";
7149 set
<pair
<spg_t
,snapid_t
>> queued
;
7150 for (auto& [pool
, snap
, hash
, shard
] : s
.stray
) {
7151 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(pool
);
7153 dout(20) << __func__
<< " pool " << pool
<< " dne" << dendl
;
7156 pg_t
pgid(pi
->raw_hash_to_pg(hash
), pool
);
7157 spg_t
spgid(pgid
, shard
);
7158 pair
<spg_t
,snapid_t
> p(spgid
, snap
);
7159 if (queued
.count(p
)) {
7160 dout(20) << __func__
<< " pg " << spgid
<< " snap " << snap
7161 << " already queued" << dendl
;
7164 PGRef pg
= lookup_lock_pg(spgid
);
7166 dout(20) << __func__
<< " pg " << spgid
<< " not found" << dendl
;
7170 dout(10) << __func__
<< " requeue pg " << spgid
<< " " << pg
<< " snap "
7172 pg
->queue_snap_retrim(snap
);
7176 if (is_stopping()) {
7179 dout(10) << __func__
<< " done queueing pgs, updating superblock" << dendl
;
7180 ObjectStore::Transaction t
;
7181 superblock
.last_purged_snaps_scrub
= ceph_clock_now();
7182 write_superblock(t
);
7183 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7184 ceph_assert(tr
== 0);
7186 send_beacon(ceph::coarse_mono_clock::now());
7188 dout(10) << __func__
<< " done" << dendl
;
7191 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
7193 set
<string
> devnames
;
7194 store
->get_devices(&devnames
);
7195 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
7196 "osd_smart_report_timeout");
7198 // == typedef std::map<std::string, mValue> mObject;
7199 json_spirit::mObject json_map
;
7201 for (auto dev
: devnames
) {
7202 // smartctl works only on physical devices; filter out any logical device
7203 if (dev
.find("dm-") == 0) {
7208 string devid
= get_device_id(dev
, &err
);
7209 if (devid
.size() == 0) {
7210 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
7211 << err
<< "), skipping" << dendl
;
7214 if (only_devid
.size() && devid
!= only_devid
) {
7218 json_spirit::mValue smart_json
;
7219 if (block_device_get_metrics(dev
, smart_timeout
,
7221 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
7224 json_map
[devid
] = smart_json
;
7226 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
7229 bool OSD::heartbeat_dispatch(Message
*m
)
7231 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
7232 switch (m
->get_type()) {
7235 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
7240 handle_osd_ping(static_cast<MOSDPing
*>(m
));
7244 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
7251 bool OSD::ms_dispatch(Message
*m
)
7253 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
7254 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
7255 service
.got_stop_ack();
7263 if (is_stopping()) {
7276 void OSDService::maybe_share_map(
7278 const OSDMapRef
& osdmap
,
7279 epoch_t peer_epoch_lb
)
7281 // NOTE: we assume caller hold something that keeps the Connection itself
7282 // pinned (e.g., an OpRequest's MessageRef).
7283 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
7288 // assume the peer has the newer of the op's sent_epoch and what
7289 // we think we sent them.
7290 session
->sent_epoch_lock
.lock();
7291 if (peer_epoch_lb
> session
->last_sent_epoch
) {
7292 dout(10) << __func__
<< " con " << con
7293 << " " << con
->get_peer_addr()
7294 << " map epoch " << session
->last_sent_epoch
7295 << " -> " << peer_epoch_lb
<< " (as per caller)" << dendl
;
7296 session
->last_sent_epoch
= peer_epoch_lb
;
7298 epoch_t last_sent_epoch
= session
->last_sent_epoch
;
7299 session
->sent_epoch_lock
.unlock();
7301 if (osdmap
->get_epoch() <= last_sent_epoch
) {
7305 send_incremental_map(last_sent_epoch
, con
, osdmap
);
7306 last_sent_epoch
= osdmap
->get_epoch();
7308 session
->sent_epoch_lock
.lock();
7309 if (session
->last_sent_epoch
< last_sent_epoch
) {
7310 dout(10) << __func__
<< " con " << con
7311 << " " << con
->get_peer_addr()
7312 << " map epoch " << session
->last_sent_epoch
7313 << " -> " << last_sent_epoch
<< " (shared)" << dendl
;
7314 session
->last_sent_epoch
= last_sent_epoch
;
7316 session
->sent_epoch_lock
.unlock();
7319 void OSD::dispatch_session_waiting(const ceph::ref_t
<Session
>& session
, OSDMapRef osdmap
)
7321 ceph_assert(ceph_mutex_is_locked(session
->session_dispatch_lock
));
7323 auto i
= session
->waiting_on_map
.begin();
7324 while (i
!= session
->waiting_on_map
.end()) {
7325 OpRequestRef op
= &(*i
);
7326 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
7327 auto m
= op
->get_req
<MOSDFastDispatchOp
>();
7328 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
7331 session
->waiting_on_map
.erase(i
++);
7335 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
7336 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
7337 static_cast<const MOSDOp
*>(m
)->get_pg());
7338 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
7342 pgid
= m
->get_spg();
7344 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
7347 if (session
->waiting_on_map
.empty()) {
7348 clear_session_waiting_on_map(session
);
7350 register_session_waiting_on_map(session
);
7354 void OSD::ms_fast_dispatch(Message
*m
)
7357 if (service
.is_stopping()) {
7362 switch (m
->get_type()) {
7364 dout(10) << "ping from " << m
->get_source() << dendl
;
7367 case MSG_OSD_FORCE_RECOVERY
:
7368 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7370 case MSG_OSD_SCRUB2
:
7371 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7373 case MSG_OSD_PG_CREATE2
:
7374 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7375 case MSG_OSD_PG_NOTIFY
:
7376 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7377 case MSG_OSD_PG_INFO
:
7378 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7379 case MSG_OSD_PG_REMOVE
:
7380 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7381 // these are single-pg messages that handle themselves
7382 case MSG_OSD_PG_LOG
:
7383 case MSG_OSD_PG_TRIM
:
7384 case MSG_OSD_PG_NOTIFY2
:
7385 case MSG_OSD_PG_QUERY2
:
7386 case MSG_OSD_PG_INFO2
:
7387 case MSG_OSD_BACKFILL_RESERVE
:
7388 case MSG_OSD_RECOVERY_RESERVE
:
7389 case MSG_OSD_PG_LEASE
:
7390 case MSG_OSD_PG_LEASE_ACK
:
7392 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7393 if (require_osd_peer(pm
)) {
7394 enqueue_peering_evt(
7396 PGPeeringEventRef(pm
->get_event()));
7403 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7406 osd_reqid_t reqid
= op
->get_reqid();
7408 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7409 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7411 op
->osd_parent_span
= tracing::osd::tracer
.start_trace("op-request-created");
7414 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7416 // note sender epoch, min req's epoch
7417 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7418 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7419 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7421 service
.maybe_inject_dispatch_delay();
7423 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7424 m
->get_type() != CEPH_MSG_OSD_OP
) {
7425 // queue it directly
7427 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7429 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7431 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7432 // message that didn't have an explicit spg_t); we need to map
7433 // them to an spg_t while preserving delivery order.
7434 auto priv
= m
->get_connection()->get_priv();
7435 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7436 std::lock_guard l
{session
->session_dispatch_lock
};
7438 session
->waiting_on_map
.push_back(*op
);
7439 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7440 dispatch_session_waiting(session
, nextmap
);
7441 service
.release_map(nextmap
);
7444 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7447 int OSD::ms_handle_fast_authentication(Connection
*con
)
7450 auto s
= ceph::ref_cast
<Session
>(con
->get_priv());
7452 s
= ceph::make_ref
<Session
>(cct
, con
);
7454 s
->entity_name
= con
->get_peer_entity_name();
7455 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7456 << " entity " << s
->entity_name
7457 << " addr " << con
->get_peer_addrs() << dendl
;
7459 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7460 << " entity " << s
->entity_name
7461 << " addr " << con
->get_peer_addrs() << dendl
;
7464 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7465 if (caps_info
.allow_all
) {
7466 s
->caps
.set_allow_all();
7467 } else if (caps_info
.caps
.length() > 0) {
7468 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7473 catch (ceph::buffer::error
& e
) {
7474 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7475 << " failed to decode caps string" << dendl
;
7479 bool success
= s
->caps
.parse(str
);
7481 dout(10) << __func__
<< " session " << s
7482 << " " << s
->entity_name
7483 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7486 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7487 << " failed to parse caps '" << str
<< "'" << dendl
;
7495 void OSD::_dispatch(Message
*m
)
7497 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7498 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7500 switch (m
->get_type()) {
7501 // -- don't need OSDMap --
7503 // map and replication
7504 case CEPH_MSG_OSD_MAP
:
7505 handle_osd_map(static_cast<MOSDMap
*>(m
));
7507 case MSG_MON_GET_PURGED_SNAPS_REPLY
:
7508 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply
*>(m
));
7513 handle_command(static_cast<MCommand
*>(m
));
7518 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7520 dout(10) << __func__
<< " " << *m
<< dendl
;
7521 if (!require_mon_or_mgr_peer(m
)) {
7525 if (m
->fsid
!= monc
->get_fsid()) {
7526 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7531 for (auto pgid
: m
->scrub_pgs
) {
7532 enqueue_peering_evt(
7535 std::make_shared
<PGPeeringEvent
>(
7538 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7543 bool OSD::scrub_random_backoff()
7545 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7546 cct
->_conf
->osd_scrub_backoff_ratio
);
7548 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off (ratio: "
7549 << cct
->_conf
->osd_scrub_backoff_ratio
<< ")" << dendl
;
7556 void OSD::sched_scrub()
7558 auto& scrub_scheduler
= service
.get_scrub_services();
7560 if (auto blocked_pgs
= scrub_scheduler
.get_blocked_pgs_count();
7562 // some PGs managed by this OSD were blocked by a locked object during
7563 // scrub. This means we might not have the resources needed to scrub now.
7566 "{}: PGs are blocked while scrubbing due to locked objects ({} PGs)",
7572 // fail fast if no resources are available
7573 if (!scrub_scheduler
.can_inc_scrubs()) {
7574 dout(20) << __func__
<< ": OSD cannot inc scrubs" << dendl
;
7578 // if there is a PG that is just now trying to reserve scrub replica resources -
7579 // we should wait and not initiate a new scrub
7580 if (scrub_scheduler
.is_reserving_now()) {
7581 dout(20) << __func__
<< ": scrub resources reservation in progress" << dendl
;
7585 Scrub::ScrubPreconds env_conditions
;
7587 if (service
.is_recovery_active() && !cct
->_conf
->osd_scrub_during_recovery
) {
7588 if (!cct
->_conf
->osd_repair_during_recovery
) {
7589 dout(15) << __func__
<< ": not scheduling scrubs due to active recovery"
7593 dout(10) << __func__
7594 << " will only schedule explicitly requested repair due to active recovery"
7596 env_conditions
.allow_requested_repair_only
= true;
7599 if (g_conf()->subsys
.should_gather
<ceph_subsys_osd
, 20>()) {
7600 dout(20) << __func__
<< " sched_scrub starts" << dendl
;
7601 auto all_jobs
= scrub_scheduler
.list_registered_jobs();
7602 for (const auto& sj
: all_jobs
) {
7603 dout(20) << "sched_scrub scrub-queue jobs: " << *sj
<< dendl
;
7607 auto was_started
= scrub_scheduler
.select_pg_and_scrub(env_conditions
);
7608 dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started
)
7612 Scrub::schedule_result_t
OSDService::initiate_a_scrub(spg_t pgid
,
7613 bool allow_requested_repair_only
)
7615 dout(20) << __func__
<< " trying " << pgid
<< dendl
;
7617 // we have a candidate to scrub. We need some PG information to know if scrubbing is
7620 PGRef pg
= osd
->lookup_lock_pg(pgid
);
7622 // the PG was dequeued in the short timespan between creating the candidates list
7623 // (collect_ripe_jobs()) and here
7624 dout(5) << __func__
<< " pg " << pgid
<< " not found" << dendl
;
7625 return Scrub::schedule_result_t::no_such_pg
;
7628 // This has already started, so go on to the next scrub job
7629 if (pg
->is_scrub_queued_or_active()) {
7631 dout(20) << __func__
<< ": already in progress pgid " << pgid
<< dendl
;
7632 return Scrub::schedule_result_t::already_started
;
7634 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7635 if (allow_requested_repair_only
&& !pg
->get_planned_scrub().must_repair
) {
7637 dout(10) << __func__
<< " skip " << pgid
7638 << " because repairing is not explicitly requested on it" << dendl
;
7639 return Scrub::schedule_result_t::preconditions
;
7642 auto scrub_attempt
= pg
->sched_scrub();
7644 return scrub_attempt
;
7647 void OSD::resched_all_scrubs()
7649 dout(10) << __func__
<< ": start" << dendl
;
7650 auto all_jobs
= service
.get_scrub_services().list_registered_jobs();
7651 for (auto& e
: all_jobs
) {
7654 dout(20) << __func__
<< ": examine " << job
.pgid
<< dendl
;
7656 PGRef pg
= _lookup_lock_pg(job
.pgid
);
7660 if (!pg
->get_planned_scrub().must_scrub
&& !pg
->get_planned_scrub().need_auto
) {
7661 dout(15) << __func__
<< ": reschedule " << job
.pgid
<< dendl
;
7662 pg
->reschedule_scrub();
7666 dout(10) << __func__
<< ": done" << dendl
;
7669 MPGStats
* OSD::collect_pg_stats()
7671 dout(15) << __func__
<< dendl
;
7672 // This implementation unconditionally sends every is_primary PG's
7673 // stats every time we're called. This has equivalent cost to the
7674 // previous implementation's worst case where all PGs are busy and
7675 // their stats are always enqueued for sending.
7676 std::shared_lock l
{map_lock
};
7678 osd_stat_t cur_stat
= service
.get_osd_stat();
7679 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7681 auto m
= new MPGStats(monc
->get_fsid(), get_osdmap_epoch());
7682 m
->osd_stat
= cur_stat
;
7684 std::lock_guard lec
{min_last_epoch_clean_lock
};
7685 min_last_epoch_clean
= get_osdmap_epoch();
7686 min_last_epoch_clean_pgs
.clear();
7688 auto now_is
= ceph::coarse_real_clock::now();
7690 std::set
<int64_t> pool_set
;
7693 for (auto& pg
: pgs
) {
7694 auto pool
= pg
->pg_id
.pgid
.pool();
7695 pool_set
.emplace((int64_t)pool
);
7696 if (!pg
->is_primary()) {
7699 pg
->with_pg_stats(now_is
, [&](const pg_stat_t
& s
, epoch_t lec
) {
7700 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7701 min_last_epoch_clean
= std::min(min_last_epoch_clean
, lec
);
7702 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7706 bool per_pool_stats
= true;
7707 bool per_pool_omap_stats
= false;
7708 for (auto p
: pool_set
) {
7709 int r
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
7710 if (r
== -ENOTSUP
) {
7711 per_pool_stats
= false;
7715 m
->pool_stat
[p
] = st
;
7719 // indicate whether we are reporting per-pool stats
7720 m
->osd_stat
.num_osds
= 1;
7721 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7722 m
->osd_stat
.num_per_pool_omap_osds
= per_pool_omap_stats
? 1 : 0;
7727 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7729 vector
<DaemonHealthMetric
> metrics
;
7731 utime_t oldest_secs
;
7732 const utime_t now
= ceph_clock_now();
7734 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7736 TrackedOpRef oldest_op
;
7737 OSDMapRef osdmap
= get_osdmap();
7738 // map of slow op counts by slow op event type for an aggregated logging to
7740 map
<uint8_t, int> slow_op_types
;
7741 // map of slow op counts by pool for reporting a pool name with highest
7743 map
<uint64_t, int> slow_op_pools
;
7744 bool log_aggregated_slow_op
=
7745 cct
->_conf
.get_val
<bool>("osd_aggregated_slow_ops_logging");
7746 auto count_slow_ops
= [&](TrackedOp
& op
) {
7747 if (op
.get_initiated() < too_old
) {
7749 ss
<< "slow request " << op
.get_desc()
7751 << op
.get_initiated()
7753 << op
.state_string();
7754 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7755 if (log_aggregated_slow_op
) {
7756 if (const OpRequest
*req
= dynamic_cast<const OpRequest
*>(&op
)) {
7757 uint8_t op_type
= req
->state_flag();
7758 auto m
= req
->get_req
<MOSDFastDispatchOp
>();
7759 uint64_t poolid
= m
->get_spg().pgid
.m_pool
;
7760 slow_op_types
[op_type
]++;
7761 if (poolid
> 0 && poolid
<= (uint64_t) osdmap
->get_pool_max()) {
7762 slow_op_pools
[poolid
]++;
7766 clog
->warn() << ss
.str();
7769 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7777 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7779 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7780 << oldest_op
->get_desc() << dendl
;
7781 if (log_aggregated_slow_op
&&
7782 slow_op_types
.size() > 0) {
7784 ss
<< slow
<< " slow requests (by type [ ";
7785 for (const auto& [op_type
, count
] : slow_op_types
) {
7786 ss
<< "'" << OpRequest::get_state_string(op_type
)
7790 auto slow_pool_it
= std::max_element(slow_op_pools
.begin(), slow_op_pools
.end(),
7791 [](std::pair
<uint64_t, int> p1
, std::pair
<uint64_t, int> p2
) {
7792 return p1
.second
< p2
.second
;
7794 if (osdmap
->get_pools().find(slow_pool_it
->first
) != osdmap
->get_pools().end()) {
7795 string pool_name
= osdmap
->get_pool_name(slow_pool_it
->first
);
7796 ss
<< "] most affected pool [ '"
7799 << slow_pool_it
->second
7804 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7805 clog
->warn() << ss
.str();
7808 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7810 // no news is not good news.
7811 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7815 std::lock_guard
l(pending_creates_lock
);
7816 auto n_primaries
= pending_creates_from_mon
;
7817 for (const auto& create
: pending_creates_from_osd
) {
7818 if (create
.second
) {
7822 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7827 // =====================================================
7830 * assimilate new OSDMap(s). scan pgs, etc.
7833 void OSD::note_down_osd(int peer
)
7835 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7836 cluster_messenger
->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer
));
7838 std::lock_guard l
{heartbeat_lock
};
7839 failure_queue
.erase(peer
);
7840 failure_pending
.erase(peer
);
7841 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7842 if (p
!= heartbeat_peers
.end()) {
7843 p
->second
.clear_mark_down();
7844 heartbeat_peers
.erase(p
);
7848 void OSD::note_up_osd(int peer
)
7850 heartbeat_set_peers_need_update();
7853 struct C_OnMapCommit
: public Context
{
7855 epoch_t first
, last
;
7857 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7858 : osd(o
), first(f
), last(l
), msg(m
) {}
7859 void finish(int r
) override
{
7860 osd
->_committed_osd_maps(first
, last
, msg
);
7865 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7867 std::lock_guard
l(osdmap_subscribe_lock
);
7868 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7871 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7873 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7879 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7881 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7882 if (min
<= superblock
.oldest_map
)
7886 ObjectStore::Transaction t
;
7887 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7888 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7889 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7890 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7891 superblock
.oldest_map
= e
+ 1;
7893 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7894 service
.publish_superblock(superblock
);
7895 write_superblock(t
);
7896 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7897 ceph_assert(tr
== 0);
7900 // skip_maps leaves us with a range of old maps if we fail to remove all
7901 // of them before moving superblock.oldest_map forward to the first map
7902 // in the incoming MOSDMap msg. so we should continue removing them in
7903 // this case, even we could do huge series of delete transactions all at
7910 service
.publish_superblock(superblock
);
7911 write_superblock(t
);
7912 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7913 ceph_assert(tr
== 0);
7915 // we should not remove the cached maps
7916 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7919 void OSD::handle_osd_map(MOSDMap
*m
)
7921 // wait for pgs to catch up
7923 // we extend the map cache pins to accomodate pgs slow to consume maps
7924 // for some period, until we hit the max_lag_factor bound, at which point
7925 // we block here to stop injesting more maps than they are able to keep
7927 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7928 m_osd_pg_epoch_max_lag_factor
;
7929 ceph_assert(max_lag
> 0);
7930 epoch_t osd_min
= 0;
7931 for (auto shard
: shards
) {
7932 epoch_t min
= shard
->get_min_pg_epoch();
7933 if (osd_min
== 0 || min
< osd_min
) {
7937 epoch_t osdmap_epoch
= get_osdmap_epoch();
7939 osdmap_epoch
> max_lag
&&
7940 osdmap_epoch
- max_lag
> osd_min
) {
7941 epoch_t need
= osdmap_epoch
- max_lag
;
7942 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7943 << " max_lag " << max_lag
<< ")" << dendl
;
7944 for (auto shard
: shards
) {
7945 epoch_t min
= shard
->get_min_pg_epoch();
7947 dout(10) << __func__
<< " waiting for pgs to consume " << need
7948 << " (shard " << shard
->shard_id
<< " min " << min
7949 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7950 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7952 unlock_guard unlock
{osd_lock
};
7953 shard
->wait_min_pg_epoch(need
);
7959 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7960 map
<epoch_t
,OSDMapRef
> added_maps
;
7961 map
<epoch_t
,bufferlist
> added_maps_bl
;
7962 if (m
->fsid
!= monc
->get_fsid()) {
7963 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7964 << monc
->get_fsid() << dendl
;
7968 if (is_initializing()) {
7969 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7974 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
7975 if (session
&& !(session
->entity_name
.is_mon() ||
7976 session
->entity_name
.is_osd())) {
7978 dout(10) << "got osd map from Session " << session
7979 << " which we can't take maps from (not a mon or osd)" << dendl
;
7984 // share with the objecter
7986 service
.objecter
->handle_osd_map(m
);
7988 epoch_t first
= m
->get_first();
7989 epoch_t last
= m
->get_last();
7990 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7991 << superblock
.newest_map
7992 << ", src has [" << m
->cluster_osdmap_trim_lower_bound
7993 << "," << m
->newest_map
<< "]"
7996 logger
->inc(l_osd_map
);
7997 logger
->inc(l_osd_mape
, last
- first
+ 1);
7998 if (first
<= superblock
.newest_map
)
7999 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
8001 if (superblock
.cluster_osdmap_trim_lower_bound
<
8002 m
->cluster_osdmap_trim_lower_bound
) {
8003 superblock
.cluster_osdmap_trim_lower_bound
=
8004 m
->cluster_osdmap_trim_lower_bound
;
8005 dout(10) << " superblock cluster_osdmap_trim_lower_bound new epoch is: "
8006 << superblock
.cluster_osdmap_trim_lower_bound
<< dendl
;
8008 superblock
.cluster_osdmap_trim_lower_bound
>= superblock
.oldest_map
);
8011 // make sure there is something new, here, before we bother flushing
8012 // the queues and such
8013 if (last
<= superblock
.newest_map
) {
8014 dout(10) << " no new maps here, dropping" << dendl
;
8020 bool skip_maps
= false;
8021 if (first
> superblock
.newest_map
+ 1) {
8022 dout(10) << "handle_osd_map message skips epochs "
8023 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
8024 if (m
->cluster_osdmap_trim_lower_bound
<= superblock
.newest_map
+ 1) {
8025 osdmap_subscribe(superblock
.newest_map
+ 1, false);
8029 // always try to get the full range of maps--as many as we can. this
8030 // 1- is good to have
8031 // 2- is at present the only way to ensure that we get a *full* map as
8033 if (m
->cluster_osdmap_trim_lower_bound
< first
) {
8034 osdmap_subscribe(m
->cluster_osdmap_trim_lower_bound
- 1, true);
8041 ObjectStore::Transaction t
;
8042 uint64_t txn_size
= 0;
8044 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> purged_snaps
;
8046 // store new maps: queue for disk and put in the osdmap cache
8047 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
8048 for (epoch_t e
= start
; e
<= last
; e
++) {
8049 if (txn_size
>= t
.get_num_bytes()) {
8050 derr
<< __func__
<< " transaction size overflowed" << dendl
;
8051 ceph_assert(txn_size
< t
.get_num_bytes());
8053 txn_size
= t
.get_num_bytes();
8054 map
<epoch_t
,bufferlist
>::iterator p
;
8055 p
= m
->maps
.find(e
);
8056 if (p
!= m
->maps
.end()) {
8057 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
8058 OSDMap
*o
= new OSDMap
;
8059 bufferlist
& bl
= p
->second
;
8063 purged_snaps
[e
] = o
->get_new_purged_snaps();
8065 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8066 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
8067 added_maps
[e
] = add_map(o
);
8068 added_maps_bl
[e
] = bl
;
8073 p
= m
->incremental_maps
.find(e
);
8074 if (p
!= m
->incremental_maps
.end()) {
8075 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
8076 bufferlist
& bl
= p
->second
;
8077 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
8078 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
8080 OSDMap
*o
= new OSDMap
;
8083 bool got
= get_map_bl(e
- 1, obl
);
8085 auto p
= added_maps_bl
.find(e
- 1);
8086 ceph_assert(p
!= added_maps_bl
.end());
8092 OSDMap::Incremental inc
;
8093 auto p
= bl
.cbegin();
8096 if (o
->apply_incremental(inc
) < 0) {
8097 derr
<< "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
8098 ceph_abort_msg("bad fsid");
8102 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
8104 bool injected_failure
= false;
8105 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
8106 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
8107 derr
<< __func__
<< " injecting map crc failure" << dendl
;
8108 injected_failure
= true;
8111 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
8112 dout(2) << "got incremental " << e
8113 << " but failed to encode full with correct crc; requesting"
8115 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
8116 dout(20) << "my encoded map was:\n";
8117 fbl
.hexdump(*_dout
);
8120 request_full_map(e
, last
);
8123 // don't continue committing if we failed to enc the first inc map
8125 dout(10) << __func__
<< " bailing because last < start (" << last
<< "<" << start
<< ")" << dendl
;
8132 purged_snaps
[e
] = o
->get_new_purged_snaps();
8134 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8135 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
8136 added_maps
[e
] = add_map(o
);
8137 added_maps_bl
[e
] = fbl
;
8141 ceph_abort_msg("MOSDMap lied about what maps it had?");
8144 // even if this map isn't from a mon, we may have satisfied our subscription
8145 monc
->sub_got("osdmap", last
);
8147 if (!m
->maps
.empty() && requested_full_first
) {
8148 dout(10) << __func__
<< " still missing full maps " << requested_full_first
8149 << ".." << requested_full_last
<< dendl
;
8150 rerequest_full_maps();
8153 if (superblock
.oldest_map
) {
8154 // make sure we at least keep pace with incoming maps
8155 trim_maps(m
->cluster_osdmap_trim_lower_bound
,
8156 last
- first
+ 1, skip_maps
);
8157 pg_num_history
.prune(superblock
.oldest_map
);
8160 if (!superblock
.oldest_map
|| skip_maps
)
8161 superblock
.oldest_map
= first
;
8162 superblock
.newest_map
= last
;
8163 superblock
.current_epoch
= last
;
8165 // note in the superblock that we were clean thru the prior epoch
8166 epoch_t boot_epoch
= service
.get_boot_epoch();
8167 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
8168 superblock
.mounted
= boot_epoch
;
8169 superblock
.clean_thru
= last
;
8172 // check for pg_num changes and deleted pools
8174 for (auto& i
: added_maps
) {
8176 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
8177 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
8178 << " probably first start of this osd" << dendl
;
8182 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
8183 for (auto& j
: lastmap
->get_pools()) {
8184 if (!i
.second
->have_pg_pool(j
.first
)) {
8185 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
8186 dout(10) << __func__
<< " recording final pg_pool_t for pool "
8187 << j
.first
<< dendl
;
8188 // this information is needed by _make_pg() if have to restart before
8189 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8190 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8192 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8193 string name
= lastmap
->get_pool_name(j
.first
);
8195 map
<string
,string
> profile
;
8196 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8197 profile
= lastmap
->get_erasure_code_profile(
8198 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8200 encode(profile
, bl
);
8201 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8202 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8203 new_pg_num
!= j
.second
.get_pg_num()) {
8204 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8205 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8206 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8209 for (auto& j
: i
.second
->get_pools()) {
8210 if (!lastmap
->have_pg_pool(j
.first
)) {
8211 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8212 << j
.second
.get_pg_num() << dendl
;
8213 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8214 j
.second
.get_pg_num());
8219 pg_num_history
.epoch
= last
;
8222 ::encode(pg_num_history
, bl
);
8223 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8224 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8227 // record new purged_snaps
8228 if (superblock
.purged_snaps_last
== start
- 1) {
8229 OSDriver osdriver
{store
.get(), service
.meta_ch
, make_purged_snaps_oid()};
8230 SnapMapper::record_purged_snaps(
8233 osdriver
.get_transaction(&t
),
8235 superblock
.purged_snaps_last
= last
;
8237 dout(10) << __func__
<< " superblock purged_snaps_last is "
8238 << superblock
.purged_snaps_last
8239 << ", not recording new purged_snaps" << dendl
;
8242 // superblock and commit
8243 write_superblock(t
);
8244 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8245 store
->queue_transaction(
8248 service
.publish_superblock(superblock
);
8251 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8253 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8254 if (is_stopping()) {
8255 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8258 std::lock_guard
l(osd_lock
);
8259 if (is_stopping()) {
8260 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8265 ceph_assert(first
<= last
);
8267 bool do_shutdown
= false;
8268 bool do_restart
= false;
8269 bool network_error
= false;
8270 OSDMapRef osdmap
= get_osdmap();
8272 // advance through the new maps
8273 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8274 dout(10) << " advance to epoch " << cur
8275 << " (<= last " << last
8276 << " <= newest_map " << superblock
.newest_map
8279 OSDMapRef newmap
= get_map(cur
);
8280 ceph_assert(newmap
); // we just cached it above!
8282 // start blocklisting messages sent to peers that go down.
8283 service
.pre_publish_map(newmap
);
8285 // kill connections to newly down osds
8286 bool waited_for_reservations
= false;
8288 osdmap
= get_osdmap();
8289 osdmap
->get_all_osds(old
);
8290 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8292 osdmap
->is_up(*p
) && // in old map
8293 newmap
->is_down(*p
)) { // but not the new one
8294 if (!waited_for_reservations
) {
8295 service
.await_reserved_maps();
8296 waited_for_reservations
= true;
8299 } else if (*p
!= whoami
&&
8300 osdmap
->is_down(*p
) &&
8301 newmap
->is_up(*p
)) {
8306 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8307 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8310 // this captures the case where we sent the boot message while
8311 // NOUP was being set on the mon and our boot request was
8312 // dropped, and then later it is cleared. it imperfectly
8313 // handles the case where our original boot message was not
8314 // dropped and we restart even though we might have booted, but
8315 // that is harmless (boot will just take slightly longer).
8320 osdmap
= std::move(newmap
);
8324 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8326 osdmap
->is_up(whoami
) &&
8327 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8328 up_epoch
= osdmap
->get_epoch();
8329 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8331 boot_epoch
= osdmap
->get_epoch();
8332 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8334 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8338 epoch_t _bind_epoch
= service
.get_bind_epoch();
8339 if (osdmap
->is_up(whoami
) &&
8340 osdmap
->get_addrs(whoami
).legacy_equals(
8341 client_messenger
->get_myaddrs()) &&
8342 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8345 dout(1) << "state: booting -> active" << dendl
;
8346 set_state(STATE_ACTIVE
);
8349 // set incarnation so that osd_reqid_t's we generate for our
8350 // objecter requests are unique across restarts.
8351 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8352 cancel_pending_failures();
8356 if (osdmap
->get_epoch() > 0 &&
8358 if (!osdmap
->exists(whoami
)) {
8359 derr
<< "map says i do not exist. shutting down." << dendl
;
8360 do_shutdown
= true; // don't call shutdown() while we have
8361 // everything paused
8362 } else if (osdmap
->is_stop(whoami
)) {
8363 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8365 } else if (!osdmap
->is_up(whoami
) ||
8366 !osdmap
->get_addrs(whoami
).legacy_equals(
8367 client_messenger
->get_myaddrs()) ||
8368 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8369 cluster_messenger
->get_myaddrs()) ||
8370 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8371 hb_back_server_messenger
->get_myaddrs()) ||
8372 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8373 hb_front_server_messenger
->get_myaddrs())) {
8374 if (!osdmap
->is_up(whoami
)) {
8375 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8376 service
.got_stop_ack();
8378 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8379 "but it is still running";
8380 clog
->debug() << "map e" << osdmap
->get_epoch()
8381 << " wrongly marked me down at e"
8382 << osdmap
->get_down_at(whoami
);
8384 if (monc
->monmap
.min_mon_release
>= ceph_release_t::octopus
) {
8385 // note that this is best-effort...
8386 monc
->send_mon_message(
8390 osdmap
->get_epoch()));
8392 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8393 client_messenger
->get_myaddrs())) {
8394 clog
->error() << "map e" << osdmap
->get_epoch()
8395 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8396 << " != my " << client_messenger
->get_myaddrs() << ")";
8397 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8398 cluster_messenger
->get_myaddrs())) {
8399 clog
->error() << "map e" << osdmap
->get_epoch()
8400 << " had wrong cluster addr ("
8401 << osdmap
->get_cluster_addrs(whoami
)
8402 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8403 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8404 hb_back_server_messenger
->get_myaddrs())) {
8405 clog
->error() << "map e" << osdmap
->get_epoch()
8406 << " had wrong heartbeat back addr ("
8407 << osdmap
->get_hb_back_addrs(whoami
)
8408 << " != my " << hb_back_server_messenger
->get_myaddrs()
8410 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8411 hb_front_server_messenger
->get_myaddrs())) {
8412 clog
->error() << "map e" << osdmap
->get_epoch()
8413 << " had wrong heartbeat front addr ("
8414 << osdmap
->get_hb_front_addrs(whoami
)
8415 << " != my " << hb_front_server_messenger
->get_myaddrs()
8419 if (!service
.is_stopping()) {
8420 epoch_t up_epoch
= 0;
8421 epoch_t bind_epoch
= osdmap
->get_epoch();
8422 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8426 utime_t now
= ceph_clock_now();
8427 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8428 osd_markdown_log
.push_back(now
);
8429 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8430 derr
<< __func__
<< " marked down "
8431 << osd_markdown_log
.size()
8432 << " > osd_max_markdown_count "
8433 << cct
->_conf
->osd_max_markdown_count
8434 << " in last " << grace
<< " seconds, shutting down"
8440 start_waiting_for_healthy();
8442 set
<int> avoid_ports
;
8443 #if defined(__FreeBSD__)
8444 // prevent FreeBSD from grabbing the client_messenger port during
8445 // rebinding. In which case a cluster_meesneger will connect also
8447 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8449 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8451 int r
= cluster_messenger
->rebind(avoid_ports
);
8453 do_shutdown
= true; // FIXME: do_restart?
8454 network_error
= true;
8455 derr
<< __func__
<< " marked down:"
8456 << " rebind cluster_messenger failed" << dendl
;
8459 hb_back_server_messenger
->mark_down_all();
8460 hb_front_server_messenger
->mark_down_all();
8461 hb_front_client_messenger
->mark_down_all();
8462 hb_back_client_messenger
->mark_down_all();
8464 reset_heartbeat_peers(true);
8467 } else if (osdmap
->get_epoch() > 0 && osdmap
->is_stop(whoami
)) {
8468 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8474 check_osdmap_features();
8479 if (is_active() || is_waiting_for_healthy())
8480 maybe_update_heartbeat_peers();
8487 if (network_error
) {
8488 cancel_pending_failures();
8490 // trigger shutdown in a different thread
8491 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8492 queue_async_signal(SIGINT
);
8494 else if (m
->newest_map
&& m
->newest_map
> last
) {
8495 dout(10) << " msg say newest map is " << m
->newest_map
8496 << ", requesting more" << dendl
;
8497 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8499 else if (is_preboot()) {
8500 if (m
->get_source().is_mon())
8501 _preboot(m
->cluster_osdmap_trim_lower_bound
, m
->newest_map
);
8505 else if (do_restart
)
8510 void OSD::check_osdmap_features()
8512 // adjust required feature bits?
8514 // we have to be a bit careful here, because we are accessing the
8515 // Policy structures without taking any lock. in particular, only
8516 // modify integer values that can safely be read by a racing CPU.
8517 // since we are only accessing existing Policy structures a their
8518 // current memory location, and setting or clearing bits in integer
8519 // fields, and we are the only writer, this is not a problem.
8521 const auto osdmap
= get_osdmap();
8523 Messenger::Policy p
= client_messenger
->get_default_policy();
8525 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8526 if ((p
.features_required
& mask
) != features
) {
8527 dout(0) << "crush map has features " << features
8528 << ", adjusting msgr requires for clients" << dendl
;
8529 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8530 client_messenger
->set_default_policy(p
);
8534 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8536 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8537 if ((p
.features_required
& mask
) != features
) {
8538 dout(0) << "crush map has features " << features
8539 << " was " << p
.features_required
8540 << ", adjusting msgr requires for mons" << dendl
;
8541 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8542 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8546 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8548 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8550 if ((p
.features_required
& mask
) != features
) {
8551 dout(0) << "crush map has features " << features
8552 << ", adjusting msgr requires for osds" << dendl
;
8553 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8554 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8557 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8558 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8559 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8560 ObjectStore::Transaction t
;
8561 write_superblock(t
);
8562 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8563 ceph_assert(err
== 0);
8567 if (osdmap
->require_osd_release
< ceph_release_t::nautilus
) {
8568 hb_front_server_messenger
->set_require_authorizer(false);
8569 hb_back_server_messenger
->set_require_authorizer(false);
8571 hb_front_server_messenger
->set_require_authorizer(true);
8572 hb_back_server_messenger
->set_require_authorizer(true);
8575 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8576 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8577 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8578 store
->write_meta("require_osd_release",
8579 stringify((int)osdmap
->require_osd_release
));
8580 last_require_osd_release
= osdmap
->require_osd_release
;
8584 struct C_FinishSplits
: public Context
{
8587 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8588 : osd(osd
), pgs(in
) {}
8589 void finish(int r
) override
{
8590 osd
->_finish_splits(pgs
);
8594 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8596 dout(10) << __func__
<< " " << pgs
<< dendl
;
8599 for (set
<PGRef
>::iterator i
= pgs
.begin();
8606 dout(10) << __func__
<< " " << *pg
<< dendl
;
8607 epoch_t e
= pg
->get_osdmap_epoch();
8608 pg
->handle_initialize(rctx
);
8609 pg
->queue_null(e
, e
);
8610 dispatch_context(rctx
, pg
, service
.get_osdmap());
8613 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8614 shards
[shard_index
]->register_and_wake_split_child(pg
);
8618 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8621 std::lock_guard
l(merge_lock
);
8622 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8623 p
[src
->pg_id
] = src
;
8624 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8625 << " for " << target
<< ", have " << p
.size() << "/" << need
8627 return p
.size() == need
;
8630 bool OSD::advance_pg(
8633 ThreadPool::TPHandle
&handle
,
8636 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8639 ceph_assert(pg
->is_locked());
8640 OSDMapRef lastmap
= pg
->get_osdmap();
8641 set
<PGRef
> new_pgs
; // any split children
8644 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8645 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8646 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8647 next_epoch
<= osd_epoch
;
8649 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8651 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8655 unsigned new_pg_num
=
8656 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8657 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8658 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8660 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8662 if (pg
->pg_id
.is_merge_source(
8666 // we are merge source
8667 PGRef spg
= pg
; // carry a ref
8668 dout(1) << __func__
<< " " << pg
->pg_id
8669 << " is merge source, target is " << parent
8671 pg
->write_if_dirty(rctx
);
8672 if (!new_pgs
.empty()) {
8673 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8677 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8679 // release backoffs explicitly, since the on_shutdown path
8680 // aggressively tears down backoff state.
8681 if (pg
->is_primary()) {
8682 pg
->release_pg_backoffs();
8685 OSDShard
*sdata
= pg
->osd_shard
;
8687 std::lock_guard
l(sdata
->shard_lock
);
8689 sdata
->_detach_pg(pg
->pg_slot
);
8690 // update pg count now since we might not get an osdmap
8692 if (pg
->is_primary())
8693 logger
->dec(l_osd_pg_primary
);
8694 else if (pg
->is_nonprimary())
8695 logger
->dec(l_osd_pg_replica
); // misnomer
8697 logger
->dec(l_osd_pg_stray
);
8702 set
<spg_t
> children
;
8703 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8704 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8705 enqueue_peering_evt(
8708 std::make_shared
<PGPeeringEvent
>(
8709 nextmap
->get_epoch(),
8710 nextmap
->get_epoch(),
8715 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8716 // we are merge target
8717 set
<spg_t
> children
;
8718 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8719 dout(20) << __func__
<< " " << pg
->pg_id
8720 << " is merge target, sources are " << children
8722 map
<spg_t
,PGRef
> sources
;
8724 std::lock_guard
l(merge_lock
);
8725 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8726 unsigned need
= children
.size();
8727 dout(20) << __func__
<< " have " << s
.size() << "/"
8729 if (s
.size() == need
) {
8731 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8732 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8733 merge_waiters
.erase(nextmap
->get_epoch());
8737 if (!sources
.empty()) {
8738 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8739 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8740 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8742 sources
, rctx
, split_bits
,
8743 nextmap
->get_pg_pool(
8744 pg
->pg_id
.pool())->last_pg_merge_meta
);
8745 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8747 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8748 pg
->write_if_dirty(rctx
);
8749 if (!new_pgs
.empty()) {
8750 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8754 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8756 // kick source(s) to get them ready
8757 for (auto& i
: children
) {
8758 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8759 enqueue_peering_evt(
8762 std::make_shared
<PGPeeringEvent
>(
8763 nextmap
->get_epoch(),
8764 nextmap
->get_epoch(),
8774 vector
<int> newup
, newacting
;
8775 int up_primary
, acting_primary
;
8776 nextmap
->pg_to_up_acting_osds(
8778 &newup
, &up_primary
,
8779 &newacting
, &acting_primary
);
8780 pg
->handle_advance_map(
8781 nextmap
, lastmap
, newup
, up_primary
,
8782 newacting
, acting_primary
, rctx
);
8784 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8785 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8786 if (oldpool
!= lastmap
->get_pools().end()
8787 && newpool
!= nextmap
->get_pools().end()) {
8788 dout(20) << __func__
8789 << " new pool opts " << newpool
->second
.opts
8790 << " old pool opts " << oldpool
->second
.opts
8793 double old_min_interval
= 0, new_min_interval
= 0;
8794 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8795 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8797 double old_max_interval
= 0, new_max_interval
= 0;
8798 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8799 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8801 // Assume if an interval is change from set to unset or vice versa the actual config
8802 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8804 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8805 pg
->on_info_history_change();
8809 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8811 set
<spg_t
> children
;
8812 if (pg
->pg_id
.is_split(
8817 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8823 old_pg_num
= new_pg_num
;
8824 handle
.reset_tp_timeout();
8826 pg
->handle_activate_map(rctx
);
8830 if (!new_pgs
.empty()) {
8831 rctx
.transaction
.register_on_applied(new C_FinishSplits(this, new_pgs
));
8836 void OSD::consume_map()
8838 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8839 auto osdmap
= get_osdmap();
8840 dout(20) << __func__
<< " version " << osdmap
->get_epoch() << dendl
;
8842 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8843 * speak the older sorting version any more. Be careful not to force
8844 * a shutdown if we are merely processing old maps, though.
8846 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8847 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8850 service
.pre_publish_map(osdmap
);
8851 service
.await_reserved_maps();
8852 service
.publish_map(osdmap
);
8853 dout(20) << "consume_map " << osdmap
->get_epoch() << " -- publish done" << dendl
;
8854 // prime splits and merges
8855 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8856 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8857 for (auto& shard
: shards
) {
8858 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8860 if (!newly_split
.empty()) {
8861 for (auto& shard
: shards
) {
8862 shard
->prime_splits(osdmap
, &newly_split
);
8864 ceph_assert(newly_split
.empty());
8867 // prune sent_ready_to_merge
8868 service
.prune_sent_ready_to_merge(osdmap
);
8870 // FIXME, maybe: We could race against an incoming peering message
8871 // that instantiates a merge PG after identify_merges() below and
8872 // never set up its peer to complete the merge. An OSD restart
8873 // would clear it up. This is a hard race to resolve,
8874 // extraordinarily rare (we only merge PGs that are stable and
8875 // clean, so it'd have to be an imported PG to an OSD with a
8876 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8877 // replace all of this with a seastar-based code soon anyway.
8878 if (!merge_pgs
.empty()) {
8879 // mark the pgs we already have, or create new and empty merge
8880 // participants for those we are missing. do this all under the
8881 // shard lock so we don't have to worry about racing pg creates
8883 for (auto& shard
: shards
) {
8884 shard
->prime_merges(osdmap
, &merge_pgs
);
8886 ceph_assert(merge_pgs
.empty());
8889 service
.prune_pg_created();
8891 unsigned pushes_to_free
= 0;
8892 for (auto& shard
: shards
) {
8893 shard
->consume_map(osdmap
, &pushes_to_free
);
8896 vector
<spg_t
> pgids
;
8899 // count (FIXME, probably during seastar rewrite)
8900 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8903 for (auto& pg
: pgs
) {
8904 // FIXME (probably during seastar rewrite): this is lockless and
8905 // racy, but we don't want to take pg lock here.
8906 if (pg
->is_primary())
8908 else if (pg
->is_nonprimary())
8909 num_pg_replica
++; // misnomer
8915 // FIXME (as part of seastar rewrite): move to OSDShard
8916 std::lock_guard
l(pending_creates_lock
);
8917 for (auto pg
= pending_creates_from_osd
.begin();
8918 pg
!= pending_creates_from_osd
.end();) {
8919 if (osdmap
->get_pg_acting_role(pg
->first
, whoami
) < 0) {
8920 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8921 << "discarding pending_create_from_osd" << dendl
;
8922 pg
= pending_creates_from_osd
.erase(pg
);
8929 service
.maybe_inject_dispatch_delay();
8931 dispatch_sessions_waiting_on_map();
8933 service
.maybe_inject_dispatch_delay();
8935 service
.release_reserved_pushes(pushes_to_free
);
8937 // queue null events to push maps down to individual PGs
8938 for (auto pgid
: pgids
) {
8939 enqueue_peering_evt(
8942 std::make_shared
<PGPeeringEvent
>(
8943 osdmap
->get_epoch(),
8944 osdmap
->get_epoch(),
8947 logger
->set(l_osd_pg
, pgids
.size());
8948 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8949 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8950 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8953 void OSD::activate_map()
8955 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8956 auto osdmap
= get_osdmap();
8958 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8961 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8962 if (!service
.recovery_is_paused()) {
8963 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8964 service
.pause_recovery();
8967 if (service
.recovery_is_paused()) {
8968 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8969 service
.unpause_recovery();
8973 service
.activate_map();
8976 bool OSD::require_mon_peer(const Message
*m
)
8978 if (!m
->get_connection()->peer_is_mon()) {
8979 dout(0) << "require_mon_peer received from non-mon "
8980 << m
->get_connection()->get_peer_addr()
8981 << " " << *m
<< dendl
;
8987 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8989 if (!m
->get_connection()->peer_is_mon() &&
8990 !m
->get_connection()->peer_is_mgr()) {
8991 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8992 << m
->get_connection()->get_peer_addr()
8993 << " " << *m
<< dendl
;
8999 bool OSD::require_osd_peer(const Message
*m
)
9001 if (!m
->get_connection()->peer_is_osd()) {
9002 dout(0) << "require_osd_peer received from non-osd "
9003 << m
->get_connection()->get_peer_addr()
9004 << " " << *m
<< dendl
;
9010 // ----------------------------------------
9013 void OSD::split_pgs(
9015 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
9020 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
9021 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
9023 vector
<object_stat_sum_t
> updated_stats
;
9024 parent
->start_split_stats(childpgids
, &updated_stats
);
9026 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
9027 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
9028 i
!= childpgids
.end();
9030 ceph_assert(stat_iter
!= updated_stats
.end());
9031 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
9032 PG
* child
= _make_pg(nextmap
, *i
);
9034 out_pgs
->insert(child
);
9035 child
->ch
= store
->create_new_collection(child
->coll
);
9038 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
9039 assert(NULL
!= shards
[shard_index
]);
9040 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
9043 unsigned split_bits
= i
->get_split_bits(pg_num
);
9044 dout(10) << " pg_num is " << pg_num
9045 << ", m_seed " << i
->ps()
9046 << ", split_bits is " << split_bits
<< dendl
;
9047 parent
->split_colls(
9051 &child
->get_pgpool().info
,
9058 child
->init_collection_pool_opts();
9060 child
->finish_split_stats(*stat_iter
, rctx
.transaction
);
9063 ceph_assert(stat_iter
!= updated_stats
.end());
9064 parent
->finish_split_stats(*stat_iter
, rctx
.transaction
);
9067 // ----------------------------------------
9068 // peering and recovery
9070 void OSD::dispatch_context(PeeringCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9071 ThreadPool::TPHandle
*handle
)
9073 if (!service
.get_osdmap()->is_up(whoami
)) {
9074 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9075 } else if (!is_active()) {
9076 dout(20) << __func__
<< " not active" << dendl
;
9078 for (auto& [osd
, ls
] : ctx
.message_map
) {
9079 if (!curmap
->is_up(osd
)) {
9080 dout(20) << __func__
<< " skipping down osd." << osd
<< dendl
;
9083 ConnectionRef con
= service
.get_con_osd_cluster(
9084 osd
, curmap
->get_epoch());
9086 dout(20) << __func__
<< " skipping osd." << osd
<< " (NULL con)"
9090 service
.maybe_share_map(con
.get(), curmap
);
9092 con
->send_message2(m
);
9097 if ((!ctx
.transaction
.empty() || ctx
.transaction
.has_contexts()) && pg
) {
9098 int tr
= store
->queue_transaction(
9100 std::move(ctx
.transaction
), TrackedOpRef(),
9102 ceph_assert(tr
== 0);
9106 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9108 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9109 if (!require_mon_peer(m
)) {
9113 for (auto& p
: m
->pgs
) {
9114 spg_t pgid
= p
.first
;
9115 epoch_t created
= p
.second
.first
;
9116 utime_t created_stamp
= p
.second
.second
;
9117 auto q
= m
->pg_extra
.find(pgid
);
9118 if (q
== m
->pg_extra
.end()) {
9119 clog
->error() << __func__
<< " " << pgid
<< " e" << created
9120 << "@" << created_stamp
<< " with no history or past_intervals"
9121 << ", this should be impossible after octopus. Ignoring.";
9123 dout(20) << __func__
<< " " << pgid
<< " e" << created
9124 << "@" << created_stamp
9125 << " history " << q
->second
.first
9126 << " pi " << q
->second
.second
<< dendl
;
9127 if (!q
->second
.second
.empty() &&
9128 m
->epoch
< q
->second
.second
.get_bounds().second
) {
9129 clog
->error() << "got pg_create on " << pgid
<< " epoch " << m
->epoch
9130 << " and unmatched past_intervals " << q
->second
.second
9131 << " (history " << q
->second
.first
<< ")";
9133 enqueue_peering_evt(
9136 std::make_shared
<PGPeeringEvent
>(
9153 std::lock_guard
l(pending_creates_lock
);
9154 if (pending_creates_from_mon
== 0) {
9155 last_pg_create_epoch
= m
->epoch
;
9162 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9164 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9165 if (!require_osd_peer(m
)) {
9169 int from
= m
->get_source().num();
9170 for (auto& p
: m
->get_pg_list()) {
9171 spg_t
pgid(p
.info
.pgid
.pgid
, p
.to
);
9172 enqueue_peering_evt(
9175 std::make_shared
<PGPeeringEvent
>(
9179 pgid
, pg_shard_t(from
, p
.from
),
9181 m
->get_connection()->get_features()),
9194 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9196 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9197 if (!require_osd_peer(m
)) {
9201 int from
= m
->get_source().num();
9202 for (auto& p
: m
->pg_list
) {
9203 enqueue_peering_evt(
9204 spg_t(p
.info
.pgid
.pgid
, p
.to
),
9206 std::make_shared
<PGPeeringEvent
>(
9207 p
.epoch_sent
, p
.query_epoch
,
9209 pg_shard_t(from
, p
.from
),
9217 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9219 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9220 if (!require_osd_peer(m
)) {
9224 for (auto& pgid
: m
->pg_list
) {
9225 enqueue_peering_evt(
9228 std::make_shared
<PGPeeringEvent
>(
9229 m
->get_epoch(), m
->get_epoch(),
9230 PeeringState::DeleteStart())));
9235 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9237 dout(10) << __func__
<< " " << *m
<< dendl
;
9238 if (!require_mon_or_mgr_peer(m
)) {
9242 epoch_t epoch
= get_osdmap_epoch();
9243 for (auto pgid
: m
->forced_pgs
) {
9244 if (m
->options
& OFR_BACKFILL
) {
9245 if (m
->options
& OFR_CANCEL
) {
9246 enqueue_peering_evt(
9249 std::make_shared
<PGPeeringEvent
>(
9251 PeeringState::UnsetForceBackfill())));
9253 enqueue_peering_evt(
9256 std::make_shared
<PGPeeringEvent
>(
9258 PeeringState::SetForceBackfill())));
9260 } else if (m
->options
& OFR_RECOVERY
) {
9261 if (m
->options
& OFR_CANCEL
) {
9262 enqueue_peering_evt(
9265 std::make_shared
<PGPeeringEvent
>(
9267 PeeringState::UnsetForceRecovery())));
9269 enqueue_peering_evt(
9272 std::make_shared
<PGPeeringEvent
>(
9274 PeeringState::SetForceRecovery())));
9281 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9283 spg_t pgid
= q
.pgid
;
9284 dout(10) << __func__
<< " " << pgid
<< dendl
;
9286 OSDMapRef osdmap
= get_osdmap();
9287 if (!osdmap
->have_pg_pool(pgid
.pool()))
9290 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9291 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9292 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9295 if (q
.query
.type
== pg_query_t::LOG
||
9296 q
.query
.type
== pg_query_t::FULLLOG
) {
9298 q
.query
.from
, q
.query
.to
,
9299 osdmap
->get_epoch(), empty
,
9300 q
.query
.epoch_sent
);
9302 pg_notify_t notify
{q
.query
.from
, q
.query
.to
,
9304 osdmap
->get_epoch(),
9307 m
= new MOSDPGNotify2(spg_t
{pgid
.pgid
, q
.query
.from
},
9310 service
.maybe_share_map(con
.get(), osdmap
);
9311 con
->send_message(m
);
9315 void OSDService::queue_check_readable(spg_t spgid
,
9317 ceph::signedspan delay
)
9319 if (delay
== ceph::signedspan::zero()) {
9320 osd
->enqueue_peering_evt(
9323 std::make_shared
<PGPeeringEvent
>(
9325 PeeringState::CheckReadable())));
9327 mono_timer
.add_event(
9329 [this, spgid
, lpr
]() {
9330 queue_check_readable(spgid
, lpr
);
9336 // =========================================================
9339 void OSDService::_maybe_queue_recovery() {
9340 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
9341 uint64_t available_pushes
;
9342 while (!awaiting_throttle
.empty() &&
9343 _recover_now(&available_pushes
)) {
9344 uint64_t to_start
= std::min(
9346 cct
->_conf
->osd_recovery_max_single_start
);
9347 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9348 awaiting_throttle
.pop_front();
9349 dout(10) << __func__
<< " starting " << to_start
9350 << ", recovery_ops_reserved " << recovery_ops_reserved
9351 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9352 recovery_ops_reserved
+= to_start
;
9356 bool OSDService::_recover_now(uint64_t *available_pushes
)
9358 if (available_pushes
)
9359 *available_pushes
= 0;
9361 if (ceph_clock_now() < defer_recovery_until
) {
9362 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9366 if (recovery_paused
) {
9367 dout(15) << __func__
<< " paused" << dendl
;
9371 uint64_t max
= osd
->get_recovery_max_active();
9372 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9373 dout(15) << __func__
<< " active " << recovery_ops_active
9374 << " + reserved " << recovery_ops_reserved
9375 << " >= max " << max
<< dendl
;
9379 if (available_pushes
)
9380 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9385 unsigned OSDService::get_target_pg_log_entries() const
9387 auto num_pgs
= osd
->get_num_pgs();
9388 auto target
= cct
->_conf
->osd_target_pg_log_entries_per_osd
;
9389 if (num_pgs
> 0 && target
> 0) {
9390 // target an even spread of our budgeted log entries across all
9391 // PGs. note that while we only get to control the entry count
9392 // for primary PGs, we'll normally be responsible for a mix of
9393 // primary and replica PGs (for the same pool(s) even), so this
9395 return std::max
<unsigned>(
9396 std::min
<unsigned>(target
/ num_pgs
,
9397 cct
->_conf
->osd_max_pg_log_entries
),
9398 cct
->_conf
->osd_min_pg_log_entries
);
9400 // fall back to a per-pg value.
9401 return cct
->_conf
->osd_min_pg_log_entries
;
9405 void OSD::do_recovery(
9406 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
, int priority
,
9407 ThreadPool::TPHandle
&handle
)
9409 uint64_t started
= 0;
9412 * When the value of osd_recovery_sleep is set greater than zero, recovery
9413 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9414 * recovery event's schedule time. This is done by adding a
9415 * recovery_requeue_callback event, which re-queues the recovery op using
9416 * queue_recovery_after_sleep.
9418 float recovery_sleep
= get_osd_recovery_sleep();
9420 std::lock_guard
l(service
.sleep_lock
);
9421 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9423 auto recovery_requeue_callback
= new LambdaContext(
9424 [this, pgref
, queued
, reserved_pushes
, priority
](int r
) {
9425 dout(20) << "do_recovery wake up at "
9427 << ", re-queuing recovery" << dendl
;
9428 std::lock_guard
l(service
.sleep_lock
);
9429 service
.recovery_needs_sleep
= false;
9430 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
, priority
);
9433 // This is true for the first recovery op and when the previous recovery op
9434 // has been scheduled in the past. The next recovery op is scheduled after
9435 // completing the sleep from now.
9437 if (auto now
= ceph::real_clock::now();
9438 service
.recovery_schedule_time
< now
) {
9439 service
.recovery_schedule_time
= now
;
9441 service
.recovery_schedule_time
+= ceph::make_timespan(recovery_sleep
);
9442 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9443 recovery_requeue_callback
);
9444 dout(20) << "Recovery event scheduled at "
9445 << service
.recovery_schedule_time
<< dendl
;
9452 std::lock_guard
l(service
.sleep_lock
);
9453 service
.recovery_needs_sleep
= true;
9456 if (pg
->pg_has_reset_since(queued
)) {
9460 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9461 #ifdef DEBUG_RECOVERY_OIDS
9462 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9465 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9466 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9467 << " on " << *pg
<< dendl
;
9471 rctx
.handle
= &handle
;
9472 pg
->find_unfound(queued
, rctx
);
9473 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9478 ceph_assert(started
<= reserved_pushes
);
9479 service
.release_reserved_pushes(reserved_pushes
);
9482 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9484 std::lock_guard
l(recovery_lock
);
9485 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9486 << " (" << recovery_ops_active
<< "/"
9487 << osd
->get_recovery_max_active() << " rops)"
9489 recovery_ops_active
++;
9491 #ifdef DEBUG_RECOVERY_OIDS
9492 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9493 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9494 recovery_oids
[pg
->pg_id
].insert(soid
);
9498 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9500 std::lock_guard
l(recovery_lock
);
9501 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9502 << " dequeue=" << dequeue
9503 << " (" << recovery_ops_active
<< "/"
9504 << osd
->get_recovery_max_active() << " rops)"
9508 ceph_assert(recovery_ops_active
> 0);
9509 recovery_ops_active
--;
9511 #ifdef DEBUG_RECOVERY_OIDS
9512 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9513 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9514 recovery_oids
[pg
->pg_id
].erase(soid
);
9517 _maybe_queue_recovery();
9520 bool OSDService::is_recovery_active()
9522 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
9525 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9528 void OSDService::release_reserved_pushes(uint64_t pushes
)
9530 std::lock_guard
l(recovery_lock
);
9531 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9532 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9534 ceph_assert(recovery_ops_reserved
>= pushes
);
9535 recovery_ops_reserved
-= pushes
;
9536 _maybe_queue_recovery();
9539 // =========================================================
9542 bool OSD::op_is_discardable(const MOSDOp
*op
)
9544 // drop client request if they are not connected and can't get the
9546 if (!op
->get_connection()->is_connected()) {
9552 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9554 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9555 const utime_t latency
= ceph_clock_now() - stamp
;
9556 const unsigned priority
= op
->get_req()->get_priority();
9557 const int cost
= op
->get_req()->get_cost();
9558 const uint64_t owner
= op
->get_req()->get_source().num();
9559 const int type
= op
->get_req()->get_type();
9561 dout(15) << "enqueue_op " << *op
->get_req() << " prio " << priority
9564 << " latency " << latency
9565 << " epoch " << epoch
9566 << " " << *(op
->get_req()) << dendl
;
9567 op
->osd_trace
.event("enqueue op");
9568 op
->osd_trace
.keyval("priority", priority
);
9569 op
->osd_trace
.keyval("cost", cost
);
9571 auto enqueue_span
= tracing::osd::tracer
.add_span(__func__
, op
->osd_parent_span
);
9572 enqueue_span
->AddEvent(__func__
, {
9573 {"priority", priority
},
9580 op
->mark_queued_for_pg();
9581 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9582 if (PGRecoveryMsg::is_recovery_msg(op
)) {
9585 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGRecoveryMsg(pg
, std::move(op
))),
9586 cost
, priority
, stamp
, owner
, epoch
));
9590 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9591 cost
, priority
, stamp
, owner
, epoch
));
9595 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9597 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9600 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9602 cct
->_conf
->osd_peering_op_priority
,
9605 evt
->get_epoch_sent()));
9609 * NOTE: dequeue called in worker thread, with pg lock
9611 void OSD::dequeue_op(
9612 PGRef pg
, OpRequestRef op
,
9613 ThreadPool::TPHandle
&handle
)
9615 const Message
*m
= op
->get_req();
9618 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_BEGIN", false);
9620 utime_t now
= ceph_clock_now();
9621 op
->set_dequeued_time(now
);
9623 utime_t latency
= now
- m
->get_recv_stamp();
9624 dout(10) << "dequeue_op " << *op
->get_req()
9625 << " prio " << m
->get_priority()
9626 << " cost " << m
->get_cost()
9627 << " latency " << latency
9629 << " pg " << *pg
<< dendl
;
9631 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9633 service
.maybe_share_map(m
->get_connection().get(),
9637 if (pg
->is_deleting())
9640 op
->mark_reached_pg();
9641 op
->osd_trace
.event("dequeue_op");
9643 pg
->do_request(op
, handle
);
9646 dout(10) << "dequeue_op " << *op
->get_req() << " finish" << dendl
;
9647 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_END", false);
9651 void OSD::dequeue_peering_evt(
9654 PGPeeringEventRef evt
,
9655 ThreadPool::TPHandle
& handle
)
9657 auto curmap
= sdata
->get_osdmap();
9658 bool need_up_thru
= false;
9659 epoch_t same_interval_since
= 0;
9661 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9662 handle_pg_query_nopg(*q
);
9664 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9667 } else if (PeeringCtx rctx
;
9668 advance_pg(curmap
->get_epoch(), pg
, handle
, rctx
)) {
9669 pg
->do_peering_event(evt
, rctx
);
9670 if (pg
->is_deleted()) {
9674 dispatch_context(rctx
, pg
, curmap
, &handle
);
9675 need_up_thru
= pg
->get_need_up_thru();
9676 same_interval_since
= pg
->get_same_interval_since();
9681 queue_want_up_thru(same_interval_since
);
9684 service
.send_pg_temp();
9687 void OSD::dequeue_delete(
9691 ThreadPool::TPHandle
& handle
)
9693 dequeue_peering_evt(
9697 std::make_shared
<PGPeeringEvent
>(
9699 PeeringState::DeleteSome())),
9705 // --------------------------------
9707 const char** OSD::get_tracked_conf_keys() const
9709 static const char* KEYS
[] = {
9710 "osd_max_backfills",
9711 "osd_min_recovery_priority",
9712 "osd_max_trimming_pgs",
9713 "osd_op_complaint_time",
9714 "osd_op_log_threshold",
9715 "osd_op_history_size",
9716 "osd_op_history_duration",
9717 "osd_op_history_slow_op_size",
9718 "osd_op_history_slow_op_threshold",
9719 "osd_enable_op_tracker",
9720 "osd_map_cache_size",
9721 "osd_pg_epoch_max_lag_factor",
9722 "osd_pg_epoch_persisted_max_stale",
9723 "osd_recovery_sleep",
9724 "osd_recovery_sleep_hdd",
9725 "osd_recovery_sleep_ssd",
9726 "osd_recovery_sleep_hybrid",
9728 "osd_delete_sleep_hdd",
9729 "osd_delete_sleep_ssd",
9730 "osd_delete_sleep_hybrid",
9731 "osd_snap_trim_sleep",
9732 "osd_snap_trim_sleep_hdd",
9733 "osd_snap_trim_sleep_ssd",
9734 "osd_snap_trim_sleep_hybrid",
9736 "osd_recovery_max_active",
9737 "osd_recovery_max_active_hdd",
9738 "osd_recovery_max_active_ssd",
9739 // clog & admin clog
9742 "clog_to_syslog_facility",
9743 "clog_to_syslog_level",
9744 "osd_objectstore_fuse",
9746 "clog_to_graylog_host",
9747 "clog_to_graylog_port",
9750 "osd_recovery_delay_start",
9751 "osd_client_message_size_cap",
9752 "osd_client_message_cap",
9753 "osd_heartbeat_min_size",
9754 "osd_heartbeat_interval",
9755 "osd_object_clean_region_max_num_intervals",
9756 "osd_scrub_min_interval",
9757 "osd_scrub_max_interval",
9763 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9764 const std::set
<std::string
> &changed
)
9766 std::lock_guard l
{osd_lock
};
9768 if (changed
.count("osd_max_backfills") ||
9769 changed
.count("osd_recovery_max_active") ||
9770 changed
.count("osd_recovery_max_active_hdd") ||
9771 changed
.count("osd_recovery_max_active_ssd")) {
9772 if (!maybe_override_options_for_qos(&changed
) &&
9773 changed
.count("osd_max_backfills")) {
9774 // Scheduler is not "mclock". Fallback to earlier behavior
9775 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9776 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9779 if (changed
.count("osd_delete_sleep") ||
9780 changed
.count("osd_delete_sleep_hdd") ||
9781 changed
.count("osd_delete_sleep_ssd") ||
9782 changed
.count("osd_delete_sleep_hybrid") ||
9783 changed
.count("osd_snap_trim_sleep") ||
9784 changed
.count("osd_snap_trim_sleep_hdd") ||
9785 changed
.count("osd_snap_trim_sleep_ssd") ||
9786 changed
.count("osd_snap_trim_sleep_hybrid") ||
9787 changed
.count("osd_scrub_sleep") ||
9788 changed
.count("osd_recovery_sleep") ||
9789 changed
.count("osd_recovery_sleep_hdd") ||
9790 changed
.count("osd_recovery_sleep_ssd") ||
9791 changed
.count("osd_recovery_sleep_hybrid")) {
9792 maybe_override_sleep_options_for_qos();
9794 if (changed
.count("osd_pg_delete_cost")) {
9795 maybe_override_cost_for_qos();
9797 if (changed
.count("osd_min_recovery_priority")) {
9798 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9799 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9801 if (changed
.count("osd_max_trimming_pgs")) {
9802 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
9804 if (changed
.count("osd_op_complaint_time") ||
9805 changed
.count("osd_op_log_threshold")) {
9806 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
9807 cct
->_conf
->osd_op_log_threshold
);
9809 if (changed
.count("osd_op_history_size") ||
9810 changed
.count("osd_op_history_duration")) {
9811 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
9812 cct
->_conf
->osd_op_history_duration
);
9814 if (changed
.count("osd_op_history_slow_op_size") ||
9815 changed
.count("osd_op_history_slow_op_threshold")) {
9816 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
9817 cct
->_conf
->osd_op_history_slow_op_threshold
);
9819 if (changed
.count("osd_enable_op_tracker")) {
9820 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
9822 if (changed
.count("osd_map_cache_size")) {
9823 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9824 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9825 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
9827 if (changed
.count("clog_to_monitors") ||
9828 changed
.count("clog_to_syslog") ||
9829 changed
.count("clog_to_syslog_level") ||
9830 changed
.count("clog_to_syslog_facility") ||
9831 changed
.count("clog_to_graylog") ||
9832 changed
.count("clog_to_graylog_host") ||
9833 changed
.count("clog_to_graylog_port") ||
9834 changed
.count("host") ||
9835 changed
.count("fsid")) {
9836 update_log_config();
9838 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
9839 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
9840 "osd_pg_epoch_max_lag_factor");
9844 if (changed
.count("osd_objectstore_fuse")) {
9846 enable_disable_fuse(false);
9851 if (changed
.count("osd_recovery_delay_start")) {
9852 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
9853 service
.kick_recovery_queue();
9856 if (changed
.count("osd_client_message_cap")) {
9857 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
9858 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9859 if (pol
.throttler_messages
) {
9860 pol
.throttler_messages
->reset_max(newval
);
9863 if (changed
.count("osd_client_message_size_cap")) {
9864 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
9865 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
9866 if (pol
.throttler_bytes
) {
9867 pol
.throttler_bytes
->reset_max(newval
);
9870 if (changed
.count("osd_object_clean_region_max_num_intervals")) {
9871 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
9874 if (changed
.count("osd_scrub_min_interval") ||
9875 changed
.count("osd_scrub_max_interval")) {
9876 resched_all_scrubs();
9877 dout(0) << __func__
<< ": scrub interval change" << dendl
;
9880 if (changed
.count("osd_asio_thread_count")) {
9881 service
.poolctx
.stop();
9882 service
.poolctx
.start(conf
.get_val
<std::uint64_t>("osd_asio_thread_count"));
9886 void OSD::maybe_override_max_osd_capacity_for_qos()
9888 // If the scheduler enabled is mclock, override the default
9889 // osd capacity with the value obtained from running the
9890 // osd bench test. This is later used to setup mclock.
9891 if ((cct
->_conf
.get_val
<std::string
>("osd_op_queue") == "mclock_scheduler") &&
9892 (cct
->_conf
.get_val
<bool>("osd_mclock_skip_benchmark") == false) &&
9893 (!unsupported_objstore_for_qos())) {
9894 std::string max_capacity_iops_config
;
9895 bool force_run_benchmark
=
9896 cct
->_conf
.get_val
<bool>("osd_mclock_force_run_benchmark_on_init");
9898 if (store_is_rotational
) {
9899 max_capacity_iops_config
= "osd_mclock_max_capacity_iops_hdd";
9901 max_capacity_iops_config
= "osd_mclock_max_capacity_iops_ssd";
9904 double default_iops
= 0.0;
9905 double cur_iops
= 0.0;
9906 if (!force_run_benchmark
) {
9907 // Get the current osd iops capacity
9908 cur_iops
= cct
->_conf
.get_val
<double>(max_capacity_iops_config
);
9910 // Get the default max iops capacity
9911 auto val
= cct
->_conf
.get_val_default(max_capacity_iops_config
);
9912 if (!val
.has_value()) {
9913 derr
<< __func__
<< " Unable to determine default value of "
9914 << max_capacity_iops_config
<< dendl
;
9915 // Cannot determine default iops. Force a run of the OSD benchmark.
9916 force_run_benchmark
= true;
9919 default_iops
= std::stod(val
.value());
9922 // Determine if we really need to run the osd benchmark
9923 if (!force_run_benchmark
&& (default_iops
!= cur_iops
)) {
9924 dout(1) << __func__
<< std::fixed
<< std::setprecision(2)
9925 << " default_iops: " << default_iops
9926 << " cur_iops: " << cur_iops
9927 << ". Skip OSD benchmark test." << dendl
;
9932 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
9933 int64_t count
= 12288000; // Count of bytes to write
9934 int64_t bsize
= 4096; // Block size
9935 int64_t osize
= 4194304; // Object size
9936 int64_t onum
= 100; // Count of objects to write
9937 double elapsed
= 0.0; // Time taken to complete the test
9940 int ret
= run_osd_bench_test(count
, bsize
, osize
, onum
, &elapsed
, ss
);
9943 << " osd bench err: " << ret
9944 << " osd bench errstr: " << ss
.str()
9949 double rate
= count
/ elapsed
;
9950 iops
= rate
/ bsize
;
9952 << " osd bench result -"
9953 << std::fixed
<< std::setprecision(3)
9954 << " bandwidth (MiB/sec): " << rate
/ (1024 * 1024)
9955 << " iops: " << iops
9956 << " elapsed_sec: " << elapsed
9959 // Get the threshold IOPS set for the underlying hdd/ssd.
9960 double threshold_iops
= 0.0;
9961 if (store_is_rotational
) {
9962 threshold_iops
= cct
->_conf
.get_val
<double>(
9963 "osd_mclock_iops_capacity_threshold_hdd");
9965 threshold_iops
= cct
->_conf
.get_val
<double>(
9966 "osd_mclock_iops_capacity_threshold_ssd");
9969 // Persist the iops value to the MON store or throw cluster warning
9970 // if the measured iops exceeds the set threshold. If the iops exceed
9971 // the threshold, the default value is used.
9972 if (iops
> threshold_iops
) {
9973 clog
->warn() << "OSD bench result of " << std::to_string(iops
)
9974 << " IOPS exceeded the threshold limit of "
9975 << std::to_string(threshold_iops
) << " IOPS for osd."
9976 << std::to_string(whoami
) << ". IOPS capacity is unchanged"
9977 << " at " << std::to_string(cur_iops
) << " IOPS. The"
9978 << " recommendation is to establish the osd's IOPS capacity"
9979 << " using other benchmark tools (e.g. Fio) and then"
9980 << " override osd_mclock_max_capacity_iops_[hdd|ssd].";
9982 mon_cmd_set_config(max_capacity_iops_config
, std::to_string(iops
));
9987 bool OSD::maybe_override_options_for_qos(const std::set
<std::string
> *changed
)
9989 // Override options only if the scheduler enabled is mclock and the
9990 // underlying objectstore is supported by mclock
9991 if (cct
->_conf
.get_val
<std::string
>("osd_op_queue") == "mclock_scheduler" &&
9992 !unsupported_objstore_for_qos()) {
9993 static const std::map
<std::string
, uint64_t> recovery_qos_defaults
{
9994 {"osd_recovery_max_active", 0},
9995 {"osd_recovery_max_active_hdd", 3},
9996 {"osd_recovery_max_active_ssd", 10},
9997 {"osd_max_backfills", 1},
10000 // Check if we were called because of a configuration change
10001 if (changed
!= nullptr) {
10002 if (cct
->_conf
.get_val
<bool>("osd_mclock_override_recovery_settings")) {
10003 if (changed
->count("osd_max_backfills")) {
10004 dout(1) << __func__
<< " Set local and remote max backfills to "
10005 << cct
->_conf
->osd_max_backfills
<< dendl
;
10006 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10007 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10010 // Recovery options change was attempted without setting
10011 // the 'osd_mclock_override_recovery_settings' option.
10012 // Find the key to remove from the configuration db.
10014 if (changed
->count("osd_max_backfills")) {
10015 key
= "osd_max_backfills";
10016 } else if (changed
->count("osd_recovery_max_active")) {
10017 key
= "osd_recovery_max_active";
10018 } else if (changed
->count("osd_recovery_max_active_hdd")) {
10019 key
= "osd_recovery_max_active_hdd";
10020 } else if (changed
->count("osd_recovery_max_active_ssd")) {
10021 key
= "osd_recovery_max_active_ssd";
10023 // No key that we are interested in. Return.
10027 // Remove the current entry from the configuration if
10028 // different from its default value.
10029 auto val
= recovery_qos_defaults
.find(key
);
10030 if (val
!= recovery_qos_defaults
.end() &&
10031 cct
->_conf
.get_val
<uint64_t>(key
) != val
->second
) {
10032 static const std::vector
<std::string
> osds
= {
10034 "osd." + std::to_string(whoami
)
10037 for (auto osd
: osds
) {
10040 "\"prefix\": \"config rm\", "
10041 "\"who\": \"" + osd
+ "\", "
10042 "\"name\": \"" + key
+ "\""
10044 vector
<std::string
> vcmd
{cmd
};
10046 dout(1) << __func__
<< " Removing Key: " << key
10047 << " for " << osd
<< " from Mon db" << dendl
;
10048 monc
->start_mon_command(vcmd
, {}, nullptr, nullptr, nullptr);
10051 // Raise a cluster warning indicating that the changes did not
10052 // take effect and indicate the reason why.
10053 clog
->warn() << "Change to " << key
<< " on osd."
10054 << std::to_string(whoami
) << " did not take effect."
10055 << " Enable osd_mclock_override_recovery_settings before"
10056 << " setting this option.";
10059 } else { // if (changed != nullptr) (osd boot-up)
10061 * This section is executed only during osd boot-up.
10062 * Override the default recovery max active (hdd & ssd) and max backfills
10063 * config options to either the mClock defaults or retain their respective
10064 * overridden values before the osd was restarted.
10066 for (auto opt
: recovery_qos_defaults
) {
10068 * Note: set_val_default doesn't overwrite an option if it was earlier
10069 * set at a config level greater than CONF_DEFAULT. It doesn't return
10070 * a status. With get_val(), the config subsystem is guaranteed to
10071 * either return the overridden value (if any) or the default value.
10073 cct
->_conf
.set_val_default(opt
.first
, std::to_string(opt
.second
));
10074 auto opt_val
= cct
->_conf
.get_val
<uint64_t>(opt
.first
);
10075 dout(1) << __func__
<< " "
10076 << opt
.first
<< " set to " << opt_val
10078 if (opt
.first
== "osd_max_backfills") {
10079 service
.local_reserver
.set_max(opt_val
);
10080 service
.remote_reserver
.set_max(opt_val
);
10089 void OSD::maybe_override_sleep_options_for_qos()
10091 // Override options only if the scheduler enabled is mclock and the
10092 // underlying objectstore is supported by mclock
10093 if (cct
->_conf
.get_val
<std::string
>("osd_op_queue") == "mclock_scheduler" &&
10094 !unsupported_objstore_for_qos()) {
10096 // Override the various sleep settings
10097 // Disable recovery sleep
10098 cct
->_conf
.set_val("osd_recovery_sleep", std::to_string(0));
10099 cct
->_conf
.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10100 cct
->_conf
.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10101 cct
->_conf
.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10103 // Disable delete sleep
10104 cct
->_conf
.set_val("osd_delete_sleep", std::to_string(0));
10105 cct
->_conf
.set_val("osd_delete_sleep_hdd", std::to_string(0));
10106 cct
->_conf
.set_val("osd_delete_sleep_ssd", std::to_string(0));
10107 cct
->_conf
.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10109 // Disable snap trim sleep
10110 cct
->_conf
.set_val("osd_snap_trim_sleep", std::to_string(0));
10111 cct
->_conf
.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10112 cct
->_conf
.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10113 cct
->_conf
.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10115 // Disable scrub sleep
10116 cct
->_conf
.set_val("osd_scrub_sleep", std::to_string(0));
10120 void OSD::maybe_override_cost_for_qos()
10122 // If the scheduler enabled is mclock, override the default PG deletion cost
10123 // so that mclock can meet the QoS goals.
10124 if (cct
->_conf
.get_val
<std::string
>("osd_op_queue") == "mclock_scheduler" &&
10125 !unsupported_objstore_for_qos()) {
10126 uint64_t pg_delete_cost
= 15728640;
10127 cct
->_conf
.set_val("osd_pg_delete_cost", std::to_string(pg_delete_cost
));
10132 * A context for receiving status from a background mon command to set
10133 * a config option and optionally apply the changes on each op shard.
10135 class MonCmdSetConfigOnFinish
: public Context
{
10142 explicit MonCmdSetConfigOnFinish(
10145 const std::string
&k
,
10146 const std::string
&v
,
10148 : osd(o
), cct(cct
), key(k
), val(v
), update_shard(s
) {}
10149 void finish(int r
) override
{
10151 // Fallback to setting the config within the in-memory "values" map.
10152 cct
->_conf
.set_val_default(key
, val
);
10155 // If requested, apply this option on the
10156 // active scheduler of each op shard.
10157 if (update_shard
) {
10158 for (auto& shard
: osd
->shards
) {
10159 shard
->update_scheduler_config();
10165 void OSD::mon_cmd_set_config(const std::string
&key
, const std::string
&val
)
10169 "\"prefix\": \"config set\", "
10170 "\"who\": \"osd." + std::to_string(whoami
) + "\", "
10171 "\"name\": \"" + key
+ "\", "
10172 "\"value\": \"" + val
+ "\""
10174 vector
<std::string
> vcmd
{cmd
};
10176 // List of config options to be distributed across each op shard.
10177 // Currently limited to a couple of mClock options.
10178 static const std::vector
<std::string
> shard_option
=
10179 { "osd_mclock_max_capacity_iops_hdd", "osd_mclock_max_capacity_iops_ssd" };
10180 const bool update_shard
= std::find(shard_option
.begin(),
10181 shard_option
.end(),
10182 key
) != shard_option
.end();
10184 auto on_finish
= new MonCmdSetConfigOnFinish(this, cct
, key
,
10185 val
, update_shard
);
10186 dout(10) << __func__
<< " Set " << key
<< " = " << val
<< dendl
;
10187 monc
->start_mon_command(vcmd
, {}, nullptr, nullptr, on_finish
);
10190 bool OSD::unsupported_objstore_for_qos()
10192 static const std::vector
<std::string
> unsupported_objstores
= { "filestore" };
10193 return std::find(unsupported_objstores
.begin(),
10194 unsupported_objstores
.end(),
10195 store
->get_type()) != unsupported_objstores
.end();
10198 void OSD::update_log_config()
10200 auto parsed_options
= clog
->parse_client_options(cct
);
10201 derr
<< "log_to_monitors " << parsed_options
.log_to_monitors
<< dendl
;
10204 void OSD::check_config()
10206 // some sanity checks
10207 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
10208 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
10209 << " is not > osd_pg_epoch_persisted_max_stale ("
10210 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
10212 if (cct
->_conf
->osd_object_clean_region_max_num_intervals
< 0) {
10213 clog
->warn() << "osd_object_clean_region_max_num_intervals ("
10214 << cct
->_conf
->osd_object_clean_region_max_num_intervals
10219 // --------------------------------
10221 void OSD::get_latest_osdmap()
10223 dout(10) << __func__
<< " -- start" << dendl
;
10225 boost::system::error_code ec
;
10226 service
.objecter
->wait_for_latest_osdmap(ceph::async::use_blocked
[ec
]);
10228 dout(10) << __func__
<< " -- finish" << dendl
;
10231 // --------------------------------
10233 void OSD::set_perf_queries(const ConfigPayload
&config_payload
) {
10234 const OSDConfigPayload
&osd_config_payload
= boost::get
<OSDConfigPayload
>(config_payload
);
10235 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
= osd_config_payload
.config
;
10236 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
10238 std::list
<OSDPerfMetricQuery
> supported_queries
;
10239 for (auto &it
: queries
) {
10240 auto &query
= it
.first
;
10241 if (!query
.key_descriptor
.empty()) {
10242 supported_queries
.push_back(query
);
10245 if (supported_queries
.size() < queries
.size()) {
10246 dout(1) << queries
.size() - supported_queries
.size()
10247 << " unsupported queries" << dendl
;
10250 std::lock_guard locker
{m_perf_queries_lock
};
10251 m_perf_queries
= supported_queries
;
10252 m_perf_limits
= queries
;
10254 std::vector
<PGRef
> pgs
;
10256 for (auto& pg
: pgs
) {
10257 std::scoped_lock l
{*pg
};
10258 pg
->set_dynamic_perf_stats_queries(supported_queries
);
10262 MetricPayload
OSD::get_perf_reports() {
10263 OSDMetricPayload payload
;
10264 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> &reports
= payload
.report
;
10266 std::vector
<PGRef
> pgs
;
10268 DynamicPerfStats dps
;
10269 for (auto& pg
: pgs
) {
10270 // m_perf_queries can be modified only in set_perf_queries by mgr client
10271 // request, and it is protected by by mgr client's lock, which is held
10272 // when set_perf_queries/get_perf_reports are called, so we may not hold
10273 // m_perf_queries_lock here.
10274 DynamicPerfStats
pg_dps(m_perf_queries
);
10276 pg
->get_dynamic_perf_stats(&pg_dps
);
10280 dps
.add_to_reports(m_perf_limits
, &reports
);
10281 dout(20) << "reports for " << reports
.size() << " queries" << dendl
;
10286 // =============================================================
10288 #undef dout_context
10289 #define dout_context cct
10291 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10293 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
10295 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10297 pg
->osd_shard
= this;
10298 pg
->pg_slot
= slot
;
10299 osd
->inc_num_pgs();
10301 slot
->epoch
= pg
->get_osdmap_epoch();
10302 pg_slots_by_epoch
.insert(*slot
);
10305 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
10307 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
10308 slot
->pg
->osd_shard
= nullptr;
10309 slot
->pg
->pg_slot
= nullptr;
10310 slot
->pg
= nullptr;
10311 osd
->dec_num_pgs();
10313 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10315 if (waiting_for_min_pg_epoch
) {
10316 min_pg_epoch_cond
.notify_all();
10320 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10322 std::lock_guard
l(shard_lock
);
10323 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10324 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10325 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10326 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10328 pg_slots_by_epoch
.insert(*slot
);
10329 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10330 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10331 if (waiting_for_min_pg_epoch
) {
10332 min_pg_epoch_cond
.notify_all();
10336 epoch_t
OSDShard::get_min_pg_epoch()
10338 std::lock_guard
l(shard_lock
);
10339 auto p
= pg_slots_by_epoch
.begin();
10340 if (p
== pg_slots_by_epoch
.end()) {
10346 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10348 std::unique_lock l
{shard_lock
};
10349 ++waiting_for_min_pg_epoch
;
10350 min_pg_epoch_cond
.wait(l
, [need
, this] {
10351 if (pg_slots_by_epoch
.empty()) {
10353 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10356 dout(10) << need
<< " waiting on "
10357 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10361 --waiting_for_min_pg_epoch
;
10364 epoch_t
OSDShard::get_max_waiting_epoch()
10366 std::lock_guard
l(shard_lock
);
10368 for (auto& i
: pg_slots
) {
10369 if (!i
.second
->waiting_peering
.empty()) {
10370 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10376 void OSDShard::consume_map(
10377 const OSDMapRef
& new_osdmap
,
10378 unsigned *pushes_to_free
)
10380 std::lock_guard
l(shard_lock
);
10381 OSDMapRef old_osdmap
;
10383 std::lock_guard
l(osdmap_lock
);
10384 old_osdmap
= std::move(shard_osdmap
);
10385 shard_osdmap
= new_osdmap
;
10387 dout(10) << new_osdmap
->get_epoch()
10388 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10393 auto p
= pg_slots
.begin();
10394 while (p
!= pg_slots
.end()) {
10395 OSDShardPGSlot
*slot
= p
->second
.get();
10396 const spg_t
& pgid
= p
->first
;
10397 dout(20) << __func__
<< " " << pgid
<< dendl
;
10398 if (!slot
->waiting_for_split
.empty()) {
10399 dout(20) << __func__
<< " " << pgid
10400 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10404 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10405 dout(20) << __func__
<< " " << pgid
10406 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10411 if (!slot
->waiting_peering
.empty()) {
10412 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10413 if (first
<= new_osdmap
->get_epoch()) {
10414 dout(20) << __func__
<< " " << pgid
10415 << " pending_peering first epoch " << first
10416 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10417 queued
+= _wake_pg_slot(pgid
, slot
);
10422 if (!slot
->waiting
.empty()) {
10423 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10424 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10429 while (!slot
->waiting
.empty() &&
10430 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10431 auto& qi
= slot
->waiting
.front();
10432 dout(20) << __func__
<< " " << pgid
10433 << " waiting item " << qi
10434 << " epoch " << qi
.get_map_epoch()
10435 << " <= " << new_osdmap
->get_epoch()
10437 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10439 << ", dropping" << dendl
;
10440 *pushes_to_free
+= qi
.get_reserved_pushes();
10441 slot
->waiting
.pop_front();
10444 if (slot
->waiting
.empty() &&
10445 slot
->num_running
== 0 &&
10446 slot
->waiting_for_split
.empty() &&
10448 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10449 p
= pg_slots
.erase(p
);
10456 std::lock_guard l
{sdata_wait_lock
};
10458 sdata_cond
.notify_one();
10460 sdata_cond
.notify_all();
10464 int OSDShard::_wake_pg_slot(
10466 OSDShardPGSlot
*slot
)
10469 dout(20) << __func__
<< " " << pgid
10470 << " to_process " << slot
->to_process
10471 << " waiting " << slot
->waiting
10472 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10473 for (auto i
= slot
->to_process
.rbegin();
10474 i
!= slot
->to_process
.rend();
10476 scheduler
->enqueue_front(std::move(*i
));
10479 slot
->to_process
.clear();
10480 for (auto i
= slot
->waiting
.rbegin();
10481 i
!= slot
->waiting
.rend();
10483 scheduler
->enqueue_front(std::move(*i
));
10486 slot
->waiting
.clear();
10487 for (auto i
= slot
->waiting_peering
.rbegin();
10488 i
!= slot
->waiting_peering
.rend();
10490 // this is overkill; we requeue everything, even if some of these
10491 // items are waiting for maps we don't have yet. FIXME, maybe,
10492 // someday, if we decide this inefficiency matters
10493 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10494 scheduler
->enqueue_front(std::move(*j
));
10498 slot
->waiting_peering
.clear();
10499 ++slot
->requeue_seq
;
10503 void OSDShard::identify_splits_and_merges(
10504 const OSDMapRef
& as_of_osdmap
,
10505 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10506 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10508 std::lock_guard
l(shard_lock
);
10509 dout(20) << __func__
<< " " << pg_slots
.size() << " slots" << dendl
;
10510 if (shard_osdmap
) {
10511 for (auto& i
: pg_slots
) {
10512 dout(20) << __func__
<< " slot pgid:" << i
.first
<< "slot:" << i
.second
.get() << dendl
;
10513 const spg_t
& pgid
= i
.first
;
10514 auto *slot
= i
.second
.get();
10516 osd
->service
.identify_splits_and_merges(
10517 shard_osdmap
, as_of_osdmap
, pgid
,
10518 split_pgs
, merge_pgs
);
10519 } else if (!slot
->waiting_for_split
.empty()) {
10520 osd
->service
.identify_splits_and_merges(
10521 shard_osdmap
, as_of_osdmap
, pgid
,
10522 split_pgs
, nullptr);
10524 dout(20) << __func__
<< " slot " << pgid
10525 << " has no pg and waiting_for_split " << dendl
;
10529 dout(20) << __func__
<< " " << split_pgs
->size() << " splits, "
10530 << merge_pgs
->size() << " merges" << dendl
;
10533 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10534 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10536 std::lock_guard
l(shard_lock
);
10537 _prime_splits(pgids
);
10538 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10539 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10540 for (auto i
: *pgids
) {
10541 osd
->service
.identify_splits_and_merges(
10542 as_of_osdmap
, shard_osdmap
, i
.first
,
10543 &newer_children
, nullptr);
10545 newer_children
.insert(pgids
->begin(), pgids
->end());
10546 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10547 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10549 _prime_splits(&newer_children
);
10550 // note: we don't care what is left over here for other shards.
10551 // if this shard is ahead of us and one isn't, e.g., one thread is
10552 // calling into prime_splits via _process (due to a newly created
10553 // pg) and this shard has a newer map due to a racing consume_map,
10554 // then any grandchildren left here will be identified (or were
10555 // identified) when the slower shard's osdmap is advanced.
10556 // _prime_splits() will tolerate the case where the pgid is
10561 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10563 dout(10) << *pgids
<< dendl
;
10564 auto p
= pgids
->begin();
10565 while (p
!= pgids
->end()) {
10566 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10567 if (shard_index
== shard_id
) {
10568 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10570 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10571 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10572 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10575 ceph_assert(q
!= pg_slots
.end());
10576 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10578 q
->second
->waiting_for_split
.insert(p
->second
);
10580 p
= pgids
->erase(p
);
10587 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10588 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10590 std::lock_guard
l(shard_lock
);
10591 dout(20) << __func__
<< " checking shard " << shard_id
10592 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10593 auto p
= merge_pgs
->begin();
10594 while (p
!= merge_pgs
->end()) {
10595 spg_t pgid
= p
->first
;
10596 epoch_t epoch
= p
->second
;
10597 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10598 if (shard_index
!= shard_id
) {
10602 OSDShardPGSlot
*slot
;
10603 auto r
= pg_slots
.emplace(pgid
, nullptr);
10605 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10607 slot
= r
.first
->second
.get();
10610 dout(20) << __func__
<< " have merge participant pg " << pgid
10611 << " " << slot
->pg
<< dendl
;
10612 } else if (!slot
->waiting_for_split
.empty() &&
10613 *slot
->waiting_for_split
.begin() < epoch
) {
10614 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10615 << " " << slot
->waiting_for_split
<< dendl
;
10617 dout(20) << __func__
<< " creating empty merge participant " << pgid
10618 << " for merge in " << epoch
<< dendl
;
10619 // leave history zeroed; PG::merge_from() will fill it in.
10620 pg_history_t history
;
10621 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10622 history
, PastIntervals(), false);
10623 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10624 _attach_pg(r
.first
->second
.get(), pg
.get());
10625 _wake_pg_slot(pgid
, slot
);
10628 // mark slot for merge
10629 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10630 slot
->waiting_for_merge_epoch
= epoch
;
10631 p
= merge_pgs
->erase(p
);
10635 void OSDShard::register_and_wake_split_child(PG
*pg
)
10637 dout(15) << __func__
<< ": " << pg
<< " #:" << pg_slots
.size() << dendl
;
10640 std::lock_guard
l(shard_lock
);
10641 dout(10) << __func__
<< ": " << pg
->pg_id
<< " " << pg
<< dendl
;
10642 auto p
= pg_slots
.find(pg
->pg_id
);
10643 ceph_assert(p
!= pg_slots
.end());
10644 auto *slot
= p
->second
.get();
10645 dout(20) << __func__
<< ": " << pg
->pg_id
<< " waiting_for_split "
10646 << slot
->waiting_for_split
<< dendl
;
10647 ceph_assert(!slot
->pg
);
10648 ceph_assert(!slot
->waiting_for_split
.empty());
10649 _attach_pg(slot
, pg
);
10651 epoch
= pg
->get_osdmap_epoch();
10652 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10653 slot
->waiting_for_split
.erase(epoch
);
10654 if (slot
->waiting_for_split
.empty()) {
10655 _wake_pg_slot(pg
->pg_id
, slot
);
10657 dout(10) << __func__
<< " still waiting for split on "
10658 << slot
->waiting_for_split
<< dendl
;
10662 // kick child to ensure it pulls up to the latest osdmap
10663 osd
->enqueue_peering_evt(
10666 std::make_shared
<PGPeeringEvent
>(
10671 std::lock_guard l
{sdata_wait_lock
};
10672 sdata_cond
.notify_one();
10675 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10677 std::lock_guard
l(shard_lock
);
10678 vector
<spg_t
> to_delete
;
10679 for (auto& i
: pg_slots
) {
10680 if (i
.first
!= parent
&&
10681 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10682 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10684 _wake_pg_slot(i
.first
, i
.second
.get());
10685 to_delete
.push_back(i
.first
);
10688 for (auto pgid
: to_delete
) {
10689 pg_slots
.erase(pgid
);
10693 void OSDShard::update_scheduler_config()
10695 scheduler
->update_configuration();
10698 std::string
OSDShard::get_scheduler_type()
10700 std::ostringstream scheduler_type
;
10701 scheduler_type
<< *scheduler
;
10702 return scheduler_type
.str();
10705 OSDShard::OSDShard(
10712 shard_name(string("OSDShard.") + stringify(id
)),
10713 sdata_wait_lock_name(shard_name
+ "::sdata_wait_lock"),
10714 sdata_wait_lock
{make_mutex(sdata_wait_lock_name
)},
10715 osdmap_lock
{make_mutex(shard_name
+ "::osdmap_lock")},
10716 shard_lock_name(shard_name
+ "::shard_lock"),
10717 shard_lock
{make_mutex(shard_lock_name
)},
10718 scheduler(ceph::osd::scheduler::make_scheduler(
10719 cct
, osd
->whoami
, osd
->num_shards
, id
, osd
->store
->is_rotational(),
10720 osd
->store
->get_type(), osd
->monc
)),
10721 context_queue(sdata_wait_lock
, sdata_cond
)
10723 dout(0) << "using op scheduler " << *scheduler
<< dendl
;
10727 // =============================================================
10729 #undef dout_context
10730 #define dout_context osd->cct
10732 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10734 void OSD::ShardedOpWQ::_add_slot_waiter(
10736 OSDShardPGSlot
*slot
,
10737 OpSchedulerItem
&& qi
)
10739 if (qi
.is_peering()) {
10740 dout(20) << __func__
<< " " << pgid
10741 << " peering, item epoch is "
10742 << qi
.get_map_epoch()
10743 << ", will wait on " << qi
<< dendl
;
10744 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10746 dout(20) << __func__
<< " " << pgid
10747 << " item epoch is "
10748 << qi
.get_map_epoch()
10749 << ", will wait on " << qi
<< dendl
;
10750 slot
->waiting
.push_back(std::move(qi
));
10755 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10757 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10759 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10760 auto& sdata
= osd
->shards
[shard_index
];
10761 ceph_assert(sdata
);
10763 // If all threads of shards do oncommits, there is a out-of-order
10764 // problem. So we choose the thread which has the smallest
10765 // thread_index(thread_index < num_shards) of shard to do oncommit
10767 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10770 sdata
->shard_lock
.lock();
10771 if (sdata
->scheduler
->empty() &&
10772 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10773 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10774 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10775 // we raced with a context_queue addition, don't wait
10776 wait_lock
.unlock();
10777 } else if (!sdata
->stop_waiting
) {
10778 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10779 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10780 sdata
->shard_lock
.unlock();
10781 sdata
->sdata_cond
.wait(wait_lock
);
10782 wait_lock
.unlock();
10783 sdata
->shard_lock
.lock();
10784 if (sdata
->scheduler
->empty() &&
10785 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10786 sdata
->shard_lock
.unlock();
10789 // found a work item; reapply default wq timeouts
10790 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10791 timeout_interval
, suicide_interval
);
10793 dout(20) << __func__
<< " need return immediately" << dendl
;
10794 wait_lock
.unlock();
10795 sdata
->shard_lock
.unlock();
10800 list
<Context
*> oncommits
;
10801 if (is_smallest_thread_index
) {
10802 sdata
->context_queue
.move_to(oncommits
);
10805 WorkItem work_item
;
10806 while (!std::get_if
<OpSchedulerItem
>(&work_item
)) {
10807 if (sdata
->scheduler
->empty()) {
10808 if (osd
->is_stopping()) {
10809 sdata
->shard_lock
.unlock();
10810 for (auto c
: oncommits
) {
10811 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10814 return; // OSD shutdown, discard.
10816 sdata
->shard_lock
.unlock();
10817 handle_oncommits(oncommits
);
10821 work_item
= sdata
->scheduler
->dequeue();
10822 if (osd
->is_stopping()) {
10823 sdata
->shard_lock
.unlock();
10824 for (auto c
: oncommits
) {
10825 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10828 return; // OSD shutdown, discard.
10831 // If the work item is scheduled in the future, wait until
10832 // the time returned in the dequeue response before retrying.
10833 if (auto when_ready
= std::get_if
<double>(&work_item
)) {
10834 if (is_smallest_thread_index
) {
10835 sdata
->shard_lock
.unlock();
10836 handle_oncommits(oncommits
);
10837 sdata
->shard_lock
.lock();
10839 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10840 auto future_time
= ceph::real_clock::from_double(*when_ready
);
10841 dout(10) << __func__
<< " dequeue future request at " << future_time
<< dendl
;
10842 // Disable heartbeat timeout until we find a non-future work item to process.
10843 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10844 sdata
->shard_lock
.unlock();
10845 ++sdata
->waiting_threads
;
10846 sdata
->sdata_cond
.wait_until(wait_lock
, future_time
);
10847 --sdata
->waiting_threads
;
10848 wait_lock
.unlock();
10849 sdata
->shard_lock
.lock();
10850 // Reapply default wq timeouts
10851 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10852 timeout_interval
, suicide_interval
);
10853 // Populate the oncommits list if there were any additions
10854 // to the context_queue while we were waiting
10855 if (is_smallest_thread_index
) {
10856 sdata
->context_queue
.move_to(oncommits
);
10861 // Access the stored item
10862 auto item
= std::move(std::get
<OpSchedulerItem
>(work_item
));
10863 if (osd
->is_stopping()) {
10864 sdata
->shard_lock
.unlock();
10865 for (auto c
: oncommits
) {
10866 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10869 return; // OSD shutdown, discard.
10872 const auto token
= item
.get_ordering_token();
10873 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10875 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10877 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10878 dout(20) << __func__
<< " " << token
10879 << (r
.second
? " (new)" : "")
10880 << " to_process " << slot
->to_process
10881 << " waiting " << slot
->waiting
10882 << " waiting_peering " << slot
->waiting_peering
10884 slot
->to_process
.push_back(std::move(item
));
10885 dout(20) << __func__
<< " " << slot
->to_process
.back()
10886 << " queued" << dendl
;
10889 PGRef pg
= slot
->pg
;
10891 // lock pg (if we have it)
10893 // note the requeue seq now...
10894 uint64_t requeue_seq
= slot
->requeue_seq
;
10895 ++slot
->num_running
;
10897 sdata
->shard_lock
.unlock();
10898 osd
->service
.maybe_inject_dispatch_delay();
10900 osd
->service
.maybe_inject_dispatch_delay();
10901 sdata
->shard_lock
.lock();
10903 auto q
= sdata
->pg_slots
.find(token
);
10904 if (q
== sdata
->pg_slots
.end()) {
10905 // this can happen if we race with pg removal.
10906 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10908 sdata
->shard_lock
.unlock();
10909 handle_oncommits(oncommits
);
10912 slot
= q
->second
.get();
10913 --slot
->num_running
;
10915 if (slot
->to_process
.empty()) {
10916 // raced with _wake_pg_slot or consume_map
10917 dout(20) << __func__
<< " " << token
10918 << " nothing queued" << dendl
;
10920 sdata
->shard_lock
.unlock();
10921 handle_oncommits(oncommits
);
10924 if (requeue_seq
!= slot
->requeue_seq
) {
10925 dout(20) << __func__
<< " " << token
10926 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10927 << requeue_seq
<< ", we raced with _wake_pg_slot"
10930 sdata
->shard_lock
.unlock();
10931 handle_oncommits(oncommits
);
10934 if (slot
->pg
!= pg
) {
10935 // this can happen if we race with pg removal.
10936 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
10943 dout(20) << __func__
<< " " << token
10944 << " to_process " << slot
->to_process
10945 << " waiting " << slot
->waiting
10946 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10948 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
10952 auto qi
= std::move(slot
->to_process
.front());
10953 slot
->to_process
.pop_front();
10954 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
10955 set
<pair
<spg_t
,epoch_t
>> new_children
;
10959 // should this pg shard exist on this osd in this (or a later) epoch?
10960 osdmap
= sdata
->shard_osdmap
;
10961 const PGCreateInfo
*create_info
= qi
.creates_pg();
10962 if (!slot
->waiting_for_split
.empty()) {
10963 dout(20) << __func__
<< " " << token
10964 << " splitting " << slot
->waiting_for_split
<< dendl
;
10965 _add_slot_waiter(token
, slot
, std::move(qi
));
10966 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
10967 dout(20) << __func__
<< " " << token
10968 << " map " << qi
.get_map_epoch() << " > "
10969 << osdmap
->get_epoch() << dendl
;
10970 _add_slot_waiter(token
, slot
, std::move(qi
));
10971 } else if (qi
.is_peering()) {
10972 if (!qi
.peering_requires_pg()) {
10973 // for pg-less events, we run them under the ordering lock, since
10974 // we don't have the pg lock to keep them ordered.
10975 qi
.run(osd
, sdata
, pg
, tp_handle
);
10976 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
10978 if (create_info
->by_mon
&&
10979 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
10980 dout(20) << __func__
<< " " << token
10981 << " no pg, no longer primary, ignoring mon create on "
10984 dout(20) << __func__
<< " " << token
10985 << " no pg, should create on " << qi
<< dendl
;
10986 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
10988 // we created the pg! drop out and continue "normally"!
10989 sdata
->_attach_pg(slot
, pg
.get());
10990 sdata
->_wake_pg_slot(token
, slot
);
10992 // identify split children between create epoch and shard epoch.
10993 osd
->service
.identify_splits_and_merges(
10994 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
10995 sdata
->_prime_splits(&new_children
);
10996 // distribute remaining split children to other shards below!
10999 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
11002 dout(20) << __func__
<< " " << token
11003 << " no pg, peering, !create, discarding " << qi
<< dendl
;
11006 dout(20) << __func__
<< " " << token
11007 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
11008 << ", discarding " << qi
11011 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
11012 dout(20) << __func__
<< " " << token
11013 << " no pg, should exist e" << osdmap
->get_epoch()
11014 << ", will wait on " << qi
<< dendl
;
11015 _add_slot_waiter(token
, slot
, std::move(qi
));
11017 dout(20) << __func__
<< " " << token
11018 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
11019 << ", dropping " << qi
<< dendl
;
11020 // share map with client?
11021 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11022 osd
->service
.maybe_share_map((*_op
)->get_req()->get_connection().get(),
11023 sdata
->shard_osdmap
,
11024 (*_op
)->sent_epoch
);
11026 unsigned pushes_to_free
= qi
.get_reserved_pushes();
11027 if (pushes_to_free
> 0) {
11028 sdata
->shard_lock
.unlock();
11029 osd
->service
.release_reserved_pushes(pushes_to_free
);
11030 handle_oncommits(oncommits
);
11034 sdata
->shard_lock
.unlock();
11035 handle_oncommits(oncommits
);
11038 if (qi
.is_peering()) {
11039 OSDMapRef osdmap
= sdata
->shard_osdmap
;
11040 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11041 _add_slot_waiter(token
, slot
, std::move(qi
));
11042 sdata
->shard_lock
.unlock();
11044 handle_oncommits(oncommits
);
11048 sdata
->shard_lock
.unlock();
11050 if (!new_children
.empty()) {
11051 for (auto shard
: osd
->shards
) {
11052 shard
->prime_splits(osdmap
, &new_children
);
11054 ceph_assert(new_children
.empty());
11057 // osd_opwq_process marks the point at which an operation has been dequeued
11058 // and will begin to be handled by a worker thread.
11062 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11063 reqid
= (*_op
)->get_reqid();
11066 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
11067 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11070 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
11071 Formatter
*f
= Formatter::create("json");
11072 f
->open_object_section("q");
11074 f
->close_section();
11079 qi
.run(osd
, sdata
, pg
, tp_handle
);
11084 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11085 reqid
= (*_op
)->get_reqid();
11088 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
11089 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11092 handle_oncommits(oncommits
);
11095 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem
&& item
) {
11096 if (unlikely(m_fast_shutdown
) ) {
11097 // stop enqueing when we are in the middle of a fast shutdown
11101 uint32_t shard_index
=
11102 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11104 OSDShard
* sdata
= osd
->shards
[shard_index
];
11105 assert (NULL
!= sdata
);
11107 dout(20) << __func__
<< " " << item
<< dendl
;
11111 std::lock_guard l
{sdata
->shard_lock
};
11112 empty
= sdata
->scheduler
->empty();
11113 sdata
->scheduler
->enqueue(std::move(item
));
11117 std::lock_guard l
{sdata
->sdata_wait_lock
};
11119 sdata
->sdata_cond
.notify_all();
11120 } else if (sdata
->waiting_threads
) {
11121 sdata
->sdata_cond
.notify_one();
11126 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem
&& item
)
11128 if (unlikely(m_fast_shutdown
) ) {
11129 // stop enqueing when we are in the middle of a fast shutdown
11133 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11134 auto& sdata
= osd
->shards
[shard_index
];
11135 ceph_assert(sdata
);
11136 sdata
->shard_lock
.lock();
11137 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
11138 if (p
!= sdata
->pg_slots
.end() &&
11139 !p
->second
->to_process
.empty()) {
11140 // we may be racing with _process, which has dequeued a new item
11141 // from scheduler, put it on to_process, and is now busy taking the
11142 // pg lock. ensure this old requeued item is ordered before any
11143 // such newer item in to_process.
11144 p
->second
->to_process
.push_front(std::move(item
));
11145 item
= std::move(p
->second
->to_process
.back());
11146 p
->second
->to_process
.pop_back();
11147 dout(20) << __func__
11148 << " " << p
->second
->to_process
.front()
11149 << " shuffled w/ " << item
<< dendl
;
11151 dout(20) << __func__
<< " " << item
<< dendl
;
11153 sdata
->scheduler
->enqueue_front(std::move(item
));
11154 sdata
->shard_lock
.unlock();
11155 std::lock_guard l
{sdata
->sdata_wait_lock
};
11156 sdata
->sdata_cond
.notify_one();
11159 void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11161 uint32_t shard_index
= 0;
11162 m_fast_shutdown
= true;
11164 for (; shard_index
< osd
->num_shards
; shard_index
++) {
11165 auto& sdata
= osd
->shards
[shard_index
];
11166 ceph_assert(sdata
);
11167 sdata
->shard_lock
.lock();
11168 int work_count
= 0;
11169 while(! sdata
->scheduler
->empty() ) {
11170 auto work_item
= sdata
->scheduler
->dequeue();
11173 sdata
->shard_lock
.unlock();
11177 namespace ceph::osd_cmds
{
11179 int heap(CephContext
& cct
,
11180 const cmdmap_t
& cmdmap
,
11181 std::ostream
& outos
,
11182 std::ostream
& erros
)
11184 if (!ceph_using_tcmalloc()) {
11185 erros
<< "could not issue heap profiler command -- not using tcmalloc!";
11186 return -EOPNOTSUPP
;
11190 if (!cmd_getval(cmdmap
, "heapcmd", cmd
)) {
11191 erros
<< "unable to get value for command \"" << cmd
<< "\"";
11195 std::vector
<std::string
> cmd_vec
;
11196 get_str_vec(cmd
, cmd_vec
);
11199 if (cmd_getval(cmdmap
, "value", val
)) {
11200 cmd_vec
.push_back(val
);
11203 ceph_heap_profiler_handle_command(cmd_vec
, outos
);
11208 } // namespace ceph::osd_cmds