1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/range/adaptor/reversed.hpp>
29 #ifdef HAVE_SYS_PARAM_H
30 #include <sys/param.h>
33 #ifdef HAVE_SYS_MOUNT_H
34 #include <sys/mount.h>
38 #include "osd/scrubber/scrub_machine.h"
39 #include "osd/scrubber/pg_scrubber.h"
41 #include "include/types.h"
42 #include "include/compat.h"
43 #include "include/random.h"
44 #include "include/scope_guard.h"
49 #include "osdc/Objecter.h"
51 #include "common/errno.h"
52 #include "common/ceph_argparse.h"
53 #include "common/ceph_releases.h"
54 #include "common/ceph_time.h"
55 #include "common/version.h"
56 #include "common/async/blocked_completion.h"
57 #include "common/pick_address.h"
58 #include "common/blkdev.h"
59 #include "common/numa.h"
61 #include "os/ObjectStore.h"
63 #include "os/FuseStore.h"
66 #include "PrimaryLogPG.h"
68 #include "msg/Messenger.h"
69 #include "msg/Message.h"
71 #include "mon/MonClient.h"
73 #include "messages/MLog.h"
75 #include "messages/MGenericMessage.h"
76 #include "messages/MOSDPing.h"
77 #include "messages/MOSDFailure.h"
78 #include "messages/MOSDMarkMeDown.h"
79 #include "messages/MOSDMarkMeDead.h"
80 #include "messages/MOSDFull.h"
81 #include "messages/MOSDOp.h"
82 #include "messages/MOSDOpReply.h"
83 #include "messages/MOSDBackoff.h"
84 #include "messages/MOSDBeacon.h"
85 #include "messages/MOSDRepOp.h"
86 #include "messages/MOSDRepOpReply.h"
87 #include "messages/MOSDBoot.h"
88 #include "messages/MOSDPGTemp.h"
89 #include "messages/MOSDPGReadyToMerge.h"
91 #include "messages/MOSDMap.h"
92 #include "messages/MMonGetOSDMap.h"
93 #include "messages/MOSDPGNotify.h"
94 #include "messages/MOSDPGNotify2.h"
95 #include "messages/MOSDPGQuery2.h"
96 #include "messages/MOSDPGLog.h"
97 #include "messages/MOSDPGRemove.h"
98 #include "messages/MOSDPGInfo.h"
99 #include "messages/MOSDPGInfo2.h"
100 #include "messages/MOSDPGCreate.h"
101 #include "messages/MOSDPGCreate2.h"
102 #include "messages/MBackfillReserve.h"
103 #include "messages/MRecoveryReserve.h"
104 #include "messages/MOSDForceRecovery.h"
105 #include "messages/MOSDECSubOpWrite.h"
106 #include "messages/MOSDECSubOpWriteReply.h"
107 #include "messages/MOSDECSubOpRead.h"
108 #include "messages/MOSDECSubOpReadReply.h"
109 #include "messages/MOSDPGCreated.h"
110 #include "messages/MOSDPGUpdateLogMissing.h"
111 #include "messages/MOSDPGUpdateLogMissingReply.h"
113 #include "messages/MOSDPeeringOp.h"
115 #include "messages/MOSDAlive.h"
117 #include "messages/MOSDScrub.h"
118 #include "messages/MOSDScrub2.h"
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
123 #include "messages/MPGStats.h"
125 #include "messages/MMonGetPurgedSnaps.h"
126 #include "messages/MMonGetPurgedSnapsReply.h"
128 #include "common/perf_counters.h"
129 #include "common/Timer.h"
130 #include "common/LogClient.h"
131 #include "common/AsyncReserver.h"
132 #include "common/HeartbeatMap.h"
133 #include "common/admin_socket.h"
134 #include "common/ceph_context.h"
136 #include "global/signal_handler.h"
137 #include "global/pidfile.h"
139 #include "include/color.h"
140 #include "perfglue/cpu_profiler.h"
141 #include "perfglue/heap_profiler.h"
143 #include "osd/ClassHandler.h"
144 #include "osd/OpRequest.h"
146 #include "auth/AuthAuthorizeHandler.h"
147 #include "auth/RotatingKeyRing.h"
149 #include "objclass/objclass.h"
151 #include "common/cmdparse.h"
152 #include "include/str_list.h"
153 #include "include/util.h"
155 #include "include/ceph_assert.h"
156 #include "common/config.h"
157 #include "common/EventTrace.h"
159 #include "json_spirit/json_spirit_reader.h"
160 #include "json_spirit/json_spirit_writer.h"
163 #define TRACEPOINT_DEFINE
164 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
165 #include "tracing/osd.h"
166 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
167 #undef TRACEPOINT_DEFINE
169 #define tracepoint(...)
172 #include "osd_tracer.h"
175 #define dout_context cct
176 #define dout_subsys ceph_subsys_osd
178 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
182 using std::lock_guard
;
183 using std::make_pair
;
184 using std::make_tuple
;
185 using std::make_unique
;
188 using std::ostringstream
;
192 using std::stringstream
;
193 using std::to_string
;
194 using std::unique_ptr
;
197 using ceph::bufferlist
;
198 using ceph::bufferptr
;
201 using ceph::fixed_u_to_string
;
202 using ceph::Formatter
;
203 using ceph::heartbeat_handle_d
;
204 using ceph::make_mutex
;
206 using namespace ceph::osd::scheduler
;
207 using TOPNSPC::common::cmd_getval
;
208 using TOPNSPC::common::cmd_getval_or
;
210 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
211 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
215 //Initial features in new superblock.
216 //Features here are also automatically upgraded
217 CompatSet
OSD::get_osd_initial_compat_set() {
218 CompatSet::FeatureSet ceph_osd_feature_compat
;
219 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
220 CompatSet::FeatureSet ceph_osd_feature_incompat
;
221 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
222 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
223 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
224 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
225 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
226 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
227 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
228 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
229 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
230 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
231 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
232 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
233 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
234 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
235 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
236 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
);
237 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
238 ceph_osd_feature_incompat
);
241 //Features are added here that this OSD supports.
242 CompatSet
OSD::get_osd_compat_set() {
243 CompatSet compat
= get_osd_initial_compat_set();
244 //Any features here can be set in code, but not in initial superblock
245 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
249 OSDService::OSDService(OSD
*osd
, ceph::async::io_context_pool
& poolctx
) :
252 whoami(osd
->whoami
), store(osd
->store
.get()),
253 log_client(osd
->log_client
), clog(osd
->clog
),
254 pg_recovery_stats(osd
->pg_recovery_stats
),
255 cluster_messenger(osd
->cluster_messenger
),
256 client_messenger(osd
->client_messenger
),
258 recoverystate_perf(osd
->recoverystate_perf
),
260 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
261 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
262 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
263 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
265 m_scrub_queue
{cct
, *this},
266 agent_valid_iterator(false),
268 flush_mode_high_count(0),
271 agent_stop_flag(false),
272 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
273 last_recalibrate(ceph_clock_now()),
274 promote_max_objects(0),
275 promote_max_bytes(0),
277 objecter(make_unique
<Objecter
>(osd
->client_messenger
->cct
,
278 osd
->objecter_messenger
,
279 osd
->monc
, poolctx
)),
280 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
281 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
283 recovery_request_timer(cct
, recovery_request_lock
, false),
284 sleep_timer(cct
, sleep_lock
, false),
285 reserver_finisher(cct
),
286 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
287 cct
->_conf
->osd_min_recovery_priority
),
288 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
289 cct
->_conf
->osd_min_recovery_priority
),
290 snap_reserver(cct
, &reserver_finisher
,
291 cct
->_conf
->osd_max_trimming_pgs
),
292 recovery_ops_active(0),
293 recovery_ops_reserved(0),
294 recovery_paused(false),
295 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
296 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
297 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
299 cur_ratio(0), physical_ratio(0),
300 boot_epoch(0), up_epoch(0), bind_epoch(0)
304 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
306 str
<< "objecter-finisher-" << i
;
307 auto fin
= make_unique
<Finisher
>(osd
->client_messenger
->cct
, str
.str(), "finisher");
308 objecter_finishers
.push_back(std::move(fin
));
313 void OSDService::add_pgid(spg_t pgid
, PG
*pg
) {
314 std::lock_guard
l(pgid_lock
);
315 if (!pgid_tracker
.count(pgid
)) {
318 pgid_tracker
[pgid
]++;
320 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
322 std::lock_guard
l(pgid_lock
);
323 ceph_assert(pgid_tracker
.count(pgid
));
324 ceph_assert(pgid_tracker
[pgid
] > 0);
325 pgid_tracker
[pgid
]--;
326 if (pgid_tracker
[pgid
] == 0) {
327 pgid_tracker
.erase(pgid
);
328 live_pgs
.erase(pgid
);
331 void OSDService::dump_live_pgids()
333 std::lock_guard
l(pgid_lock
);
334 derr
<< "live pgids:" << dendl
;
335 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
336 i
!= pgid_tracker
.cend();
338 derr
<< "\t" << *i
<< dendl
;
339 live_pgs
[i
->first
]->dump_live_ids();
345 ceph::signedspan
OSDService::get_mnow()
347 return ceph::mono_clock::now() - osd
->startup_time
;
350 void OSDService::identify_splits_and_merges(
354 set
<pair
<spg_t
,epoch_t
>> *split_children
,
355 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
357 if (!old_map
->have_pg_pool(pgid
.pool())) {
360 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
361 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
362 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
365 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
366 << " to e" << new_map
->get_epoch()
367 << " pg_nums " << p
->second
<< dendl
;
369 queue
.push_back(pgid
);
371 while (!queue
.empty()) {
372 auto cur
= queue
.front();
375 unsigned pgnum
= old_pgnum
;
376 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
377 q
!= p
->second
.end() &&
378 q
->first
<= new_map
->get_epoch();
380 if (pgnum
< q
->second
) {
382 if (cur
.ps() < pgnum
) {
384 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
385 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
386 << " pg_num " << pgnum
<< " -> " << q
->second
387 << " children " << children
<< dendl
;
388 for (auto i
: children
) {
389 split_children
->insert(make_pair(i
, q
->first
));
394 } else if (cur
.ps() < q
->second
) {
395 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
396 << " pg_num " << pgnum
<< " -> " << q
->second
397 << " is a child" << dendl
;
398 // normally we'd capture this from the parent, but it's
399 // possible the parent doesn't exist yet (it will be
400 // fabricated to allow an intervening merge). note this PG
401 // as a split child here to be sure we catch it.
402 split_children
->insert(make_pair(cur
, q
->first
));
404 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
405 << " pg_num " << pgnum
<< " -> " << q
->second
406 << " is post-split, skipping" << dendl
;
408 } else if (merge_pgs
) {
410 if (cur
.ps() >= q
->second
) {
411 if (cur
.ps() < pgnum
) {
413 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
415 parent
.is_split(q
->second
, pgnum
, &children
);
416 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
417 << " pg_num " << pgnum
<< " -> " << q
->second
418 << " is merge source, target " << parent
419 << ", source(s) " << children
<< dendl
;
420 merge_pgs
->insert(make_pair(parent
, q
->first
));
421 if (!did
.count(parent
)) {
422 // queue (and re-scan) parent in case it might not exist yet
423 // and there are some future splits pending on it
424 queue
.push_back(parent
);
426 for (auto c
: children
) {
427 merge_pgs
->insert(make_pair(c
, q
->first
));
433 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
434 << " pg_num " << pgnum
<< " -> " << q
->second
435 << " is beyond old pgnum, skipping" << dendl
;
439 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
440 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
441 << " pg_num " << pgnum
<< " -> " << q
->second
442 << " is merge target, source " << children
<< dendl
;
443 for (auto c
: children
) {
444 merge_pgs
->insert(make_pair(c
, q
->first
));
448 merge_pgs
->insert(make_pair(cur
, q
->first
));
457 void OSDService::need_heartbeat_peer_update()
459 osd
->need_heartbeat_peer_update();
462 HeartbeatStampsRef
OSDService::get_hb_stamps(unsigned peer
)
464 std::lock_guard
l(hb_stamp_lock
);
465 if (peer
>= hb_stamps
.size()) {
466 hb_stamps
.resize(peer
+ 1);
468 if (!hb_stamps
[peer
]) {
469 hb_stamps
[peer
] = ceph::make_ref
<HeartbeatStamps
>(peer
);
471 return hb_stamps
[peer
];
474 void OSDService::queue_renew_lease(epoch_t epoch
, spg_t spgid
)
476 osd
->enqueue_peering_evt(
479 std::make_shared
<PGPeeringEvent
>(
484 void OSDService::start_shutdown()
487 std::lock_guard
l(agent_timer_lock
);
488 agent_timer
.shutdown();
492 std::lock_guard
l(sleep_lock
);
493 sleep_timer
.shutdown();
497 std::lock_guard
l(recovery_request_lock
);
498 recovery_request_timer
.shutdown();
502 void OSDService::shutdown_reserver()
504 reserver_finisher
.wait_for_empty();
505 reserver_finisher
.stop();
508 void OSDService::shutdown()
510 mono_timer
.suspend();
513 std::lock_guard
l(watch_lock
);
514 watch_timer
.shutdown();
517 objecter
->shutdown();
518 for (auto& f
: objecter_finishers
) {
523 publish_map(OSDMapRef());
524 next_osdmap
= OSDMapRef();
527 void OSDService::init()
529 reserver_finisher
.start();
530 for (auto& f
: objecter_finishers
) {
533 objecter
->set_client_incarnation(0);
535 // deprioritize objecter in daemonperf output
536 objecter
->get_logger()->set_prio_adjust(-3);
542 agent_thread
.create("osd_srv_agent");
544 if (cct
->_conf
->osd_recovery_delay_start
)
545 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
548 void OSDService::final_init()
550 objecter
->start(osdmap
.get());
553 void OSDService::activate_map()
555 // wake/unwake the tiering agent
556 std::lock_guard l
{agent_lock
};
558 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
560 agent_cond
.notify_all();
563 void OSDService::request_osdmap_update(epoch_t e
)
565 osd
->osdmap_subscribe(e
, false);
569 class AgentTimeoutCB
: public Context
{
572 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
573 void finish(int) override
{
574 pg
->agent_choose_mode_restart();
578 void OSDService::agent_entry()
580 dout(10) << __func__
<< " start" << dendl
;
581 std::unique_lock agent_locker
{agent_lock
};
583 while (!agent_stop_flag
) {
584 if (agent_queue
.empty()) {
585 dout(20) << __func__
<< " empty queue" << dendl
;
586 agent_cond
.wait(agent_locker
);
589 uint64_t level
= agent_queue
.rbegin()->first
;
590 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
592 << " tiers " << agent_queue
.size()
593 << ", top is " << level
594 << " with pgs " << top
.size()
595 << ", ops " << agent_ops
<< "/"
596 << cct
->_conf
->osd_agent_max_ops
597 << (agent_active
? " active" : " NOT ACTIVE")
599 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
600 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
601 int agent_flush_quota
= max
;
602 if (!flush_mode_high_count
)
603 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
604 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
605 agent_cond
.wait(agent_locker
);
609 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
610 agent_queue_pos
= top
.begin();
611 agent_valid_iterator
= true;
613 PGRef pg
= *agent_queue_pos
;
614 dout(10) << "high_count " << flush_mode_high_count
615 << " agent_ops " << agent_ops
616 << " flush_quota " << agent_flush_quota
<< dendl
;
617 agent_locker
.unlock();
618 if (!pg
->agent_work(max
, agent_flush_quota
)) {
619 dout(10) << __func__
<< " " << pg
->pg_id
620 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
621 << " seconds" << dendl
;
623 logger
->inc(l_osd_tier_delay
);
624 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
625 std::lock_guard timer_locker
{agent_timer_lock
};
626 Context
*cb
= new AgentTimeoutCB(pg
);
627 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
631 dout(10) << __func__
<< " finish" << dendl
;
634 void OSDService::agent_stop()
637 std::lock_guard
l(agent_lock
);
639 // By this time all ops should be cancelled
640 ceph_assert(agent_ops
== 0);
641 // By this time all PGs are shutdown and dequeued
642 if (!agent_queue
.empty()) {
643 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
644 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
645 ceph_abort_msg("agent queue not empty");
648 agent_stop_flag
= true;
649 agent_cond
.notify_all();
654 // -------------------------------------
656 void OSDService::promote_throttle_recalibrate()
658 utime_t now
= ceph_clock_now();
659 double dur
= now
- last_recalibrate
;
660 last_recalibrate
= now
;
661 unsigned prob
= promote_probability_millis
;
663 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
664 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
666 unsigned min_prob
= 1;
668 uint64_t attempts
, obj
, bytes
;
669 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
670 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
671 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
672 << target_obj_sec
<< " obj/sec or "
673 << byte_u_t(target_bytes_sec
) << "/sec"
676 // calculate what the probability *should* be, given the targets
678 if (attempts
&& dur
> 0) {
679 uint64_t avg_size
= 1;
681 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
682 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
683 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
685 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
686 << avg_size
<< dendl
;
687 if (target_obj_sec
&& target_bytes_sec
)
688 new_prob
= std::min(po
, pb
);
689 else if (target_obj_sec
)
691 else if (target_bytes_sec
)
698 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
700 // correct for persistent skew between target rate and actual rate, adjust
703 if (attempts
&& obj
) {
704 actual
= obj
* 1000 / attempts
;
705 ratio
= (double)actual
/ (double)prob
;
706 new_prob
= (double)new_prob
/ ratio
;
708 new_prob
= std::max(new_prob
, min_prob
);
709 new_prob
= std::min(new_prob
, 1000u);
712 prob
= (prob
+ new_prob
) / 2;
713 prob
= std::max(prob
, min_prob
);
714 prob
= std::min(prob
, 1000u);
715 dout(10) << __func__
<< " actual " << actual
716 << ", actual/prob ratio " << ratio
717 << ", adjusted new_prob " << new_prob
718 << ", prob " << promote_probability_millis
<< " -> " << prob
720 promote_probability_millis
= prob
;
722 // set hard limits for this interval to mitigate stampedes
723 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
724 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
727 // -------------------------------------
729 float OSDService::get_failsafe_full_ratio()
731 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
732 if (full_ratio
> 1.0) full_ratio
/= 100.0;
736 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
738 // The OSDMap ratios take precendence. So if the failsafe is .95 and
739 // the admin sets the cluster full to .96, the failsafe moves up to .96
740 // too. (Not that having failsafe == full is ideal, but it's better than
741 // dropping writes before the clusters appears full.)
742 OSDMapRef osdmap
= get_osdmap();
743 if (!osdmap
|| osdmap
->get_epoch() == 0) {
746 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
747 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
748 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
749 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
751 if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
752 // use the failsafe for nearfull and full; the mon isn't using the
753 // flags anyway because we're mid-upgrade.
754 full_ratio
= failsafe_ratio
;
755 backfillfull_ratio
= failsafe_ratio
;
756 nearfull_ratio
= failsafe_ratio
;
757 } else if (full_ratio
<= 0 ||
758 backfillfull_ratio
<= 0 ||
759 nearfull_ratio
<= 0) {
760 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
761 // use failsafe flag. ick. the monitor did something wrong or the user
762 // did something stupid.
763 full_ratio
= failsafe_ratio
;
764 backfillfull_ratio
= failsafe_ratio
;
765 nearfull_ratio
= failsafe_ratio
;
768 if (injectfull_state
> NONE
&& injectfull
) {
769 inject
= "(Injected)";
770 return injectfull_state
;
771 } else if (pratio
> failsafe_ratio
) {
773 } else if (ratio
> full_ratio
) {
775 } else if (ratio
> backfillfull_ratio
) {
777 } else if (pratio
> nearfull_ratio
) {
783 void OSDService::check_full_status(float ratio
, float pratio
)
785 std::lock_guard
l(full_status_lock
);
788 physical_ratio
= pratio
;
792 new_state
= recalc_full_state(ratio
, pratio
, inject
);
794 dout(20) << __func__
<< " cur ratio " << ratio
795 << ", physical ratio " << pratio
796 << ", new state " << get_full_state_name(new_state
)
801 if (cur_state
!= new_state
) {
802 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
803 << " -> " << get_full_state_name(new_state
) << dendl
;
804 if (new_state
== FAILSAFE
) {
805 clog
->error() << "full status failsafe engaged, dropping updates, now "
806 << (int)roundf(ratio
* 100) << "% full";
807 } else if (cur_state
== FAILSAFE
) {
808 clog
->error() << "full status failsafe disengaged, no longer dropping "
809 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
811 cur_state
= new_state
;
815 bool OSDService::need_fullness_update()
817 OSDMapRef osdmap
= get_osdmap();
819 if (osdmap
->exists(whoami
)) {
820 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
822 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
824 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
831 else if (is_backfillfull())
833 else if (is_nearfull())
838 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
840 if (injectfull
&& injectfull_state
>= type
) {
841 // injectfull is either a count of the number of times to return failsafe full
842 // or if -1 then always return full
845 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
846 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
853 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
855 std::lock_guard
l(full_status_lock
);
857 if (_check_inject_full(dpp
, type
))
860 if (cur_state
>= type
)
861 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
862 << " physical " << physical_ratio
<< dendl
;
864 return cur_state
>= type
;
867 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
869 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
871 std::lock_guard
l(full_status_lock
);
872 if (_check_inject_full(dpp
, type
)) {
878 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
881 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
883 if (tentative_state
>= type
)
884 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
886 return tentative_state
>= type
;
889 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
891 return _check_full(dpp
, FAILSAFE
);
894 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
896 return _check_full(dpp
, FULL
);
899 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
901 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
904 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
906 return _check_full(dpp
, BACKFILLFULL
);
909 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
911 return _check_full(dpp
, NEARFULL
);
914 bool OSDService::is_failsafe_full() const
916 std::lock_guard
l(full_status_lock
);
917 return cur_state
== FAILSAFE
;
920 bool OSDService::is_full() const
922 std::lock_guard
l(full_status_lock
);
923 return cur_state
>= FULL
;
926 bool OSDService::is_backfillfull() const
928 std::lock_guard
l(full_status_lock
);
929 return cur_state
>= BACKFILLFULL
;
932 bool OSDService::is_nearfull() const
934 std::lock_guard
l(full_status_lock
);
935 return cur_state
>= NEARFULL
;
938 void OSDService::set_injectfull(s_names type
, int64_t count
)
940 std::lock_guard
l(full_status_lock
);
941 injectfull_state
= type
;
945 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
946 osd_alert_list_t
& alerts
)
948 uint64_t bytes
= stbuf
.total
;
949 uint64_t avail
= stbuf
.available
;
950 uint64_t used
= stbuf
.get_used_raw();
952 // For testing fake statfs values so it doesn't matter if all
953 // OSDs are using the same partition.
954 if (cct
->_conf
->fake_statfs_for_testing
) {
955 uint64_t total_num_bytes
= 0;
959 total_num_bytes
+= p
->get_stats_num_bytes();
961 bytes
= cct
->_conf
->fake_statfs_for_testing
;
962 if (total_num_bytes
< bytes
)
963 avail
= bytes
- total_num_bytes
;
966 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
967 << " adjust available " << avail
969 used
= bytes
- avail
;
972 logger
->set(l_osd_stat_bytes
, bytes
);
973 logger
->set(l_osd_stat_bytes_used
, used
);
974 logger
->set(l_osd_stat_bytes_avail
, avail
);
976 std::lock_guard
l(stat_lock
);
977 osd_stat
.statfs
= stbuf
;
978 osd_stat
.os_alerts
.clear();
979 osd_stat
.os_alerts
[whoami
].swap(alerts
);
980 if (cct
->_conf
->fake_statfs_for_testing
) {
981 osd_stat
.statfs
.total
= bytes
;
982 osd_stat
.statfs
.available
= avail
;
983 // For testing don't want used to go negative, so clear reserved
984 osd_stat
.statfs
.internally_reserved
= 0;
988 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
991 utime_t now
= ceph_clock_now();
992 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
993 std::lock_guard
l(stat_lock
);
994 osd_stat
.hb_peers
.swap(hb_peers
);
995 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
996 osd_stat
.num_pgs
= num_pgs
;
997 // Clean entries that aren't updated
998 // This is called often enough that we can just remove 1 at a time
999 for (auto i
: osd_stat
.hb_pingtime
) {
1000 if (i
.second
.last_update
== 0)
1002 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
1003 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
1004 << " last_update " << i
.second
.last_update
<< dendl
;
1005 osd_stat
.hb_pingtime
.erase(i
.first
);
1012 void OSDService::inc_osd_stat_repaired()
1014 std::lock_guard
l(stat_lock
);
1015 osd_stat
.num_shards_repaired
++;
1019 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
1020 uint64_t adjust_used
)
1023 ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1026 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1027 if (new_stat
.statfs
.available
> adjust_used
)
1028 new_stat
.statfs
.available
-= adjust_used
;
1030 new_stat
.statfs
.available
= 0;
1031 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1034 // Check all pgs and adjust kb_used to include all pending backfill data
1035 int backfill_adjusted
= 0;
1037 osd
->_get_pgs(&pgs
);
1038 for (auto p
: pgs
) {
1039 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1041 if (backfill_adjusted
) {
1042 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1044 return ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1047 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1049 OSDMapRef next_map
= get_nextmap_reserved();
1050 // service map is always newer/newest
1051 ceph_assert(from_epoch
<= next_map
->get_epoch());
1053 if (next_map
->is_down(peer
) ||
1054 next_map
->get_info(peer
).up_from
> from_epoch
) {
1056 release_map(next_map
);
1059 ConnectionRef peer_con
;
1060 if (peer
== whoami
) {
1061 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1063 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1064 next_map
->get_cluster_addrs(peer
), false, true);
1066 maybe_share_map(peer_con
.get(), next_map
);
1067 peer_con
->send_message(m
);
1068 release_map(next_map
);
1071 void OSDService::send_message_osd_cluster(std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
)
1073 OSDMapRef next_map
= get_nextmap_reserved();
1074 // service map is always newer/newest
1075 ceph_assert(from_epoch
<= next_map
->get_epoch());
1077 for (auto& iter
: messages
) {
1078 if (next_map
->is_down(iter
.first
) ||
1079 next_map
->get_info(iter
.first
).up_from
> from_epoch
) {
1083 ConnectionRef peer_con
;
1084 if (iter
.first
== whoami
) {
1085 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1087 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1088 next_map
->get_cluster_addrs(iter
.first
), false, true);
1090 maybe_share_map(peer_con
.get(), next_map
);
1091 peer_con
->send_message(iter
.second
);
1093 release_map(next_map
);
1095 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1097 OSDMapRef next_map
= get_nextmap_reserved();
1098 // service map is always newer/newest
1099 ceph_assert(from_epoch
<= next_map
->get_epoch());
1101 if (next_map
->is_down(peer
) ||
1102 next_map
->get_info(peer
).up_from
> from_epoch
) {
1103 release_map(next_map
);
1107 if (peer
== whoami
) {
1108 con
= osd
->cluster_messenger
->get_loopback_connection();
1110 con
= osd
->cluster_messenger
->connect_to_osd(
1111 next_map
->get_cluster_addrs(peer
), false, true);
1113 release_map(next_map
);
1117 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1119 OSDMapRef next_map
= get_nextmap_reserved();
1120 // service map is always newer/newest
1121 ceph_assert(from_epoch
<= next_map
->get_epoch());
1123 pair
<ConnectionRef
,ConnectionRef
> ret
;
1124 if (next_map
->is_down(peer
) ||
1125 next_map
->get_info(peer
).up_from
> from_epoch
) {
1126 release_map(next_map
);
1129 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1130 next_map
->get_hb_back_addrs(peer
));
1131 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1132 next_map
->get_hb_front_addrs(peer
));
1133 release_map(next_map
);
1137 entity_name_t
OSDService::get_cluster_msgr_name() const
1139 return cluster_messenger
->get_myname();
1142 void OSDService::queue_want_pg_temp(pg_t pgid
,
1143 const vector
<int>& want
,
1146 std::lock_guard
l(pg_temp_lock
);
1147 auto p
= pg_temp_pending
.find(pgid
);
1148 if (p
== pg_temp_pending
.end() ||
1149 p
->second
.acting
!= want
||
1151 pg_temp_wanted
[pgid
] = {want
, forced
};
1155 void OSDService::remove_want_pg_temp(pg_t pgid
)
1157 std::lock_guard
l(pg_temp_lock
);
1158 pg_temp_wanted
.erase(pgid
);
1159 pg_temp_pending
.erase(pgid
);
1162 void OSDService::_sent_pg_temp()
1164 #ifdef HAVE_STDLIB_MAP_SPLICING
1165 pg_temp_pending
.merge(pg_temp_wanted
);
1167 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1168 make_move_iterator(end(pg_temp_wanted
)));
1170 pg_temp_wanted
.clear();
1173 void OSDService::requeue_pg_temp()
1175 std::lock_guard
l(pg_temp_lock
);
1176 // wanted overrides pending. note that remove_want_pg_temp
1177 // clears the item out of both.
1178 unsigned old_wanted
= pg_temp_wanted
.size();
1179 unsigned old_pending
= pg_temp_pending
.size();
1181 pg_temp_wanted
.swap(pg_temp_pending
);
1182 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1183 << pg_temp_wanted
.size() << dendl
;
1186 std::ostream
& operator<<(std::ostream
& out
,
1187 const OSDService::pg_temp_t
& pg_temp
)
1189 out
<< pg_temp
.acting
;
1190 if (pg_temp
.forced
) {
1196 void OSDService::send_pg_temp()
1198 std::lock_guard
l(pg_temp_lock
);
1199 if (pg_temp_wanted
.empty())
1201 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1202 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1203 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1204 auto& m
= ms
[pg_temp
.forced
];
1206 m
= new MOSDPGTemp(osdmap
->get_epoch());
1207 m
->forced
= pg_temp
.forced
;
1209 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1213 monc
->send_mon_message(m
);
1219 void OSDService::send_pg_created(pg_t pgid
)
1221 std::lock_guard
l(pg_created_lock
);
1222 dout(20) << __func__
<< dendl
;
1223 auto o
= get_osdmap();
1224 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1225 pg_created
.insert(pgid
);
1226 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1230 void OSDService::send_pg_created()
1232 std::lock_guard
l(pg_created_lock
);
1233 dout(20) << __func__
<< dendl
;
1234 auto o
= get_osdmap();
1235 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1236 for (auto pgid
: pg_created
) {
1237 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1242 void OSDService::prune_pg_created()
1244 std::lock_guard
l(pg_created_lock
);
1245 dout(20) << __func__
<< dendl
;
1246 auto o
= get_osdmap();
1247 auto i
= pg_created
.begin();
1248 while (i
!= pg_created
.end()) {
1249 auto p
= o
->get_pg_pool(i
->pool());
1250 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1251 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1252 i
= pg_created
.erase(i
);
1254 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1261 // --------------------------------------
1264 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1265 epoch_t
*_bind_epoch
) const
1267 std::lock_guard
l(epoch_lock
);
1269 *_boot_epoch
= boot_epoch
;
1271 *_up_epoch
= up_epoch
;
1273 *_bind_epoch
= bind_epoch
;
1276 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1277 const epoch_t
*_bind_epoch
)
1279 std::lock_guard
l(epoch_lock
);
1281 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1282 boot_epoch
= *_boot_epoch
;
1285 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1286 up_epoch
= *_up_epoch
;
1289 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1290 bind_epoch
= *_bind_epoch
;
1294 bool OSDService::prepare_to_stop()
1296 std::unique_lock
l(is_stopping_lock
);
1297 if (get_state() != NOT_STOPPING
)
1300 OSDMapRef osdmap
= get_osdmap();
1301 if (osdmap
&& osdmap
->is_up(whoami
)) {
1302 dout(0) << __func__
<< " telling mon we are shutting down and dead " << dendl
;
1303 set_state(PREPARING_TO_STOP
);
1304 monc
->send_mon_message(
1308 osdmap
->get_addrs(whoami
),
1309 osdmap
->get_epoch(),
1310 true, // request ack
1311 true // mark as down and dead
1313 const auto timeout
= ceph::make_timespan(cct
->_conf
->osd_mon_shutdown_timeout
);
1314 is_stopping_cond
.wait_for(l
, timeout
,
1315 [this] { return get_state() == STOPPING
; });
1318 dout(0) << __func__
<< " starting shutdown" << dendl
;
1319 set_state(STOPPING
);
1323 void OSDService::got_stop_ack()
1325 std::scoped_lock
l(is_stopping_lock
);
1326 if (get_state() == PREPARING_TO_STOP
) {
1327 dout(0) << __func__
<< " starting shutdown" << dendl
;
1328 set_state(STOPPING
);
1329 is_stopping_cond
.notify_all();
1331 dout(10) << __func__
<< " ignoring msg" << dendl
;
1335 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1336 OSDSuperblock
& sblock
)
1338 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1339 osdmap
->get_encoding_features());
1340 m
->oldest_map
= max_oldest_map
;
1341 m
->newest_map
= sblock
.newest_map
;
1343 int max
= cct
->_conf
->osd_map_message_max
;
1344 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1346 if (since
< m
->oldest_map
) {
1347 // we don't have the next map the target wants, so start with a
1350 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1351 << since
<< ", starting with full map" << dendl
;
1352 since
= m
->oldest_map
;
1353 if (!get_map_bl(since
, bl
)) {
1354 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1358 max_bytes
-= bl
.length();
1359 m
->maps
[since
] = std::move(bl
);
1361 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1363 if (get_inc_map_bl(e
, bl
)) {
1364 m
->incremental_maps
[e
] = std::move(bl
);
1366 dout(10) << __func__
<< " missing incremental map " << e
<< dendl
;
1367 if (!get_map_bl(e
, bl
)) {
1368 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1371 m
->maps
[e
] = std::move(bl
);
1374 max_bytes
-= bl
.length();
1375 if (max
<= 0 || max_bytes
<= 0) {
1382 if (!m
->maps
.empty() ||
1383 !m
->incremental_maps
.empty()) {
1384 // send what we have so far
1389 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1390 m
->incremental_maps
[m
->newest_map
] = std::move(bl
);
1392 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1393 if (!get_map_bl(m
->newest_map
, bl
)) {
1394 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1398 m
->maps
[m
->newest_map
] = std::move(bl
);
1403 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1405 con
->send_message(m
);
1408 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1409 const OSDMapRef
& osdmap
)
1411 epoch_t to
= osdmap
->get_epoch();
1412 dout(10) << "send_incremental_map " << since
<< " -> " << to
1413 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1417 OSDSuperblock
sblock(get_superblock());
1418 if (since
< sblock
.oldest_map
) {
1419 // just send latest full map
1420 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1421 osdmap
->get_encoding_features());
1422 m
->oldest_map
= max_oldest_map
;
1423 m
->newest_map
= sblock
.newest_map
;
1424 get_map_bl(to
, m
->maps
[to
]);
1429 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1430 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1431 << ", only sending most recent" << dendl
;
1432 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1435 m
= build_incremental_map_msg(since
, to
, sblock
);
1440 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1442 bool found
= map_bl_cache
.lookup(e
, &bl
);
1444 logger
->inc(l_osd_map_bl_cache_hit
);
1447 logger
->inc(l_osd_map_bl_cache_miss
);
1448 found
= store
->read(meta_ch
,
1449 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1450 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1457 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1459 std::lock_guard
l(map_cache_lock
);
1460 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1462 logger
->inc(l_osd_map_bl_cache_hit
);
1465 logger
->inc(l_osd_map_bl_cache_miss
);
1466 found
= store
->read(meta_ch
,
1467 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1468 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1470 _add_map_inc_bl(e
, bl
);
1475 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1477 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1478 // cache a contiguous buffer
1479 if (bl
.get_num_buffers() > 1) {
1482 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1483 map_bl_cache
.add(e
, bl
);
1486 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1488 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1489 // cache a contiguous buffer
1490 if (bl
.get_num_buffers() > 1) {
1493 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1494 map_bl_inc_cache
.add(e
, bl
);
1497 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1499 epoch_t e
= o
->get_epoch();
1501 if (cct
->_conf
->osd_map_dedup
) {
1502 // Dedup against an existing map at a nearby epoch
1503 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1505 OSDMap::dedup(for_dedup
.get(), o
);
1509 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1516 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1518 std::lock_guard
l(map_cache_lock
);
1519 OSDMapRef retval
= map_cache
.lookup(epoch
);
1521 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1522 logger
->inc(l_osd_map_cache_hit
);
1526 logger
->inc(l_osd_map_cache_miss
);
1527 epoch_t lb
= map_cache
.cached_key_lower_bound();
1529 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1530 logger
->inc(l_osd_map_cache_miss_low
);
1531 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1535 OSDMap
*map
= new OSDMap
;
1537 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1539 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1540 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1546 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1548 return _add_map(map
);
1554 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1556 reply_op_error(op
, err
, eversion_t(), 0, {});
1559 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1561 vector
<pg_log_op_return_item_t
> op_returns
)
1563 auto m
= op
->get_req
<MOSDOp
>();
1564 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1566 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1568 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1569 !m
->has_flag(CEPH_OSD_FLAG_RETURNVEC
));
1570 reply
->set_reply_versions(v
, uv
);
1571 reply
->set_op_returns(op_returns
);
1572 m
->get_connection()->send_message(reply
);
1575 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1577 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1581 auto m
= op
->get_req
<MOSDOp
>();
1582 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1584 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1586 if (pg
->is_ec_pg()) {
1588 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1589 * can get this result:
1590 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1591 * [CRUSH_ITEM_NONE, 2, 3]/3
1592 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1594 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1596 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1599 * We can't compute the op target based on the sending map epoch due to
1600 * splitting. The simplest thing is to detect such cases here and drop
1601 * them without an error (the client will resend anyway).
1603 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1604 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1606 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1607 << m
->get_map_epoch() << ", dropping" << dendl
;
1610 pg_t _pgid
= m
->get_raw_pg();
1612 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1613 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1614 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1615 pgid
.shard
!= pg
->pg_id
.shard
) {
1616 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1617 << m
->get_map_epoch() << ", dropping" << dendl
;
1622 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1623 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1624 << " pg " << m
->get_raw_pg()
1625 << " to osd." << whoami
1626 << " not " << pg
->get_acting()
1627 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1630 void OSDService::enqueue_back(OpSchedulerItem
&& qi
)
1632 osd
->op_shardedwq
.queue(std::move(qi
));
1635 void OSDService::enqueue_front(OpSchedulerItem
&& qi
)
1637 osd
->op_shardedwq
.queue_front(std::move(qi
));
1640 void OSDService::queue_recovery_context(
1642 GenContext
<ThreadPool::TPHandle
&> *c
)
1644 epoch_t e
= get_osdmap_epoch();
1647 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1648 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1649 cct
->_conf
->osd_recovery_cost
,
1650 cct
->_conf
->osd_recovery_priority
,
1656 void OSDService::queue_for_snap_trim(PG
*pg
)
1658 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1661 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1662 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1663 cct
->_conf
->osd_snap_trim_cost
,
1664 cct
->_conf
->osd_snap_trim_priority
,
1667 pg
->get_osdmap_epoch()));
1670 template <class MSG_TYPE
>
1671 void OSDService::queue_scrub_event_msg(PG
* pg
,
1672 Scrub::scrub_prio_t with_priority
,
1673 unsigned int qu_priority
,
1674 Scrub::act_token_t act_token
)
1676 const auto epoch
= pg
->get_osdmap_epoch();
1677 auto msg
= new MSG_TYPE(pg
->get_pgid(), epoch
, act_token
);
1678 dout(15) << "queue a scrub event (" << *msg
<< ") for " << *pg
1679 << ". Epoch: " << epoch
<< " token: " << act_token
<< dendl
;
1681 enqueue_back(OpSchedulerItem(
1682 unique_ptr
<OpSchedulerItem::OpQueueable
>(msg
), cct
->_conf
->osd_scrub_cost
,
1683 pg
->scrub_requeue_priority(with_priority
, qu_priority
), ceph_clock_now(), 0, epoch
));
1686 template <class MSG_TYPE
>
1687 void OSDService::queue_scrub_event_msg(PG
* pg
,
1688 Scrub::scrub_prio_t with_priority
)
1690 const auto epoch
= pg
->get_osdmap_epoch();
1691 auto msg
= new MSG_TYPE(pg
->get_pgid(), epoch
);
1692 dout(15) << "queue a scrub event (" << *msg
<< ") for " << *pg
<< ". Epoch: " << epoch
<< dendl
;
1694 enqueue_back(OpSchedulerItem(
1695 unique_ptr
<OpSchedulerItem::OpQueueable
>(msg
), cct
->_conf
->osd_scrub_cost
,
1696 pg
->scrub_requeue_priority(with_priority
), ceph_clock_now(), 0, epoch
));
1699 void OSDService::queue_for_scrub(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1701 queue_scrub_event_msg
<PGScrub
>(pg
, with_priority
);
1704 void OSDService::queue_scrub_after_repair(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1706 queue_scrub_event_msg
<PGScrubAfterRepair
>(pg
, with_priority
);
1709 void OSDService::queue_for_rep_scrub(PG
* pg
,
1710 Scrub::scrub_prio_t with_priority
,
1711 unsigned int qu_priority
,
1712 Scrub::act_token_t act_token
)
1714 queue_scrub_event_msg
<PGRepScrub
>(pg
, with_priority
, qu_priority
, act_token
);
1717 void OSDService::queue_for_rep_scrub_resched(PG
* pg
,
1718 Scrub::scrub_prio_t with_priority
,
1719 unsigned int qu_priority
,
1720 Scrub::act_token_t act_token
)
1722 // Resulting scrub event: 'SchedReplica'
1723 queue_scrub_event_msg
<PGRepScrubResched
>(pg
, with_priority
, qu_priority
,
1727 void OSDService::queue_for_scrub_granted(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1729 // Resulting scrub event: 'RemotesReserved'
1730 queue_scrub_event_msg
<PGScrubResourcesOK
>(pg
, with_priority
);
1733 void OSDService::queue_for_scrub_denied(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1735 // Resulting scrub event: 'ReservationFailure'
1736 queue_scrub_event_msg
<PGScrubDenied
>(pg
, with_priority
);
1739 void OSDService::queue_for_scrub_resched(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1741 // Resulting scrub event: 'InternalSchedScrub'
1742 queue_scrub_event_msg
<PGScrubResched
>(pg
, with_priority
);
1745 void OSDService::queue_scrub_pushes_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1747 // Resulting scrub event: 'ActivePushesUpd'
1748 queue_scrub_event_msg
<PGScrubPushesUpdate
>(pg
, with_priority
);
1751 void OSDService::queue_scrub_chunk_free(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1753 // Resulting scrub event: 'SelectedChunkFree'
1754 queue_scrub_event_msg
<PGScrubChunkIsFree
>(pg
, with_priority
);
1757 void OSDService::queue_scrub_chunk_busy(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1759 // Resulting scrub event: 'ChunkIsBusy'
1760 queue_scrub_event_msg
<PGScrubChunkIsBusy
>(pg
, with_priority
);
1763 void OSDService::queue_scrub_applied_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1765 queue_scrub_event_msg
<PGScrubAppliedUpdate
>(pg
, with_priority
);
1768 void OSDService::queue_scrub_unblocking(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1770 // Resulting scrub event: 'Unblocked'
1771 queue_scrub_event_msg
<PGScrubUnblocked
>(pg
, with_priority
);
1774 void OSDService::queue_scrub_digest_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1776 // Resulting scrub event: 'DigestUpdate'
1777 queue_scrub_event_msg
<PGScrubDigestUpdate
>(pg
, with_priority
);
1780 void OSDService::queue_scrub_got_local_map(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1782 // Resulting scrub event: 'IntLocalMapDone'
1783 queue_scrub_event_msg
<PGScrubGotLocalMap
>(pg
, with_priority
);
1786 void OSDService::queue_scrub_got_repl_maps(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1788 // Resulting scrub event: 'GotReplicas'
1789 queue_scrub_event_msg
<PGScrubGotReplMaps
>(pg
, with_priority
);
1792 void OSDService::queue_scrub_maps_compared(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1794 // Resulting scrub event: 'MapsCompared'
1795 queue_scrub_event_msg
<PGScrubMapsCompared
>(pg
, with_priority
);
1798 void OSDService::queue_scrub_replica_pushes(PG
*pg
, Scrub::scrub_prio_t with_priority
)
1800 // Resulting scrub event: 'ReplicaPushesUpd'
1801 queue_scrub_event_msg
<PGScrubReplicaPushes
>(pg
, with_priority
);
1804 void OSDService::queue_scrub_is_finished(PG
*pg
)
1806 // Resulting scrub event: 'ScrubFinished'
1807 queue_scrub_event_msg
<PGScrubScrubFinished
>(pg
, Scrub::scrub_prio_t::high_priority
);
1810 void OSDService::queue_scrub_next_chunk(PG
*pg
, Scrub::scrub_prio_t with_priority
)
1812 // Resulting scrub event: 'NextChunk'
1813 queue_scrub_event_msg
<PGScrubGetNextChunk
>(pg
, with_priority
);
1816 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1818 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1821 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1822 new PGDelete(pgid
, e
)),
1823 cct
->_conf
->osd_pg_delete_cost
,
1824 cct
->_conf
->osd_pg_delete_priority
,
1830 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1832 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1837 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1839 std::lock_guard
l(merge_lock
);
1840 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1841 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1842 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1843 _send_ready_to_merge();
1846 void OSDService::set_ready_to_merge_target(PG
*pg
,
1848 epoch_t last_epoch_started
,
1849 epoch_t last_epoch_clean
)
1851 std::lock_guard
l(merge_lock
);
1852 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1853 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1856 last_epoch_clean
)));
1857 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1858 _send_ready_to_merge();
1861 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1863 std::lock_guard
l(merge_lock
);
1864 dout(10) << __func__
<< " " << source
<< dendl
;
1865 not_ready_to_merge_source
.insert(source
);
1866 assert(ready_to_merge_source
.count(source
) == 0);
1867 _send_ready_to_merge();
1870 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1872 std::lock_guard
l(merge_lock
);
1873 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1874 not_ready_to_merge_target
[target
] = source
;
1875 assert(ready_to_merge_target
.count(target
) == 0);
1876 _send_ready_to_merge();
1879 void OSDService::send_ready_to_merge()
1881 std::lock_guard
l(merge_lock
);
1882 _send_ready_to_merge();
1885 void OSDService::_send_ready_to_merge()
1887 dout(20) << __func__
1888 << " ready_to_merge_source " << ready_to_merge_source
1889 << " not_ready_to_merge_source " << not_ready_to_merge_source
1890 << " ready_to_merge_target " << ready_to_merge_target
1891 << " not_ready_to_merge_target " << not_ready_to_merge_target
1892 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1894 for (auto src
: not_ready_to_merge_source
) {
1895 if (sent_ready_to_merge_source
.count(src
) == 0) {
1896 monc
->send_mon_message(new MOSDPGReadyToMerge(
1900 osdmap
->get_epoch()));
1901 sent_ready_to_merge_source
.insert(src
);
1904 for (auto p
: not_ready_to_merge_target
) {
1905 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1906 monc
->send_mon_message(new MOSDPGReadyToMerge(
1910 osdmap
->get_epoch()));
1911 sent_ready_to_merge_source
.insert(p
.second
);
1914 for (auto src
: ready_to_merge_source
) {
1915 if (not_ready_to_merge_source
.count(src
.first
) ||
1916 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1919 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1920 if (p
!= ready_to_merge_target
.end() &&
1921 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1922 monc
->send_mon_message(new MOSDPGReadyToMerge(
1923 src
.first
, // source pgid
1924 src
.second
, // src version
1925 std::get
<0>(p
->second
), // target version
1926 std::get
<1>(p
->second
), // PG's last_epoch_started
1927 std::get
<2>(p
->second
), // PG's last_epoch_clean
1929 osdmap
->get_epoch()));
1930 sent_ready_to_merge_source
.insert(src
.first
);
1935 void OSDService::clear_ready_to_merge(PG
*pg
)
1937 std::lock_guard
l(merge_lock
);
1938 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1939 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1940 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1941 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1942 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1943 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1946 void OSDService::clear_sent_ready_to_merge()
1948 std::lock_guard
l(merge_lock
);
1949 sent_ready_to_merge_source
.clear();
1952 void OSDService::prune_sent_ready_to_merge(const OSDMapRef
& osdmap
)
1954 std::lock_guard
l(merge_lock
);
1955 auto i
= sent_ready_to_merge_source
.begin();
1956 while (i
!= sent_ready_to_merge_source
.end()) {
1957 if (!osdmap
->pg_exists(*i
)) {
1958 dout(10) << __func__
<< " " << *i
<< dendl
;
1959 i
= sent_ready_to_merge_source
.erase(i
);
1968 void OSDService::_queue_for_recovery(
1969 std::pair
<epoch_t
, PGRef
> p
,
1970 uint64_t reserved_pushes
)
1972 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
1975 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1977 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
1978 cct
->_conf
->osd_recovery_cost
,
1979 cct
->_conf
->osd_recovery_priority
,
1985 // ====================================================================
1989 #define dout_prefix *_dout
1991 // Commands shared between OSD's console and admin console:
1992 namespace ceph::osd_cmds
{
1994 int heap(CephContext
& cct
,
1995 const cmdmap_t
& cmdmap
,
1996 std::ostream
& outos
,
1997 std::ostream
& erros
);
1999 } // namespace ceph::osd_cmds
2001 int OSD::mkfs(CephContext
*cct
,
2002 std::unique_ptr
<ObjectStore
> store
,
2005 string osdspec_affinity
)
2011 // if we are fed a uuid for this osd, use it.
2012 store
->set_fsid(cct
->_conf
->osd_uuid
);
2014 ret
= store
->mkfs();
2016 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
2017 << cpp_strerror(ret
) << dendl
;
2021 store
->set_cache_shards(1); // doesn't matter for mkfs!
2023 ret
= store
->mount();
2025 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
2026 << cpp_strerror(ret
) << dendl
;
2030 auto umount_store
= make_scope_guard([&] {
2034 ObjectStore::CollectionHandle ch
=
2035 store
->open_collection(coll_t::meta());
2037 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
2039 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
2042 /* if we already have superblock, check content of superblock */
2043 dout(0) << " have superblock" << dendl
;
2044 auto p
= sbbl
.cbegin();
2046 if (whoami
!= sb
.whoami
) {
2047 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
2051 if (fsid
!= sb
.cluster_fsid
) {
2052 derr
<< "provided cluster fsid " << fsid
2053 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
2057 // create superblock
2058 sb
.cluster_fsid
= fsid
;
2059 sb
.osd_fsid
= store
->get_fsid();
2061 sb
.compat_features
= get_osd_initial_compat_set();
2066 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
2068 ObjectStore::Transaction t
;
2069 t
.create_collection(coll_t::meta(), 0);
2070 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
2071 ret
= store
->queue_transaction(ch
, std::move(t
));
2073 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2074 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
2080 ret
= write_meta(cct
, store
.get(), sb
.cluster_fsid
, sb
.osd_fsid
, whoami
, osdspec_affinity
);
2082 derr
<< "OSD::mkfs: failed to write fsid file: error "
2083 << cpp_strerror(ret
) << dendl
;
2088 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
, string
& osdspec_affinity
)
2093 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2094 r
= store
->write_meta("magic", val
);
2098 snprintf(val
, sizeof(val
), "%d", whoami
);
2099 r
= store
->write_meta("whoami", val
);
2103 cluster_fsid
.print(val
);
2104 r
= store
->write_meta("ceph_fsid", val
);
2108 string key
= cct
->_conf
.get_val
<string
>("key");
2110 r
= store
->write_meta("osd_key", key
);
2114 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2115 if (!keyfile
.empty()) {
2118 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2120 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2121 << err
<< ": " << cpp_strerror(r
) << dendl
;
2124 r
= store
->write_meta("osd_key", keybl
.to_str());
2129 if (!osdspec_affinity
.empty()) {
2130 r
= store
->write_meta("osdspec_affinity", osdspec_affinity
.c_str());
2135 r
= store
->write_meta("ready", "ready");
2142 int OSD::peek_meta(ObjectStore
*store
,
2144 uuid_d
*cluster_fsid
,
2147 ceph_release_t
*require_osd_release
)
2151 int r
= store
->read_meta("magic", &val
);
2156 r
= store
->read_meta("whoami", &val
);
2159 *whoami
= atoi(val
.c_str());
2161 r
= store
->read_meta("ceph_fsid", &val
);
2164 r
= cluster_fsid
->parse(val
.c_str());
2168 r
= store
->read_meta("fsid", &val
);
2170 *osd_fsid
= uuid_d();
2172 r
= osd_fsid
->parse(val
.c_str());
2177 r
= store
->read_meta("require_osd_release", &val
);
2179 *require_osd_release
= ceph_release_from_name(val
);
2187 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2191 OSD::OSD(CephContext
*cct_
,
2192 std::unique_ptr
<ObjectStore
> store_
,
2194 Messenger
*internal_messenger
,
2195 Messenger
*external_messenger
,
2196 Messenger
*hb_client_front
,
2197 Messenger
*hb_client_back
,
2198 Messenger
*hb_front_serverm
,
2199 Messenger
*hb_back_serverm
,
2200 Messenger
*osdc_messenger
,
2202 const std::string
&dev
, const std::string
&jdev
,
2203 ceph::async::io_context_pool
& poolctx
) :
2205 tick_timer(cct
, osd_lock
),
2206 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2207 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2208 cluster_messenger(internal_messenger
),
2209 client_messenger(external_messenger
),
2210 objecter_messenger(osdc_messenger
),
2212 mgrc(cct_
, client_messenger
, &mc
->monmap
),
2213 logger(create_logger()),
2214 recoverystate_perf(create_recoverystate_perf()),
2215 store(std::move(store_
)),
2216 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2217 clog(log_client
.create_channel()),
2219 dev_path(dev
), journal_path(jdev
),
2220 store_is_rotational(store
->is_rotational()),
2221 trace_endpoint("0.0.0.0", 0, "osd"),
2223 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2224 "osd_pg_epoch_max_lag_factor")),
2225 osd_compat(get_osd_compat_set()),
2226 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2227 get_num_op_threads()),
2228 heartbeat_stop(false),
2229 heartbeat_need_update(true),
2230 hb_front_client_messenger(hb_client_front
),
2231 hb_back_client_messenger(hb_client_back
),
2232 hb_front_server_messenger(hb_front_serverm
),
2233 hb_back_server_messenger(hb_back_serverm
),
2235 heartbeat_thread(this),
2236 heartbeat_dispatcher(this),
2237 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2238 cct
->_conf
->osd_num_op_tracker_shard
),
2239 test_ops_hook(NULL
),
2242 ceph::make_timespan(cct
->_conf
->osd_op_thread_timeout
),
2243 ceph::make_timespan(cct
->_conf
->osd_op_thread_suicide_timeout
),
2245 last_pg_create_epoch(0),
2248 requested_full_first(0),
2249 requested_full_last(0),
2250 service(this, poolctx
)
2253 if (!gss_ktfile_client
.empty()) {
2254 // Assert we can export environment variable
2256 The default client keytab is used, if it is present and readable,
2257 to automatically obtain initial credentials for GSSAPI client
2258 applications. The principal name of the first entry in the client
2259 keytab is used by default when obtaining initial credentials.
2260 1. The KRB5_CLIENT_KTNAME environment variable.
2261 2. The default_client_keytab_name profile variable in [libdefaults].
2262 3. The hardcoded default, DEFCKTNAME.
2264 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2265 gss_ktfile_client
.c_str(), 1));
2266 ceph_assert(set_result
== 0);
2269 monc
->set_messenger(client_messenger
);
2270 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2271 cct
->_conf
->osd_op_log_threshold
);
2272 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2273 cct
->_conf
->osd_op_history_duration
);
2274 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2275 cct
->_conf
->osd_op_history_slow_op_threshold
);
2276 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
2278 std::stringstream ss
;
2279 ss
<< "osd." << whoami
;
2280 trace_endpoint
.copy_name(ss
.str());
2283 // initialize shards
2284 num_shards
= get_num_op_shards();
2285 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2286 OSDShard
*one_shard
= new OSDShard(
2290 shards
.push_back(one_shard
);
2296 while (!shards
.empty()) {
2297 delete shards
.back();
2300 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2301 cct
->get_perfcounters_collection()->remove(logger
);
2302 delete recoverystate_perf
;
2306 double OSD::get_tick_interval() const
2308 // vary +/- 5% to avoid scrub scheduling livelocks
2309 constexpr auto delta
= 0.05;
2310 return (OSD_TICK_INTERVAL
*
2311 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2314 void OSD::handle_signal(int signum
)
2316 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2317 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2323 std::lock_guard
lock(osd_lock
);
2327 if (store
->test_mount_in_use()) {
2328 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2329 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2333 cct
->_conf
.add_observer(this);
2337 int OSD::set_numa_affinity()
2339 // storage numa node
2340 int store_node
= -1;
2341 store
->get_numa_node(&store_node
, nullptr, nullptr);
2342 if (store_node
>= 0) {
2343 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2346 // check network numa node(s)
2347 int front_node
= -1, back_node
= -1;
2348 string front_iface
= pick_iface(
2350 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2351 string back_iface
= pick_iface(
2353 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2354 int r
= get_iface_numa_node(front_iface
, &front_node
);
2355 if (r
>= 0 && front_node
>= 0) {
2356 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2357 << front_node
<< dendl
;
2358 r
= get_iface_numa_node(back_iface
, &back_node
);
2359 if (r
>= 0 && back_node
>= 0) {
2360 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2361 << back_node
<< dendl
;
2362 if (front_node
== back_node
&&
2363 front_node
== store_node
) {
2364 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2365 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2366 numa_node
= front_node
;
2368 } else if (front_node
!= back_node
) {
2369 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2372 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2375 } else if (back_node
== -2) {
2376 dout(1) << __func__
<< " cluster network " << back_iface
2377 << " ports numa nodes do not match" << dendl
;
2379 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2380 << "' numa node: " << cpp_strerror(r
) << dendl
;
2382 } else if (front_node
== -2) {
2383 dout(1) << __func__
<< " public network " << front_iface
2384 << " ports numa nodes do not match" << dendl
;
2386 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2387 << "' numa node: " << cpp_strerror(r
) << dendl
;
2389 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2390 // this takes precedence over the automagic logic above
2393 if (numa_node
>= 0) {
2394 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2396 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2397 << " CPUs" << dendl
;
2400 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2402 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2404 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2407 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2413 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2420 class OSDSocketHook
: public AdminSocketHook
{
2423 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2424 int call(std::string_view prefix
, const cmdmap_t
& cmdmap
,
2427 bufferlist
& out
) override
{
2428 ceph_abort("should use async hook");
2431 std::string_view prefix
,
2432 const cmdmap_t
& cmdmap
,
2434 const bufferlist
& inbl
,
2435 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
) override
{
2437 osd
->asok_command(prefix
, cmdmap
, f
, inbl
, on_finish
);
2438 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2440 on_finish(-EINVAL
, e
.what(), empty
);
2445 std::set
<int64_t> OSD::get_mapped_pools()
2447 std::set
<int64_t> pools
;
2448 std::vector
<spg_t
> pgids
;
2450 for (const auto &pgid
: pgids
) {
2451 pools
.insert(pgid
.pool());
2456 OSD::PGRefOrError
OSD::locate_asok_target(const cmdmap_t
& cmdmap
,
2461 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2462 ss
<< "no pgid specified";
2463 return OSD::PGRefOrError
{std::nullopt
, -EINVAL
};
2467 if (!pgid
.parse(pgidstr
.c_str())) {
2468 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2469 return OSD::PGRefOrError
{std::nullopt
, -EINVAL
};
2474 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) && (pg
= _lookup_lock_pg(pcand
))) {
2475 if (pg
->is_primary() || !only_primary
) {
2476 return OSD::PGRefOrError
{pg
, 0};
2479 ss
<< "not primary for pgid " << pgid
;
2481 return OSD::PGRefOrError
{std::nullopt
, -EAGAIN
};
2483 ss
<< "i don't have pgid " << pgid
;
2484 return OSD::PGRefOrError
{std::nullopt
, -ENOENT
};
2488 // note that the cmdmap is explicitly copied into asok_route_to_pg()
2489 int OSD::asok_route_to_pg(
2491 std::string_view prefix
,
2495 const bufferlist
& inbl
,
2497 std::function
<void(int, const std::string
&, bufferlist
&)> on_finish
)
2499 auto [target_pg
, ret
] = locate_asok_target(cmdmap
, ss
, only_primary
);
2501 if (!target_pg
.has_value()) {
2502 // 'ss' and 'ret' already contain the error information
2503 on_finish(ret
, ss
.str(), outbl
);
2507 // the PG was locked by locate_asok_target()
2509 (*target_pg
)->do_command(prefix
, cmdmap
, inbl
, on_finish
);
2510 (*target_pg
)->unlock();
2511 return 0; // the pg handler calls on_finish directly
2512 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2513 (*target_pg
)->unlock();
2515 on_finish(ret
, ss
.str(), outbl
);
2520 void OSD::asok_command(
2521 std::string_view prefix
, const cmdmap_t
& cmdmap
,
2523 const bufferlist
& inbl
,
2524 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
2527 stringstream ss
; // stderr error message stream
2528 bufferlist outbl
; // if empty at end, we'll dump formatter as output
2530 // --- PG commands are routed here to PG::do_command ---
2531 if (prefix
== "pg" ||
2532 prefix
== "query" ||
2533 prefix
== "mark_unfound_lost" ||
2534 prefix
== "list_unfound" ||
2535 prefix
== "scrub" ||
2536 prefix
== "deep_scrub"
2540 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2541 ss
<< "no pgid specified";
2545 if (!pgid
.parse(pgidstr
.c_str())) {
2546 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2552 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
2553 (pg
= _lookup_lock_pg(pcand
))) {
2554 if (pg
->is_primary()) {
2555 cmdmap_t new_cmdmap
= cmdmap
;
2557 pg
->do_command(prefix
, new_cmdmap
, inbl
, on_finish
);
2559 return; // the pg handler calls on_finish directly
2560 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2567 ss
<< "not primary for pgid " << pgid
;
2568 // do not reply; they will get newer maps and realize they
2575 ss
<< "i don't have pgid " << pgid
;
2580 // --- PG commands that will be answered even if !primary ---
2582 else if (prefix
== "scrubdebug") {
2583 asok_route_to_pg(false, prefix
, cmdmap
, f
, ss
, inbl
, outbl
, on_finish
);
2587 // --- OSD commands follow ---
2589 else if (prefix
== "status") {
2590 lock_guard
l(osd_lock
);
2591 f
->open_object_section("status");
2592 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2593 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2594 f
->dump_unsigned("whoami", superblock
.whoami
);
2595 f
->dump_string("state", get_state_name(get_state()));
2596 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2597 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2598 f
->dump_unsigned("num_pgs", num_pgs
);
2600 } else if (prefix
== "flush_journal") {
2601 store
->flush_journal();
2602 } else if (prefix
== "dump_ops_in_flight" ||
2604 prefix
== "dump_blocked_ops" ||
2605 prefix
== "dump_historic_ops" ||
2606 prefix
== "dump_historic_ops_by_duration" ||
2607 prefix
== "dump_historic_slow_ops") {
2609 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2610 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2611 will start to track new ops received afterwards.";
2613 set
<string
> filters
;
2614 vector
<string
> filter_str
;
2615 if (cmd_getval(cmdmap
, "filterstr", filter_str
)) {
2616 copy(filter_str
.begin(), filter_str
.end(),
2617 inserter(filters
, filters
.end()));
2620 if (prefix
== "dump_ops_in_flight" ||
2622 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2628 if (prefix
== "dump_blocked_ops") {
2629 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2635 if (prefix
== "dump_historic_ops") {
2636 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2642 if (prefix
== "dump_historic_ops_by_duration") {
2643 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2649 if (prefix
== "dump_historic_slow_ops") {
2650 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2656 } else if (prefix
== "dump_op_pq_state") {
2657 f
->open_object_section("pq");
2658 op_shardedwq
.dump(f
);
2660 } else if (prefix
== "dump_blocklist") {
2661 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2662 list
<pair
<entity_addr_t
,utime_t
> > rbl
;
2663 OSDMapRef curmap
= service
.get_osdmap();
2664 curmap
->get_blocklist(&bl
, &rbl
);
2666 f
->open_array_section("blocklist");
2667 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2668 it
!= bl
.end(); ++it
) {
2669 f
->open_object_section("entry");
2670 f
->open_object_section("entity_addr_t");
2672 f
->close_section(); //entity_addr_t
2673 it
->second
.localtime(f
->dump_stream("expire_time"));
2674 f
->close_section(); //entry
2676 f
->close_section(); //blocklist
2677 f
->open_array_section("range_blocklist");
2678 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= rbl
.begin();
2679 it
!= rbl
.end(); ++it
) {
2680 f
->open_object_section("entry");
2681 f
->open_object_section("entity_addr_t");
2683 f
->close_section(); //entity_addr_t
2684 it
->second
.localtime(f
->dump_stream("expire_time"));
2685 f
->close_section(); //entry
2687 f
->close_section(); //blocklist
2688 } else if (prefix
== "dump_watchers") {
2689 list
<obj_watch_item_t
> watchers
;
2693 for (auto& pg
: pgs
) {
2694 list
<obj_watch_item_t
> pg_watchers
;
2695 pg
->get_watchers(&pg_watchers
);
2696 watchers
.splice(watchers
.end(), pg_watchers
);
2699 f
->open_array_section("watchers");
2700 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2701 it
!= watchers
.end(); ++it
) {
2703 f
->open_object_section("watch");
2705 f
->dump_string("namespace", it
->obj
.nspace
);
2706 f
->dump_string("object", it
->obj
.oid
.name
);
2708 f
->open_object_section("entity_name");
2709 it
->wi
.name
.dump(f
);
2710 f
->close_section(); //entity_name_t
2712 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2713 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2715 f
->open_object_section("entity_addr_t");
2716 it
->wi
.addr
.dump(f
);
2717 f
->close_section(); //entity_addr_t
2719 f
->close_section(); //watch
2722 f
->close_section(); //watchers
2723 } else if (prefix
== "dump_recovery_reservations") {
2724 f
->open_object_section("reservations");
2725 f
->open_object_section("local_reservations");
2726 service
.local_reserver
.dump(f
);
2728 f
->open_object_section("remote_reservations");
2729 service
.remote_reserver
.dump(f
);
2732 } else if (prefix
== "dump_scrub_reservations") {
2733 f
->open_object_section("scrub_reservations");
2734 service
.get_scrub_services().dump_scrub_reservations(f
);
2736 } else if (prefix
== "get_latest_osdmap") {
2737 get_latest_osdmap();
2738 } else if (prefix
== "set_heap_property") {
2742 bool success
= false;
2743 if (!cmd_getval(cmdmap
, "property", property
)) {
2744 error
= "unable to get property";
2746 } else if (!cmd_getval(cmdmap
, "value", value
)) {
2747 error
= "unable to get value";
2749 } else if (value
< 0) {
2750 error
= "negative value not allowed";
2752 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2753 error
= "invalid property";
2758 f
->open_object_section("result");
2759 f
->dump_string("error", error
);
2760 f
->dump_bool("success", success
);
2762 } else if (prefix
== "get_heap_property") {
2766 bool success
= false;
2767 if (!cmd_getval(cmdmap
, "property", property
)) {
2768 error
= "unable to get property";
2770 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2771 error
= "invalid property";
2776 f
->open_object_section("result");
2777 f
->dump_string("error", error
);
2778 f
->dump_bool("success", success
);
2779 f
->dump_int("value", value
);
2781 } else if (prefix
== "dump_objectstore_kv_stats") {
2782 store
->get_db_statistics(f
);
2783 } else if (prefix
== "dump_scrubs") {
2784 service
.get_scrub_services().dump_scrubs(f
);
2785 } else if (prefix
== "calc_objectstore_db_histogram") {
2786 store
->generate_db_histogram(f
);
2787 } else if (prefix
== "flush_store_cache") {
2788 store
->flush_cache(&ss
);
2789 } else if (prefix
== "dump_pgstate_history") {
2790 f
->open_object_section("pgstate_history");
2791 f
->open_array_section("pgs");
2794 for (auto& pg
: pgs
) {
2795 f
->open_object_section("pg");
2796 f
->dump_stream("pg") << pg
->pg_id
;
2797 f
->dump_string("currently", pg
->get_current_state());
2798 pg
->dump_pgstate_history(f
);
2803 } else if (prefix
== "compact") {
2804 dout(1) << "triggering manual compaction" << dendl
;
2805 auto start
= ceph::coarse_mono_clock::now();
2807 auto end
= ceph::coarse_mono_clock::now();
2808 double duration
= std::chrono::duration
<double>(end
-start
).count();
2809 dout(1) << "finished manual compaction in "
2811 << " seconds" << dendl
;
2812 f
->open_object_section("compact_result");
2813 f
->dump_float("elapsed_time", duration
);
2815 } else if (prefix
== "get_mapped_pools") {
2816 f
->open_array_section("mapped_pools");
2817 set
<int64_t> poollist
= get_mapped_pools();
2818 for (auto pool
: poollist
) {
2819 f
->dump_int("pool_id", pool
);
2822 } else if (prefix
== "smart") {
2824 cmd_getval(cmdmap
, "devid", devid
);
2826 probe_smart(devid
, out
);
2827 outbl
.append(out
.str());
2828 } else if (prefix
== "list_devices") {
2829 set
<string
> devnames
;
2830 store
->get_devices(&devnames
);
2831 f
->open_array_section("list_devices");
2832 for (auto dev
: devnames
) {
2833 if (dev
.find("dm-") == 0) {
2837 f
->open_object_section("device");
2838 f
->dump_string("device", "/dev/" + dev
);
2839 f
->dump_string("device_id", get_device_id(dev
, &err
));
2843 } else if (prefix
== "send_beacon") {
2844 lock_guard
l(osd_lock
);
2846 send_beacon(ceph::coarse_mono_clock::now());
2850 else if (prefix
== "cluster_log") {
2852 cmd_getval(cmdmap
, "message", msg
);
2855 ss
<< "ignoring empty log message";
2858 string message
= msg
.front();
2859 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
2860 message
+= " " + *a
;
2862 cmd_getval(cmdmap
, "level", lvl
);
2863 clog_type level
= string_to_clog_type(lvl
);
2866 ss
<< "unknown level '" << lvl
<< "'";
2869 clog
->do_log(level
, message
);
2872 else if (prefix
== "bench") {
2873 // default count 1G, size 4MB
2874 int64_t count
= cmd_getval_or
<int64_t>(cmdmap
, "count", 1LL << 30);
2875 int64_t bsize
= cmd_getval_or
<int64_t>(cmdmap
, "size", 4LL << 20);
2876 int64_t osize
= cmd_getval_or
<int64_t>(cmdmap
, "object_size", 0);
2877 int64_t onum
= cmd_getval_or
<int64_t>(cmdmap
, "object_num", 0);
2878 double elapsed
= 0.0;
2880 ret
= run_osd_bench_test(count
, bsize
, osize
, onum
, &elapsed
, ss
);
2885 double rate
= count
/ elapsed
;
2886 double iops
= rate
/ bsize
;
2887 f
->open_object_section("osd_bench_results");
2888 f
->dump_int("bytes_written", count
);
2889 f
->dump_int("blocksize", bsize
);
2890 f
->dump_float("elapsed_sec", elapsed
);
2891 f
->dump_float("bytes_per_sec", rate
);
2892 f
->dump_float("iops", iops
);
2896 else if (prefix
== "flush_pg_stats") {
2897 mgrc
.send_pgstats();
2898 f
->dump_unsigned("stat_seq", service
.get_osd_stat_seq());
2901 else if (prefix
== "heap") {
2902 std::stringstream outss
;
2903 ret
= ceph::osd_cmds::heap(*cct
, cmdmap
, outss
, ss
);
2904 outbl
.append(outss
);
2907 else if (prefix
== "debug dump_missing") {
2908 f
->open_array_section("pgs");
2911 for (auto& pg
: pgs
) {
2912 string s
= stringify(pg
->pg_id
);
2913 f
->open_array_section(s
.c_str());
2915 pg
->dump_missing(f
);
2922 else if (prefix
== "debug kick_recovery_wq") {
2924 cmd_getval(cmdmap
, "delay", delay
);
2927 ret
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
2929 ss
<< "kick_recovery_wq: error setting "
2930 << "osd_recovery_delay_start to '" << delay
<< "': error "
2934 cct
->_conf
.apply_changes(nullptr);
2935 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
2936 << "to " << cct
->_conf
->osd_recovery_delay_start
;
2939 else if (prefix
== "cpu_profiler") {
2942 cmd_getval(cmdmap
, "arg", arg
);
2943 vector
<string
> argvec
;
2944 get_str_vec(arg
, argvec
);
2945 cpu_profiler_handle_command(argvec
, ds
);
2946 outbl
.append(ds
.str());
2949 else if (prefix
== "dump_pg_recovery_stats") {
2950 lock_guard
l(osd_lock
);
2951 pg_recovery_stats
.dump_formatted(f
);
2954 else if (prefix
== "reset_pg_recovery_stats") {
2955 lock_guard
l(osd_lock
);
2956 pg_recovery_stats
.reset();
2959 else if (prefix
== "perf histogram dump") {
2961 std::string counter
;
2962 cmd_getval(cmdmap
, "logger", logger
);
2963 cmd_getval(cmdmap
, "counter", counter
);
2964 cct
->get_perfcounters_collection()->dump_formatted_histograms(
2965 f
, false, logger
, counter
);
2968 else if (prefix
== "cache drop") {
2969 lock_guard
l(osd_lock
);
2970 dout(20) << "clearing all caches" << dendl
;
2971 // Clear the objectstore's cache - onode and buffer for Bluestore,
2972 // system's pagecache for Filestore
2973 ret
= store
->flush_cache(&ss
);
2975 ss
<< "Error flushing objectstore cache: " << cpp_strerror(ret
);
2978 // Clear the objectcontext cache (per PG)
2981 for (auto& pg
: pgs
) {
2986 else if (prefix
== "cache status") {
2987 lock_guard
l(osd_lock
);
2988 int obj_ctx_count
= 0;
2991 for (auto& pg
: pgs
) {
2992 obj_ctx_count
+= pg
->get_cache_obj_count();
2994 f
->open_object_section("cache_status");
2995 f
->dump_int("object_ctx", obj_ctx_count
);
2996 store
->dump_cache_stats(f
);
3000 else if (prefix
== "scrub_purged_snaps") {
3001 lock_guard
l(osd_lock
);
3002 scrub_purged_snaps();
3005 else if (prefix
== "dump_osd_network") {
3006 lock_guard
l(osd_lock
);
3008 if (!(cmd_getval(cmdmap
, "value", value
))) {
3009 // Convert milliseconds to microseconds
3010 value
= static_cast<double>(g_conf().get_val
<double>(
3011 "mon_warn_on_slow_ping_time")) * 1000;
3013 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
3014 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
3015 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
3018 // Convert user input to microseconds
3021 if (value
< 0) value
= 0;
3023 struct osd_ping_time_t
{
3027 std::array
<uint32_t,3> times
;
3028 std::array
<uint32_t,3> min
;
3029 std::array
<uint32_t,3> max
;
3031 uint32_t last_update
;
3033 bool operator<(const osd_ping_time_t
& rhs
) const {
3034 if (pingtime
< rhs
.pingtime
)
3036 if (pingtime
> rhs
.pingtime
)
3046 set
<osd_ping_time_t
> sorted
;
3047 // Get pingtimes under lock and not on the stack
3048 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
3049 service
.get_hb_pingtime(pingtimes
);
3050 for (auto j
: *pingtimes
) {
3051 if (j
.second
.last_update
== 0)
3053 osd_ping_time_t item
;
3054 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
3055 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
3056 if (item
.pingtime
>= value
) {
3058 item
.times
[0] = j
.second
.back_pingtime
[0];
3059 item
.times
[1] = j
.second
.back_pingtime
[1];
3060 item
.times
[2] = j
.second
.back_pingtime
[2];
3061 item
.min
[0] = j
.second
.back_min
[0];
3062 item
.min
[1] = j
.second
.back_min
[1];
3063 item
.min
[2] = j
.second
.back_min
[2];
3064 item
.max
[0] = j
.second
.back_max
[0];
3065 item
.max
[1] = j
.second
.back_max
[1];
3066 item
.max
[2] = j
.second
.back_max
[2];
3067 item
.last
= j
.second
.back_last
;
3069 item
.last_update
= j
.second
.last_update
;
3070 sorted
.emplace(item
);
3072 if (j
.second
.front_last
== 0)
3074 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
3075 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
3076 if (item
.pingtime
>= value
) {
3078 item
.times
[0] = j
.second
.front_pingtime
[0];
3079 item
.times
[1] = j
.second
.front_pingtime
[1];
3080 item
.times
[2] = j
.second
.front_pingtime
[2];
3081 item
.min
[0] = j
.second
.front_min
[0];
3082 item
.min
[1] = j
.second
.front_min
[1];
3083 item
.min
[2] = j
.second
.front_min
[2];
3084 item
.max
[0] = j
.second
.front_max
[0];
3085 item
.max
[1] = j
.second
.front_max
[1];
3086 item
.max
[2] = j
.second
.front_max
[2];
3087 item
.last
= j
.second
.front_last
;
3088 item
.last_update
= j
.second
.last_update
;
3090 sorted
.emplace(item
);
3095 // Network ping times (1min 5min 15min)
3096 f
->open_object_section("network_ping_times");
3097 f
->dump_int("threshold", value
/ 1000);
3098 f
->open_array_section("entries");
3099 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
3100 ceph_assert(sitem
.pingtime
>= value
);
3101 f
->open_object_section("entry");
3103 const time_t lu(sitem
.last_update
);
3105 string
lustr(ctime_r(&lu
, buffer
));
3106 lustr
.pop_back(); // Remove trailing \n
3107 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3108 f
->dump_string("last update", lustr
);
3109 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
3110 f
->dump_int("from osd", whoami
);
3111 f
->dump_int("to osd", sitem
.to
);
3112 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
3113 f
->open_object_section("average");
3114 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
3115 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
3116 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
3117 f
->close_section(); // average
3118 f
->open_object_section("min");
3119 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3120 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3121 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3122 f
->close_section(); // min
3123 f
->open_object_section("max");
3124 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3125 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3126 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3127 f
->close_section(); // max
3128 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
3129 f
->close_section(); // entry
3131 f
->close_section(); // entries
3132 f
->close_section(); // network_ping_times
3133 } else if (prefix
== "dump_pool_statfs") {
3134 lock_guard
l(osd_lock
);
3137 if (!(cmd_getval(cmdmap
, "poolid", p
))) {
3138 ss
<< "Error dumping pool statfs: no poolid provided";
3144 bool per_pool_omap_stats
= false;
3146 ret
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
3148 ss
<< "Error dumping pool statfs: " << cpp_strerror(ret
);
3151 ss
<< "dumping pool statfs...";
3152 f
->open_object_section("pool_statfs");
3153 f
->dump_int("poolid", p
);
3158 ceph_abort_msg("broken asok registration");
3162 on_finish(ret
, ss
.str(), outbl
);
3165 int OSD::run_osd_bench_test(
3174 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
3176 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
3177 // let us limit the block size because the next checks rely on it
3178 // having a sane value. If we allow any block size to be set things
3179 // can still go sideways.
3180 ss
<< "block 'size' values are capped at "
3181 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
3182 << " a higher value, please adjust 'osd_bench_max_block_size'";
3185 } else if (bsize
< (int64_t) (1 << 20)) {
3186 // entering the realm of small block sizes.
3187 // limit the count to a sane value, assuming a configurable amount of
3188 // IOPS and duration, so that the OSD doesn't get hung up on this,
3189 // preventing timeouts from going off
3191 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
3192 if (count
> max_count
) {
3193 ss
<< "'count' values greater than " << max_count
3194 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
3195 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
3196 << " for " << duration
<< " seconds,"
3197 << " can cause ill effects on osd. "
3198 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3199 << " value if you wish to use a higher 'count'.";
3204 // 1MB block sizes are big enough so that we get more stuff done.
3205 // However, to avoid the osd from getting hung on this and having
3206 // timers being triggered, we are going to limit the count assuming
3207 // a configurable throughput and duration.
3208 // NOTE: max_count is the total amount of bytes that we believe we
3209 // will be able to write during 'duration' for the given
3210 // throughput. The block size hardly impacts this unless it's
3211 // way too big. Given we already check how big the block size
3212 // is, it's safe to assume everything will check out.
3214 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
3215 if (count
> max_count
) {
3216 ss
<< "'count' values greater than " << max_count
3217 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
3218 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
3219 << " for " << duration
<< " seconds,"
3220 << " can cause ill effects on osd. "
3221 << " Please adjust 'osd_bench_large_size_max_throughput'"
3222 << " with a higher value if you wish to use a higher 'count'.";
3228 if (osize
&& bsize
> osize
) {
3232 dout(1) << " bench count " << count
3233 << " bsize " << byte_u_t(bsize
) << dendl
;
3235 ObjectStore::Transaction cleanupt
;
3237 if (osize
&& onum
) {
3239 bufferptr
bp(osize
);
3240 memset(bp
.c_str(), 'a', bp
.length());
3241 bl
.push_back(std::move(bp
));
3242 bl
.rebuild_page_aligned();
3243 for (int i
=0; i
<onum
; ++i
) {
3245 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
3247 hobject_t
soid(sobject_t(oid
, 0));
3248 ObjectStore::Transaction t
;
3249 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
3250 store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
3251 cleanupt
.remove(coll_t(), ghobject_t(soid
));
3256 bufferptr
bp(bsize
);
3257 memset(bp
.c_str(), 'a', bp
.length());
3258 bl
.push_back(std::move(bp
));
3259 bl
.rebuild_page_aligned();
3263 if (!service
.meta_ch
->flush_commit(&waiter
)) {
3268 utime_t start
= ceph_clock_now();
3269 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
3271 unsigned offset
= 0;
3272 if (onum
&& osize
) {
3273 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
3274 offset
= rand() % (osize
/ bsize
) * bsize
;
3276 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
3279 hobject_t
soid(sobject_t(oid
, 0));
3280 ObjectStore::Transaction t
;
3281 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
3282 store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
3283 if (!onum
|| !osize
) {
3284 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
3290 if (!service
.meta_ch
->flush_commit(&waiter
)) {
3294 utime_t end
= ceph_clock_now();
3295 *elapsed
= end
- start
;
3298 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), nullptr);
3301 if (!service
.meta_ch
->flush_commit(&waiter
)) {
3309 class TestOpsSocketHook
: public AdminSocketHook
{
3310 OSDService
*service
;
3313 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
3314 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
3316 std::ostream
& errss
,
3317 bufferlist
& out
) override
{
3321 test_ops(service
, store
, command
, cmdmap
, outss
);
3323 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
3329 void test_ops(OSDService
*service
, ObjectStore
*store
,
3330 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
3334 class OSD::C_Tick
: public Context
{
3337 explicit C_Tick(OSD
*o
) : osd(o
) {}
3338 void finish(int r
) override
{
3343 class OSD::C_Tick_WithoutOSDLock
: public Context
{
3346 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
3347 void finish(int r
) override
{
3348 osd
->tick_without_osd_lock();
3352 int OSD::enable_disable_fuse(bool stop
)
3356 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
3357 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
3358 dout(1) << __func__
<< " disabling" << dendl
;
3362 r
= ::rmdir(mntpath
.c_str());
3365 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
3366 << cpp_strerror(r
) << dendl
;
3371 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
3372 dout(1) << __func__
<< " enabling" << dendl
;
3373 r
= ::mkdir(mntpath
.c_str(), 0700);
3376 if (r
< 0 && r
!= -EEXIST
) {
3377 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
3378 << cpp_strerror(r
) << dendl
;
3381 fuse_store
= new FuseStore(store
.get(), mntpath
);
3382 r
= fuse_store
->start();
3384 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
3390 #endif // HAVE_LIBFUSE
3394 size_t OSD::get_num_cache_shards()
3396 return cct
->_conf
.get_val
<Option::size_t>("osd_num_cache_shards");
3399 int OSD::get_num_op_shards()
3401 if (cct
->_conf
->osd_op_num_shards
)
3402 return cct
->_conf
->osd_op_num_shards
;
3403 if (store_is_rotational
)
3404 return cct
->_conf
->osd_op_num_shards_hdd
;
3406 return cct
->_conf
->osd_op_num_shards_ssd
;
3409 int OSD::get_num_op_threads()
3411 if (cct
->_conf
->osd_op_num_threads_per_shard
)
3412 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
3413 if (store_is_rotational
)
3414 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
3416 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
3419 float OSD::get_osd_recovery_sleep()
3421 if (cct
->_conf
->osd_recovery_sleep
)
3422 return cct
->_conf
->osd_recovery_sleep
;
3423 if (!store_is_rotational
&& !journal_is_rotational
)
3424 return cct
->_conf
->osd_recovery_sleep_ssd
;
3425 else if (store_is_rotational
&& !journal_is_rotational
)
3426 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3428 return cct
->_conf
->osd_recovery_sleep_hdd
;
3431 float OSD::get_osd_delete_sleep()
3433 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3434 if (osd_delete_sleep
> 0)
3435 return osd_delete_sleep
;
3436 if (!store_is_rotational
&& !journal_is_rotational
)
3437 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3438 if (store_is_rotational
&& !journal_is_rotational
)
3439 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3440 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3443 int OSD::get_recovery_max_active()
3445 if (cct
->_conf
->osd_recovery_max_active
)
3446 return cct
->_conf
->osd_recovery_max_active
;
3447 if (store_is_rotational
)
3448 return cct
->_conf
->osd_recovery_max_active_hdd
;
3450 return cct
->_conf
->osd_recovery_max_active_ssd
;
3453 float OSD::get_osd_snap_trim_sleep()
3455 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3456 if (osd_snap_trim_sleep
> 0)
3457 return osd_snap_trim_sleep
;
3458 if (!store_is_rotational
&& !journal_is_rotational
)
3459 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3460 if (store_is_rotational
&& !journal_is_rotational
)
3461 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3462 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3468 CompatSet initial
, diff
;
3469 std::lock_guard
lock(osd_lock
);
3472 tracing::osd::tracer
.init("osd");
3474 tick_timer_without_osd_lock
.init();
3475 service
.recovery_request_timer
.init();
3476 service
.sleep_timer
.init();
3478 boot_finisher
.start();
3482 store
->read_meta("require_osd_release", &val
);
3483 last_require_osd_release
= ceph_release_from_name(val
);
3487 dout(2) << "init " << dev_path
3488 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3490 dout(2) << "journal " << journal_path
<< dendl
;
3491 ceph_assert(store
); // call pre_init() first!
3493 store
->set_cache_shards(get_num_cache_shards());
3495 int rotating_auth_attempts
= 0;
3496 auto rotating_auth_timeout
=
3497 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3499 int r
= store
->mount();
3501 derr
<< "OSD:init: unable to mount object store" << dendl
;
3504 journal_is_rotational
= store
->is_journal_rotational();
3505 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3508 enable_disable_fuse(false);
3510 dout(2) << "boot" << dendl
;
3512 service
.meta_ch
= store
->open_collection(coll_t::meta());
3513 if (!service
.meta_ch
) {
3514 derr
<< "OSD:init: unable to open meta collection"
3519 // initialize the daily loadavg with current 15min loadavg
3521 if (getloadavg(loadavgs
, 3) == 3) {
3522 daily_loadavg
= loadavgs
[2];
3524 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3525 daily_loadavg
= 1.0;
3528 // sanity check long object name handling
3531 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3532 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3533 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3534 r
= store
->validate_hobject_key(l
);
3536 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3537 << "object name[space] len" << dendl
;
3538 derr
<< " osd max object name len = "
3539 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3540 derr
<< " osd max object namespace len = "
3541 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3542 derr
<< cpp_strerror(r
) << dendl
;
3543 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3546 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3549 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3554 r
= read_superblock();
3556 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3561 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3562 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3563 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3564 derr
<< " daemon features " << osd_compat
<< dendl
;
3566 if (osd_compat
.writeable(superblock
.compat_features
)) {
3567 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3568 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3573 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3574 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3580 assert_warn(whoami
== superblock
.whoami
);
3581 if (whoami
!= superblock
.whoami
) {
3582 derr
<< "OSD::init: superblock says osd"
3583 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3588 startup_time
= ceph::mono_clock::now();
3590 // load up "current" osdmap
3591 assert_warn(!get_osdmap());
3593 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3597 osdmap
= get_map(superblock
.current_epoch
);
3600 // make sure we don't have legacy pgs deleting
3603 int r
= store
->list_collections(ls
);
3604 ceph_assert(r
>= 0);
3607 if (c
.is_pg(&pgid
) &&
3608 !osdmap
->have_pg_pool(pgid
.pool())) {
3609 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3610 if (!store
->exists(service
.meta_ch
, oid
)) {
3611 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3612 << pgid
.pool() << " for pg " << pgid
3613 << "; please downgrade to luminous and allow "
3614 << "pg deletion to complete before upgrading" << dendl
;
3621 initial
= get_osd_initial_compat_set();
3622 diff
= superblock
.compat_features
.unsupported(initial
);
3623 if (superblock
.compat_features
.merge(initial
)) {
3624 // Are we adding SNAPMAPPER2?
3625 if (diff
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
)) {
3626 dout(1) << __func__
<< " upgrade snap_mapper (first start as octopus)"
3628 auto ch
= service
.meta_ch
;
3629 auto hoid
= make_snapmapper_oid();
3630 unsigned max
= cct
->_conf
->osd_target_transaction_size
;
3631 r
= SnapMapper::convert_legacy(cct
, store
.get(), ch
, hoid
, max
);
3635 // We need to persist the new compat_set before we
3637 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3638 ObjectStore::Transaction t
;
3639 write_superblock(t
);
3640 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3645 // make sure snap mapper object exists
3646 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3647 dout(10) << "init creating/touching snapmapper object" << dendl
;
3648 ObjectStore::Transaction t
;
3649 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3650 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3654 if (!store
->exists(service
.meta_ch
, OSD::make_purged_snaps_oid())) {
3655 dout(10) << "init creating/touching purged_snaps object" << dendl
;
3656 ObjectStore::Transaction t
;
3657 t
.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3658 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3663 if (cct
->_conf
->osd_open_classes_on_start
) {
3664 int r
= ClassHandler::get_instance().open_all_classes();
3666 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3669 check_osdmap_features();
3672 epoch_t bind_epoch
= osdmap
->get_epoch();
3673 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3676 clear_temp_objects();
3678 // initialize osdmap references in sharded wq
3679 for (auto& shard
: shards
) {
3680 std::lock_guard
l(shard
->osdmap_lock
);
3681 shard
->shard_osdmap
= osdmap
;
3684 // load up pgs (as they previously existed)
3687 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3689 if (cct
->_conf
.get_val
<bool>("osd_compact_on_start")) {
3690 dout(2) << "compacting object store's omap" << dendl
;
3696 struct store_statfs_t stbuf
;
3697 osd_alert_list_t alerts
;
3698 int r
= store
->statfs(&stbuf
, &alerts
);
3699 ceph_assert(r
== 0);
3700 service
.set_statfs(stbuf
, alerts
);
3703 // client_messenger's auth_client will be set up by monc->init() later.
3704 for (auto m
: { cluster_messenger
,
3706 hb_front_client_messenger
,
3707 hb_back_client_messenger
,
3708 hb_front_server_messenger
,
3709 hb_back_server_messenger
} ) {
3710 m
->set_auth_client(monc
);
3712 for (auto m
: { client_messenger
,
3714 hb_front_server_messenger
,
3715 hb_back_server_messenger
}) {
3716 m
->set_auth_server(monc
);
3718 monc
->set_handle_authentication_dispatcher(this);
3720 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3721 | CEPH_ENTITY_TYPE_MGR
);
3726 mgrc
.set_pgstats_cb([this]() { return collect_pg_stats(); });
3727 mgrc
.set_perf_metric_query_cb(
3728 [this](const ConfigPayload
&config_payload
) {
3729 set_perf_queries(config_payload
);
3732 return get_perf_reports();
3736 // tell monc about log_client so it will know about mon session resets
3737 monc
->set_log_client(&log_client
);
3738 update_log_config();
3741 client_messenger
->add_dispatcher_tail(&mgrc
);
3742 client_messenger
->add_dispatcher_tail(this);
3743 cluster_messenger
->add_dispatcher_head(this);
3745 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3746 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3747 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3748 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3750 objecter_messenger
->add_dispatcher_head(service
.objecter
.get());
3753 service
.publish_map(osdmap
);
3754 service
.publish_superblock(superblock
);
3755 service
.max_oldest_map
= superblock
.oldest_map
;
3757 for (auto& shard
: shards
) {
3758 // put PGs in a temporary set because we may modify pg_slots
3759 // unordered_map below.
3761 for (auto& i
: shard
->pg_slots
) {
3762 PGRef pg
= i
.second
->pg
;
3768 for (auto pg
: pgs
) {
3769 std::scoped_lock l
{*pg
};
3770 set
<pair
<spg_t
,epoch_t
>> new_children
;
3771 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3772 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3773 &new_children
, &merge_pgs
);
3774 if (!new_children
.empty()) {
3775 for (auto shard
: shards
) {
3776 shard
->prime_splits(osdmap
, &new_children
);
3778 assert(new_children
.empty());
3780 if (!merge_pgs
.empty()) {
3781 for (auto shard
: shards
) {
3782 shard
->prime_merges(osdmap
, &merge_pgs
);
3784 assert(merge_pgs
.empty());
3791 // start the heartbeat
3792 heartbeat_thread
.create("osd_srv_heartbt");
3795 tick_timer
.add_event_after(get_tick_interval(),
3798 std::lock_guard
l(tick_timer_lock
);
3799 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3800 new C_Tick_WithoutOSDLock(this));
3805 r
= monc
->authenticate();
3807 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3812 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3813 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3814 ++rotating_auth_attempts
;
3815 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3816 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3821 r
= update_crush_device_class();
3823 derr
<< __func__
<< " unable to update_crush_device_class: "
3824 << cpp_strerror(r
) << dendl
;
3828 r
= update_crush_location();
3830 derr
<< __func__
<< " unable to update_crush_location: "
3831 << cpp_strerror(r
) << dendl
;
3839 // start objecter *after* we have authenticated, so that we don't ignore
3840 // the OSDMaps it requests.
3841 service
.final_init();
3845 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3848 dout(0) << "done with init, starting boot process" << dendl
;
3850 // subscribe to any pg creations
3851 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3853 // MgrClient needs this (it doesn't have MonClient reference itself)
3854 monc
->sub_want("mgrmap", 0, 0);
3856 // we don't need to ask for an osdmap here; objecter will
3857 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3863 // Override a few options if mclock scheduler is enabled.
3864 maybe_override_max_osd_capacity_for_qos();
3865 maybe_override_options_for_qos();
3870 enable_disable_fuse(true);
3876 void OSD::final_init()
3878 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3879 asok_hook
= new OSDSocketHook(this);
3880 int r
= admin_socket
->register_command("status", asok_hook
,
3881 "high-level status of OSD");
3882 ceph_assert(r
== 0);
3883 r
= admin_socket
->register_command("flush_journal",
3885 "flush the journal to permanent store");
3886 ceph_assert(r
== 0);
3887 r
= admin_socket
->register_command("dump_ops_in_flight " \
3888 "name=filterstr,type=CephString,n=N,req=false",
3890 "show the ops currently in flight");
3891 ceph_assert(r
== 0);
3892 r
= admin_socket
->register_command("ops " \
3893 "name=filterstr,type=CephString,n=N,req=false",
3895 "show the ops currently in flight");
3896 ceph_assert(r
== 0);
3897 r
= admin_socket
->register_command("dump_blocked_ops " \
3898 "name=filterstr,type=CephString,n=N,req=false",
3900 "show the blocked ops currently in flight");
3901 ceph_assert(r
== 0);
3902 r
= admin_socket
->register_command("dump_historic_ops " \
3903 "name=filterstr,type=CephString,n=N,req=false",
3906 ceph_assert(r
== 0);
3907 r
= admin_socket
->register_command("dump_historic_slow_ops " \
3908 "name=filterstr,type=CephString,n=N,req=false",
3910 "show slowest recent ops");
3911 ceph_assert(r
== 0);
3912 r
= admin_socket
->register_command("dump_historic_ops_by_duration " \
3913 "name=filterstr,type=CephString,n=N,req=false",
3915 "show slowest recent ops, sorted by duration");
3916 ceph_assert(r
== 0);
3917 r
= admin_socket
->register_command("dump_op_pq_state",
3919 "dump op queue state");
3920 ceph_assert(r
== 0);
3921 r
= admin_socket
->register_command("dump_blocklist",
3923 "dump blocklisted clients and times");
3924 ceph_assert(r
== 0);
3925 r
= admin_socket
->register_command("dump_watchers",
3927 "show clients which have active watches,"
3928 " and on which objects");
3929 ceph_assert(r
== 0);
3930 r
= admin_socket
->register_command("dump_recovery_reservations",
3932 "show recovery reservations");
3933 ceph_assert(r
== 0);
3934 r
= admin_socket
->register_command("dump_scrub_reservations",
3936 "show scrub reservations");
3937 ceph_assert(r
== 0);
3938 r
= admin_socket
->register_command("get_latest_osdmap",
3940 "force osd to update the latest map from "
3942 ceph_assert(r
== 0);
3944 r
= admin_socket
->register_command("set_heap_property " \
3945 "name=property,type=CephString " \
3946 "name=value,type=CephInt",
3948 "update malloc extension heap property");
3949 ceph_assert(r
== 0);
3951 r
= admin_socket
->register_command("get_heap_property " \
3952 "name=property,type=CephString",
3954 "get malloc extension heap property");
3955 ceph_assert(r
== 0);
3957 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3959 "print statistics of kvdb which used by bluestore");
3960 ceph_assert(r
== 0);
3962 r
= admin_socket
->register_command("dump_scrubs",
3964 "print scheduled scrubs");
3965 ceph_assert(r
== 0);
3967 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3969 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3970 ceph_assert(r
== 0);
3972 r
= admin_socket
->register_command("flush_store_cache",
3974 "Flush bluestore internal cache");
3975 ceph_assert(r
== 0);
3976 r
= admin_socket
->register_command("dump_pgstate_history",
3978 "show recent state history");
3979 ceph_assert(r
== 0);
3981 r
= admin_socket
->register_command("compact",
3983 "Commpact object store's omap."
3984 " WARNING: Compaction probably slows your requests");
3985 ceph_assert(r
== 0);
3987 r
= admin_socket
->register_command("get_mapped_pools",
3989 "dump pools whose PG(s) are mapped to this OSD.");
3991 ceph_assert(r
== 0);
3993 r
= admin_socket
->register_command("smart name=devid,type=CephString,req=false",
3995 "probe OSD devices for SMART data.");
3997 ceph_assert(r
== 0);
3999 r
= admin_socket
->register_command("list_devices",
4001 "list OSD devices.");
4002 r
= admin_socket
->register_command("send_beacon",
4004 "send OSD beacon to mon immediately");
4006 r
= admin_socket
->register_command(
4007 "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
4008 "Dump osd heartbeat network ping times");
4009 ceph_assert(r
== 0);
4011 r
= admin_socket
->register_command(
4012 "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook
,
4013 "Dump store's statistics for the given pool");
4014 ceph_assert(r
== 0);
4016 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
.get());
4017 // Note: pools are CephString instead of CephPoolname because
4018 // these commands traditionally support both pool names and numbers
4019 r
= admin_socket
->register_command(
4021 "name=pool,type=CephString " \
4022 "name=objname,type=CephObjectname " \
4023 "name=key,type=CephString "\
4024 "name=val,type=CephString",
4027 ceph_assert(r
== 0);
4028 r
= admin_socket
->register_command(
4030 "name=pool,type=CephString " \
4031 "name=objname,type=CephObjectname " \
4032 "name=key,type=CephString",
4035 ceph_assert(r
== 0);
4036 r
= admin_socket
->register_command(
4038 "name=pool,type=CephString " \
4039 "name=objname,type=CephObjectname " \
4040 "name=header,type=CephString",
4043 ceph_assert(r
== 0);
4045 r
= admin_socket
->register_command(
4047 "name=pool,type=CephString " \
4048 "name=objname,type=CephObjectname",
4050 "output entire object map");
4051 ceph_assert(r
== 0);
4053 r
= admin_socket
->register_command(
4055 "name=pool,type=CephString " \
4056 "name=objname,type=CephObjectname " \
4057 "name=len,type=CephInt",
4059 "truncate object to length");
4060 ceph_assert(r
== 0);
4062 r
= admin_socket
->register_command(
4064 "name=pool,type=CephString " \
4065 "name=objname,type=CephObjectname " \
4066 "name=shardid,type=CephInt,req=false,range=0|255",
4068 "inject data error to an object");
4069 ceph_assert(r
== 0);
4071 r
= admin_socket
->register_command(
4073 "name=pool,type=CephString " \
4074 "name=objname,type=CephObjectname " \
4075 "name=shardid,type=CephInt,req=false,range=0|255",
4077 "inject metadata error to an object");
4078 ceph_assert(r
== 0);
4079 r
= admin_socket
->register_command(
4080 "set_recovery_delay " \
4081 "name=utime,type=CephInt,req=false",
4083 "Delay osd recovery by specified seconds");
4084 ceph_assert(r
== 0);
4085 r
= admin_socket
->register_command(
4087 "name=type,type=CephString,req=false " \
4088 "name=count,type=CephInt,req=false ",
4090 "Inject a full disk (optional count times)");
4091 ceph_assert(r
== 0);
4092 r
= admin_socket
->register_command(
4094 "name=count,type=CephInt,req=false " \
4095 "name=size,type=CephInt,req=false " \
4096 "name=object_size,type=CephInt,req=false " \
4097 "name=object_num,type=CephInt,req=false ",
4099 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4100 "(default count=1G default size=4MB). Results in log.");
4101 ceph_assert(r
== 0);
4102 r
= admin_socket
->register_command(
4104 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4105 "name=message,type=CephString,n=N",
4107 "log a message to the cluster log");
4108 ceph_assert(r
== 0);
4109 r
= admin_socket
->register_command(
4113 ceph_assert(r
== 0);
4114 r
= admin_socket
->register_command(
4116 "name=heapcmd,type=CephChoices,strings=" \
4117 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4118 "name=value,type=CephString,req=false",
4120 "show heap usage info (available only if compiled with tcmalloc)");
4121 ceph_assert(r
== 0);
4122 r
= admin_socket
->register_command(
4123 "debug dump_missing " \
4124 "name=filename,type=CephFilepath",
4126 "dump missing objects to a named file");
4127 ceph_assert(r
== 0);
4128 r
= admin_socket
->register_command(
4129 "debug kick_recovery_wq " \
4130 "name=delay,type=CephInt,range=0",
4132 "set osd_recovery_delay_start to <val>");
4133 ceph_assert(r
== 0);
4134 r
= admin_socket
->register_command(
4136 "name=arg,type=CephChoices,strings=status|flush",
4138 "run cpu profiling on daemon");
4139 ceph_assert(r
== 0);
4140 r
= admin_socket
->register_command(
4141 "dump_pg_recovery_stats",
4143 "dump pg recovery statistics");
4144 ceph_assert(r
== 0);
4145 r
= admin_socket
->register_command(
4146 "reset_pg_recovery_stats",
4148 "reset pg recovery statistics");
4149 ceph_assert(r
== 0);
4150 r
= admin_socket
->register_command(
4153 "Drop all OSD caches");
4154 ceph_assert(r
== 0);
4155 r
= admin_socket
->register_command(
4158 "Get OSD caches statistics");
4159 ceph_assert(r
== 0);
4160 r
= admin_socket
->register_command(
4161 "scrub_purged_snaps",
4163 "Scrub purged_snaps vs snapmapper index");
4164 ceph_assert(r
== 0);
4165 r
= admin_socket
->register_command(
4167 "name=pgid,type=CephPgid " \
4168 "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4169 "name=value,type=CephString,req=false",
4171 "debug the scrubber");
4172 ceph_assert(r
== 0);
4174 // -- pg commands --
4175 // old form: ceph pg <pgid> command ...
4176 r
= admin_socket
->register_command(
4178 "name=pgid,type=CephPgid " \
4179 "name=cmd,type=CephChoices,strings=query",
4182 ceph_assert(r
== 0);
4183 r
= admin_socket
->register_command(
4185 "name=pgid,type=CephPgid " \
4186 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4187 "name=mulcmd,type=CephChoices,strings=revert|delete",
4190 ceph_assert(r
== 0);
4191 r
= admin_socket
->register_command(
4193 "name=pgid,type=CephPgid " \
4194 "name=cmd,type=CephChoices,strings=list_unfound " \
4195 "name=offset,type=CephString,req=false",
4198 ceph_assert(r
== 0);
4199 r
= admin_socket
->register_command(
4201 "name=pgid,type=CephPgid " \
4202 "name=cmd,type=CephChoices,strings=scrub " \
4203 "name=time,type=CephInt,req=false",
4206 ceph_assert(r
== 0);
4207 r
= admin_socket
->register_command(
4209 "name=pgid,type=CephPgid " \
4210 "name=cmd,type=CephChoices,strings=deep_scrub " \
4211 "name=time,type=CephInt,req=false",
4214 ceph_assert(r
== 0);
4215 // new form: tell <pgid> <cmd> for both cli and rest
4216 r
= admin_socket
->register_command(
4219 "show details of a specific pg");
4220 ceph_assert(r
== 0);
4221 r
= admin_socket
->register_command(
4222 "mark_unfound_lost " \
4223 "name=pgid,type=CephPgid,req=false " \
4224 "name=mulcmd,type=CephChoices,strings=revert|delete",
4226 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4227 ceph_assert(r
== 0);
4228 r
= admin_socket
->register_command(
4230 "name=pgid,type=CephPgid,req=false " \
4231 "name=offset,type=CephString,req=false",
4233 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4234 ceph_assert(r
== 0);
4235 r
= admin_socket
->register_command(
4237 "name=pgid,type=CephPgid,req=false " \
4238 "name=time,type=CephInt,req=false",
4240 "Trigger a scheduled scrub ");
4241 ceph_assert(r
== 0);
4242 r
= admin_socket
->register_command(
4244 "name=pgid,type=CephPgid,req=false " \
4245 "name=time,type=CephInt,req=false",
4247 "Trigger a scheduled deep scrub ");
4248 ceph_assert(r
== 0);
4251 PerfCounters
* OSD::create_logger()
4253 PerfCounters
* logger
= build_osd_logger(cct
);
4254 cct
->get_perfcounters_collection()->add(logger
);
4258 PerfCounters
* OSD::create_recoverystate_perf()
4260 PerfCounters
* recoverystate_perf
= build_recoverystate_perf(cct
);
4261 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
4262 return recoverystate_perf
;
4267 // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4268 //cct->_conf->osd_fast_shutdown = true;
4270 dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4271 << cct
->_conf
->osd_fast_shutdown
4272 << ", null-fm = " << store
->has_null_manager() << dendl
;
4274 utime_t start_time_func
= ceph_clock_now();
4276 if (cct
->_conf
->osd_fast_shutdown
) {
4277 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4278 if (cct
->_conf
->osd_fast_shutdown_notify_mon
)
4279 service
.prepare_to_stop();
4281 // There is no state we need to keep wehn running in NULL-FM moode
4282 if (!store
->has_null_manager()) {
4286 } else if (!service
.prepare_to_stop()) {
4287 return 0; // already shutting down
4291 if (is_stopping()) {
4296 if (!cct
->_conf
->osd_fast_shutdown
) {
4297 dout(0) << "shutdown" << dendl
;
4300 // don't accept new task for this OSD
4301 set_state(STATE_STOPPING
);
4303 // Disabled debugging during fast-shutdown
4304 if (!cct
->_conf
->osd_fast_shutdown
&& cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4305 cct
->_conf
.set_val("debug_osd", "100");
4306 cct
->_conf
.set_val("debug_journal", "100");
4307 cct
->_conf
.set_val("debug_filestore", "100");
4308 cct
->_conf
.set_val("debug_bluestore", "100");
4309 cct
->_conf
.set_val("debug_ms", "100");
4310 cct
->_conf
.apply_changes(nullptr);
4313 if (cct
->_conf
->osd_fast_shutdown
) {
4314 // first, stop new task from being taken from op_shardedwq
4315 // and clear all pending tasks
4316 op_shardedwq
.stop_for_fast_shutdown();
4318 utime_t start_time_timer
= ceph_clock_now();
4319 tick_timer
.shutdown();
4321 std::lock_guard
l(tick_timer_lock
);
4322 tick_timer_without_osd_lock
.shutdown();
4326 utime_t start_time_osd_drain
= ceph_clock_now();
4328 // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4332 utime_t start_time_umount
= ceph_clock_now();
4333 store
->prepare_for_fast_shutdown();
4334 std::lock_guard
lock(osd_lock
);
4335 // TBD: assert in allocator that nothing is being add
4338 utime_t end_time
= ceph_clock_now();
4339 if (cct
->_conf
->osd_fast_shutdown_timeout
) {
4340 ceph_assert(end_time
- start_time_func
< cct
->_conf
->osd_fast_shutdown_timeout
);
4342 dout(0) <<"Fast Shutdown duration total :" << end_time
- start_time_func
<< " seconds" << dendl
;
4343 dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount
- start_time_osd_drain
<< " seconds" << dendl
;
4344 dout(0) <<"Fast Shutdown duration umount :" << end_time
- start_time_umount
<< " seconds" << dendl
;
4345 dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain
- start_time_timer
<< " seconds" << dendl
;
4348 // now it is safe to exit
4352 // stop MgrClient earlier as it's more like an internal consumer of OSD
4355 service
.start_shutdown();
4357 // stop sending work to pgs. this just prevents any new work in _process
4358 // from racing with on_shutdown and potentially entering the pg after.
4359 op_shardedwq
.drain();
4365 for (auto pg
: pgs
) {
4370 // drain op queue again (in case PGs requeued something)
4371 op_shardedwq
.drain();
4373 finished
.clear(); // zap waiters (bleh, this is messy)
4374 waiting_for_osdmap
.clear();
4377 // unregister commands
4378 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4382 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4383 delete test_ops_hook
;
4384 test_ops_hook
= NULL
;
4389 std::lock_guard l
{heartbeat_lock
};
4390 heartbeat_stop
= true;
4391 heartbeat_cond
.notify_all();
4392 heartbeat_peers
.clear();
4394 heartbeat_thread
.join();
4396 hb_back_server_messenger
->mark_down_all();
4397 hb_front_server_messenger
->mark_down_all();
4398 hb_front_client_messenger
->mark_down_all();
4399 hb_back_client_messenger
->mark_down_all();
4403 dout(10) << "op sharded tp stopped" << dendl
;
4405 dout(10) << "stopping agent" << dendl
;
4406 service
.agent_stop();
4408 boot_finisher
.wait_for_empty();
4412 boot_finisher
.stop();
4413 reset_heartbeat_peers(true);
4415 tick_timer
.shutdown();
4418 std::lock_guard
l(tick_timer_lock
);
4419 tick_timer_without_osd_lock
.shutdown();
4422 // note unmount epoch
4423 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl
;
4424 superblock
.mounted
= service
.get_boot_epoch();
4425 superblock
.clean_thru
= get_osdmap_epoch();
4426 ObjectStore::Transaction t
;
4427 write_superblock(t
);
4428 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4430 derr
<< "OSD::shutdown: error writing superblock: "
4431 << cpp_strerror(r
) << dendl
;
4435 service
.shutdown_reserver();
4438 #ifdef PG_DEBUG_REFS
4439 service
.dump_live_pgids();
4443 _get_pgs(&pgs
, true);
4447 for (auto& pg
: pgs
) {
4448 if (pg
->is_deleted()) {
4451 dout(20) << " kicking pg " << pg
<< dendl
;
4453 if (pg
->get_num_ref() != 1) {
4454 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4455 << pg
->get_num_ref() << dendl
;
4456 #ifdef PG_DEBUG_REFS
4457 pg
->dump_live_ids();
4459 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4467 #ifdef PG_DEBUG_REFS
4468 service
.dump_live_pgids();
4472 cct
->_conf
.remove_observer(this);
4475 service
.meta_ch
.reset();
4477 dout(10) << "syncing store" << dendl
;
4478 enable_disable_fuse(true);
4480 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4481 dout(10) << "flushing journal" << dendl
;
4482 store
->flush_journal();
4488 std::unique_lock l
{map_lock
};
4489 set_osdmap(OSDMapRef());
4491 for (auto s
: shards
) {
4492 std::lock_guard
l(s
->osdmap_lock
);
4493 s
->shard_osdmap
= OSDMapRef();
4497 std::lock_guard
lock(osd_lock
);
4500 dout(10) << "Store synced" << dendl
;
4502 op_tracker
.on_shutdown();
4504 ClassHandler::get_instance().shutdown();
4505 client_messenger
->shutdown();
4506 cluster_messenger
->shutdown();
4507 hb_front_client_messenger
->shutdown();
4508 hb_back_client_messenger
->shutdown();
4509 objecter_messenger
->shutdown();
4510 hb_front_server_messenger
->shutdown();
4511 hb_back_server_messenger
->shutdown();
4513 utime_t duration
= ceph_clock_now() - start_time_func
;
4514 dout(0) <<"Slow Shutdown duration:" << duration
<< " seconds" << dendl
;
4516 tracing::osd::tracer
.shutdown();
4521 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4523 bool created
= false;
4525 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4526 vector
<string
> vcmd
{cmd
};
4530 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4533 if (r
== -ENOENT
&& !created
) {
4534 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4535 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4536 vector
<string
> vnewcmd
{newcmd
};
4540 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4543 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4544 << cpp_strerror(r
) << dendl
;
4550 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4559 int OSD::update_crush_location()
4561 if (!cct
->_conf
->osd_crush_update_on_start
) {
4562 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4567 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4568 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4570 struct store_statfs_t st
;
4571 osd_alert_list_t alerts
;
4572 int r
= store
->statfs(&st
, &alerts
);
4574 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4577 snprintf(weight
, sizeof(weight
), "%.4lf",
4580 double(1ull << 40 /* TB */)));
4583 dout(10) << __func__
<< " crush location is " << cct
->crush_location
<< dendl
;
4586 string("{\"prefix\": \"osd crush create-or-move\", ") +
4587 string("\"id\": ") + stringify(whoami
) + ", " +
4588 string("\"weight\":") + weight
+ ", " +
4589 string("\"args\": [") + stringify(cct
->crush_location
) + "]}";
4590 return mon_cmd_maybe_osd_create(cmd
);
4593 int OSD::update_crush_device_class()
4595 if (!cct
->_conf
->osd_class_update_on_start
) {
4596 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4600 string device_class
;
4601 int r
= store
->read_meta("crush_device_class", &device_class
);
4602 if (r
< 0 || device_class
.empty()) {
4603 device_class
= store
->get_default_device_class();
4606 if (device_class
.empty()) {
4607 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4612 string("{\"prefix\": \"osd crush set-device-class\", ") +
4613 string("\"class\": \"") + device_class
+ string("\", ") +
4614 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4616 r
= mon_cmd_maybe_osd_create(cmd
);
4618 // good, already bound to a device-class
4625 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4627 dout(10) << "write_superblock " << superblock
<< dendl
;
4629 //hack: at minimum it's using the baseline feature set
4630 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4631 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4634 encode(superblock
, bl
);
4635 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4638 int OSD::read_superblock()
4641 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4645 auto p
= bl
.cbegin();
4646 decode(superblock
, p
);
4648 dout(10) << "read_superblock " << superblock
<< dendl
;
4653 void OSD::clear_temp_objects()
4655 dout(10) << __func__
<< dendl
;
4657 store
->list_collections(ls
);
4658 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4660 if (!p
->is_pg(&pgid
))
4663 // list temp objects
4664 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4666 vector
<ghobject_t
> temps
;
4669 vector
<ghobject_t
> objects
;
4670 auto ch
= store
->open_collection(*p
);
4672 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4673 store
->get_ideal_list_max(),
4675 if (objects
.empty())
4677 vector
<ghobject_t
>::iterator q
;
4678 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4679 // Hammer set pool for temps to -1, so check for clean-up
4680 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4681 temps
.push_back(*q
);
4686 // If we saw a non-temp object and hit the break above we can
4687 // break out of the while loop too.
4688 if (q
!= objects
.end())
4691 if (!temps
.empty()) {
4692 ObjectStore::Transaction t
;
4694 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4695 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4697 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4698 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4699 t
= ObjectStore::Transaction();
4704 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4710 void OSD::recursive_remove_collection(CephContext
* cct
,
4711 ObjectStore
*store
, spg_t pgid
,
4717 make_snapmapper_oid());
4719 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4720 ObjectStore::Transaction t
;
4721 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4724 int max
= cct
->_conf
->osd_target_transaction_size
;
4725 vector
<ghobject_t
> objects
;
4726 objects
.reserve(max
);
4729 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4730 max
, &objects
, &next
);
4731 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4732 if (objects
.empty())
4734 for (auto& p
: objects
) {
4735 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4736 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4737 if (r
!= 0 && r
!= -ENOENT
)
4741 int r
= store
->queue_transaction(ch
, std::move(t
));
4742 ceph_assert(r
== 0);
4743 t
= ObjectStore::Transaction();
4745 t
.remove_collection(tmp
);
4746 int r
= store
->queue_transaction(ch
, std::move(t
));
4747 ceph_assert(r
== 0);
4750 if (!ch
->flush_commit(&waiter
)) {
4756 // ======================================================
4760 OSDMapRef createmap
,
4763 dout(10) << __func__
<< " " << pgid
<< dendl
;
4765 map
<string
,string
> ec_profile
;
4767 if (createmap
->have_pg_pool(pgid
.pool())) {
4768 pi
= *createmap
->get_pg_pool(pgid
.pool());
4769 name
= createmap
->get_pool_name(pgid
.pool());
4770 if (pi
.is_erasure()) {
4771 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4774 // pool was deleted; grab final pg_pool_t off disk.
4775 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4777 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4779 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4783 ceph_assert(r
>= 0);
4784 auto p
= bl
.cbegin();
4787 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4788 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4789 << " tombstone" << dendl
;
4792 decode(ec_profile
, p
);
4794 PGPool
pool(createmap
, pgid
.pool(), pi
, name
);
4796 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4797 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4798 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4804 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4807 v
->reserve(get_num_pgs());
4808 for (auto& s
: shards
) {
4809 std::lock_guard
l(s
->shard_lock
);
4810 for (auto& j
: s
->pg_slots
) {
4812 !j
.second
->pg
->is_deleted()) {
4813 v
->push_back(j
.second
->pg
);
4815 s
->_detach_pg(j
.second
.get());
4822 void OSD::_get_pgids(vector
<spg_t
> *v
)
4825 v
->reserve(get_num_pgs());
4826 for (auto& s
: shards
) {
4827 std::lock_guard
l(s
->shard_lock
);
4828 for (auto& j
: s
->pg_slots
) {
4830 !j
.second
->pg
->is_deleted()) {
4831 v
->push_back(j
.first
);
4837 void OSD::register_pg(PGRef pg
)
4839 spg_t pgid
= pg
->get_pgid();
4840 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4841 auto sdata
= shards
[shard_index
];
4842 std::lock_guard
l(sdata
->shard_lock
);
4843 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4844 ceph_assert(r
.second
);
4845 auto *slot
= r
.first
->second
.get();
4846 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4847 sdata
->_attach_pg(slot
, pg
.get());
4850 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4852 auto sdata
= pg
->osd_shard
;
4855 std::lock_guard
l(sdata
->shard_lock
);
4856 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4857 if (p
== sdata
->pg_slots
.end() ||
4859 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4862 if (p
->second
->waiting_for_merge_epoch
) {
4863 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4866 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4867 sdata
->_detach_pg(p
->second
.get());
4870 for (auto shard
: shards
) {
4871 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4874 // update pg count now since we might not get an osdmap any time soon.
4875 if (pg
->is_primary())
4876 service
.logger
->dec(l_osd_pg_primary
);
4877 else if (pg
->is_nonprimary())
4878 service
.logger
->dec(l_osd_pg_replica
); // misnomver
4880 service
.logger
->dec(l_osd_pg_stray
);
4885 PGRef
OSD::_lookup_pg(spg_t pgid
)
4887 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4888 auto sdata
= shards
[shard_index
];
4889 std::lock_guard
l(sdata
->shard_lock
);
4890 auto p
= sdata
->pg_slots
.find(pgid
);
4891 if (p
== sdata
->pg_slots
.end()) {
4894 return p
->second
->pg
;
4897 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4899 PGRef pg
= _lookup_pg(pgid
);
4904 if (!pg
->is_deleted()) {
4911 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4913 return _lookup_lock_pg(pgid
);
4916 void OSD::load_pgs()
4918 ceph_assert(ceph_mutex_is_locked(osd_lock
));
4919 dout(0) << "load_pgs" << dendl
;
4922 auto pghist
= make_pg_num_history_oid();
4924 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4925 if (r
>= 0 && bl
.length() > 0) {
4926 auto p
= bl
.cbegin();
4927 decode(pg_num_history
, p
);
4929 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4933 int r
= store
->list_collections(ls
);
4935 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4939 for (vector
<coll_t
>::iterator it
= ls
.begin();
4943 if (it
->is_temp(&pgid
) ||
4944 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
.get(), pgid
))) {
4945 dout(10) << "load_pgs " << *it
4946 << " removing, legacy or flagged for removal pg" << dendl
;
4947 recursive_remove_collection(cct
, store
.get(), pgid
, *it
);
4951 if (!it
->is_pg(&pgid
)) {
4952 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4956 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4957 epoch_t map_epoch
= 0;
4958 int r
= PG::peek_map_epoch(store
.get(), pgid
, &map_epoch
);
4960 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4966 if (map_epoch
> 0) {
4967 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4969 if (!get_osdmap()->have_pg_pool(pgid
.pool())) {
4970 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4971 << " on pg " << pgid
<< ", but the pool is not present in the "
4972 << "current map, so this is probably a result of bug 10617. "
4973 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4974 << "to clean it up later." << dendl
;
4977 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4978 << map_epoch
<< ", but missing map. Crashing."
4980 ceph_abort_msg("Missing map in load_pgs");
4983 pg
= _make_pg(pgosdmap
, pgid
);
4985 pg
= _make_pg(get_osdmap(), pgid
);
4988 recursive_remove_collection(cct
, store
.get(), pgid
, *it
);
4992 // there can be no waiters here, so we don't call _wake_pg_slot
4995 pg
->ch
= store
->open_collection(pg
->coll
);
4997 // read pg state, log
4998 pg
->read_state(store
.get());
5001 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
5004 recursive_remove_collection(cct
, store
.get(), pgid
, *it
);
5008 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
5009 assert(NULL
!= shards
[shard_index
]);
5010 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
5013 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
5019 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
5023 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
5024 const PGCreateInfo
*info
)
5026 spg_t pgid
= info
->pgid
;
5028 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
5029 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
5033 OSDMapRef startmap
= get_map(info
->epoch
);
5036 int64_t pool_id
= pgid
.pgid
.pool();
5037 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
5039 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
5042 if (osdmap
->require_osd_release
>= ceph_release_t::nautilus
&&
5043 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
5044 // this ensures we do not process old creating messages after the
5045 // pool's initial pgs have been created (and pg are subsequently
5046 // allowed to split or merge).
5047 dout(20) << __func__
<< " dropping " << pgid
5048 << "create, pool does not have CREATING flag set" << dendl
;
5053 int up_primary
, acting_primary
;
5054 vector
<int> up
, acting
;
5055 startmap
->pg_to_up_acting_osds(
5056 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
5058 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
5059 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
5060 store
->get_type() != "bluestore") {
5061 clog
->warn() << "pg " << pgid
5062 << " is at risk of silent data corruption: "
5063 << "the pool allows ec overwrites but is not stored in "
5064 << "bluestore, so deep scrubbing will not detect bitrot";
5067 create_pg_collection(
5068 rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
5069 init_pg_ondisk(rctx
.transaction
, pgid
, pp
);
5071 int role
= startmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
5073 PGRef pg
= _make_pg(startmap
, pgid
);
5074 pg
->ch
= store
->create_new_collection(pg
->coll
);
5077 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
5078 assert(NULL
!= shards
[shard_index
]);
5079 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
5084 // we are holding the shard lock
5085 ceph_assert(!pg
->is_deleted());
5094 info
->past_intervals
,
5097 pg
->init_collection_pool_opts();
5099 if (pg
->is_primary()) {
5100 std::lock_guard locker
{m_perf_queries_lock
};
5101 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
5104 pg
->handle_initialize(rctx
);
5105 pg
->handle_activate_map(rctx
);
5107 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
5109 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
5113 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
5117 const auto max_pgs_per_osd
=
5118 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
5119 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
5121 if (num_pgs
< max_pgs_per_osd
) {
5125 std::lock_guard
l(pending_creates_lock
);
5126 if (is_mon_create
) {
5127 pending_creates_from_mon
++;
5129 bool is_primary
= osdmap
->get_pg_acting_role(pgid
, whoami
) == 0;
5130 pending_creates_from_osd
.emplace(pgid
, is_primary
);
5132 dout(1) << __func__
<< " withhold creation of pg " << pgid
5133 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
5137 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5138 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5139 // to up set if pg_temp is empty. so an empty pg_temp won't work.
5140 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
5141 if (acting
.size() > 1) {
5144 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
5145 twiddled
.push_back(-1);
5150 void OSD::resume_creating_pg()
5152 bool do_sub_pg_creates
= false;
5153 bool have_pending_creates
= false;
5155 const auto max_pgs_per_osd
=
5156 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
5157 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
5158 if (max_pgs_per_osd
<= num_pgs
) {
5159 // this could happen if admin decreases this setting before a PG is removed
5162 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
5163 std::lock_guard
l(pending_creates_lock
);
5164 if (pending_creates_from_mon
> 0) {
5165 dout(20) << __func__
<< " pending_creates_from_mon "
5166 << pending_creates_from_mon
<< dendl
;
5167 do_sub_pg_creates
= true;
5168 if (pending_creates_from_mon
>= spare_pgs
) {
5169 spare_pgs
= pending_creates_from_mon
= 0;
5171 spare_pgs
-= pending_creates_from_mon
;
5172 pending_creates_from_mon
= 0;
5175 auto pg
= pending_creates_from_osd
.cbegin();
5176 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
5177 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
5179 get_osdmap()->pg_to_up_acting_osds(pg
->first
.pgid
, nullptr, nullptr, &acting
, nullptr);
5180 service
.queue_want_pg_temp(pg
->first
.pgid
, twiddle(acting
), true);
5181 pg
= pending_creates_from_osd
.erase(pg
);
5182 do_sub_pg_creates
= true;
5185 have_pending_creates
= (pending_creates_from_mon
> 0 ||
5186 !pending_creates_from_osd
.empty());
5189 bool do_renew_subs
= false;
5190 if (do_sub_pg_creates
) {
5191 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
5192 dout(4) << __func__
<< ": resolicit pg creates from mon since "
5193 << last_pg_create_epoch
<< dendl
;
5194 do_renew_subs
= true;
5197 version_t start
= get_osdmap_epoch() + 1;
5198 if (have_pending_creates
) {
5199 // don't miss any new osdmap deleting PGs
5200 if (monc
->sub_want("osdmap", start
, 0)) {
5201 dout(4) << __func__
<< ": resolicit osdmap from mon since "
5203 do_renew_subs
= true;
5205 } else if (do_sub_pg_creates
) {
5206 // no need to subscribe the osdmap continuously anymore
5207 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5208 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
5209 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
5211 do_renew_subs
= true;
5215 if (do_renew_subs
) {
5219 service
.send_pg_temp();
5222 void OSD::build_initial_pg_history(
5225 utime_t created_stamp
,
5229 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
5230 *h
= pg_history_t(created
, created_stamp
);
5232 OSDMapRef lastmap
= service
.get_map(created
);
5233 int up_primary
, acting_primary
;
5234 vector
<int> up
, acting
;
5235 lastmap
->pg_to_up_acting_osds(
5236 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
5238 ostringstream debug
;
5239 for (epoch_t e
= created
+ 1; e
<= get_osdmap_epoch(); ++e
) {
5240 OSDMapRef osdmap
= service
.get_map(e
);
5241 int new_up_primary
, new_acting_primary
;
5242 vector
<int> new_up
, new_acting
;
5243 osdmap
->pg_to_up_acting_osds(
5244 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
5246 // this is a bit imprecise, but sufficient?
5247 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
5248 const pg_pool_t
*pi
;
5249 bool operator()(const set
<pg_shard_t
> &have
) const {
5250 return have
.size() >= pi
->min_size
;
5252 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
5253 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
5255 bool new_interval
= PastIntervals::check_new_interval(
5262 h
->same_interval_since
,
5263 h
->last_epoch_clean
,
5271 h
->same_interval_since
= e
;
5273 h
->same_up_since
= e
;
5275 if (acting_primary
!= new_acting_primary
) {
5276 h
->same_primary_since
= e
;
5278 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
5279 osdmap
->get_pg_num(pgid
.pgid
.pool()),
5281 h
->last_epoch_split
= e
;
5284 acting
= new_acting
;
5285 up_primary
= new_up_primary
;
5286 acting_primary
= new_acting_primary
;
5290 dout(20) << __func__
<< " " << debug
.str() << dendl
;
5291 dout(10) << __func__
<< " " << *h
<< " " << *pi
5292 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
5293 pi
->get_bounds()) << ")"
5297 void OSD::_add_heartbeat_peer(int p
)
5303 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
5304 if (i
== heartbeat_peers
.end()) {
5305 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, get_osdmap_epoch());
5308 assert(cons
.second
);
5310 hi
= &heartbeat_peers
[p
];
5313 auto stamps
= service
.get_hb_stamps(p
);
5315 auto sb
= ceph::make_ref
<Session
>(cct
, cons
.first
.get());
5317 sb
->stamps
= stamps
;
5318 hi
->hb_interval_start
= ceph_clock_now();
5319 hi
->con_back
= cons
.first
.get();
5320 hi
->con_back
->set_priv(sb
);
5322 auto sf
= ceph::make_ref
<Session
>(cct
, cons
.second
.get());
5324 sf
->stamps
= stamps
;
5325 hi
->con_front
= cons
.second
.get();
5326 hi
->con_front
->set_priv(sf
);
5328 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5329 << " " << hi
->con_back
->get_peer_addr()
5330 << " " << hi
->con_front
->get_peer_addr()
5335 hi
->epoch
= get_osdmap_epoch();
5338 void OSD::_remove_heartbeat_peer(int n
)
5340 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5341 ceph_assert(q
!= heartbeat_peers
.end());
5342 dout(20) << " removing heartbeat peer osd." << n
5343 << " " << q
->second
.con_back
->get_peer_addr()
5344 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5346 q
->second
.clear_mark_down();
5347 heartbeat_peers
.erase(q
);
5350 void OSD::need_heartbeat_peer_update()
5354 dout(20) << "need_heartbeat_peer_update" << dendl
;
5355 heartbeat_set_peers_need_update();
5358 void OSD::maybe_update_heartbeat_peers()
5360 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5362 if (is_waiting_for_healthy() || is_active()) {
5363 utime_t now
= ceph_clock_now();
5364 if (last_heartbeat_resample
== utime_t()) {
5365 last_heartbeat_resample
= now
;
5366 heartbeat_set_peers_need_update();
5367 } else if (!heartbeat_peers_need_update()) {
5368 utime_t dur
= now
- last_heartbeat_resample
;
5369 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5370 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5371 heartbeat_set_peers_need_update();
5372 last_heartbeat_resample
= now
;
5373 // automatically clean up any stale heartbeat peers
5374 // if we are unhealthy, then clean all
5375 reset_heartbeat_peers(is_waiting_for_healthy());
5380 if (!heartbeat_peers_need_update())
5382 heartbeat_clear_peers_need_update();
5384 std::lock_guard
l(heartbeat_lock
);
5386 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5389 // build heartbeat from set
5393 for (auto& pg
: pgs
) {
5394 pg
->with_heartbeat_peers([&](int peer
) {
5395 if (get_osdmap()->is_up(peer
)) {
5396 _add_heartbeat_peer(peer
);
5402 // include next and previous up osds to ensure we have a fully-connected set
5403 set
<int> want
, extras
;
5404 const int next
= get_osdmap()->get_next_up_osd_after(whoami
);
5407 int prev
= get_osdmap()->get_previous_up_osd_before(whoami
);
5408 if (prev
>= 0 && prev
!= next
)
5411 // make sure we have at least **min_down** osds coming from different
5412 // subtree level (e.g., hosts) for fast failure detection.
5413 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5414 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5415 auto limit
= std::max(min_down
, (uint64_t)cct
->_conf
->osd_heartbeat_min_peers
);
5416 get_osdmap()->get_random_up_osds_by_subtree(
5417 whoami
, subtree
, limit
, want
, &want
);
5419 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5420 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5422 _add_heartbeat_peer(*p
);
5425 // remove down peers; enumerate extras
5426 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5427 while (p
!= heartbeat_peers
.end()) {
5428 if (!get_osdmap()->is_up(p
->first
)) {
5431 _remove_heartbeat_peer(o
);
5434 if (p
->second
.epoch
< get_osdmap_epoch()) {
5435 extras
.insert(p
->first
);
5441 for (int n
= next
; n
>= 0; ) {
5442 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5444 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5445 dout(10) << " adding random peer osd." << n
<< dendl
;
5447 _add_heartbeat_peer(n
);
5449 n
= get_osdmap()->get_next_up_osd_after(n
);
5451 break; // came full circle; stop
5455 for (set
<int>::iterator p
= extras
.begin();
5456 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5460 _remove_heartbeat_peer(*p
);
5463 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5465 // clean up stale failure pending
5466 for (auto it
= failure_pending
.begin(); it
!= failure_pending
.end();) {
5467 if (heartbeat_peers
.count(it
->first
) == 0) {
5468 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
5469 failure_pending
.erase(it
++);
5476 void OSD::reset_heartbeat_peers(bool all
)
5478 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5479 dout(10) << "reset_heartbeat_peers" << dendl
;
5480 utime_t stale
= ceph_clock_now();
5481 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5482 std::lock_guard
l(heartbeat_lock
);
5483 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5484 auto& [peer
, hi
] = *it
;
5485 if (all
|| hi
.is_stale(stale
)) {
5486 hi
.clear_mark_down();
5487 // stop sending failure_report to mon too
5488 failure_queue
.erase(peer
);
5489 failure_pending
.erase(peer
);
5490 it
= heartbeat_peers
.erase(it
);
5497 void OSD::handle_osd_ping(MOSDPing
*m
)
5499 if (superblock
.cluster_fsid
!= m
->fsid
) {
5500 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5501 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
5507 int from
= m
->get_source().num();
5509 heartbeat_lock
.lock();
5510 if (is_stopping()) {
5511 heartbeat_lock
.unlock();
5516 utime_t now
= ceph_clock_now();
5517 auto mnow
= service
.get_mnow();
5518 ConnectionRef
con(m
->get_connection());
5519 OSDMapRef curmap
= service
.get_osdmap();
5521 heartbeat_lock
.unlock();
5526 auto sref
= con
->get_priv();
5527 Session
*s
= static_cast<Session
*>(sref
.get());
5529 heartbeat_lock
.unlock();
5535 s
->stamps
= service
.get_hb_stamps(from
);
5540 case MOSDPing::PING
:
5542 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5543 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5544 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5545 if (heartbeat_drop
->second
== 0) {
5546 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5548 --heartbeat_drop
->second
;
5549 dout(5) << "Dropping heartbeat from " << from
5550 << ", " << heartbeat_drop
->second
5551 << " remaining to drop" << dendl
;
5554 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5555 ((((double)(rand()%100))/100.0))) {
5557 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5558 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5559 dout(5) << "Dropping heartbeat from " << from
5560 << ", " << heartbeat_drop
->second
5561 << " remaining to drop" << dendl
;
5566 ceph::signedspan sender_delta_ub
{};
5567 s
->stamps
->got_ping(
5573 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5575 if (!cct
->get_heartbeat_map()->is_healthy()) {
5576 dout(10) << "internal heartbeat not healthy, dropping ping request"
5581 Message
*r
= new MOSDPing(monc
->get_fsid(),
5582 curmap
->get_epoch(),
5583 MOSDPing::PING_REPLY
,
5587 service
.get_up_epoch(),
5588 cct
->_conf
->osd_heartbeat_min_size
,
5590 con
->send_message(r
);
5592 if (curmap
->is_up(from
)) {
5594 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5595 from
, curmap
->get_epoch());
5597 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5600 } else if (!curmap
->exists(from
) ||
5601 curmap
->get_down_at(from
) > m
->map_epoch
) {
5602 // tell them they have died
5603 Message
*r
= new MOSDPing(monc
->get_fsid(),
5604 curmap
->get_epoch(),
5609 service
.get_up_epoch(),
5610 cct
->_conf
->osd_heartbeat_min_size
);
5611 con
->send_message(r
);
5616 case MOSDPing::PING_REPLY
:
5618 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5619 if (i
!= heartbeat_peers
.end()) {
5620 auto acked
= i
->second
.ping_history
.find(m
->ping_stamp
);
5621 if (acked
!= i
->second
.ping_history
.end()) {
5622 int &unacknowledged
= acked
->second
.second
;
5623 if (con
== i
->second
.con_back
) {
5624 dout(25) << "handle_osd_ping got reply from osd." << from
5625 << " first_tx " << i
->second
.first_tx
5626 << " last_tx " << i
->second
.last_tx
5627 << " last_rx_back " << i
->second
.last_rx_back
5629 << " last_rx_front " << i
->second
.last_rx_front
5631 i
->second
.last_rx_back
= now
;
5632 ceph_assert(unacknowledged
> 0);
5634 // if there is no front con, set both stamps.
5635 if (i
->second
.con_front
== NULL
) {
5636 i
->second
.last_rx_front
= now
;
5637 ceph_assert(unacknowledged
> 0);
5640 } else if (con
== i
->second
.con_front
) {
5641 dout(25) << "handle_osd_ping got reply from osd." << from
5642 << " first_tx " << i
->second
.first_tx
5643 << " last_tx " << i
->second
.last_tx
5644 << " last_rx_back " << i
->second
.last_rx_back
5645 << " last_rx_front " << i
->second
.last_rx_front
5648 i
->second
.last_rx_front
= now
;
5649 ceph_assert(unacknowledged
> 0);
5653 if (unacknowledged
== 0) {
5654 // succeeded in getting all replies
5655 dout(25) << "handle_osd_ping got all replies from osd." << from
5656 << " , erase pending ping(sent at " << m
->ping_stamp
<< ")"
5657 << " and older pending ping(s)"
5660 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5661 ++i
->second
.hb_average_count
;
5662 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->ping_stamp
);
5663 i
->second
.hb_total_back
+= back_pingtime
;
5664 if (back_pingtime
< i
->second
.hb_min_back
)
5665 i
->second
.hb_min_back
= back_pingtime
;
5666 if (back_pingtime
> i
->second
.hb_max_back
)
5667 i
->second
.hb_max_back
= back_pingtime
;
5668 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->ping_stamp
);
5669 i
->second
.hb_total_front
+= front_pingtime
;
5670 if (front_pingtime
< i
->second
.hb_min_front
)
5671 i
->second
.hb_min_front
= front_pingtime
;
5672 if (front_pingtime
> i
->second
.hb_max_front
)
5673 i
->second
.hb_max_front
= front_pingtime
;
5675 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5676 if (i
->second
.hb_interval_start
== utime_t())
5677 i
->second
.hb_interval_start
= now
;
5678 int64_t hb_avg_time_period
= 60;
5679 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5680 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5682 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5683 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5684 uint32_t back_min
= i
->second
.hb_min_back
;
5685 uint32_t back_max
= i
->second
.hb_max_back
;
5686 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5687 uint32_t front_min
= i
->second
.hb_min_front
;
5688 uint32_t front_max
= i
->second
.hb_max_front
;
5690 // Reset for new interval
5691 i
->second
.hb_average_count
= 0;
5692 i
->second
.hb_interval_start
= now
;
5693 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5694 i
->second
.hb_min_back
= UINT_MAX
;
5695 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5696 i
->second
.hb_min_front
= UINT_MAX
;
5698 // Record per osd interace ping times
5699 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5700 if (i
->second
.hb_back_pingtime
.size() == 0) {
5701 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5702 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5703 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5704 i
->second
.hb_back_min
.push_back(back_min
);
5705 i
->second
.hb_back_max
.push_back(back_max
);
5706 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5707 i
->second
.hb_front_min
.push_back(front_min
);
5708 i
->second
.hb_front_max
.push_back(front_max
);
5709 ++i
->second
.hb_index
;
5712 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5713 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5714 i
->second
.hb_back_min
[index
] = back_min
;
5715 i
->second
.hb_back_max
[index
] = back_max
;
5716 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5717 i
->second
.hb_front_min
[index
] = front_min
;
5718 i
->second
.hb_front_max
[index
] = front_max
;
5719 ++i
->second
.hb_index
;
5723 std::lock_guard
l(service
.stat_lock
);
5724 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5725 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5728 uint32_t min
= UINT_MAX
;
5732 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5733 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5735 int index
= (i
->second
.hb_index
+ k
) % size
;
5736 total
+= i
->second
.hb_back_pingtime
[index
];
5737 if (i
->second
.hb_back_min
[index
] < min
)
5738 min
= i
->second
.hb_back_min
[index
];
5739 if (i
->second
.hb_back_max
[index
] > max
)
5740 max
= i
->second
.hb_back_max
[index
];
5741 if (count
== 1 || count
== 5 || count
== 15) {
5742 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5743 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5744 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5751 if (i
->second
.con_front
!= NULL
) {
5752 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5759 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5761 int index
= (i
->second
.hb_index
+ k
) % size
;
5762 total
+= i
->second
.hb_front_pingtime
[index
];
5763 if (i
->second
.hb_front_min
[index
] < min
)
5764 min
= i
->second
.hb_front_min
[index
];
5765 if (i
->second
.hb_front_max
[index
] > max
)
5766 max
= i
->second
.hb_front_max
[index
];
5767 if (count
== 1 || count
== 5 || count
== 15) {
5768 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5769 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5770 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5779 std::lock_guard
l(service
.stat_lock
);
5780 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5781 if (i
->second
.con_front
!= NULL
)
5782 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5784 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5787 if (i
->second
.is_healthy(now
)) {
5788 // Cancel false reports
5789 auto failure_queue_entry
= failure_queue
.find(from
);
5790 if (failure_queue_entry
!= failure_queue
.end()) {
5791 dout(10) << "handle_osd_ping canceling queued "
5792 << "failure report for osd." << from
<< dendl
;
5793 failure_queue
.erase(failure_queue_entry
);
5796 auto failure_pending_entry
= failure_pending
.find(from
);
5797 if (failure_pending_entry
!= failure_pending
.end()) {
5798 dout(10) << "handle_osd_ping canceling in-flight "
5799 << "failure report for osd." << from
<< dendl
;
5800 send_still_alive(curmap
->get_epoch(),
5802 failure_pending_entry
->second
.second
);
5803 failure_pending
.erase(failure_pending_entry
);
5807 // old replies, deprecated by newly sent pings.
5808 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->ping_stamp
5809 << ") is found, treat as covered by newly sent pings "
5816 curmap
->is_up(from
)) {
5818 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5819 from
, curmap
->get_epoch());
5821 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5826 s
->stamps
->got_ping_reply(
5830 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5834 case MOSDPing::YOU_DIED
:
5835 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5836 << " says i am down in " << m
->map_epoch
<< dendl
;
5837 osdmap_subscribe(curmap
->get_epoch()+1, false);
5841 heartbeat_lock
.unlock();
5845 void OSD::heartbeat_entry()
5847 std::unique_lock
l(heartbeat_lock
);
5850 while (!heartbeat_stop
) {
5854 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5855 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5857 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5859 auto w
= ceph::make_timespan(wait
);
5860 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5861 heartbeat_cond
.wait_for(l
, w
);
5864 dout(30) << "heartbeat_entry woke up" << dendl
;
5868 void OSD::heartbeat_check()
5870 ceph_assert(ceph_mutex_is_locked(heartbeat_lock
));
5871 utime_t now
= ceph_clock_now();
5873 // check for incoming heartbeats (move me elsewhere?)
5874 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5875 p
!= heartbeat_peers
.end();
5878 if (p
->second
.first_tx
== utime_t()) {
5879 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5880 << " yet, skipping" << dendl
;
5884 dout(25) << "heartbeat_check osd." << p
->first
5885 << " first_tx " << p
->second
.first_tx
5886 << " last_tx " << p
->second
.last_tx
5887 << " last_rx_back " << p
->second
.last_rx_back
5888 << " last_rx_front " << p
->second
.last_rx_front
5890 if (p
->second
.is_unhealthy(now
)) {
5891 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5892 if (p
->second
.last_rx_back
== utime_t() ||
5893 p
->second
.last_rx_front
== utime_t()) {
5894 derr
<< "heartbeat_check: no reply from "
5895 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5896 << " osd." << p
->first
5897 << " ever on either front or back, first ping sent "
5898 << p
->second
.first_tx
5899 << " (oldest deadline " << oldest_deadline
<< ")"
5902 failure_queue
[p
->first
] = p
->second
.first_tx
;
5904 derr
<< "heartbeat_check: no reply from "
5905 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5906 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5907 << " front " << p
->second
.last_rx_front
5908 << " (oldest deadline " << oldest_deadline
<< ")"
5911 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5917 void OSD::heartbeat()
5919 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock
));
5920 dout(30) << "heartbeat" << dendl
;
5922 auto load_for_logger
= service
.get_scrub_services().update_load_average();
5923 if (load_for_logger
) {
5924 logger
->set(l_osd_loadavg
, load_for_logger
.value());
5926 dout(30) << "heartbeat checking stats" << dendl
;
5928 // refresh peer list and osd stats
5929 vector
<int> hb_peers
;
5930 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5931 p
!= heartbeat_peers
.end();
5933 hb_peers
.push_back(p
->first
);
5935 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5936 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5937 ceph_assert(new_stat
.statfs
.total
);
5940 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5942 service
.check_full_status(ratio
, pratio
);
5944 utime_t now
= ceph_clock_now();
5945 auto mnow
= service
.get_mnow();
5946 utime_t deadline
= now
;
5947 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5950 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5951 i
!= heartbeat_peers
.end();
5953 int peer
= i
->first
;
5954 Session
*s
= static_cast<Session
*>(i
->second
.con_back
->get_priv().get());
5956 dout(30) << "heartbeat osd." << peer
<< " has no open con" << dendl
;
5959 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5961 i
->second
.last_tx
= now
;
5962 if (i
->second
.first_tx
== utime_t())
5963 i
->second
.first_tx
= now
;
5964 i
->second
.ping_history
[now
] = make_pair(deadline
,
5965 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5966 if (i
->second
.hb_interval_start
== utime_t())
5967 i
->second
.hb_interval_start
= now
;
5969 std::optional
<ceph::signedspan
> delta_ub
;
5970 s
->stamps
->sent_ping(&delta_ub
);
5972 i
->second
.con_back
->send_message(
5973 new MOSDPing(monc
->get_fsid(),
5974 service
.get_osdmap_epoch(),
5979 service
.get_up_epoch(),
5980 cct
->_conf
->osd_heartbeat_min_size
,
5983 if (i
->second
.con_front
)
5984 i
->second
.con_front
->send_message(
5985 new MOSDPing(monc
->get_fsid(),
5986 service
.get_osdmap_epoch(),
5991 service
.get_up_epoch(),
5992 cct
->_conf
->osd_heartbeat_min_size
,
5996 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5998 // hmm.. am i all alone?
5999 dout(30) << "heartbeat lonely?" << dendl
;
6000 if (heartbeat_peers
.empty()) {
6001 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
6002 last_mon_heartbeat
= now
;
6003 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
6004 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6008 dout(30) << "heartbeat done" << dendl
;
6011 bool OSD::heartbeat_reset(Connection
*con
)
6013 std::lock_guard
l(heartbeat_lock
);
6014 auto s
= con
->get_priv();
6015 dout(20) << __func__
<< " con " << con
<< " s " << s
.get() << dendl
;
6016 con
->set_priv(nullptr);
6018 if (is_stopping()) {
6021 auto session
= static_cast<Session
*>(s
.get());
6022 auto p
= heartbeat_peers
.find(session
->peer
);
6023 if (p
!= heartbeat_peers
.end() &&
6024 (p
->second
.con_back
== con
||
6025 p
->second
.con_front
== con
)) {
6026 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
6027 << ", reopening" << dendl
;
6028 p
->second
.clear_mark_down(con
);
6029 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
6031 p
->second
.con_back
= newcon
.first
.get();
6032 p
->second
.con_back
->set_priv(s
);
6033 if (newcon
.second
) {
6034 p
->second
.con_front
= newcon
.second
.get();
6035 p
->second
.con_front
->set_priv(s
);
6037 p
->second
.ping_history
.clear();
6039 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
6040 << ", raced with osdmap update, closing out peer" << dendl
;
6041 heartbeat_peers
.erase(p
);
6044 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
6052 // =========================================
6056 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6057 dout(10) << "tick" << dendl
;
6059 utime_t now
= ceph_clock_now();
6060 // throw out any obsolete markdown log
6061 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
6062 while (!osd_markdown_log
.empty() &&
6063 osd_markdown_log
.front() + grace
< now
)
6064 osd_markdown_log
.pop_front();
6066 if (is_active() || is_waiting_for_healthy()) {
6067 maybe_update_heartbeat_peers();
6070 if (is_waiting_for_healthy()) {
6074 if (is_waiting_for_healthy() || is_booting()) {
6075 std::lock_guard
l(heartbeat_lock
);
6076 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
6077 last_mon_heartbeat
= now
;
6078 dout(1) << __func__
<< " checking mon for new map" << dendl
;
6079 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6085 // scrub purged_snaps every deep scrub interval
6087 const utime_t last
= superblock
.last_purged_snaps_scrub
;
6088 utime_t next
= last
;
6089 next
+= cct
->_conf
->osd_scrub_min_interval
;
6091 // use a seed that is stable for each scrub interval, but varies
6092 // by OSD to avoid any herds.
6093 rng
.seed(whoami
+ superblock
.last_purged_snaps_scrub
.sec());
6094 double r
= (rng() % 1024) / 1024.0;
6096 cct
->_conf
->osd_scrub_min_interval
*
6097 cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
6098 if (next
< ceph_clock_now()) {
6099 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
6100 << " next " << next
<< " ... now" << dendl
;
6101 scrub_purged_snaps();
6103 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
6104 << " next " << next
<< dendl
;
6108 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
6111 void OSD::tick_without_osd_lock()
6113 ceph_assert(ceph_mutex_is_locked(tick_timer_lock
));
6114 dout(10) << "tick_without_osd_lock" << dendl
;
6116 logger
->set(l_osd_cached_crc
, ceph::buffer::get_cached_crc());
6117 logger
->set(l_osd_cached_crc_adjusted
, ceph::buffer::get_cached_crc_adjusted());
6118 logger
->set(l_osd_missed_crc
, ceph::buffer::get_missed_crc());
6120 // refresh osd stats
6121 struct store_statfs_t stbuf
;
6122 osd_alert_list_t alerts
;
6123 int r
= store
->statfs(&stbuf
, &alerts
);
6124 ceph_assert(r
== 0);
6125 service
.set_statfs(stbuf
, alerts
);
6127 // osd_lock is not being held, which means the OSD state
6128 // might change when doing the monitor report
6129 if (is_active() || is_waiting_for_healthy()) {
6131 std::lock_guard l
{heartbeat_lock
};
6134 map_lock
.lock_shared();
6135 std::lock_guard
l(mon_report_lock
);
6138 utime_t now
= ceph_clock_now();
6139 if (service
.need_fullness_update() ||
6140 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
6141 last_mon_report
= now
;
6145 map_lock
.unlock_shared();
6147 epoch_t max_waiting_epoch
= 0;
6148 for (auto s
: shards
) {
6149 max_waiting_epoch
= std::max(max_waiting_epoch
,
6150 s
->get_max_waiting_epoch());
6152 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
6153 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
6154 << ", requesting new map" << dendl
;
6155 osdmap_subscribe(superblock
.newest_map
+ 1, false);
6160 if (!scrub_random_backoff()) {
6163 service
.promote_throttle_recalibrate();
6164 resume_creating_pg();
6165 bool need_send_beacon
= false;
6166 const auto now
= ceph::coarse_mono_clock::now();
6168 // borrow lec lock to pretect last_sent_beacon from changing
6169 std::lock_guard l
{min_last_epoch_clean_lock
};
6170 const auto elapsed
= now
- last_sent_beacon
;
6171 if (std::chrono::duration_cast
<std::chrono::seconds
>(elapsed
).count() >
6172 cct
->_conf
->osd_beacon_report_interval
) {
6173 need_send_beacon
= true;
6176 if (need_send_beacon
) {
6181 mgrc
.update_daemon_health(get_health_metrics());
6182 service
.kick_recovery_queue();
6183 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
6184 new C_Tick_WithoutOSDLock(this));
6188 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6189 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
6190 // setomapheader <pool-id> [namespace/]<obj-name> <header>
6191 // getomap <pool> [namespace/]<obj-name>
6192 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
6193 // injectmdataerr [namespace/]<obj-name> [shardid]
6194 // injectdataerr [namespace/]<obj-name> [shardid]
6196 // set_recovery_delay [utime]
6197 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
6198 std::string_view command
,
6199 const cmdmap_t
& cmdmap
, ostream
&ss
)
6202 //Support changing the omap on a single osd by using the Admin Socket to
6203 //directly request the osd make a change.
6204 if (command
== "setomapval" || command
== "rmomapkey" ||
6205 command
== "setomapheader" || command
== "getomap" ||
6206 command
== "truncobj" || command
== "injectmdataerr" ||
6207 command
== "injectdataerr"
6211 OSDMapRef curmap
= service
->get_osdmap();
6216 cmd_getval(cmdmap
, "pool", poolstr
);
6217 pool
= curmap
->lookup_pg_pool_name(poolstr
);
6218 //If we can't find it by name then maybe id specified
6219 if (pool
< 0 && isdigit(poolstr
[0]))
6220 pool
= atoll(poolstr
.c_str());
6222 ss
<< "Invalid pool '" << poolstr
<< "''";
6226 string objname
, nspace
;
6227 cmd_getval(cmdmap
, "objname", objname
);
6228 std::size_t found
= objname
.find_first_of('/');
6229 if (found
!= string::npos
) {
6230 nspace
= objname
.substr(0, found
);
6231 objname
= objname
.substr(found
+1);
6233 object_locator_t
oloc(pool
, nspace
);
6234 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
6237 ss
<< "Invalid namespace/objname";
6241 int64_t shardid
= cmd_getval_or
<int64_t>(cmdmap
, "shardid", shard_id_t::NO_SHARD
);
6242 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
6243 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
6244 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
6245 if (curmap
->pg_is_ec(rawpg
)) {
6246 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
6247 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
6252 ObjectStore::Transaction t
;
6254 if (command
== "setomapval") {
6255 map
<string
, bufferlist
> newattrs
;
6258 cmd_getval(cmdmap
, "key", key
);
6259 cmd_getval(cmdmap
, "val", valstr
);
6262 newattrs
[key
] = val
;
6263 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
6264 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6266 ss
<< "error=" << r
;
6269 } else if (command
== "rmomapkey") {
6271 cmd_getval(cmdmap
, "key", key
);
6273 t
.omap_rmkey(coll_t(pgid
), ghobject_t(obj
), key
);
6274 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6276 ss
<< "error=" << r
;
6279 } else if (command
== "setomapheader") {
6280 bufferlist newheader
;
6283 cmd_getval(cmdmap
, "header", headerstr
);
6284 newheader
.append(headerstr
);
6285 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
6286 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6288 ss
<< "error=" << r
;
6291 } else if (command
== "getomap") {
6292 //Debug: Output entire omap
6294 map
<string
, bufferlist
> keyvals
;
6295 auto ch
= store
->open_collection(coll_t(pgid
));
6297 ss
<< "unable to open collection for " << pgid
;
6300 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
6302 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
6303 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
6304 it
!= keyvals
.end(); ++it
)
6305 ss
<< " key=" << (*it
).first
<< " val="
6306 << string((*it
).second
.c_str(), (*it
).second
.length());
6308 ss
<< "error=" << r
;
6311 } else if (command
== "truncobj") {
6313 cmd_getval(cmdmap
, "len", trunclen
);
6314 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
6315 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6317 ss
<< "error=" << r
;
6320 } else if (command
== "injectdataerr") {
6321 store
->inject_data_error(gobj
);
6323 } else if (command
== "injectmdataerr") {
6324 store
->inject_mdata_error(gobj
);
6329 if (command
== "set_recovery_delay") {
6330 int64_t delay
= cmd_getval_or
<int64_t>(cmdmap
, "utime", 0);
6333 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
6336 ss
<< "set_recovery_delay: error setting "
6337 << "osd_recovery_delay_start to '" << delay
<< "': error "
6341 service
->cct
->_conf
.apply_changes(nullptr);
6342 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
6343 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
6346 if (command
== "injectfull") {
6347 int64_t count
= cmd_getval_or
<int64_t>(cmdmap
, "count", -1);
6348 string type
= cmd_getval_or
<string
>(cmdmap
, "type", "full");
6349 OSDService::s_names state
;
6351 if (type
== "none" || count
== 0) {
6355 state
= service
->get_full_state(type
);
6356 if (state
== OSDService::s_names::INVALID
) {
6357 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6360 service
->set_injectfull(state
, count
);
6363 ss
<< "Internal error - command=" << command
;
6366 // =========================================
6368 void OSD::ms_handle_connect(Connection
*con
)
6370 dout(10) << __func__
<< " con " << con
<< dendl
;
6371 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6372 std::lock_guard
l(osd_lock
);
6375 dout(10) << __func__
<< " on mon" << dendl
;
6379 } else if (is_booting()) {
6380 _send_boot(); // resend boot message
6382 map_lock
.lock_shared();
6383 std::lock_guard
l2(mon_report_lock
);
6385 utime_t now
= ceph_clock_now();
6386 last_mon_report
= now
;
6388 // resend everything, it's a new session
6391 service
.requeue_pg_temp();
6392 service
.clear_sent_ready_to_merge();
6393 service
.send_pg_temp();
6394 service
.send_ready_to_merge();
6395 service
.send_pg_created();
6399 map_lock
.unlock_shared();
6401 send_beacon(ceph::coarse_mono_clock::now());
6405 // full map requests may happen while active or pre-boot
6406 if (requested_full_first
) {
6407 rerequest_full_maps();
6412 void OSD::ms_handle_fast_connect(Connection
*con
)
6414 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6415 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6416 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6417 s
= ceph::make_ref
<Session
>(cct
, con
);
6419 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6420 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6421 // we don't connect to clients
6422 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6423 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6428 void OSD::ms_handle_fast_accept(Connection
*con
)
6430 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6431 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6432 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6433 s
= ceph::make_ref
<Session
>(cct
, con
);
6435 dout(10) << "new session (incoming)" << s
<< " con=" << con
6436 << " addr=" << con
->get_peer_addr()
6437 << " must have raced with connect" << dendl
;
6438 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6439 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6444 bool OSD::ms_handle_reset(Connection
*con
)
6446 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6447 dout(2) << "ms_handle_reset con " << con
<< " session " << session
.get() << dendl
;
6450 session
->wstate
.reset(con
);
6451 session
->con
->set_priv(nullptr);
6452 session
->con
.reset(); // break con <-> session ref cycle
6453 // note that we break session->con *before* the session_handle_reset
6454 // cleanup below. this avoids a race between us and
6455 // PG::add_backoff, Session::check_backoff, etc.
6456 session_handle_reset(session
);
6460 bool OSD::ms_handle_refused(Connection
*con
)
6462 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6465 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6466 dout(2) << "ms_handle_refused con " << con
<< " session " << session
.get() << dendl
;
6469 int type
= con
->get_peer_type();
6470 // handle only OSD failures here
6471 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6472 OSDMapRef osdmap
= get_osdmap();
6474 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6475 if (id
>= 0 && osdmap
->is_up(id
)) {
6476 // I'm cheating mon heartbeat grace logic, because we know it's not going
6477 // to respawn alone. +1 so we won't hit any boundary case.
6478 monc
->send_mon_message(
6482 osdmap
->get_addrs(id
),
6483 cct
->_conf
->osd_heartbeat_grace
+ 1,
6484 osdmap
->get_epoch(),
6485 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6493 struct CB_OSD_GetVersion
{
6495 explicit CB_OSD_GetVersion(OSD
*o
) : osd(o
) {}
6496 void operator ()(boost::system::error_code ec
, version_t newest
,
6499 osd
->_got_mon_epochs(oldest
, newest
);
6503 void OSD::start_boot()
6505 if (!_is_healthy()) {
6506 // if we are not healthy, do not mark ourselves up (yet)
6507 dout(1) << "not healthy; waiting to boot" << dendl
;
6508 if (!is_waiting_for_healthy())
6509 start_waiting_for_healthy();
6510 // send pings sooner rather than later
6514 dout(1) << __func__
<< dendl
;
6515 set_state(STATE_PREBOOT
);
6516 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6517 << ".." << superblock
.newest_map
<< dendl
;
6518 monc
->get_version("osdmap", CB_OSD_GetVersion(this));
6521 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6523 std::lock_guard
l(osd_lock
);
6525 _preboot(oldest
, newest
);
6529 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6531 ceph_assert(is_preboot());
6532 dout(10) << __func__
<< " _preboot mon has osdmaps "
6533 << oldest
<< ".." << newest
<< dendl
;
6535 // ensure our local fullness awareness is accurate
6537 std::lock_guard
l(heartbeat_lock
);
6541 const auto& monmap
= monc
->monmap
;
6542 const auto osdmap
= get_osdmap();
6543 // if our map within recent history, try to add ourselves to the osdmap.
6544 if (osdmap
->get_epoch() == 0) {
6545 derr
<< "waiting for initial osdmap" << dendl
;
6546 } else if (osdmap
->is_destroyed(whoami
)) {
6547 derr
<< "osdmap says I am destroyed" << dendl
;
6548 // provide a small margin so we don't livelock seeing if we
6549 // un-destroyed ourselves.
6550 if (osdmap
->get_epoch() > newest
- 1) {
6553 } else if (osdmap
->is_noup(whoami
)) {
6554 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6555 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6556 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6558 } else if (service
.need_fullness_update()) {
6559 derr
<< "osdmap fullness state needs update" << dendl
;
6561 } else if (monmap
.min_mon_release
>= ceph_release_t::octopus
&&
6562 superblock
.purged_snaps_last
< superblock
.current_epoch
) {
6563 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6564 << " < newest_map " << superblock
.current_epoch
<< dendl
;
6565 _get_purged_snaps();
6566 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6567 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6569 // wait for pgs to fully catch up in a different thread, since
6570 // this thread might be required for splitting and merging PGs to
6572 boot_finisher
.queue(
6575 std::unique_lock
l(osd_lock
);
6577 dout(10) << __func__
<< " waiting for peering work to drain"
6580 for (auto shard
: shards
) {
6581 shard
->wait_min_pg_epoch(get_osdmap_epoch());
6592 // get all the latest maps
6593 if (osdmap
->get_epoch() + 1 >= oldest
)
6594 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6596 osdmap_subscribe(oldest
- 1, true);
6599 void OSD::_get_purged_snaps()
6601 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6602 // overlapping requests to the mon, which will be somewhat inefficient, but
6603 // it should be reliable.
6604 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6605 << ", newest_map " << superblock
.current_epoch
<< dendl
;
6606 MMonGetPurgedSnaps
*m
= new MMonGetPurgedSnaps(
6607 superblock
.purged_snaps_last
+ 1,
6608 superblock
.current_epoch
+ 1);
6609 monc
->send_mon_message(m
);
6612 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply
*m
)
6614 dout(10) << __func__
<< " " << *m
<< dendl
;
6615 ObjectStore::Transaction t
;
6616 if (!is_preboot() ||
6617 m
->last
< superblock
.purged_snaps_last
) {
6620 SnapMapper::record_purged_snaps(cct
, store
.get(), service
.meta_ch
,
6621 make_purged_snaps_oid(), &t
,
6623 superblock
.purged_snaps_last
= m
->last
;
6624 write_superblock(t
);
6625 store
->queue_transaction(
6628 service
.publish_superblock(superblock
);
6629 if (m
->last
< superblock
.current_epoch
) {
6630 _get_purged_snaps();
6638 void OSD::send_full_update()
6640 if (!service
.need_fullness_update())
6643 if (service
.is_full()) {
6644 state
= CEPH_OSD_FULL
;
6645 } else if (service
.is_backfillfull()) {
6646 state
= CEPH_OSD_BACKFILLFULL
;
6647 } else if (service
.is_nearfull()) {
6648 state
= CEPH_OSD_NEARFULL
;
6651 OSDMap::calc_state_set(state
, s
);
6652 dout(10) << __func__
<< " want state " << s
<< dendl
;
6653 monc
->send_mon_message(new MOSDFull(get_osdmap_epoch(), state
));
6656 void OSD::start_waiting_for_healthy()
6658 dout(1) << "start_waiting_for_healthy" << dendl
;
6659 set_state(STATE_WAITING_FOR_HEALTHY
);
6660 last_heartbeat_resample
= utime_t();
6662 // subscribe to osdmap updates, in case our peers really are known to be dead
6663 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6666 bool OSD::_is_healthy()
6668 if (!cct
->get_heartbeat_map()->is_healthy()) {
6669 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6673 if (is_waiting_for_healthy()) {
6674 utime_t now
= ceph_clock_now();
6675 if (osd_markdown_log
.empty()) {
6676 dout(5) << __func__
<< " force returning true since last markdown"
6677 << " was " << cct
->_conf
->osd_max_markdown_period
6678 << "s ago" << dendl
;
6681 std::lock_guard
l(heartbeat_lock
);
6682 int num
= 0, up
= 0;
6683 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6684 p
!= heartbeat_peers
.end();
6686 if (p
->second
.is_healthy(now
))
6690 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6691 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6692 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6700 void OSD::_send_boot()
6702 dout(10) << "_send_boot" << dendl
;
6703 Connection
*local_connection
=
6704 cluster_messenger
->get_loopback_connection().get();
6705 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6706 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6707 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6708 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6710 dout(20) << " initial client_addrs " << client_addrs
6711 << ", cluster_addrs " << cluster_addrs
6712 << ", hb_back_addrs " << hb_back_addrs
6713 << ", hb_front_addrs " << hb_front_addrs
6715 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6716 dout(10) << " assuming cluster_addrs match client_addrs "
6717 << client_addrs
<< dendl
;
6718 cluster_addrs
= cluster_messenger
->get_myaddrs();
6720 if (auto session
= local_connection
->get_priv(); !session
) {
6721 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6724 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6725 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6726 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6727 << cluster_addrs
<< dendl
;
6728 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6730 if (auto session
= local_connection
->get_priv(); !session
) {
6731 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6734 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6735 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6736 dout(10) << " assuming hb_front_addrs match client_addrs "
6737 << client_addrs
<< dendl
;
6738 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6740 if (auto session
= local_connection
->get_priv(); !session
) {
6741 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6744 // we now know what our front and back addrs will be, and we are
6745 // about to tell the mon what our metadata (including numa bindings)
6746 // are, so now is a good time!
6747 set_numa_affinity();
6749 MOSDBoot
*mboot
= new MOSDBoot(
6750 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6751 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6753 dout(10) << " final client_addrs " << client_addrs
6754 << ", cluster_addrs " << cluster_addrs
6755 << ", hb_back_addrs " << hb_back_addrs
6756 << ", hb_front_addrs " << hb_front_addrs
6758 _collect_metadata(&mboot
->metadata
);
6759 monc
->send_mon_message(mboot
);
6760 set_state(STATE_BOOTING
);
6763 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6766 (*pm
)["osd_data"] = dev_path
;
6767 if (store
->get_type() == "filestore") {
6768 // not applicable for bluestore
6769 (*pm
)["osd_journal"] = journal_path
;
6771 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6772 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6773 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6774 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6777 (*pm
)["osd_objectstore"] = store
->get_type();
6778 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6779 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6780 (*pm
)["default_device_class"] = store
->get_default_device_class();
6781 string osdspec_affinity
;
6782 int r
= store
->read_meta("osdspec_affinity", &osdspec_affinity
);
6783 if (r
< 0 || osdspec_affinity
.empty()) {
6784 osdspec_affinity
= "";
6786 (*pm
)["osdspec_affinity"] = osdspec_affinity
;
6787 store
->collect_metadata(pm
);
6789 collect_sys_info(pm
, cct
);
6791 (*pm
)["front_iface"] = pick_iface(
6793 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6794 (*pm
)["back_iface"] = pick_iface(
6796 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6802 set
<string
> unknown
;
6803 for (auto nm
: { "front_iface", "back_iface" }) {
6804 if (!(*pm
)[nm
].size()) {
6809 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6811 unknown
.insert((*pm
)[nm
]);
6819 if (unknown
.size()) {
6820 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6822 if (!nodes
.empty()) {
6823 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6825 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6826 (*pm
)["network_numa_node"] = stringify(node
);
6830 if (numa_node
>= 0) {
6831 (*pm
)["numa_node"] = stringify(numa_node
);
6832 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6836 set
<string
> devnames
;
6837 store
->get_devices(&devnames
);
6838 map
<string
,string
> errs
;
6839 get_device_metadata(devnames
, pm
, &errs
);
6840 for (auto& i
: errs
) {
6841 dout(1) << __func__
<< " " << i
.first
<< ": " << i
.second
<< dendl
;
6843 dout(10) << __func__
<< " " << *pm
<< dendl
;
6846 void OSD::queue_want_up_thru(epoch_t want
)
6848 std::shared_lock map_locker
{map_lock
};
6849 epoch_t cur
= get_osdmap()->get_up_thru(whoami
);
6850 std::lock_guard
report_locker(mon_report_lock
);
6851 if (want
> up_thru_wanted
) {
6852 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6853 << ", currently " << cur
6855 up_thru_wanted
= want
;
6858 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6859 << ", currently " << cur
6864 void OSD::send_alive()
6866 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6867 const auto osdmap
= get_osdmap();
6868 if (!osdmap
->exists(whoami
))
6870 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6871 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6872 if (up_thru_wanted
> up_thru
) {
6873 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6874 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6878 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6880 dout(10) << __func__
<< " " << first
<< ".." << last
6881 << ", previously requested "
6882 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6883 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6884 ceph_assert(first
> 0 && last
> 0);
6885 ceph_assert(first
<= last
);
6886 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6887 if (requested_full_first
== 0) {
6889 requested_full_first
= first
;
6890 requested_full_last
= last
;
6891 } else if (last
<= requested_full_last
) {
6895 // additional request
6896 first
= requested_full_last
+ 1;
6897 requested_full_last
= last
;
6899 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6900 req
->request_full(first
, last
);
6901 monc
->send_mon_message(req
);
6904 void OSD::got_full_map(epoch_t e
)
6906 ceph_assert(requested_full_first
<= requested_full_last
);
6907 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6908 if (requested_full_first
== 0) {
6909 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6912 if (e
< requested_full_first
) {
6913 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6914 << ".." << requested_full_last
6915 << ", ignoring" << dendl
;
6918 if (e
>= requested_full_last
) {
6919 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6920 << ".." << requested_full_last
<< ", resetting" << dendl
;
6921 requested_full_first
= requested_full_last
= 0;
6925 requested_full_first
= e
+ 1;
6927 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6928 << ".." << requested_full_last
6929 << ", still need more" << dendl
;
6932 void OSD::requeue_failures()
6934 std::lock_guard
l(heartbeat_lock
);
6935 unsigned old_queue
= failure_queue
.size();
6936 unsigned old_pending
= failure_pending
.size();
6937 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6938 failure_queue
[p
->first
] = p
->second
.first
;
6939 failure_pending
.erase(p
++);
6941 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6942 << failure_queue
.size() << dendl
;
6945 void OSD::send_failures()
6947 ceph_assert(ceph_mutex_is_locked(map_lock
));
6948 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6949 std::lock_guard
l(heartbeat_lock
);
6950 utime_t now
= ceph_clock_now();
6951 const auto osdmap
= get_osdmap();
6952 while (!failure_queue
.empty()) {
6953 int osd
= failure_queue
.begin()->first
;
6954 if (!failure_pending
.count(osd
)) {
6955 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6956 monc
->send_mon_message(
6960 osdmap
->get_addrs(osd
),
6962 osdmap
->get_epoch()));
6963 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6964 osdmap
->get_addrs(osd
));
6966 failure_queue
.erase(osd
);
6970 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6972 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6973 MOSDFailure::FLAG_ALIVE
);
6974 monc
->send_mon_message(m
);
6977 void OSD::cancel_pending_failures()
6979 std::lock_guard
l(heartbeat_lock
);
6980 auto it
= failure_pending
.begin();
6981 while (it
!= failure_pending
.end()) {
6982 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6983 << it
->first
<< dendl
;
6984 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
6985 failure_pending
.erase(it
++);
6989 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6991 const auto& monmap
= monc
->monmap
;
6992 // send beacon to mon even if we are just connected, and the monmap is not
6993 // initialized yet by then.
6994 if (monmap
.epoch
> 0 &&
6995 monmap
.get_required_features().contains_all(
6996 ceph::features::mon::FEATURE_LUMINOUS
)) {
6997 dout(20) << __func__
<< " sending" << dendl
;
6998 MOSDBeacon
* beacon
= nullptr;
7000 std::lock_guard l
{min_last_epoch_clean_lock
};
7001 beacon
= new MOSDBeacon(get_osdmap_epoch(),
7002 min_last_epoch_clean
,
7003 superblock
.last_purged_snaps_scrub
,
7004 cct
->_conf
->osd_beacon_report_interval
);
7005 beacon
->pgs
= min_last_epoch_clean_pgs
;
7006 last_sent_beacon
= now
;
7008 monc
->send_mon_message(beacon
);
7010 dout(20) << __func__
<< " not sending" << dendl
;
7014 void OSD::handle_command(MCommand
*m
)
7016 ConnectionRef con
= m
->get_connection();
7017 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
7019 con
->send_message(new MCommandReply(m
, -EACCES
));
7023 if (!session
->caps
.allow_all()) {
7024 con
->send_message(new MCommandReply(m
, -EACCES
));
7028 cct
->get_admin_socket()->queue_tell_command(m
);
7033 class unlock_guard
{
7036 explicit unlock_guard(ceph::mutex
& mutex
)
7041 unlock_guard(unlock_guard
&) = delete;
7048 void OSD::scrub_purged_snaps()
7050 dout(10) << __func__
<< dendl
;
7051 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7052 SnapMapper::Scrubber
s(cct
, store
.get(), service
.meta_ch
,
7053 make_snapmapper_oid(),
7054 make_purged_snaps_oid());
7055 clog
->debug() << "purged_snaps scrub starts";
7058 if (s
.stray
.size()) {
7059 clog
->debug() << "purged_snaps scrub found " << s
.stray
.size() << " strays";
7061 clog
->debug() << "purged_snaps scrub ok";
7063 set
<pair
<spg_t
,snapid_t
>> queued
;
7064 for (auto& [pool
, snap
, hash
, shard
] : s
.stray
) {
7065 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(pool
);
7067 dout(20) << __func__
<< " pool " << pool
<< " dne" << dendl
;
7070 pg_t
pgid(pi
->raw_hash_to_pg(hash
), pool
);
7071 spg_t
spgid(pgid
, shard
);
7072 pair
<spg_t
,snapid_t
> p(spgid
, snap
);
7073 if (queued
.count(p
)) {
7074 dout(20) << __func__
<< " pg " << spgid
<< " snap " << snap
7075 << " already queued" << dendl
;
7078 PGRef pg
= lookup_lock_pg(spgid
);
7080 dout(20) << __func__
<< " pg " << spgid
<< " not found" << dendl
;
7084 dout(10) << __func__
<< " requeue pg " << spgid
<< " " << pg
<< " snap "
7086 pg
->queue_snap_retrim(snap
);
7090 if (is_stopping()) {
7093 dout(10) << __func__
<< " done queueing pgs, updating superblock" << dendl
;
7094 ObjectStore::Transaction t
;
7095 superblock
.last_purged_snaps_scrub
= ceph_clock_now();
7096 write_superblock(t
);
7097 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7098 ceph_assert(tr
== 0);
7100 send_beacon(ceph::coarse_mono_clock::now());
7102 dout(10) << __func__
<< " done" << dendl
;
7105 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
7107 set
<string
> devnames
;
7108 store
->get_devices(&devnames
);
7109 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
7110 "osd_smart_report_timeout");
7112 // == typedef std::map<std::string, mValue> mObject;
7113 json_spirit::mObject json_map
;
7115 for (auto dev
: devnames
) {
7116 // smartctl works only on physical devices; filter out any logical device
7117 if (dev
.find("dm-") == 0) {
7122 string devid
= get_device_id(dev
, &err
);
7123 if (devid
.size() == 0) {
7124 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
7125 << err
<< "), skipping" << dendl
;
7128 if (only_devid
.size() && devid
!= only_devid
) {
7132 json_spirit::mValue smart_json
;
7133 if (block_device_get_metrics(dev
, smart_timeout
,
7135 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
7138 json_map
[devid
] = smart_json
;
7140 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
7143 bool OSD::heartbeat_dispatch(Message
*m
)
7145 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
7146 switch (m
->get_type()) {
7149 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
7154 handle_osd_ping(static_cast<MOSDPing
*>(m
));
7158 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
7165 bool OSD::ms_dispatch(Message
*m
)
7167 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
7168 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
7169 service
.got_stop_ack();
7177 if (is_stopping()) {
7191 void OSDService::maybe_share_map(
7193 const OSDMapRef
& osdmap
,
7194 epoch_t peer_epoch_lb
)
7196 // NOTE: we assume caller hold something that keeps the Connection itself
7197 // pinned (e.g., an OpRequest's MessageRef).
7198 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
7203 // assume the peer has the newer of the op's sent_epoch and what
7204 // we think we sent them.
7205 session
->sent_epoch_lock
.lock();
7206 if (peer_epoch_lb
> session
->last_sent_epoch
) {
7207 dout(10) << __func__
<< " con " << con
7208 << " " << con
->get_peer_addr()
7209 << " map epoch " << session
->last_sent_epoch
7210 << " -> " << peer_epoch_lb
<< " (as per caller)" << dendl
;
7211 session
->last_sent_epoch
= peer_epoch_lb
;
7213 epoch_t last_sent_epoch
= session
->last_sent_epoch
;
7214 session
->sent_epoch_lock
.unlock();
7216 if (osdmap
->get_epoch() <= last_sent_epoch
) {
7220 send_incremental_map(last_sent_epoch
, con
, osdmap
);
7221 last_sent_epoch
= osdmap
->get_epoch();
7223 session
->sent_epoch_lock
.lock();
7224 if (session
->last_sent_epoch
< last_sent_epoch
) {
7225 dout(10) << __func__
<< " con " << con
7226 << " " << con
->get_peer_addr()
7227 << " map epoch " << session
->last_sent_epoch
7228 << " -> " << last_sent_epoch
<< " (shared)" << dendl
;
7229 session
->last_sent_epoch
= last_sent_epoch
;
7231 session
->sent_epoch_lock
.unlock();
7234 void OSD::dispatch_session_waiting(const ceph::ref_t
<Session
>& session
, OSDMapRef osdmap
)
7236 ceph_assert(ceph_mutex_is_locked(session
->session_dispatch_lock
));
7238 auto i
= session
->waiting_on_map
.begin();
7239 while (i
!= session
->waiting_on_map
.end()) {
7240 OpRequestRef op
= &(*i
);
7241 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
7242 auto m
= op
->get_req
<MOSDFastDispatchOp
>();
7243 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
7246 session
->waiting_on_map
.erase(i
++);
7250 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
7251 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
7252 static_cast<const MOSDOp
*>(m
)->get_pg());
7253 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
7257 pgid
= m
->get_spg();
7259 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
7262 if (session
->waiting_on_map
.empty()) {
7263 clear_session_waiting_on_map(session
);
7265 register_session_waiting_on_map(session
);
7269 void OSD::ms_fast_dispatch(Message
*m
)
7271 auto dispatch_span
= tracing::osd::tracer
.start_trace(__func__
);
7273 if (service
.is_stopping()) {
7278 switch (m
->get_type()) {
7280 dout(10) << "ping from " << m
->get_source() << dendl
;
7283 case MSG_OSD_FORCE_RECOVERY
:
7284 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7286 case MSG_OSD_SCRUB2
:
7287 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7289 case MSG_OSD_PG_CREATE2
:
7290 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7291 case MSG_OSD_PG_NOTIFY
:
7292 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7293 case MSG_OSD_PG_INFO
:
7294 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7295 case MSG_OSD_PG_REMOVE
:
7296 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7297 // these are single-pg messages that handle themselves
7298 case MSG_OSD_PG_LOG
:
7299 case MSG_OSD_PG_TRIM
:
7300 case MSG_OSD_PG_NOTIFY2
:
7301 case MSG_OSD_PG_QUERY2
:
7302 case MSG_OSD_PG_INFO2
:
7303 case MSG_OSD_BACKFILL_RESERVE
:
7304 case MSG_OSD_RECOVERY_RESERVE
:
7305 case MSG_OSD_PG_LEASE
:
7306 case MSG_OSD_PG_LEASE_ACK
:
7308 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7309 if (require_osd_peer(pm
)) {
7310 enqueue_peering_evt(
7312 PGPeeringEventRef(pm
->get_event()));
7319 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7322 osd_reqid_t reqid
= op
->get_reqid();
7324 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7325 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7327 op
->osd_parent_span
= tracing::osd::tracer
.add_span("op-request-created", dispatch_span
);
7330 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7332 // note sender epoch, min req's epoch
7333 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7334 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7335 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7337 service
.maybe_inject_dispatch_delay();
7339 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7340 m
->get_type() != CEPH_MSG_OSD_OP
) {
7341 // queue it directly
7343 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7345 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7347 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7348 // message that didn't have an explicit spg_t); we need to map
7349 // them to an spg_t while preserving delivery order.
7350 auto priv
= m
->get_connection()->get_priv();
7351 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7352 std::lock_guard l
{session
->session_dispatch_lock
};
7354 session
->waiting_on_map
.push_back(*op
);
7355 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7356 dispatch_session_waiting(session
, nextmap
);
7357 service
.release_map(nextmap
);
7360 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7363 int OSD::ms_handle_authentication(Connection
*con
)
7366 auto s
= ceph::ref_cast
<Session
>(con
->get_priv());
7368 s
= ceph::make_ref
<Session
>(cct
, con
);
7370 s
->entity_name
= con
->get_peer_entity_name();
7371 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7372 << " entity " << s
->entity_name
7373 << " addr " << con
->get_peer_addrs() << dendl
;
7375 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7376 << " entity " << s
->entity_name
7377 << " addr " << con
->get_peer_addrs() << dendl
;
7380 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7381 if (caps_info
.allow_all
) {
7382 s
->caps
.set_allow_all();
7383 } else if (caps_info
.caps
.length() > 0) {
7384 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7389 catch (ceph::buffer::error
& e
) {
7390 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7391 << " failed to decode caps string" << dendl
;
7395 bool success
= s
->caps
.parse(str
);
7397 dout(10) << __func__
<< " session " << s
7398 << " " << s
->entity_name
7399 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7402 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7403 << " failed to parse caps '" << str
<< "'" << dendl
;
7411 void OSD::do_waiters()
7413 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7415 dout(10) << "do_waiters -- start" << dendl
;
7416 while (!finished
.empty()) {
7417 OpRequestRef next
= finished
.front();
7418 finished
.pop_front();
7421 dout(10) << "do_waiters -- finish" << dendl
;
7424 void OSD::dispatch_op(OpRequestRef op
)
7426 switch (op
->get_req()->get_type()) {
7428 case MSG_OSD_PG_CREATE
:
7429 handle_pg_create(op
);
7434 void OSD::_dispatch(Message
*m
)
7436 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7437 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7439 switch (m
->get_type()) {
7440 // -- don't need OSDMap --
7442 // map and replication
7443 case CEPH_MSG_OSD_MAP
:
7444 handle_osd_map(static_cast<MOSDMap
*>(m
));
7446 case MSG_MON_GET_PURGED_SNAPS_REPLY
:
7447 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply
*>(m
));
7452 handle_scrub(static_cast<MOSDScrub
*>(m
));
7456 handle_command(static_cast<MCommand
*>(m
));
7459 // -- need OSDMap --
7461 case MSG_OSD_PG_CREATE
:
7463 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7465 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7466 // no map? starting up?
7467 if (!get_osdmap()) {
7468 dout(7) << "no OSDMap, not booted" << dendl
;
7469 logger
->inc(l_osd_waiting_for_map
);
7470 waiting_for_osdmap
.push_back(op
);
7471 op
->mark_delayed("no osdmap");
7481 // remove me post-nautilus
7482 void OSD::handle_scrub(MOSDScrub
*m
)
7484 dout(10) << "handle_scrub " << *m
<< dendl
;
7485 if (!require_mon_or_mgr_peer(m
)) {
7489 if (m
->fsid
!= monc
->get_fsid()) {
7490 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7499 if (!m
->scrub_pgs
.empty()) {
7501 for (auto pgid
: m
->scrub_pgs
) {
7503 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
7504 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7511 for (auto pgid
: spgs
) {
7512 enqueue_peering_evt(
7515 std::make_shared
<PGPeeringEvent
>(
7518 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7524 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7526 dout(10) << __func__
<< " " << *m
<< dendl
;
7527 if (!require_mon_or_mgr_peer(m
)) {
7531 if (m
->fsid
!= monc
->get_fsid()) {
7532 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7537 for (auto pgid
: m
->scrub_pgs
) {
7538 enqueue_peering_evt(
7541 std::make_shared
<PGPeeringEvent
>(
7544 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7549 bool OSD::scrub_random_backoff()
7551 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7552 cct
->_conf
->osd_scrub_backoff_ratio
);
7554 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7561 void OSD::sched_scrub()
7563 auto& scrub_scheduler
= service
.get_scrub_services();
7565 // fail fast if no resources are available
7566 if (!scrub_scheduler
.can_inc_scrubs()) {
7567 dout(20) << __func__
<< ": OSD cannot inc scrubs" << dendl
;
7571 // if there is a PG that is just now trying to reserve scrub replica resources -
7572 // we should wait and not initiate a new scrub
7573 if (scrub_scheduler
.is_reserving_now()) {
7574 dout(20) << __func__
<< ": scrub resources reservation in progress" << dendl
;
7578 Scrub::ScrubPreconds env_conditions
;
7580 if (service
.is_recovery_active() && !cct
->_conf
->osd_scrub_during_recovery
) {
7581 if (!cct
->_conf
->osd_repair_during_recovery
) {
7582 dout(15) << __func__
<< ": not scheduling scrubs due to active recovery"
7586 dout(10) << __func__
7587 << " will only schedule explicitly requested repair due to active recovery"
7589 env_conditions
.allow_requested_repair_only
= true;
7592 if (g_conf()->subsys
.should_gather
<ceph_subsys_osd
, 20>()) {
7593 dout(20) << __func__
<< " sched_scrub starts" << dendl
;
7594 auto all_jobs
= scrub_scheduler
.list_registered_jobs();
7595 for (const auto& sj
: all_jobs
) {
7596 dout(20) << "sched_scrub scrub-queue jobs: " << *sj
<< dendl
;
7600 auto was_started
= scrub_scheduler
.select_pg_and_scrub(env_conditions
);
7601 dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started
)
7605 Scrub::schedule_result_t
OSDService::initiate_a_scrub(spg_t pgid
,
7606 bool allow_requested_repair_only
)
7608 dout(20) << __func__
<< " trying " << pgid
<< dendl
;
7610 // we have a candidate to scrub. We need some PG information to know if scrubbing is
7613 PGRef pg
= osd
->lookup_lock_pg(pgid
);
7615 // the PG was dequeued in the short timespan between creating the candidates list
7616 // (collect_ripe_jobs()) and here
7617 dout(5) << __func__
<< " pg " << pgid
<< " not found" << dendl
;
7618 return Scrub::schedule_result_t::no_such_pg
;
7621 // This has already started, so go on to the next scrub job
7622 if (pg
->is_scrub_queued_or_active()) {
7624 dout(20) << __func__
<< ": already in progress pgid " << pgid
<< dendl
;
7625 return Scrub::schedule_result_t::already_started
;
7627 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7628 if (allow_requested_repair_only
&& !pg
->m_planned_scrub
.must_repair
) {
7630 dout(10) << __func__
<< " skip " << pgid
7631 << " because repairing is not explicitly requested on it" << dendl
;
7632 return Scrub::schedule_result_t::preconditions
;
7635 auto scrub_attempt
= pg
->sched_scrub();
7637 return scrub_attempt
;
7640 void OSD::resched_all_scrubs()
7642 dout(10) << __func__
<< ": start" << dendl
;
7643 auto all_jobs
= service
.get_scrub_services().list_registered_jobs();
7644 for (auto& e
: all_jobs
) {
7647 dout(20) << __func__
<< ": examine " << job
.pgid
<< dendl
;
7649 PGRef pg
= _lookup_lock_pg(job
.pgid
);
7653 if (!pg
->m_planned_scrub
.must_scrub
&& !pg
->m_planned_scrub
.need_auto
) {
7654 dout(15) << __func__
<< ": reschedule " << job
.pgid
<< dendl
;
7655 pg
->reschedule_scrub();
7659 dout(10) << __func__
<< ": done" << dendl
;
7662 MPGStats
* OSD::collect_pg_stats()
7664 dout(15) << __func__
<< dendl
;
7665 // This implementation unconditionally sends every is_primary PG's
7666 // stats every time we're called. This has equivalent cost to the
7667 // previous implementation's worst case where all PGs are busy and
7668 // their stats are always enqueued for sending.
7669 std::shared_lock l
{map_lock
};
7671 osd_stat_t cur_stat
= service
.get_osd_stat();
7672 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7674 auto m
= new MPGStats(monc
->get_fsid(), get_osdmap_epoch());
7675 m
->osd_stat
= cur_stat
;
7677 std::lock_guard lec
{min_last_epoch_clean_lock
};
7678 min_last_epoch_clean
= get_osdmap_epoch();
7679 min_last_epoch_clean_pgs
.clear();
7681 std::set
<int64_t> pool_set
;
7684 for (auto& pg
: pgs
) {
7685 auto pool
= pg
->pg_id
.pgid
.pool();
7686 pool_set
.emplace((int64_t)pool
);
7687 if (!pg
->is_primary()) {
7690 pg
->with_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
7691 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7692 min_last_epoch_clean
= std::min(min_last_epoch_clean
, lec
);
7693 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7697 bool per_pool_stats
= false;
7698 bool per_pool_omap_stats
= false;
7699 for (auto p
: pool_set
) {
7700 int r
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
7701 if (r
== -ENOTSUP
) {
7705 m
->pool_stat
[p
] = st
;
7706 per_pool_stats
= true;
7710 // indicate whether we are reporting per-pool stats
7711 m
->osd_stat
.num_osds
= 1;
7712 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7713 m
->osd_stat
.num_per_pool_omap_osds
= per_pool_omap_stats
? 1 : 0;
7718 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7720 vector
<DaemonHealthMetric
> metrics
;
7722 utime_t oldest_secs
;
7723 const utime_t now
= ceph_clock_now();
7725 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7727 TrackedOpRef oldest_op
;
7728 OSDMapRef osdmap
= get_osdmap();
7729 // map of slow op counts by slow op event type for an aggregated logging to
7731 map
<uint8_t, int> slow_op_types
;
7732 // map of slow op counts by pool for reporting a pool name with highest
7734 map
<uint64_t, int> slow_op_pools
;
7735 bool log_aggregated_slow_op
=
7736 cct
->_conf
.get_val
<bool>("osd_aggregated_slow_ops_logging");
7737 auto count_slow_ops
= [&](TrackedOp
& op
) {
7738 if (op
.get_initiated() < too_old
) {
7740 ss
<< "slow request " << op
.get_desc()
7742 << op
.get_initiated()
7744 << op
.state_string();
7745 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7746 if (log_aggregated_slow_op
) {
7747 if (const OpRequest
*req
= dynamic_cast<const OpRequest
*>(&op
)) {
7748 uint8_t op_type
= req
->state_flag();
7749 auto m
= req
->get_req
<MOSDFastDispatchOp
>();
7750 uint64_t poolid
= m
->get_spg().pgid
.m_pool
;
7751 slow_op_types
[op_type
]++;
7752 if (poolid
> 0 && poolid
<= (uint64_t) osdmap
->get_pool_max()) {
7753 slow_op_pools
[poolid
]++;
7757 clog
->warn() << ss
.str();
7760 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7768 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7770 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7771 << oldest_op
->get_desc() << dendl
;
7772 if (log_aggregated_slow_op
&&
7773 slow_op_types
.size() > 0) {
7775 ss
<< slow
<< " slow requests (by type [ ";
7776 for (const auto& [op_type
, count
] : slow_op_types
) {
7777 ss
<< "'" << OpRequest::get_state_string(op_type
)
7781 auto slow_pool_it
= std::max_element(slow_op_pools
.begin(), slow_op_pools
.end(),
7782 [](std::pair
<uint64_t, int> p1
, std::pair
<uint64_t, int> p2
) {
7783 return p1
.second
< p2
.second
;
7785 if (osdmap
->get_pools().find(slow_pool_it
->first
) != osdmap
->get_pools().end()) {
7786 string pool_name
= osdmap
->get_pool_name(slow_pool_it
->first
);
7787 ss
<< "] most affected pool [ '"
7790 << slow_pool_it
->second
7795 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7796 clog
->warn() << ss
.str();
7799 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7801 // no news is not good news.
7802 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7806 std::lock_guard
l(pending_creates_lock
);
7807 auto n_primaries
= pending_creates_from_mon
;
7808 for (const auto& create
: pending_creates_from_osd
) {
7809 if (create
.second
) {
7813 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7818 // =====================================================
7821 void OSD::wait_for_new_map(OpRequestRef op
)
7824 if (waiting_for_osdmap
.empty()) {
7825 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7828 logger
->inc(l_osd_waiting_for_map
);
7829 waiting_for_osdmap
.push_back(op
);
7830 op
->mark_delayed("wait for new map");
7835 * assimilate new OSDMap(s). scan pgs, etc.
7838 void OSD::note_down_osd(int peer
)
7840 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7841 cluster_messenger
->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer
));
7843 std::lock_guard l
{heartbeat_lock
};
7844 failure_queue
.erase(peer
);
7845 failure_pending
.erase(peer
);
7846 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7847 if (p
!= heartbeat_peers
.end()) {
7848 p
->second
.clear_mark_down();
7849 heartbeat_peers
.erase(p
);
7853 void OSD::note_up_osd(int peer
)
7855 heartbeat_set_peers_need_update();
7858 struct C_OnMapCommit
: public Context
{
7860 epoch_t first
, last
;
7862 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7863 : osd(o
), first(f
), last(l
), msg(m
) {}
7864 void finish(int r
) override
{
7865 osd
->_committed_osd_maps(first
, last
, msg
);
7870 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7872 std::lock_guard
l(osdmap_subscribe_lock
);
7873 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7876 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7878 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7884 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7886 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7887 if (min
<= superblock
.oldest_map
)
7891 ObjectStore::Transaction t
;
7892 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7893 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7894 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7895 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7896 superblock
.oldest_map
= e
+ 1;
7898 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7899 service
.publish_superblock(superblock
);
7900 write_superblock(t
);
7901 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7902 ceph_assert(tr
== 0);
7905 // skip_maps leaves us with a range of old maps if we fail to remove all
7906 // of them before moving superblock.oldest_map forward to the first map
7907 // in the incoming MOSDMap msg. so we should continue removing them in
7908 // this case, even we could do huge series of delete transactions all at
7915 service
.publish_superblock(superblock
);
7916 write_superblock(t
);
7917 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7918 ceph_assert(tr
== 0);
7920 // we should not remove the cached maps
7921 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7924 void OSD::handle_osd_map(MOSDMap
*m
)
7926 // wait for pgs to catch up
7928 // we extend the map cache pins to accomodate pgs slow to consume maps
7929 // for some period, until we hit the max_lag_factor bound, at which point
7930 // we block here to stop injesting more maps than they are able to keep
7932 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7933 m_osd_pg_epoch_max_lag_factor
;
7934 ceph_assert(max_lag
> 0);
7935 epoch_t osd_min
= 0;
7936 for (auto shard
: shards
) {
7937 epoch_t min
= shard
->get_min_pg_epoch();
7938 if (osd_min
== 0 || min
< osd_min
) {
7942 epoch_t osdmap_epoch
= get_osdmap_epoch();
7944 osdmap_epoch
> max_lag
&&
7945 osdmap_epoch
- max_lag
> osd_min
) {
7946 epoch_t need
= osdmap_epoch
- max_lag
;
7947 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7948 << " max_lag " << max_lag
<< ")" << dendl
;
7949 for (auto shard
: shards
) {
7950 epoch_t min
= shard
->get_min_pg_epoch();
7952 dout(10) << __func__
<< " waiting for pgs to consume " << need
7953 << " (shard " << shard
->shard_id
<< " min " << min
7954 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7955 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7957 unlock_guard unlock
{osd_lock
};
7958 shard
->wait_min_pg_epoch(need
);
7964 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7965 map
<epoch_t
,OSDMapRef
> added_maps
;
7966 map
<epoch_t
,bufferlist
> added_maps_bl
;
7967 if (m
->fsid
!= monc
->get_fsid()) {
7968 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7969 << monc
->get_fsid() << dendl
;
7973 if (is_initializing()) {
7974 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7979 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
7980 if (session
&& !(session
->entity_name
.is_mon() ||
7981 session
->entity_name
.is_osd())) {
7983 dout(10) << "got osd map from Session " << session
7984 << " which we can't take maps from (not a mon or osd)" << dendl
;
7989 // share with the objecter
7991 service
.objecter
->handle_osd_map(m
);
7993 epoch_t first
= m
->get_first();
7994 epoch_t last
= m
->get_last();
7995 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7996 << superblock
.newest_map
7997 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
8000 logger
->inc(l_osd_map
);
8001 logger
->inc(l_osd_mape
, last
- first
+ 1);
8002 if (first
<= superblock
.newest_map
)
8003 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
8004 if (service
.max_oldest_map
< m
->oldest_map
) {
8005 service
.max_oldest_map
= m
->oldest_map
;
8006 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
8009 // make sure there is something new, here, before we bother flushing
8010 // the queues and such
8011 if (last
<= superblock
.newest_map
) {
8012 dout(10) << " no new maps here, dropping" << dendl
;
8018 bool skip_maps
= false;
8019 if (first
> superblock
.newest_map
+ 1) {
8020 dout(10) << "handle_osd_map message skips epochs "
8021 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
8022 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
8023 osdmap_subscribe(superblock
.newest_map
+ 1, false);
8027 // always try to get the full range of maps--as many as we can. this
8028 // 1- is good to have
8029 // 2- is at present the only way to ensure that we get a *full* map as
8031 if (m
->oldest_map
< first
) {
8032 osdmap_subscribe(m
->oldest_map
- 1, true);
8039 ObjectStore::Transaction t
;
8040 uint64_t txn_size
= 0;
8042 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> purged_snaps
;
8044 // store new maps: queue for disk and put in the osdmap cache
8045 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
8046 for (epoch_t e
= start
; e
<= last
; e
++) {
8047 if (txn_size
>= t
.get_num_bytes()) {
8048 derr
<< __func__
<< " transaction size overflowed" << dendl
;
8049 ceph_assert(txn_size
< t
.get_num_bytes());
8051 txn_size
= t
.get_num_bytes();
8052 map
<epoch_t
,bufferlist
>::iterator p
;
8053 p
= m
->maps
.find(e
);
8054 if (p
!= m
->maps
.end()) {
8055 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
8056 OSDMap
*o
= new OSDMap
;
8057 bufferlist
& bl
= p
->second
;
8061 purged_snaps
[e
] = o
->get_new_purged_snaps();
8063 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8064 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
8065 added_maps
[e
] = add_map(o
);
8066 added_maps_bl
[e
] = bl
;
8071 p
= m
->incremental_maps
.find(e
);
8072 if (p
!= m
->incremental_maps
.end()) {
8073 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
8074 bufferlist
& bl
= p
->second
;
8075 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
8076 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
8078 OSDMap
*o
= new OSDMap
;
8081 bool got
= get_map_bl(e
- 1, obl
);
8083 auto p
= added_maps_bl
.find(e
- 1);
8084 ceph_assert(p
!= added_maps_bl
.end());
8090 OSDMap::Incremental inc
;
8091 auto p
= bl
.cbegin();
8094 if (o
->apply_incremental(inc
) < 0) {
8095 derr
<< "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
8096 ceph_abort_msg("bad fsid");
8100 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
8102 bool injected_failure
= false;
8103 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
8104 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
8105 derr
<< __func__
<< " injecting map crc failure" << dendl
;
8106 injected_failure
= true;
8109 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
8110 dout(2) << "got incremental " << e
8111 << " but failed to encode full with correct crc; requesting"
8113 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
8114 dout(20) << "my encoded map was:\n";
8115 fbl
.hexdump(*_dout
);
8118 request_full_map(e
, last
);
8121 // don't continue committing if we failed to enc the first inc map
8123 dout(10) << __func__
<< " bailing because last < start (" << last
<< "<" << start
<< ")" << dendl
;
8130 purged_snaps
[e
] = o
->get_new_purged_snaps();
8132 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8133 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
8134 added_maps
[e
] = add_map(o
);
8135 added_maps_bl
[e
] = fbl
;
8139 ceph_abort_msg("MOSDMap lied about what maps it had?");
8142 // even if this map isn't from a mon, we may have satisfied our subscription
8143 monc
->sub_got("osdmap", last
);
8145 if (!m
->maps
.empty() && requested_full_first
) {
8146 dout(10) << __func__
<< " still missing full maps " << requested_full_first
8147 << ".." << requested_full_last
<< dendl
;
8148 rerequest_full_maps();
8151 if (superblock
.oldest_map
) {
8152 // make sure we at least keep pace with incoming maps
8153 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
8154 pg_num_history
.prune(superblock
.oldest_map
);
8157 if (!superblock
.oldest_map
|| skip_maps
)
8158 superblock
.oldest_map
= first
;
8159 superblock
.newest_map
= last
;
8160 superblock
.current_epoch
= last
;
8162 // note in the superblock that we were clean thru the prior epoch
8163 epoch_t boot_epoch
= service
.get_boot_epoch();
8164 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
8165 superblock
.mounted
= boot_epoch
;
8166 superblock
.clean_thru
= last
;
8169 // check for pg_num changes and deleted pools
8171 for (auto& i
: added_maps
) {
8173 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
8174 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
8175 << " probably first start of this osd" << dendl
;
8179 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
8180 for (auto& j
: lastmap
->get_pools()) {
8181 if (!i
.second
->have_pg_pool(j
.first
)) {
8182 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
8183 dout(10) << __func__
<< " recording final pg_pool_t for pool "
8184 << j
.first
<< dendl
;
8185 // this information is needed by _make_pg() if have to restart before
8186 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8187 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8189 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8190 string name
= lastmap
->get_pool_name(j
.first
);
8192 map
<string
,string
> profile
;
8193 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8194 profile
= lastmap
->get_erasure_code_profile(
8195 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8197 encode(profile
, bl
);
8198 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8199 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8200 new_pg_num
!= j
.second
.get_pg_num()) {
8201 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8202 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8203 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8206 for (auto& j
: i
.second
->get_pools()) {
8207 if (!lastmap
->have_pg_pool(j
.first
)) {
8208 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8209 << j
.second
.get_pg_num() << dendl
;
8210 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8211 j
.second
.get_pg_num());
8216 pg_num_history
.epoch
= last
;
8219 ::encode(pg_num_history
, bl
);
8220 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8221 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8224 // record new purged_snaps
8225 if (superblock
.purged_snaps_last
== start
- 1) {
8226 SnapMapper::record_purged_snaps(cct
, store
.get(), service
.meta_ch
,
8227 make_purged_snaps_oid(), &t
,
8229 superblock
.purged_snaps_last
= last
;
8231 dout(10) << __func__
<< " superblock purged_snaps_last is "
8232 << superblock
.purged_snaps_last
8233 << ", not recording new purged_snaps" << dendl
;
8236 // superblock and commit
8237 write_superblock(t
);
8238 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8239 store
->queue_transaction(
8242 service
.publish_superblock(superblock
);
8245 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8247 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8248 if (is_stopping()) {
8249 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8252 std::lock_guard
l(osd_lock
);
8253 if (is_stopping()) {
8254 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8259 ceph_assert(first
<= last
);
8261 bool do_shutdown
= false;
8262 bool do_restart
= false;
8263 bool network_error
= false;
8264 OSDMapRef osdmap
= get_osdmap();
8266 // advance through the new maps
8267 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8268 dout(10) << " advance to epoch " << cur
8269 << " (<= last " << last
8270 << " <= newest_map " << superblock
.newest_map
8273 OSDMapRef newmap
= get_map(cur
);
8274 ceph_assert(newmap
); // we just cached it above!
8276 // start blocklisting messages sent to peers that go down.
8277 service
.pre_publish_map(newmap
);
8279 // kill connections to newly down osds
8280 bool waited_for_reservations
= false;
8282 osdmap
= get_osdmap();
8283 osdmap
->get_all_osds(old
);
8284 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8286 osdmap
->is_up(*p
) && // in old map
8287 newmap
->is_down(*p
)) { // but not the new one
8288 if (!waited_for_reservations
) {
8289 service
.await_reserved_maps();
8290 waited_for_reservations
= true;
8293 } else if (*p
!= whoami
&&
8294 osdmap
->is_down(*p
) &&
8295 newmap
->is_up(*p
)) {
8300 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8301 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8304 // this captures the case where we sent the boot message while
8305 // NOUP was being set on the mon and our boot request was
8306 // dropped, and then later it is cleared. it imperfectly
8307 // handles the case where our original boot message was not
8308 // dropped and we restart even though we might have booted, but
8309 // that is harmless (boot will just take slightly longer).
8314 osdmap
= std::move(newmap
);
8318 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8320 osdmap
->is_up(whoami
) &&
8321 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8322 up_epoch
= osdmap
->get_epoch();
8323 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8325 boot_epoch
= osdmap
->get_epoch();
8326 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8328 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8332 epoch_t _bind_epoch
= service
.get_bind_epoch();
8333 if (osdmap
->is_up(whoami
) &&
8334 osdmap
->get_addrs(whoami
).legacy_equals(
8335 client_messenger
->get_myaddrs()) &&
8336 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8339 dout(1) << "state: booting -> active" << dendl
;
8340 set_state(STATE_ACTIVE
);
8343 // set incarnation so that osd_reqid_t's we generate for our
8344 // objecter requests are unique across restarts.
8345 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8346 cancel_pending_failures();
8350 if (osdmap
->get_epoch() > 0 &&
8352 if (!osdmap
->exists(whoami
)) {
8353 derr
<< "map says i do not exist. shutting down." << dendl
;
8354 do_shutdown
= true; // don't call shutdown() while we have
8355 // everything paused
8356 } else if (osdmap
->is_stop(whoami
)) {
8357 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8359 } else if (!osdmap
->is_up(whoami
) ||
8360 !osdmap
->get_addrs(whoami
).legacy_equals(
8361 client_messenger
->get_myaddrs()) ||
8362 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8363 cluster_messenger
->get_myaddrs()) ||
8364 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8365 hb_back_server_messenger
->get_myaddrs()) ||
8366 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8367 hb_front_server_messenger
->get_myaddrs())) {
8368 if (!osdmap
->is_up(whoami
)) {
8369 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8370 service
.got_stop_ack();
8372 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8373 "but it is still running";
8374 clog
->debug() << "map e" << osdmap
->get_epoch()
8375 << " wrongly marked me down at e"
8376 << osdmap
->get_down_at(whoami
);
8378 if (monc
->monmap
.min_mon_release
>= ceph_release_t::octopus
) {
8379 // note that this is best-effort...
8380 monc
->send_mon_message(
8384 osdmap
->get_epoch()));
8386 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8387 client_messenger
->get_myaddrs())) {
8388 clog
->error() << "map e" << osdmap
->get_epoch()
8389 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8390 << " != my " << client_messenger
->get_myaddrs() << ")";
8391 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8392 cluster_messenger
->get_myaddrs())) {
8393 clog
->error() << "map e" << osdmap
->get_epoch()
8394 << " had wrong cluster addr ("
8395 << osdmap
->get_cluster_addrs(whoami
)
8396 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8397 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8398 hb_back_server_messenger
->get_myaddrs())) {
8399 clog
->error() << "map e" << osdmap
->get_epoch()
8400 << " had wrong heartbeat back addr ("
8401 << osdmap
->get_hb_back_addrs(whoami
)
8402 << " != my " << hb_back_server_messenger
->get_myaddrs()
8404 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8405 hb_front_server_messenger
->get_myaddrs())) {
8406 clog
->error() << "map e" << osdmap
->get_epoch()
8407 << " had wrong heartbeat front addr ("
8408 << osdmap
->get_hb_front_addrs(whoami
)
8409 << " != my " << hb_front_server_messenger
->get_myaddrs()
8413 if (!service
.is_stopping()) {
8414 epoch_t up_epoch
= 0;
8415 epoch_t bind_epoch
= osdmap
->get_epoch();
8416 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8420 utime_t now
= ceph_clock_now();
8421 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8422 osd_markdown_log
.push_back(now
);
8423 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8424 derr
<< __func__
<< " marked down "
8425 << osd_markdown_log
.size()
8426 << " > osd_max_markdown_count "
8427 << cct
->_conf
->osd_max_markdown_count
8428 << " in last " << grace
<< " seconds, shutting down"
8434 start_waiting_for_healthy();
8436 set
<int> avoid_ports
;
8437 #if defined(__FreeBSD__)
8438 // prevent FreeBSD from grabbing the client_messenger port during
8439 // rebinding. In which case a cluster_meesneger will connect also
8441 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8443 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8445 int r
= cluster_messenger
->rebind(avoid_ports
);
8447 do_shutdown
= true; // FIXME: do_restart?
8448 network_error
= true;
8449 derr
<< __func__
<< " marked down:"
8450 << " rebind cluster_messenger failed" << dendl
;
8453 hb_back_server_messenger
->mark_down_all();
8454 hb_front_server_messenger
->mark_down_all();
8455 hb_front_client_messenger
->mark_down_all();
8456 hb_back_client_messenger
->mark_down_all();
8458 reset_heartbeat_peers(true);
8461 } else if (osdmap
->get_epoch() > 0 && osdmap
->is_stop(whoami
)) {
8462 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8468 check_osdmap_features();
8473 if (is_active() || is_waiting_for_healthy())
8474 maybe_update_heartbeat_peers();
8481 if (network_error
) {
8482 cancel_pending_failures();
8484 // trigger shutdown in a different thread
8485 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8486 queue_async_signal(SIGINT
);
8488 else if (m
->newest_map
&& m
->newest_map
> last
) {
8489 dout(10) << " msg say newest map is " << m
->newest_map
8490 << ", requesting more" << dendl
;
8491 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8493 else if (is_preboot()) {
8494 if (m
->get_source().is_mon())
8495 _preboot(m
->oldest_map
, m
->newest_map
);
8499 else if (do_restart
)
8504 void OSD::check_osdmap_features()
8506 // adjust required feature bits?
8508 // we have to be a bit careful here, because we are accessing the
8509 // Policy structures without taking any lock. in particular, only
8510 // modify integer values that can safely be read by a racing CPU.
8511 // since we are only accessing existing Policy structures a their
8512 // current memory location, and setting or clearing bits in integer
8513 // fields, and we are the only writer, this is not a problem.
8515 const auto osdmap
= get_osdmap();
8517 Messenger::Policy p
= client_messenger
->get_default_policy();
8519 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8520 if ((p
.features_required
& mask
) != features
) {
8521 dout(0) << "crush map has features " << features
8522 << ", adjusting msgr requires for clients" << dendl
;
8523 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8524 client_messenger
->set_default_policy(p
);
8528 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8530 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8531 if ((p
.features_required
& mask
) != features
) {
8532 dout(0) << "crush map has features " << features
8533 << " was " << p
.features_required
8534 << ", adjusting msgr requires for mons" << dendl
;
8535 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8536 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8540 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8542 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8544 if ((p
.features_required
& mask
) != features
) {
8545 dout(0) << "crush map has features " << features
8546 << ", adjusting msgr requires for osds" << dendl
;
8547 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8548 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8551 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8552 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8553 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8554 ObjectStore::Transaction t
;
8555 write_superblock(t
);
8556 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8557 ceph_assert(err
== 0);
8561 if (osdmap
->require_osd_release
< ceph_release_t::nautilus
) {
8562 hb_front_server_messenger
->set_require_authorizer(false);
8563 hb_back_server_messenger
->set_require_authorizer(false);
8565 hb_front_server_messenger
->set_require_authorizer(true);
8566 hb_back_server_messenger
->set_require_authorizer(true);
8569 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8570 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8571 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8572 store
->write_meta("require_osd_release",
8573 stringify((int)osdmap
->require_osd_release
));
8574 last_require_osd_release
= osdmap
->require_osd_release
;
8578 struct C_FinishSplits
: public Context
{
8581 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8582 : osd(osd
), pgs(in
) {}
8583 void finish(int r
) override
{
8584 osd
->_finish_splits(pgs
);
8588 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8590 dout(10) << __func__
<< " " << pgs
<< dendl
;
8593 for (set
<PGRef
>::iterator i
= pgs
.begin();
8600 dout(10) << __func__
<< " " << *pg
<< dendl
;
8601 epoch_t e
= pg
->get_osdmap_epoch();
8602 pg
->handle_initialize(rctx
);
8603 pg
->queue_null(e
, e
);
8604 dispatch_context(rctx
, pg
, service
.get_osdmap());
8607 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8608 shards
[shard_index
]->register_and_wake_split_child(pg
);
8612 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8615 std::lock_guard
l(merge_lock
);
8616 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8617 p
[src
->pg_id
] = src
;
8618 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8619 << " for " << target
<< ", have " << p
.size() << "/" << need
8621 return p
.size() == need
;
8624 bool OSD::advance_pg(
8627 ThreadPool::TPHandle
&handle
,
8630 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8633 ceph_assert(pg
->is_locked());
8634 OSDMapRef lastmap
= pg
->get_osdmap();
8635 set
<PGRef
> new_pgs
; // any split children
8638 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8639 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8640 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8641 next_epoch
<= osd_epoch
;
8643 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8645 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8649 unsigned new_pg_num
=
8650 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8651 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8652 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8654 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8656 if (pg
->pg_id
.is_merge_source(
8660 // we are merge source
8661 PGRef spg
= pg
; // carry a ref
8662 dout(1) << __func__
<< " " << pg
->pg_id
8663 << " is merge source, target is " << parent
8665 pg
->write_if_dirty(rctx
);
8666 if (!new_pgs
.empty()) {
8667 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8671 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8673 // release backoffs explicitly, since the on_shutdown path
8674 // aggressively tears down backoff state.
8675 if (pg
->is_primary()) {
8676 pg
->release_pg_backoffs();
8679 OSDShard
*sdata
= pg
->osd_shard
;
8681 std::lock_guard
l(sdata
->shard_lock
);
8683 sdata
->_detach_pg(pg
->pg_slot
);
8684 // update pg count now since we might not get an osdmap
8686 if (pg
->is_primary())
8687 logger
->dec(l_osd_pg_primary
);
8688 else if (pg
->is_nonprimary())
8689 logger
->dec(l_osd_pg_replica
); // misnomer
8691 logger
->dec(l_osd_pg_stray
);
8696 set
<spg_t
> children
;
8697 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8698 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8699 enqueue_peering_evt(
8702 std::make_shared
<PGPeeringEvent
>(
8703 nextmap
->get_epoch(),
8704 nextmap
->get_epoch(),
8709 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8710 // we are merge target
8711 set
<spg_t
> children
;
8712 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8713 dout(20) << __func__
<< " " << pg
->pg_id
8714 << " is merge target, sources are " << children
8716 map
<spg_t
,PGRef
> sources
;
8718 std::lock_guard
l(merge_lock
);
8719 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8720 unsigned need
= children
.size();
8721 dout(20) << __func__
<< " have " << s
.size() << "/"
8723 if (s
.size() == need
) {
8725 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8726 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8727 merge_waiters
.erase(nextmap
->get_epoch());
8731 if (!sources
.empty()) {
8732 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8733 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8734 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8736 sources
, rctx
, split_bits
,
8737 nextmap
->get_pg_pool(
8738 pg
->pg_id
.pool())->last_pg_merge_meta
);
8739 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8741 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8742 pg
->write_if_dirty(rctx
);
8743 if (!new_pgs
.empty()) {
8744 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8748 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8750 // kick source(s) to get them ready
8751 for (auto& i
: children
) {
8752 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8753 enqueue_peering_evt(
8756 std::make_shared
<PGPeeringEvent
>(
8757 nextmap
->get_epoch(),
8758 nextmap
->get_epoch(),
8768 vector
<int> newup
, newacting
;
8769 int up_primary
, acting_primary
;
8770 nextmap
->pg_to_up_acting_osds(
8772 &newup
, &up_primary
,
8773 &newacting
, &acting_primary
);
8774 pg
->handle_advance_map(
8775 nextmap
, lastmap
, newup
, up_primary
,
8776 newacting
, acting_primary
, rctx
);
8778 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8779 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8780 if (oldpool
!= lastmap
->get_pools().end()
8781 && newpool
!= nextmap
->get_pools().end()) {
8782 dout(20) << __func__
8783 << " new pool opts " << newpool
->second
.opts
8784 << " old pool opts " << oldpool
->second
.opts
8787 double old_min_interval
= 0, new_min_interval
= 0;
8788 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8789 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8791 double old_max_interval
= 0, new_max_interval
= 0;
8792 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8793 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8795 // Assume if an interval is change from set to unset or vice versa the actual config
8796 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8798 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8799 pg
->on_info_history_change();
8803 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8805 set
<spg_t
> children
;
8806 if (pg
->pg_id
.is_split(
8811 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8817 old_pg_num
= new_pg_num
;
8818 handle
.reset_tp_timeout();
8820 pg
->handle_activate_map(rctx
);
8824 if (!new_pgs
.empty()) {
8825 rctx
.transaction
.register_on_applied(new C_FinishSplits(this, new_pgs
));
8830 void OSD::consume_map()
8832 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8833 auto osdmap
= get_osdmap();
8834 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8836 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8837 * speak the older sorting version any more. Be careful not to force
8838 * a shutdown if we are merely processing old maps, though.
8840 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8841 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8845 service
.pre_publish_map(osdmap
);
8846 service
.await_reserved_maps();
8847 service
.publish_map(osdmap
);
8849 // prime splits and merges
8850 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8851 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8852 for (auto& shard
: shards
) {
8853 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8855 if (!newly_split
.empty()) {
8856 for (auto& shard
: shards
) {
8857 shard
->prime_splits(osdmap
, &newly_split
);
8859 ceph_assert(newly_split
.empty());
8862 // prune sent_ready_to_merge
8863 service
.prune_sent_ready_to_merge(osdmap
);
8865 // FIXME, maybe: We could race against an incoming peering message
8866 // that instantiates a merge PG after identify_merges() below and
8867 // never set up its peer to complete the merge. An OSD restart
8868 // would clear it up. This is a hard race to resolve,
8869 // extraordinarily rare (we only merge PGs that are stable and
8870 // clean, so it'd have to be an imported PG to an OSD with a
8871 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8872 // replace all of this with a seastar-based code soon anyway.
8873 if (!merge_pgs
.empty()) {
8874 // mark the pgs we already have, or create new and empty merge
8875 // participants for those we are missing. do this all under the
8876 // shard lock so we don't have to worry about racing pg creates
8878 for (auto& shard
: shards
) {
8879 shard
->prime_merges(osdmap
, &merge_pgs
);
8881 ceph_assert(merge_pgs
.empty());
8884 service
.prune_pg_created();
8886 unsigned pushes_to_free
= 0;
8887 for (auto& shard
: shards
) {
8888 shard
->consume_map(osdmap
, &pushes_to_free
);
8891 vector
<spg_t
> pgids
;
8894 // count (FIXME, probably during seastar rewrite)
8895 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8898 for (auto& pg
: pgs
) {
8899 // FIXME (probably during seastar rewrite): this is lockless and
8900 // racy, but we don't want to take pg lock here.
8901 if (pg
->is_primary())
8903 else if (pg
->is_nonprimary())
8904 num_pg_replica
++; // misnomer
8910 // FIXME (as part of seastar rewrite): move to OSDShard
8911 std::lock_guard
l(pending_creates_lock
);
8912 for (auto pg
= pending_creates_from_osd
.begin();
8913 pg
!= pending_creates_from_osd
.end();) {
8914 if (osdmap
->get_pg_acting_role(pg
->first
, whoami
) < 0) {
8915 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8916 << "discarding pending_create_from_osd" << dendl
;
8917 pg
= pending_creates_from_osd
.erase(pg
);
8924 service
.maybe_inject_dispatch_delay();
8926 dispatch_sessions_waiting_on_map();
8928 service
.maybe_inject_dispatch_delay();
8930 service
.release_reserved_pushes(pushes_to_free
);
8932 // queue null events to push maps down to individual PGs
8933 for (auto pgid
: pgids
) {
8934 enqueue_peering_evt(
8937 std::make_shared
<PGPeeringEvent
>(
8938 osdmap
->get_epoch(),
8939 osdmap
->get_epoch(),
8942 logger
->set(l_osd_pg
, pgids
.size());
8943 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8944 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8945 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8948 void OSD::activate_map()
8950 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8951 auto osdmap
= get_osdmap();
8953 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8956 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8957 if (!service
.recovery_is_paused()) {
8958 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8959 service
.pause_recovery();
8962 if (service
.recovery_is_paused()) {
8963 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8964 service
.unpause_recovery();
8968 service
.activate_map();
8971 take_waiters(waiting_for_osdmap
);
8974 bool OSD::require_mon_peer(const Message
*m
)
8976 if (!m
->get_connection()->peer_is_mon()) {
8977 dout(0) << "require_mon_peer received from non-mon "
8978 << m
->get_connection()->get_peer_addr()
8979 << " " << *m
<< dendl
;
8985 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8987 if (!m
->get_connection()->peer_is_mon() &&
8988 !m
->get_connection()->peer_is_mgr()) {
8989 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8990 << m
->get_connection()->get_peer_addr()
8991 << " " << *m
<< dendl
;
8997 bool OSD::require_osd_peer(const Message
*m
)
8999 if (!m
->get_connection()->peer_is_osd()) {
9000 dout(0) << "require_osd_peer received from non-osd "
9001 << m
->get_connection()->get_peer_addr()
9002 << " " << *m
<< dendl
;
9008 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
9010 epoch_t up_epoch
= service
.get_up_epoch();
9011 if (epoch
< up_epoch
) {
9012 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
9017 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
9024 bool OSD::require_same_peer_instance(const Message
*m
, const OSDMapRef
& map
,
9025 bool is_fast_dispatch
)
9027 int from
= m
->get_source().num();
9029 if (map
->is_down(from
) ||
9030 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
9031 dout(5) << "from dead osd." << from
<< ", marking down, "
9032 << " msg was " << m
->get_source_inst().addr
9034 << (map
->is_up(from
) ?
9035 map
->get_cluster_addrs(from
) : entity_addrvec_t())
9037 ConnectionRef con
= m
->get_connection();
9039 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); s
) {
9040 if (!is_fast_dispatch
)
9041 s
->session_dispatch_lock
.lock();
9042 clear_session_waiting_on_map(s
);
9043 con
->set_priv(nullptr); // break ref <-> session cycle, if any
9045 if (!is_fast_dispatch
)
9046 s
->session_dispatch_lock
.unlock();
9055 * require that we have same (or newer) map, and that
9056 * the source is the pg primary.
9058 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
9059 bool is_fast_dispatch
)
9061 const Message
*m
= op
->get_req();
9062 const auto osdmap
= get_osdmap();
9063 dout(15) << "require_same_or_newer_map " << epoch
9064 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
9066 ceph_assert(ceph_mutex_is_locked(osd_lock
));
9068 // do they have a newer map?
9069 if (epoch
> osdmap
->get_epoch()) {
9070 dout(7) << "waiting for newer map epoch " << epoch
9071 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
9072 wait_for_new_map(op
);
9076 if (!require_self_aliveness(op
->get_req(), epoch
)) {
9080 // ok, our map is same or newer.. do they still exist?
9081 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
9082 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
9093 // ----------------------------------------
9096 void OSD::split_pgs(
9098 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
9103 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
9104 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
9106 vector
<object_stat_sum_t
> updated_stats
;
9107 parent
->start_split_stats(childpgids
, &updated_stats
);
9109 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
9110 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
9111 i
!= childpgids
.end();
9113 ceph_assert(stat_iter
!= updated_stats
.end());
9114 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
9115 PG
* child
= _make_pg(nextmap
, *i
);
9117 out_pgs
->insert(child
);
9118 child
->ch
= store
->create_new_collection(child
->coll
);
9121 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
9122 assert(NULL
!= shards
[shard_index
]);
9123 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
9126 unsigned split_bits
= i
->get_split_bits(pg_num
);
9127 dout(10) << " pg_num is " << pg_num
9128 << ", m_seed " << i
->ps()
9129 << ", split_bits is " << split_bits
<< dendl
;
9130 parent
->split_colls(
9134 &child
->get_pool().info
,
9141 child
->init_collection_pool_opts();
9143 child
->finish_split_stats(*stat_iter
, rctx
.transaction
);
9146 ceph_assert(stat_iter
!= updated_stats
.end());
9147 parent
->finish_split_stats(*stat_iter
, rctx
.transaction
);
9153 void OSD::handle_pg_create(OpRequestRef op
)
9155 // NOTE: this can be removed in P release (mimic is the last version to
9156 // send MOSDPGCreate messages).
9158 auto m
= op
->get_req
<MOSDPGCreate
>();
9159 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
9161 dout(10) << "handle_pg_create " << *m
<< dendl
;
9163 if (!require_mon_peer(op
->get_req())) {
9167 if (!require_same_or_newer_map(op
, m
->epoch
, false))
9172 const auto osdmap
= get_osdmap();
9173 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
9174 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
9177 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
9178 epoch_t created
= p
->second
.created
;
9179 if (p
->second
.split_bits
) // Skip split pgs
9183 if (!osdmap
->have_pg_pool(on
.pool())) {
9184 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
9188 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
9191 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
9192 ceph_assert(mapped
);
9194 // is it still ours?
9195 vector
<int> up
, acting
;
9196 int up_primary
= -1;
9197 int acting_primary
= -1;
9198 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
9199 int role
= osdmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
9201 if (acting_primary
!= whoami
) {
9202 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9203 << "), my role=" << role
<< ", skipping" << dendl
;
9209 pg_history_t history
;
9210 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9212 // The mon won't resend unless the primary changed, so we ignore
9213 // same_interval_since. We'll pass this history with the current
9214 // epoch as the event.
9215 if (history
.same_primary_since
> m
->epoch
) {
9216 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9217 << pgid
<< " from epoch " << m
->epoch
9218 << ", primary changed in " << history
.same_primary_since
9222 enqueue_peering_evt(
9225 std::make_shared
<PGPeeringEvent
>(
9226 osdmap
->get_epoch(),
9227 osdmap
->get_epoch(),
9232 osdmap
->get_epoch(),
9240 std::lock_guard
l(pending_creates_lock
);
9241 if (pending_creates_from_mon
== 0) {
9242 last_pg_create_epoch
= m
->epoch
;
9246 maybe_update_heartbeat_peers();
9250 // ----------------------------------------
9251 // peering and recovery
9253 void OSD::dispatch_context(PeeringCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9254 ThreadPool::TPHandle
*handle
)
9256 if (!service
.get_osdmap()->is_up(whoami
)) {
9257 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9258 } else if (!is_active()) {
9259 dout(20) << __func__
<< " not active" << dendl
;
9261 for (auto& [osd
, ls
] : ctx
.message_map
) {
9262 if (!curmap
->is_up(osd
)) {
9263 dout(20) << __func__
<< " skipping down osd." << osd
<< dendl
;
9266 ConnectionRef con
= service
.get_con_osd_cluster(
9267 osd
, curmap
->get_epoch());
9269 dout(20) << __func__
<< " skipping osd." << osd
<< " (NULL con)"
9273 service
.maybe_share_map(con
.get(), curmap
);
9275 con
->send_message2(m
);
9280 if ((!ctx
.transaction
.empty() || ctx
.transaction
.has_contexts()) && pg
) {
9281 int tr
= store
->queue_transaction(
9283 std::move(ctx
.transaction
), TrackedOpRef(),
9285 ceph_assert(tr
== 0);
9289 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9291 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9292 if (!require_mon_peer(m
)) {
9296 for (auto& p
: m
->pgs
) {
9297 spg_t pgid
= p
.first
;
9298 epoch_t created
= p
.second
.first
;
9299 utime_t created_stamp
= p
.second
.second
;
9300 auto q
= m
->pg_extra
.find(pgid
);
9301 if (q
== m
->pg_extra
.end()) {
9302 dout(20) << __func__
<< " " << pgid
<< " e" << created
9303 << "@" << created_stamp
9304 << " (no history or past_intervals)" << dendl
;
9305 // pre-octopus ... no pg history. this can be removed in Q release.
9306 enqueue_peering_evt(
9309 std::make_shared
<PGPeeringEvent
>(
9317 pg_history_t(created
, created_stamp
),
9322 dout(20) << __func__
<< " " << pgid
<< " e" << created
9323 << "@" << created_stamp
9324 << " history " << q
->second
.first
9325 << " pi " << q
->second
.second
<< dendl
;
9326 if (!q
->second
.second
.empty() &&
9327 m
->epoch
< q
->second
.second
.get_bounds().second
) {
9328 clog
->error() << "got pg_create on " << pgid
<< " epoch " << m
->epoch
9329 << " and unmatched past_intervals " << q
->second
.second
9330 << " (history " << q
->second
.first
<< ")";
9332 enqueue_peering_evt(
9335 std::make_shared
<PGPeeringEvent
>(
9352 std::lock_guard
l(pending_creates_lock
);
9353 if (pending_creates_from_mon
== 0) {
9354 last_pg_create_epoch
= m
->epoch
;
9361 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9363 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9364 if (!require_osd_peer(m
)) {
9368 int from
= m
->get_source().num();
9369 for (auto& p
: m
->get_pg_list()) {
9370 spg_t
pgid(p
.info
.pgid
.pgid
, p
.to
);
9371 enqueue_peering_evt(
9374 std::make_shared
<PGPeeringEvent
>(
9378 pgid
, pg_shard_t(from
, p
.from
),
9380 m
->get_connection()->get_features()),
9393 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9395 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9396 if (!require_osd_peer(m
)) {
9400 int from
= m
->get_source().num();
9401 for (auto& p
: m
->pg_list
) {
9402 enqueue_peering_evt(
9403 spg_t(p
.info
.pgid
.pgid
, p
.to
),
9405 std::make_shared
<PGPeeringEvent
>(
9406 p
.epoch_sent
, p
.query_epoch
,
9408 pg_shard_t(from
, p
.from
),
9416 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9418 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9419 if (!require_osd_peer(m
)) {
9423 for (auto& pgid
: m
->pg_list
) {
9424 enqueue_peering_evt(
9427 std::make_shared
<PGPeeringEvent
>(
9428 m
->get_epoch(), m
->get_epoch(),
9429 PeeringState::DeleteStart())));
9434 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9436 dout(10) << __func__
<< " " << *m
<< dendl
;
9437 if (!require_mon_or_mgr_peer(m
)) {
9441 epoch_t epoch
= get_osdmap_epoch();
9442 for (auto pgid
: m
->forced_pgs
) {
9443 if (m
->options
& OFR_BACKFILL
) {
9444 if (m
->options
& OFR_CANCEL
) {
9445 enqueue_peering_evt(
9448 std::make_shared
<PGPeeringEvent
>(
9450 PeeringState::UnsetForceBackfill())));
9452 enqueue_peering_evt(
9455 std::make_shared
<PGPeeringEvent
>(
9457 PeeringState::SetForceBackfill())));
9459 } else if (m
->options
& OFR_RECOVERY
) {
9460 if (m
->options
& OFR_CANCEL
) {
9461 enqueue_peering_evt(
9464 std::make_shared
<PGPeeringEvent
>(
9466 PeeringState::UnsetForceRecovery())));
9468 enqueue_peering_evt(
9471 std::make_shared
<PGPeeringEvent
>(
9473 PeeringState::SetForceRecovery())));
9480 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9482 spg_t pgid
= q
.pgid
;
9483 dout(10) << __func__
<< " " << pgid
<< dendl
;
9485 OSDMapRef osdmap
= get_osdmap();
9486 if (!osdmap
->have_pg_pool(pgid
.pool()))
9489 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9490 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9491 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9494 if (q
.query
.type
== pg_query_t::LOG
||
9495 q
.query
.type
== pg_query_t::FULLLOG
) {
9497 q
.query
.from
, q
.query
.to
,
9498 osdmap
->get_epoch(), empty
,
9499 q
.query
.epoch_sent
);
9501 pg_notify_t notify
{q
.query
.from
, q
.query
.to
,
9503 osdmap
->get_epoch(),
9506 m
= new MOSDPGNotify2(spg_t
{pgid
.pgid
, q
.query
.from
},
9509 service
.maybe_share_map(con
.get(), osdmap
);
9510 con
->send_message(m
);
9514 void OSDService::queue_check_readable(spg_t spgid
,
9516 ceph::signedspan delay
)
9518 if (delay
== ceph::signedspan::zero()) {
9519 osd
->enqueue_peering_evt(
9522 std::make_shared
<PGPeeringEvent
>(
9524 PeeringState::CheckReadable())));
9526 mono_timer
.add_event(
9528 [this, spgid
, lpr
]() {
9529 queue_check_readable(spgid
, lpr
);
9535 // =========================================================
9538 void OSDService::_maybe_queue_recovery() {
9539 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
9540 uint64_t available_pushes
;
9541 while (!awaiting_throttle
.empty() &&
9542 _recover_now(&available_pushes
)) {
9543 uint64_t to_start
= std::min(
9545 cct
->_conf
->osd_recovery_max_single_start
);
9546 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9547 awaiting_throttle
.pop_front();
9548 dout(10) << __func__
<< " starting " << to_start
9549 << ", recovery_ops_reserved " << recovery_ops_reserved
9550 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9551 recovery_ops_reserved
+= to_start
;
9555 bool OSDService::_recover_now(uint64_t *available_pushes
)
9557 if (available_pushes
)
9558 *available_pushes
= 0;
9560 if (ceph_clock_now() < defer_recovery_until
) {
9561 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9565 if (recovery_paused
) {
9566 dout(15) << __func__
<< " paused" << dendl
;
9570 uint64_t max
= osd
->get_recovery_max_active();
9571 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9572 dout(15) << __func__
<< " active " << recovery_ops_active
9573 << " + reserved " << recovery_ops_reserved
9574 << " >= max " << max
<< dendl
;
9578 if (available_pushes
)
9579 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9584 unsigned OSDService::get_target_pg_log_entries() const
9586 auto num_pgs
= osd
->get_num_pgs();
9587 auto target
= cct
->_conf
->osd_target_pg_log_entries_per_osd
;
9588 if (num_pgs
> 0 && target
> 0) {
9589 // target an even spread of our budgeted log entries across all
9590 // PGs. note that while we only get to control the entry count
9591 // for primary PGs, we'll normally be responsible for a mix of
9592 // primary and replica PGs (for the same pool(s) even), so this
9594 return std::max
<unsigned>(
9595 std::min
<unsigned>(target
/ num_pgs
,
9596 cct
->_conf
->osd_max_pg_log_entries
),
9597 cct
->_conf
->osd_min_pg_log_entries
);
9599 // fall back to a per-pg value.
9600 return cct
->_conf
->osd_min_pg_log_entries
;
9604 void OSD::do_recovery(
9605 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9606 ThreadPool::TPHandle
&handle
)
9608 uint64_t started
= 0;
9611 * When the value of osd_recovery_sleep is set greater than zero, recovery
9612 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9613 * recovery event's schedule time. This is done by adding a
9614 * recovery_requeue_callback event, which re-queues the recovery op using
9615 * queue_recovery_after_sleep.
9617 float recovery_sleep
= get_osd_recovery_sleep();
9619 std::lock_guard
l(service
.sleep_lock
);
9620 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9622 auto recovery_requeue_callback
= new LambdaContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9623 dout(20) << "do_recovery wake up at "
9625 << ", re-queuing recovery" << dendl
;
9626 std::lock_guard
l(service
.sleep_lock
);
9627 service
.recovery_needs_sleep
= false;
9628 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9631 // This is true for the first recovery op and when the previous recovery op
9632 // has been scheduled in the past. The next recovery op is scheduled after
9633 // completing the sleep from now.
9635 if (auto now
= ceph::real_clock::now();
9636 service
.recovery_schedule_time
< now
) {
9637 service
.recovery_schedule_time
= now
;
9639 service
.recovery_schedule_time
+= ceph::make_timespan(recovery_sleep
);
9640 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9641 recovery_requeue_callback
);
9642 dout(20) << "Recovery event scheduled at "
9643 << service
.recovery_schedule_time
<< dendl
;
9650 std::lock_guard
l(service
.sleep_lock
);
9651 service
.recovery_needs_sleep
= true;
9654 if (pg
->pg_has_reset_since(queued
)) {
9658 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9659 #ifdef DEBUG_RECOVERY_OIDS
9660 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9663 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9664 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9665 << " on " << *pg
<< dendl
;
9669 rctx
.handle
= &handle
;
9670 pg
->find_unfound(queued
, rctx
);
9671 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9676 ceph_assert(started
<= reserved_pushes
);
9677 service
.release_reserved_pushes(reserved_pushes
);
9680 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9682 std::lock_guard
l(recovery_lock
);
9683 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9684 << " (" << recovery_ops_active
<< "/"
9685 << osd
->get_recovery_max_active() << " rops)"
9687 recovery_ops_active
++;
9689 #ifdef DEBUG_RECOVERY_OIDS
9690 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9691 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9692 recovery_oids
[pg
->pg_id
].insert(soid
);
9696 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9698 std::lock_guard
l(recovery_lock
);
9699 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9700 << " dequeue=" << dequeue
9701 << " (" << recovery_ops_active
<< "/"
9702 << osd
->get_recovery_max_active() << " rops)"
9706 ceph_assert(recovery_ops_active
> 0);
9707 recovery_ops_active
--;
9709 #ifdef DEBUG_RECOVERY_OIDS
9710 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9711 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9712 recovery_oids
[pg
->pg_id
].erase(soid
);
9715 _maybe_queue_recovery();
9718 bool OSDService::is_recovery_active()
9720 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
9723 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9726 void OSDService::release_reserved_pushes(uint64_t pushes
)
9728 std::lock_guard
l(recovery_lock
);
9729 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9730 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9732 ceph_assert(recovery_ops_reserved
>= pushes
);
9733 recovery_ops_reserved
-= pushes
;
9734 _maybe_queue_recovery();
9737 // =========================================================
9740 bool OSD::op_is_discardable(const MOSDOp
*op
)
9742 // drop client request if they are not connected and can't get the
9744 if (!op
->get_connection()->is_connected()) {
9750 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9752 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9753 const utime_t latency
= ceph_clock_now() - stamp
;
9754 const unsigned priority
= op
->get_req()->get_priority();
9755 const int cost
= op
->get_req()->get_cost();
9756 const uint64_t owner
= op
->get_req()->get_source().num();
9757 const int type
= op
->get_req()->get_type();
9759 dout(15) << "enqueue_op " << op
<< " prio " << priority
9762 << " latency " << latency
9763 << " epoch " << epoch
9764 << " " << *(op
->get_req()) << dendl
;
9765 op
->osd_trace
.event("enqueue op");
9766 op
->osd_trace
.keyval("priority", priority
);
9767 op
->osd_trace
.keyval("cost", cost
);
9769 auto enqueue_span
= tracing::osd::tracer
.add_span(__func__
, op
->osd_parent_span
);
9770 enqueue_span
->AddEvent(__func__
, {
9771 {"priority", priority
},
9778 op
->mark_queued_for_pg();
9779 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9780 if (type
== MSG_OSD_PG_PUSH
||
9781 type
== MSG_OSD_PG_PUSH_REPLY
) {
9784 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGRecoveryMsg(pg
, std::move(op
))),
9785 cost
, priority
, stamp
, owner
, epoch
));
9789 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9790 cost
, priority
, stamp
, owner
, epoch
));
9794 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9796 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9799 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9801 cct
->_conf
->osd_peering_op_priority
,
9804 evt
->get_epoch_sent()));
9808 * NOTE: dequeue called in worker thread, with pg lock
9810 void OSD::dequeue_op(
9811 PGRef pg
, OpRequestRef op
,
9812 ThreadPool::TPHandle
&handle
)
9814 const Message
*m
= op
->get_req();
9817 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_BEGIN", false);
9819 utime_t now
= ceph_clock_now();
9820 op
->set_dequeued_time(now
);
9822 utime_t latency
= now
- m
->get_recv_stamp();
9823 dout(10) << "dequeue_op " << op
<< " prio " << m
->get_priority()
9824 << " cost " << m
->get_cost()
9825 << " latency " << latency
9827 << " pg " << *pg
<< dendl
;
9829 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9831 service
.maybe_share_map(m
->get_connection().get(),
9835 if (pg
->is_deleting())
9838 op
->mark_reached_pg();
9839 op
->osd_trace
.event("dequeue_op");
9841 pg
->do_request(op
, handle
);
9844 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9845 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_END", false);
9849 void OSD::dequeue_peering_evt(
9852 PGPeeringEventRef evt
,
9853 ThreadPool::TPHandle
& handle
)
9855 auto curmap
= sdata
->get_osdmap();
9856 bool need_up_thru
= false;
9857 epoch_t same_interval_since
= 0;
9859 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9860 handle_pg_query_nopg(*q
);
9862 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9865 } else if (PeeringCtx rctx
;
9866 advance_pg(curmap
->get_epoch(), pg
, handle
, rctx
)) {
9867 pg
->do_peering_event(evt
, rctx
);
9868 if (pg
->is_deleted()) {
9872 dispatch_context(rctx
, pg
, curmap
, &handle
);
9873 need_up_thru
= pg
->get_need_up_thru();
9874 same_interval_since
= pg
->get_same_interval_since();
9879 queue_want_up_thru(same_interval_since
);
9882 service
.send_pg_temp();
9885 void OSD::dequeue_delete(
9889 ThreadPool::TPHandle
& handle
)
9891 dequeue_peering_evt(
9895 std::make_shared
<PGPeeringEvent
>(
9897 PeeringState::DeleteSome())),
9903 // --------------------------------
9905 const char** OSD::get_tracked_conf_keys() const
9907 static const char* KEYS
[] = {
9908 "osd_max_backfills",
9909 "osd_min_recovery_priority",
9910 "osd_max_trimming_pgs",
9911 "osd_op_complaint_time",
9912 "osd_op_log_threshold",
9913 "osd_op_history_size",
9914 "osd_op_history_duration",
9915 "osd_op_history_slow_op_size",
9916 "osd_op_history_slow_op_threshold",
9917 "osd_enable_op_tracker",
9918 "osd_map_cache_size",
9919 "osd_pg_epoch_max_lag_factor",
9920 "osd_pg_epoch_persisted_max_stale",
9921 "osd_recovery_sleep",
9922 "osd_recovery_sleep_hdd",
9923 "osd_recovery_sleep_ssd",
9924 "osd_recovery_sleep_hybrid",
9926 "osd_delete_sleep_hdd",
9927 "osd_delete_sleep_ssd",
9928 "osd_delete_sleep_hybrid",
9929 "osd_snap_trim_sleep",
9930 "osd_snap_trim_sleep_hdd",
9931 "osd_snap_trim_sleep_ssd",
9932 "osd_snap_trim_sleep_hybrid",
9934 "osd_recovery_max_active",
9935 "osd_recovery_max_active_hdd",
9936 "osd_recovery_max_active_ssd",
9937 // clog & admin clog
9940 "clog_to_syslog_facility",
9941 "clog_to_syslog_level",
9942 "osd_objectstore_fuse",
9944 "clog_to_graylog_host",
9945 "clog_to_graylog_port",
9948 "osd_recovery_delay_start",
9949 "osd_client_message_size_cap",
9950 "osd_client_message_cap",
9951 "osd_heartbeat_min_size",
9952 "osd_heartbeat_interval",
9953 "osd_object_clean_region_max_num_intervals",
9954 "osd_scrub_min_interval",
9955 "osd_scrub_max_interval",
9961 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9962 const std::set
<std::string
> &changed
)
9964 std::lock_guard l
{osd_lock
};
9966 if (changed
.count("osd_max_backfills") ||
9967 changed
.count("osd_delete_sleep") ||
9968 changed
.count("osd_delete_sleep_hdd") ||
9969 changed
.count("osd_delete_sleep_ssd") ||
9970 changed
.count("osd_delete_sleep_hybrid") ||
9971 changed
.count("osd_snap_trim_sleep") ||
9972 changed
.count("osd_snap_trim_sleep_hdd") ||
9973 changed
.count("osd_snap_trim_sleep_ssd") ||
9974 changed
.count("osd_snap_trim_sleep_hybrid") ||
9975 changed
.count("osd_scrub_sleep") ||
9976 changed
.count("osd_recovery_sleep") ||
9977 changed
.count("osd_recovery_sleep_hdd") ||
9978 changed
.count("osd_recovery_sleep_ssd") ||
9979 changed
.count("osd_recovery_sleep_hybrid") ||
9980 changed
.count("osd_recovery_max_active") ||
9981 changed
.count("osd_recovery_max_active_hdd") ||
9982 changed
.count("osd_recovery_max_active_ssd")) {
9983 if (!maybe_override_options_for_qos() &&
9984 changed
.count("osd_max_backfills")) {
9985 // Scheduler is not "mclock". Fallback to earlier behavior
9986 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9987 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
9990 if (changed
.count("osd_min_recovery_priority")) {
9991 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9992 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
9994 if (changed
.count("osd_max_trimming_pgs")) {
9995 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
9997 if (changed
.count("osd_op_complaint_time") ||
9998 changed
.count("osd_op_log_threshold")) {
9999 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
10000 cct
->_conf
->osd_op_log_threshold
);
10002 if (changed
.count("osd_op_history_size") ||
10003 changed
.count("osd_op_history_duration")) {
10004 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
10005 cct
->_conf
->osd_op_history_duration
);
10007 if (changed
.count("osd_op_history_slow_op_size") ||
10008 changed
.count("osd_op_history_slow_op_threshold")) {
10009 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
10010 cct
->_conf
->osd_op_history_slow_op_threshold
);
10012 if (changed
.count("osd_enable_op_tracker")) {
10013 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
10015 if (changed
.count("osd_map_cache_size")) {
10016 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10017 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10018 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10020 if (changed
.count("clog_to_monitors") ||
10021 changed
.count("clog_to_syslog") ||
10022 changed
.count("clog_to_syslog_level") ||
10023 changed
.count("clog_to_syslog_facility") ||
10024 changed
.count("clog_to_graylog") ||
10025 changed
.count("clog_to_graylog_host") ||
10026 changed
.count("clog_to_graylog_port") ||
10027 changed
.count("host") ||
10028 changed
.count("fsid")) {
10029 update_log_config();
10031 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
10032 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
10033 "osd_pg_epoch_max_lag_factor");
10036 #ifdef HAVE_LIBFUSE
10037 if (changed
.count("osd_objectstore_fuse")) {
10039 enable_disable_fuse(false);
10044 if (changed
.count("osd_recovery_delay_start")) {
10045 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
10046 service
.kick_recovery_queue();
10049 if (changed
.count("osd_client_message_cap")) {
10050 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
10051 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10052 if (pol
.throttler_messages
) {
10053 pol
.throttler_messages
->reset_max(newval
);
10056 if (changed
.count("osd_client_message_size_cap")) {
10057 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
10058 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10059 if (pol
.throttler_bytes
) {
10060 pol
.throttler_bytes
->reset_max(newval
);
10063 if (changed
.count("osd_object_clean_region_max_num_intervals")) {
10064 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
10067 if (changed
.count("osd_scrub_min_interval") ||
10068 changed
.count("osd_scrub_max_interval")) {
10069 resched_all_scrubs();
10070 dout(0) << __func__
<< ": scrub interval change" << dendl
;
10073 if (changed
.count("osd_asio_thread_count")) {
10074 service
.poolctx
.stop();
10075 service
.poolctx
.start(conf
.get_val
<std::uint64_t>("osd_asio_thread_count"));
10079 void OSD::maybe_override_max_osd_capacity_for_qos()
10081 // If the scheduler enabled is mclock, override the default
10082 // osd capacity with the value obtained from running the
10083 // osd bench test. This is later used to setup mclock.
10084 if ((cct
->_conf
.get_val
<std::string
>("osd_op_queue") == "mclock_scheduler") &&
10085 (cct
->_conf
.get_val
<bool>("osd_mclock_skip_benchmark") == false) &&
10086 (!unsupported_objstore_for_qos())) {
10087 std::string max_capacity_iops_config
;
10088 bool force_run_benchmark
=
10089 cct
->_conf
.get_val
<bool>("osd_mclock_force_run_benchmark_on_init");
10091 if (store_is_rotational
) {
10092 max_capacity_iops_config
= "osd_mclock_max_capacity_iops_hdd";
10094 max_capacity_iops_config
= "osd_mclock_max_capacity_iops_ssd";
10097 if (!force_run_benchmark
) {
10098 double default_iops
= 0.0;
10100 // Get the current osd iops capacity
10101 double cur_iops
= cct
->_conf
.get_val
<double>(max_capacity_iops_config
);
10103 // Get the default max iops capacity
10104 auto val
= cct
->_conf
.get_val_default(max_capacity_iops_config
);
10105 if (!val
.has_value()) {
10106 derr
<< __func__
<< " Unable to determine default value of "
10107 << max_capacity_iops_config
<< dendl
;
10108 // Cannot determine default iops. Force a run of the OSD benchmark.
10109 force_run_benchmark
= true;
10112 default_iops
= std::stod(val
.value());
10115 // Determine if we really need to run the osd benchmark
10116 if (!force_run_benchmark
&& (default_iops
!= cur_iops
)) {
10117 dout(1) << __func__
<< std::fixed
<< std::setprecision(2)
10118 << " default_iops: " << default_iops
10119 << " cur_iops: " << cur_iops
10120 << ". Skip OSD benchmark test." << dendl
;
10125 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
10126 int64_t count
= 12288000; // Count of bytes to write
10127 int64_t bsize
= 4096; // Block size
10128 int64_t osize
= 4194304; // Object size
10129 int64_t onum
= 100; // Count of objects to write
10130 double elapsed
= 0.0; // Time taken to complete the test
10133 int ret
= run_osd_bench_test(count
, bsize
, osize
, onum
, &elapsed
, ss
);
10136 << " osd bench err: " << ret
10137 << " osd bench errstr: " << ss
.str()
10142 double rate
= count
/ elapsed
;
10143 iops
= rate
/ bsize
;
10144 dout(1) << __func__
10145 << " osd bench result -"
10146 << std::fixed
<< std::setprecision(3)
10147 << " bandwidth (MiB/sec): " << rate
/ (1024 * 1024)
10148 << " iops: " << iops
10149 << " elapsed_sec: " << elapsed
10152 // Persist iops to the MON store
10153 ret
= mon_cmd_set_config(max_capacity_iops_config
, std::to_string(iops
));
10155 // Fallback to setting the config within the in-memory "values" map.
10156 cct
->_conf
.set_val(max_capacity_iops_config
, std::to_string(iops
));
10159 // Override the max osd capacity for all shards
10160 for (auto& shard
: shards
) {
10161 shard
->update_scheduler_config();
10166 bool OSD::maybe_override_options_for_qos()
10168 // If the scheduler enabled is mclock, override the recovery, backfill
10169 // and sleep options so that mclock can meet the QoS goals.
10170 if (cct
->_conf
.get_val
<std::string
>("osd_op_queue") == "mclock_scheduler" &&
10171 !unsupported_objstore_for_qos()) {
10172 dout(1) << __func__
10173 << ": Changing recovery/backfill/sleep settings for QoS" << dendl
;
10175 // Set high value for recovery max active
10176 uint32_t rec_max_active
= 1000;
10177 cct
->_conf
.set_val(
10178 "osd_recovery_max_active", std::to_string(rec_max_active
));
10179 cct
->_conf
.set_val(
10180 "osd_recovery_max_active_hdd", std::to_string(rec_max_active
));
10181 cct
->_conf
.set_val(
10182 "osd_recovery_max_active_ssd", std::to_string(rec_max_active
));
10184 // Set high value for osd_max_backfill
10185 uint32_t max_backfills
= 1000;
10186 cct
->_conf
.set_val("osd_max_backfills", std::to_string(max_backfills
));
10187 service
.local_reserver
.set_max(max_backfills
);
10188 service
.remote_reserver
.set_max(max_backfills
);
10190 // Disable recovery sleep
10191 cct
->_conf
.set_val("osd_recovery_sleep", std::to_string(0));
10192 cct
->_conf
.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10193 cct
->_conf
.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10194 cct
->_conf
.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10196 // Disable delete sleep
10197 cct
->_conf
.set_val("osd_delete_sleep", std::to_string(0));
10198 cct
->_conf
.set_val("osd_delete_sleep_hdd", std::to_string(0));
10199 cct
->_conf
.set_val("osd_delete_sleep_ssd", std::to_string(0));
10200 cct
->_conf
.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10202 // Disable snap trim sleep
10203 cct
->_conf
.set_val("osd_snap_trim_sleep", std::to_string(0));
10204 cct
->_conf
.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10205 cct
->_conf
.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10206 cct
->_conf
.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10208 // Disable scrub sleep
10209 cct
->_conf
.set_val("osd_scrub_sleep", std::to_string(0));
10215 int OSD::mon_cmd_set_config(const std::string
&key
, const std::string
&val
)
10219 "\"prefix\": \"config set\", "
10220 "\"who\": \"osd." + std::to_string(whoami
) + "\", "
10221 "\"name\": \"" + key
+ "\", "
10222 "\"value\": \"" + val
+ "\""
10225 vector
<std::string
> vcmd
{cmd
};
10229 monc
->start_mon_command(vcmd
, inbl
, nullptr, &outs
, &cond
);
10230 int r
= cond
.wait();
10232 derr
<< __func__
<< " Failed to set config key " << key
10233 << " err: " << cpp_strerror(r
)
10234 << " errstr: " << outs
<< dendl
;
10241 bool OSD::unsupported_objstore_for_qos()
10243 static const std::vector
<std::string
> unsupported_objstores
= { "filestore" };
10244 return std::find(unsupported_objstores
.begin(),
10245 unsupported_objstores
.end(),
10246 store
->get_type()) != unsupported_objstores
.end();
10249 void OSD::update_log_config()
10251 auto parsed_options
= clog
->parse_client_options(cct
);
10252 derr
<< "log_to_monitors " << parsed_options
.log_to_monitors
<< dendl
;
10255 void OSD::check_config()
10257 // some sanity checks
10258 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
10259 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
10260 << " is not > osd_pg_epoch_persisted_max_stale ("
10261 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
10263 if (cct
->_conf
->osd_object_clean_region_max_num_intervals
< 0) {
10264 clog
->warn() << "osd_object_clean_region_max_num_intervals ("
10265 << cct
->_conf
->osd_object_clean_region_max_num_intervals
10270 // --------------------------------
10272 void OSD::get_latest_osdmap()
10274 dout(10) << __func__
<< " -- start" << dendl
;
10276 boost::system::error_code ec
;
10277 service
.objecter
->wait_for_latest_osdmap(ceph::async::use_blocked
[ec
]);
10279 dout(10) << __func__
<< " -- finish" << dendl
;
10282 // --------------------------------
10284 void OSD::set_perf_queries(const ConfigPayload
&config_payload
) {
10285 const OSDConfigPayload
&osd_config_payload
= boost::get
<OSDConfigPayload
>(config_payload
);
10286 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
= osd_config_payload
.config
;
10287 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
10289 std::list
<OSDPerfMetricQuery
> supported_queries
;
10290 for (auto &it
: queries
) {
10291 auto &query
= it
.first
;
10292 if (!query
.key_descriptor
.empty()) {
10293 supported_queries
.push_back(query
);
10296 if (supported_queries
.size() < queries
.size()) {
10297 dout(1) << queries
.size() - supported_queries
.size()
10298 << " unsupported queries" << dendl
;
10301 std::lock_guard locker
{m_perf_queries_lock
};
10302 m_perf_queries
= supported_queries
;
10303 m_perf_limits
= queries
;
10305 std::vector
<PGRef
> pgs
;
10307 for (auto& pg
: pgs
) {
10308 std::scoped_lock l
{*pg
};
10309 pg
->set_dynamic_perf_stats_queries(supported_queries
);
10313 MetricPayload
OSD::get_perf_reports() {
10314 OSDMetricPayload payload
;
10315 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> &reports
= payload
.report
;
10317 std::vector
<PGRef
> pgs
;
10319 DynamicPerfStats dps
;
10320 for (auto& pg
: pgs
) {
10321 // m_perf_queries can be modified only in set_perf_queries by mgr client
10322 // request, and it is protected by by mgr client's lock, which is held
10323 // when set_perf_queries/get_perf_reports are called, so we may not hold
10324 // m_perf_queries_lock here.
10325 DynamicPerfStats
pg_dps(m_perf_queries
);
10327 pg
->get_dynamic_perf_stats(&pg_dps
);
10331 dps
.add_to_reports(m_perf_limits
, &reports
);
10332 dout(20) << "reports for " << reports
.size() << " queries" << dendl
;
10337 // =============================================================
10339 #undef dout_context
10340 #define dout_context cct
10342 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10344 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
10346 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10348 pg
->osd_shard
= this;
10349 pg
->pg_slot
= slot
;
10350 osd
->inc_num_pgs();
10352 slot
->epoch
= pg
->get_osdmap_epoch();
10353 pg_slots_by_epoch
.insert(*slot
);
10356 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
10358 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
10359 slot
->pg
->osd_shard
= nullptr;
10360 slot
->pg
->pg_slot
= nullptr;
10361 slot
->pg
= nullptr;
10362 osd
->dec_num_pgs();
10364 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10366 if (waiting_for_min_pg_epoch
) {
10367 min_pg_epoch_cond
.notify_all();
10371 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10373 std::lock_guard
l(shard_lock
);
10374 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10375 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10376 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10377 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10379 pg_slots_by_epoch
.insert(*slot
);
10380 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10381 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10382 if (waiting_for_min_pg_epoch
) {
10383 min_pg_epoch_cond
.notify_all();
10387 epoch_t
OSDShard::get_min_pg_epoch()
10389 std::lock_guard
l(shard_lock
);
10390 auto p
= pg_slots_by_epoch
.begin();
10391 if (p
== pg_slots_by_epoch
.end()) {
10397 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10399 std::unique_lock l
{shard_lock
};
10400 ++waiting_for_min_pg_epoch
;
10401 min_pg_epoch_cond
.wait(l
, [need
, this] {
10402 if (pg_slots_by_epoch
.empty()) {
10404 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10407 dout(10) << need
<< " waiting on "
10408 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10412 --waiting_for_min_pg_epoch
;
10415 epoch_t
OSDShard::get_max_waiting_epoch()
10417 std::lock_guard
l(shard_lock
);
10419 for (auto& i
: pg_slots
) {
10420 if (!i
.second
->waiting_peering
.empty()) {
10421 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10427 void OSDShard::consume_map(
10428 const OSDMapRef
& new_osdmap
,
10429 unsigned *pushes_to_free
)
10431 std::lock_guard
l(shard_lock
);
10432 OSDMapRef old_osdmap
;
10434 std::lock_guard
l(osdmap_lock
);
10435 old_osdmap
= std::move(shard_osdmap
);
10436 shard_osdmap
= new_osdmap
;
10438 dout(10) << new_osdmap
->get_epoch()
10439 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10444 auto p
= pg_slots
.begin();
10445 while (p
!= pg_slots
.end()) {
10446 OSDShardPGSlot
*slot
= p
->second
.get();
10447 const spg_t
& pgid
= p
->first
;
10448 dout(20) << __func__
<< " " << pgid
<< dendl
;
10449 if (!slot
->waiting_for_split
.empty()) {
10450 dout(20) << __func__
<< " " << pgid
10451 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10455 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10456 dout(20) << __func__
<< " " << pgid
10457 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10462 if (!slot
->waiting_peering
.empty()) {
10463 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10464 if (first
<= new_osdmap
->get_epoch()) {
10465 dout(20) << __func__
<< " " << pgid
10466 << " pending_peering first epoch " << first
10467 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10468 queued
+= _wake_pg_slot(pgid
, slot
);
10473 if (!slot
->waiting
.empty()) {
10474 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10475 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10480 while (!slot
->waiting
.empty() &&
10481 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10482 auto& qi
= slot
->waiting
.front();
10483 dout(20) << __func__
<< " " << pgid
10484 << " waiting item " << qi
10485 << " epoch " << qi
.get_map_epoch()
10486 << " <= " << new_osdmap
->get_epoch()
10488 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10490 << ", dropping" << dendl
;
10491 *pushes_to_free
+= qi
.get_reserved_pushes();
10492 slot
->waiting
.pop_front();
10495 if (slot
->waiting
.empty() &&
10496 slot
->num_running
== 0 &&
10497 slot
->waiting_for_split
.empty() &&
10499 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10500 p
= pg_slots
.erase(p
);
10507 std::lock_guard l
{sdata_wait_lock
};
10509 sdata_cond
.notify_one();
10511 sdata_cond
.notify_all();
10515 int OSDShard::_wake_pg_slot(
10517 OSDShardPGSlot
*slot
)
10520 dout(20) << __func__
<< " " << pgid
10521 << " to_process " << slot
->to_process
10522 << " waiting " << slot
->waiting
10523 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10524 for (auto i
= slot
->to_process
.rbegin();
10525 i
!= slot
->to_process
.rend();
10527 scheduler
->enqueue_front(std::move(*i
));
10530 slot
->to_process
.clear();
10531 for (auto i
= slot
->waiting
.rbegin();
10532 i
!= slot
->waiting
.rend();
10534 scheduler
->enqueue_front(std::move(*i
));
10537 slot
->waiting
.clear();
10538 for (auto i
= slot
->waiting_peering
.rbegin();
10539 i
!= slot
->waiting_peering
.rend();
10541 // this is overkill; we requeue everything, even if some of these
10542 // items are waiting for maps we don't have yet. FIXME, maybe,
10543 // someday, if we decide this inefficiency matters
10544 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10545 scheduler
->enqueue_front(std::move(*j
));
10549 slot
->waiting_peering
.clear();
10550 ++slot
->requeue_seq
;
10554 void OSDShard::identify_splits_and_merges(
10555 const OSDMapRef
& as_of_osdmap
,
10556 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10557 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10559 std::lock_guard
l(shard_lock
);
10560 if (shard_osdmap
) {
10561 for (auto& i
: pg_slots
) {
10562 const spg_t
& pgid
= i
.first
;
10563 auto *slot
= i
.second
.get();
10565 osd
->service
.identify_splits_and_merges(
10566 shard_osdmap
, as_of_osdmap
, pgid
,
10567 split_pgs
, merge_pgs
);
10568 } else if (!slot
->waiting_for_split
.empty()) {
10569 osd
->service
.identify_splits_and_merges(
10570 shard_osdmap
, as_of_osdmap
, pgid
,
10571 split_pgs
, nullptr);
10573 dout(20) << __func__
<< " slot " << pgid
10574 << " has no pg and waiting_for_split " << dendl
;
10580 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10581 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10583 std::lock_guard
l(shard_lock
);
10584 _prime_splits(pgids
);
10585 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10586 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10587 for (auto i
: *pgids
) {
10588 osd
->service
.identify_splits_and_merges(
10589 as_of_osdmap
, shard_osdmap
, i
.first
,
10590 &newer_children
, nullptr);
10592 newer_children
.insert(pgids
->begin(), pgids
->end());
10593 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10594 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10596 _prime_splits(&newer_children
);
10597 // note: we don't care what is left over here for other shards.
10598 // if this shard is ahead of us and one isn't, e.g., one thread is
10599 // calling into prime_splits via _process (due to a newly created
10600 // pg) and this shard has a newer map due to a racing consume_map,
10601 // then any grandchildren left here will be identified (or were
10602 // identified) when the slower shard's osdmap is advanced.
10603 // _prime_splits() will tolerate the case where the pgid is
10608 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10610 dout(10) << *pgids
<< dendl
;
10611 auto p
= pgids
->begin();
10612 while (p
!= pgids
->end()) {
10613 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10614 if (shard_index
== shard_id
) {
10615 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10617 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10618 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10619 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10622 ceph_assert(q
!= pg_slots
.end());
10623 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10625 q
->second
->waiting_for_split
.insert(p
->second
);
10627 p
= pgids
->erase(p
);
10634 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10635 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10637 std::lock_guard
l(shard_lock
);
10638 dout(20) << __func__
<< " checking shard " << shard_id
10639 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10640 auto p
= merge_pgs
->begin();
10641 while (p
!= merge_pgs
->end()) {
10642 spg_t pgid
= p
->first
;
10643 epoch_t epoch
= p
->second
;
10644 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10645 if (shard_index
!= shard_id
) {
10649 OSDShardPGSlot
*slot
;
10650 auto r
= pg_slots
.emplace(pgid
, nullptr);
10652 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10654 slot
= r
.first
->second
.get();
10657 dout(20) << __func__
<< " have merge participant pg " << pgid
10658 << " " << slot
->pg
<< dendl
;
10659 } else if (!slot
->waiting_for_split
.empty() &&
10660 *slot
->waiting_for_split
.begin() < epoch
) {
10661 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10662 << " " << slot
->waiting_for_split
<< dendl
;
10664 dout(20) << __func__
<< " creating empty merge participant " << pgid
10665 << " for merge in " << epoch
<< dendl
;
10666 // leave history zeroed; PG::merge_from() will fill it in.
10667 pg_history_t history
;
10668 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10669 history
, PastIntervals(), false);
10670 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10671 _attach_pg(r
.first
->second
.get(), pg
.get());
10672 _wake_pg_slot(pgid
, slot
);
10675 // mark slot for merge
10676 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10677 slot
->waiting_for_merge_epoch
= epoch
;
10678 p
= merge_pgs
->erase(p
);
10682 void OSDShard::register_and_wake_split_child(PG
*pg
)
10684 dout(15) << __func__
<< ": " << pg
<< " #:" << pg_slots
.size() << dendl
;
10687 std::lock_guard
l(shard_lock
);
10688 dout(10) << __func__
<< ": " << pg
->pg_id
<< " " << pg
<< dendl
;
10689 auto p
= pg_slots
.find(pg
->pg_id
);
10690 ceph_assert(p
!= pg_slots
.end());
10691 auto *slot
= p
->second
.get();
10692 dout(20) << __func__
<< ": " << pg
->pg_id
<< " waiting_for_split "
10693 << slot
->waiting_for_split
<< dendl
;
10694 ceph_assert(!slot
->pg
);
10695 ceph_assert(!slot
->waiting_for_split
.empty());
10696 _attach_pg(slot
, pg
);
10698 epoch
= pg
->get_osdmap_epoch();
10699 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10700 slot
->waiting_for_split
.erase(epoch
);
10701 if (slot
->waiting_for_split
.empty()) {
10702 _wake_pg_slot(pg
->pg_id
, slot
);
10704 dout(10) << __func__
<< " still waiting for split on "
10705 << slot
->waiting_for_split
<< dendl
;
10709 // kick child to ensure it pulls up to the latest osdmap
10710 osd
->enqueue_peering_evt(
10713 std::make_shared
<PGPeeringEvent
>(
10718 std::lock_guard l
{sdata_wait_lock
};
10719 sdata_cond
.notify_one();
10722 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10724 std::lock_guard
l(shard_lock
);
10725 vector
<spg_t
> to_delete
;
10726 for (auto& i
: pg_slots
) {
10727 if (i
.first
!= parent
&&
10728 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10729 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10731 _wake_pg_slot(i
.first
, i
.second
.get());
10732 to_delete
.push_back(i
.first
);
10735 for (auto pgid
: to_delete
) {
10736 pg_slots
.erase(pgid
);
10740 void OSDShard::update_scheduler_config()
10742 std::lock_guard
l(shard_lock
);
10743 scheduler
->update_configuration();
10746 std::string
OSDShard::get_scheduler_type()
10748 std::ostringstream scheduler_type
;
10749 scheduler_type
<< *scheduler
;
10750 return scheduler_type
.str();
10753 OSDShard::OSDShard(
10760 shard_name(string("OSDShard.") + stringify(id
)),
10761 sdata_wait_lock_name(shard_name
+ "::sdata_wait_lock"),
10762 sdata_wait_lock
{make_mutex(sdata_wait_lock_name
)},
10763 osdmap_lock
{make_mutex(shard_name
+ "::osdmap_lock")},
10764 shard_lock_name(shard_name
+ "::shard_lock"),
10765 shard_lock
{make_mutex(shard_lock_name
)},
10766 scheduler(ceph::osd::scheduler::make_scheduler(
10767 cct
, osd
->num_shards
, osd
->store
->is_rotational(),
10768 osd
->store
->get_type())),
10769 context_queue(sdata_wait_lock
, sdata_cond
)
10771 dout(0) << "using op scheduler " << *scheduler
<< dendl
;
10775 // =============================================================
10777 #undef dout_context
10778 #define dout_context osd->cct
10780 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10782 void OSD::ShardedOpWQ::_add_slot_waiter(
10784 OSDShardPGSlot
*slot
,
10785 OpSchedulerItem
&& qi
)
10787 if (qi
.is_peering()) {
10788 dout(20) << __func__
<< " " << pgid
10789 << " peering, item epoch is "
10790 << qi
.get_map_epoch()
10791 << ", will wait on " << qi
<< dendl
;
10792 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10794 dout(20) << __func__
<< " " << pgid
10795 << " item epoch is "
10796 << qi
.get_map_epoch()
10797 << ", will wait on " << qi
<< dendl
;
10798 slot
->waiting
.push_back(std::move(qi
));
10803 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10805 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10807 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10808 auto& sdata
= osd
->shards
[shard_index
];
10809 ceph_assert(sdata
);
10811 // If all threads of shards do oncommits, there is a out-of-order
10812 // problem. So we choose the thread which has the smallest
10813 // thread_index(thread_index < num_shards) of shard to do oncommit
10815 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10818 sdata
->shard_lock
.lock();
10819 if (sdata
->scheduler
->empty() &&
10820 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10821 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10822 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10823 // we raced with a context_queue addition, don't wait
10824 wait_lock
.unlock();
10825 } else if (!sdata
->stop_waiting
) {
10826 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10827 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10828 sdata
->shard_lock
.unlock();
10829 sdata
->sdata_cond
.wait(wait_lock
);
10830 wait_lock
.unlock();
10831 sdata
->shard_lock
.lock();
10832 if (sdata
->scheduler
->empty() &&
10833 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10834 sdata
->shard_lock
.unlock();
10837 // found a work item; reapply default wq timeouts
10838 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10839 timeout_interval
, suicide_interval
);
10841 dout(20) << __func__
<< " need return immediately" << dendl
;
10842 wait_lock
.unlock();
10843 sdata
->shard_lock
.unlock();
10848 list
<Context
*> oncommits
;
10849 if (is_smallest_thread_index
) {
10850 sdata
->context_queue
.move_to(oncommits
);
10853 WorkItem work_item
;
10854 while (!std::get_if
<OpSchedulerItem
>(&work_item
)) {
10855 if (sdata
->scheduler
->empty()) {
10856 if (osd
->is_stopping()) {
10857 sdata
->shard_lock
.unlock();
10858 for (auto c
: oncommits
) {
10859 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10862 return; // OSD shutdown, discard.
10864 sdata
->shard_lock
.unlock();
10865 handle_oncommits(oncommits
);
10869 work_item
= sdata
->scheduler
->dequeue();
10870 if (osd
->is_stopping()) {
10871 sdata
->shard_lock
.unlock();
10872 for (auto c
: oncommits
) {
10873 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10876 return; // OSD shutdown, discard.
10879 // If the work item is scheduled in the future, wait until
10880 // the time returned in the dequeue response before retrying.
10881 if (auto when_ready
= std::get_if
<double>(&work_item
)) {
10882 if (is_smallest_thread_index
) {
10883 sdata
->shard_lock
.unlock();
10884 handle_oncommits(oncommits
);
10885 sdata
->shard_lock
.lock();
10887 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10888 auto future_time
= ceph::real_clock::from_double(*when_ready
);
10889 dout(10) << __func__
<< " dequeue future request at " << future_time
<< dendl
;
10890 // Disable heartbeat timeout until we find a non-future work item to process.
10891 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10892 sdata
->shard_lock
.unlock();
10893 ++sdata
->waiting_threads
;
10894 sdata
->sdata_cond
.wait_until(wait_lock
, future_time
);
10895 --sdata
->waiting_threads
;
10896 wait_lock
.unlock();
10897 sdata
->shard_lock
.lock();
10898 // Reapply default wq timeouts
10899 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10900 timeout_interval
, suicide_interval
);
10901 // Populate the oncommits list if there were any additions
10902 // to the context_queue while we were waiting
10903 if (is_smallest_thread_index
) {
10904 sdata
->context_queue
.move_to(oncommits
);
10909 // Access the stored item
10910 auto item
= std::move(std::get
<OpSchedulerItem
>(work_item
));
10911 if (osd
->is_stopping()) {
10912 sdata
->shard_lock
.unlock();
10913 for (auto c
: oncommits
) {
10914 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10917 return; // OSD shutdown, discard.
10920 const auto token
= item
.get_ordering_token();
10921 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10923 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10925 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10926 dout(20) << __func__
<< " " << token
10927 << (r
.second
? " (new)" : "")
10928 << " to_process " << slot
->to_process
10929 << " waiting " << slot
->waiting
10930 << " waiting_peering " << slot
->waiting_peering
10932 slot
->to_process
.push_back(std::move(item
));
10933 dout(20) << __func__
<< " " << slot
->to_process
.back()
10934 << " queued" << dendl
;
10937 PGRef pg
= slot
->pg
;
10939 // lock pg (if we have it)
10941 // note the requeue seq now...
10942 uint64_t requeue_seq
= slot
->requeue_seq
;
10943 ++slot
->num_running
;
10945 sdata
->shard_lock
.unlock();
10946 osd
->service
.maybe_inject_dispatch_delay();
10948 osd
->service
.maybe_inject_dispatch_delay();
10949 sdata
->shard_lock
.lock();
10951 auto q
= sdata
->pg_slots
.find(token
);
10952 if (q
== sdata
->pg_slots
.end()) {
10953 // this can happen if we race with pg removal.
10954 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10956 sdata
->shard_lock
.unlock();
10957 handle_oncommits(oncommits
);
10960 slot
= q
->second
.get();
10961 --slot
->num_running
;
10963 if (slot
->to_process
.empty()) {
10964 // raced with _wake_pg_slot or consume_map
10965 dout(20) << __func__
<< " " << token
10966 << " nothing queued" << dendl
;
10968 sdata
->shard_lock
.unlock();
10969 handle_oncommits(oncommits
);
10972 if (requeue_seq
!= slot
->requeue_seq
) {
10973 dout(20) << __func__
<< " " << token
10974 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10975 << requeue_seq
<< ", we raced with _wake_pg_slot"
10978 sdata
->shard_lock
.unlock();
10979 handle_oncommits(oncommits
);
10982 if (slot
->pg
!= pg
) {
10983 // this can happen if we race with pg removal.
10984 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
10991 dout(20) << __func__
<< " " << token
10992 << " to_process " << slot
->to_process
10993 << " waiting " << slot
->waiting
10994 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10996 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
11000 auto qi
= std::move(slot
->to_process
.front());
11001 slot
->to_process
.pop_front();
11002 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
11003 set
<pair
<spg_t
,epoch_t
>> new_children
;
11007 // should this pg shard exist on this osd in this (or a later) epoch?
11008 osdmap
= sdata
->shard_osdmap
;
11009 const PGCreateInfo
*create_info
= qi
.creates_pg();
11010 if (!slot
->waiting_for_split
.empty()) {
11011 dout(20) << __func__
<< " " << token
11012 << " splitting " << slot
->waiting_for_split
<< dendl
;
11013 _add_slot_waiter(token
, slot
, std::move(qi
));
11014 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11015 dout(20) << __func__
<< " " << token
11016 << " map " << qi
.get_map_epoch() << " > "
11017 << osdmap
->get_epoch() << dendl
;
11018 _add_slot_waiter(token
, slot
, std::move(qi
));
11019 } else if (qi
.is_peering()) {
11020 if (!qi
.peering_requires_pg()) {
11021 // for pg-less events, we run them under the ordering lock, since
11022 // we don't have the pg lock to keep them ordered.
11023 qi
.run(osd
, sdata
, pg
, tp_handle
);
11024 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
11026 if (create_info
->by_mon
&&
11027 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
11028 dout(20) << __func__
<< " " << token
11029 << " no pg, no longer primary, ignoring mon create on "
11032 dout(20) << __func__
<< " " << token
11033 << " no pg, should create on " << qi
<< dendl
;
11034 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
11036 // we created the pg! drop out and continue "normally"!
11037 sdata
->_attach_pg(slot
, pg
.get());
11038 sdata
->_wake_pg_slot(token
, slot
);
11040 // identify split children between create epoch and shard epoch.
11041 osd
->service
.identify_splits_and_merges(
11042 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
11043 sdata
->_prime_splits(&new_children
);
11044 // distribute remaining split children to other shards below!
11047 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
11050 dout(20) << __func__
<< " " << token
11051 << " no pg, peering, !create, discarding " << qi
<< dendl
;
11054 dout(20) << __func__
<< " " << token
11055 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
11056 << ", discarding " << qi
11059 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
11060 dout(20) << __func__
<< " " << token
11061 << " no pg, should exist e" << osdmap
->get_epoch()
11062 << ", will wait on " << qi
<< dendl
;
11063 _add_slot_waiter(token
, slot
, std::move(qi
));
11065 dout(20) << __func__
<< " " << token
11066 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
11067 << ", dropping " << qi
<< dendl
;
11068 // share map with client?
11069 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11070 osd
->service
.maybe_share_map((*_op
)->get_req()->get_connection().get(),
11071 sdata
->shard_osdmap
,
11072 (*_op
)->sent_epoch
);
11074 unsigned pushes_to_free
= qi
.get_reserved_pushes();
11075 if (pushes_to_free
> 0) {
11076 sdata
->shard_lock
.unlock();
11077 osd
->service
.release_reserved_pushes(pushes_to_free
);
11078 handle_oncommits(oncommits
);
11082 sdata
->shard_lock
.unlock();
11083 handle_oncommits(oncommits
);
11086 if (qi
.is_peering()) {
11087 OSDMapRef osdmap
= sdata
->shard_osdmap
;
11088 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11089 _add_slot_waiter(token
, slot
, std::move(qi
));
11090 sdata
->shard_lock
.unlock();
11092 handle_oncommits(oncommits
);
11096 sdata
->shard_lock
.unlock();
11098 if (!new_children
.empty()) {
11099 for (auto shard
: osd
->shards
) {
11100 shard
->prime_splits(osdmap
, &new_children
);
11102 ceph_assert(new_children
.empty());
11105 // osd_opwq_process marks the point at which an operation has been dequeued
11106 // and will begin to be handled by a worker thread.
11110 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11111 reqid
= (*_op
)->get_reqid();
11114 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
11115 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11118 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
11119 Formatter
*f
= Formatter::create("json");
11120 f
->open_object_section("q");
11122 f
->close_section();
11127 qi
.run(osd
, sdata
, pg
, tp_handle
);
11132 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11133 reqid
= (*_op
)->get_reqid();
11136 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
11137 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11140 handle_oncommits(oncommits
);
11143 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem
&& item
) {
11144 if (unlikely(m_fast_shutdown
) ) {
11145 // stop enqueing when we are in the middle of a fast shutdown
11149 uint32_t shard_index
=
11150 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11152 OSDShard
* sdata
= osd
->shards
[shard_index
];
11153 assert (NULL
!= sdata
);
11154 if (sdata
->get_scheduler_type() == "mClockScheduler") {
11155 item
.maybe_set_is_qos_item();
11158 dout(20) << __func__
<< " " << item
<< dendl
;
11162 std::lock_guard l
{sdata
->shard_lock
};
11163 empty
= sdata
->scheduler
->empty();
11164 sdata
->scheduler
->enqueue(std::move(item
));
11168 std::lock_guard l
{sdata
->sdata_wait_lock
};
11170 sdata
->sdata_cond
.notify_all();
11171 } else if (sdata
->waiting_threads
) {
11172 sdata
->sdata_cond
.notify_one();
11177 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem
&& item
)
11179 if (unlikely(m_fast_shutdown
) ) {
11180 // stop enqueing when we are in the middle of a fast shutdown
11184 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11185 auto& sdata
= osd
->shards
[shard_index
];
11186 ceph_assert(sdata
);
11187 sdata
->shard_lock
.lock();
11188 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
11189 if (p
!= sdata
->pg_slots
.end() &&
11190 !p
->second
->to_process
.empty()) {
11191 // we may be racing with _process, which has dequeued a new item
11192 // from scheduler, put it on to_process, and is now busy taking the
11193 // pg lock. ensure this old requeued item is ordered before any
11194 // such newer item in to_process.
11195 p
->second
->to_process
.push_front(std::move(item
));
11196 item
= std::move(p
->second
->to_process
.back());
11197 p
->second
->to_process
.pop_back();
11198 dout(20) << __func__
11199 << " " << p
->second
->to_process
.front()
11200 << " shuffled w/ " << item
<< dendl
;
11202 dout(20) << __func__
<< " " << item
<< dendl
;
11204 sdata
->scheduler
->enqueue_front(std::move(item
));
11205 sdata
->shard_lock
.unlock();
11206 std::lock_guard l
{sdata
->sdata_wait_lock
};
11207 sdata
->sdata_cond
.notify_one();
11210 void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11212 uint32_t shard_index
= 0;
11213 m_fast_shutdown
= true;
11215 for (; shard_index
< osd
->num_shards
; shard_index
++) {
11216 auto& sdata
= osd
->shards
[shard_index
];
11217 ceph_assert(sdata
);
11218 sdata
->shard_lock
.lock();
11219 int work_count
= 0;
11220 while(! sdata
->scheduler
->empty() ) {
11221 auto work_item
= sdata
->scheduler
->dequeue();
11224 sdata
->shard_lock
.unlock();
11228 namespace ceph::osd_cmds
{
11230 int heap(CephContext
& cct
,
11231 const cmdmap_t
& cmdmap
,
11232 std::ostream
& outos
,
11233 std::ostream
& erros
)
11235 if (!ceph_using_tcmalloc()) {
11236 erros
<< "could not issue heap profiler command -- not using tcmalloc!";
11237 return -EOPNOTSUPP
;
11241 if (!cmd_getval(cmdmap
, "heapcmd", cmd
)) {
11242 erros
<< "unable to get value for command \"" << cmd
<< "\"";
11246 std::vector
<std::string
> cmd_vec
;
11247 get_str_vec(cmd
, cmd_vec
);
11250 if (cmd_getval(cmdmap
, "value", val
)) {
11251 cmd_vec
.push_back(val
);
11254 ceph_heap_profiler_handle_command(cmd_vec
, outos
);
11259 } // namespace ceph::osd_cmds