1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
39 #include "osd/scrub_machine.h"
40 #include "osd/pg_scrubber.h"
42 #include "include/types.h"
43 #include "include/compat.h"
44 #include "include/random.h"
49 #include "osdc/Objecter.h"
51 #include "common/errno.h"
52 #include "common/ceph_argparse.h"
53 #include "common/ceph_releases.h"
54 #include "common/ceph_time.h"
55 #include "common/version.h"
56 #include "common/async/blocked_completion.h"
57 #include "common/pick_address.h"
58 #include "common/blkdev.h"
59 #include "common/numa.h"
61 #include "os/ObjectStore.h"
63 #include "os/FuseStore.h"
66 #include "PrimaryLogPG.h"
68 #include "msg/Messenger.h"
69 #include "msg/Message.h"
71 #include "mon/MonClient.h"
73 #include "messages/MLog.h"
75 #include "messages/MGenericMessage.h"
76 #include "messages/MOSDPing.h"
77 #include "messages/MOSDFailure.h"
78 #include "messages/MOSDMarkMeDown.h"
79 #include "messages/MOSDMarkMeDead.h"
80 #include "messages/MOSDFull.h"
81 #include "messages/MOSDOp.h"
82 #include "messages/MOSDOpReply.h"
83 #include "messages/MOSDBackoff.h"
84 #include "messages/MOSDBeacon.h"
85 #include "messages/MOSDRepOp.h"
86 #include "messages/MOSDRepOpReply.h"
87 #include "messages/MOSDBoot.h"
88 #include "messages/MOSDPGTemp.h"
89 #include "messages/MOSDPGReadyToMerge.h"
91 #include "messages/MOSDMap.h"
92 #include "messages/MMonGetOSDMap.h"
93 #include "messages/MOSDPGNotify.h"
94 #include "messages/MOSDPGNotify2.h"
95 #include "messages/MOSDPGQuery.h"
96 #include "messages/MOSDPGQuery2.h"
97 #include "messages/MOSDPGLog.h"
98 #include "messages/MOSDPGRemove.h"
99 #include "messages/MOSDPGInfo.h"
100 #include "messages/MOSDPGInfo2.h"
101 #include "messages/MOSDPGCreate.h"
102 #include "messages/MOSDPGCreate2.h"
103 #include "messages/MBackfillReserve.h"
104 #include "messages/MRecoveryReserve.h"
105 #include "messages/MOSDForceRecovery.h"
106 #include "messages/MOSDECSubOpWrite.h"
107 #include "messages/MOSDECSubOpWriteReply.h"
108 #include "messages/MOSDECSubOpRead.h"
109 #include "messages/MOSDECSubOpReadReply.h"
110 #include "messages/MOSDPGCreated.h"
111 #include "messages/MOSDPGUpdateLogMissing.h"
112 #include "messages/MOSDPGUpdateLogMissingReply.h"
114 #include "messages/MOSDPeeringOp.h"
116 #include "messages/MOSDAlive.h"
118 #include "messages/MOSDScrub.h"
119 #include "messages/MOSDScrub2.h"
120 #include "messages/MOSDRepScrub.h"
122 #include "messages/MCommand.h"
123 #include "messages/MCommandReply.h"
125 #include "messages/MPGStats.h"
127 #include "messages/MWatchNotify.h"
128 #include "messages/MOSDPGPush.h"
129 #include "messages/MOSDPGPushReply.h"
130 #include "messages/MOSDPGPull.h"
132 #include "messages/MMonGetPurgedSnaps.h"
133 #include "messages/MMonGetPurgedSnapsReply.h"
135 #include "common/perf_counters.h"
136 #include "common/Timer.h"
137 #include "common/LogClient.h"
138 #include "common/AsyncReserver.h"
139 #include "common/HeartbeatMap.h"
140 #include "common/admin_socket.h"
141 #include "common/ceph_context.h"
143 #include "global/signal_handler.h"
144 #include "global/pidfile.h"
146 #include "include/color.h"
147 #include "perfglue/cpu_profiler.h"
148 #include "perfglue/heap_profiler.h"
150 #include "osd/ClassHandler.h"
151 #include "osd/OpRequest.h"
153 #include "auth/AuthAuthorizeHandler.h"
154 #include "auth/RotatingKeyRing.h"
156 #include "objclass/objclass.h"
158 #include "common/cmdparse.h"
159 #include "include/str_list.h"
160 #include "include/util.h"
162 #include "include/ceph_assert.h"
163 #include "common/config.h"
164 #include "common/EventTrace.h"
166 #include "json_spirit/json_spirit_reader.h"
167 #include "json_spirit/json_spirit_writer.h"
170 #define TRACEPOINT_DEFINE
171 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #include "tracing/osd.h"
173 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
174 #undef TRACEPOINT_DEFINE
176 #define tracepoint(...)
179 #include "common/tracer.h"
182 #define dout_context cct
183 #define dout_subsys ceph_subsys_osd
185 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
189 using std::lock_guard
;
190 using std::make_pair
;
191 using std::make_tuple
;
192 using std::make_unique
;
195 using std::ostringstream
;
199 using std::stringstream
;
200 using std::to_string
;
201 using std::unique_ptr
;
204 using ceph::bufferlist
;
205 using ceph::bufferptr
;
208 using ceph::fixed_u_to_string
;
209 using ceph::Formatter
;
210 using ceph::heartbeat_handle_d
;
211 using ceph::make_mutex
;
213 using namespace ceph::osd::scheduler
;
214 using TOPNSPC::common::cmd_getval
;
216 static ostream
& _prefix(std::ostream
* _dout
, int whoami
, epoch_t epoch
) {
217 return *_dout
<< "osd." << whoami
<< " " << epoch
<< " ";
220 //Initial features in new superblock.
221 //Features here are also automatically upgraded
222 CompatSet
OSD::get_osd_initial_compat_set() {
223 CompatSet::FeatureSet ceph_osd_feature_compat
;
224 CompatSet::FeatureSet ceph_osd_feature_ro_compat
;
225 CompatSet::FeatureSet ceph_osd_feature_incompat
;
226 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
227 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO
);
228 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC
);
229 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC
);
230 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES
);
231 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL
);
232 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO
);
233 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO
);
234 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG
);
235 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER
);
236 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS
);
237 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA
);
238 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING
);
239 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO
);
240 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES
);
241 ceph_osd_feature_incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
);
242 return CompatSet(ceph_osd_feature_compat
, ceph_osd_feature_ro_compat
,
243 ceph_osd_feature_incompat
);
246 //Features are added here that this OSD supports.
247 CompatSet
OSD::get_osd_compat_set() {
248 CompatSet compat
= get_osd_initial_compat_set();
249 //Any features here can be set in code, but not in initial superblock
250 compat
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
254 OSDService::OSDService(OSD
*osd
, ceph::async::io_context_pool
& poolctx
) :
257 whoami(osd
->whoami
), store(osd
->store
),
258 log_client(osd
->log_client
), clog(osd
->clog
),
259 pg_recovery_stats(osd
->pg_recovery_stats
),
260 cluster_messenger(osd
->cluster_messenger
),
261 client_messenger(osd
->client_messenger
),
263 recoverystate_perf(osd
->recoverystate_perf
),
265 osd_max_object_size(cct
->_conf
, "osd_max_object_size"),
266 osd_skip_data_digest(cct
->_conf
, "osd_skip_data_digest"),
267 publish_lock
{ceph::make_mutex("OSDService::publish_lock")},
268 pre_publish_lock
{ceph::make_mutex("OSDService::pre_publish_lock")},
272 agent_valid_iterator(false),
274 flush_mode_high_count(0),
277 agent_stop_flag(false),
278 agent_timer(osd
->client_messenger
->cct
, agent_timer_lock
),
279 last_recalibrate(ceph_clock_now()),
280 promote_max_objects(0),
281 promote_max_bytes(0),
283 objecter(make_unique
<Objecter
>(osd
->client_messenger
->cct
,
284 osd
->objecter_messenger
,
285 osd
->monc
, poolctx
)),
286 m_objecter_finishers(cct
->_conf
->osd_objecter_finishers
),
287 watch_timer(osd
->client_messenger
->cct
, watch_lock
),
289 recovery_request_timer(cct
, recovery_request_lock
, false),
290 sleep_timer(cct
, sleep_lock
, false),
291 reserver_finisher(cct
),
292 local_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
293 cct
->_conf
->osd_min_recovery_priority
),
294 remote_reserver(cct
, &reserver_finisher
, cct
->_conf
->osd_max_backfills
,
295 cct
->_conf
->osd_min_recovery_priority
),
296 snap_reserver(cct
, &reserver_finisher
,
297 cct
->_conf
->osd_max_trimming_pgs
),
298 recovery_ops_active(0),
299 recovery_ops_reserved(0),
300 recovery_paused(false),
301 map_cache(cct
, cct
->_conf
->osd_map_cache_size
),
302 map_bl_cache(cct
->_conf
->osd_map_cache_size
),
303 map_bl_inc_cache(cct
->_conf
->osd_map_cache_size
),
305 cur_ratio(0), physical_ratio(0),
306 boot_epoch(0), up_epoch(0), bind_epoch(0)
310 for (int i
= 0; i
< m_objecter_finishers
; i
++) {
312 str
<< "objecter-finisher-" << i
;
313 auto fin
= make_unique
<Finisher
>(osd
->client_messenger
->cct
, str
.str(), "finisher");
314 objecter_finishers
.push_back(std::move(fin
));
319 void OSDService::add_pgid(spg_t pgid
, PG
*pg
) {
320 std::lock_guard
l(pgid_lock
);
321 if (!pgid_tracker
.count(pgid
)) {
324 pgid_tracker
[pgid
]++;
326 void OSDService::remove_pgid(spg_t pgid
, PG
*pg
)
328 std::lock_guard
l(pgid_lock
);
329 ceph_assert(pgid_tracker
.count(pgid
));
330 ceph_assert(pgid_tracker
[pgid
] > 0);
331 pgid_tracker
[pgid
]--;
332 if (pgid_tracker
[pgid
] == 0) {
333 pgid_tracker
.erase(pgid
);
334 live_pgs
.erase(pgid
);
337 void OSDService::dump_live_pgids()
339 std::lock_guard
l(pgid_lock
);
340 derr
<< "live pgids:" << dendl
;
341 for (map
<spg_t
, int>::const_iterator i
= pgid_tracker
.cbegin();
342 i
!= pgid_tracker
.cend();
344 derr
<< "\t" << *i
<< dendl
;
345 live_pgs
[i
->first
]->dump_live_ids();
351 ceph::signedspan
OSDService::get_mnow()
353 return ceph::mono_clock::now() - osd
->startup_time
;
356 void OSDService::identify_splits_and_merges(
360 set
<pair
<spg_t
,epoch_t
>> *split_children
,
361 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
363 if (!old_map
->have_pg_pool(pgid
.pool())) {
366 int old_pgnum
= old_map
->get_pg_num(pgid
.pool());
367 auto p
= osd
->pg_num_history
.pg_nums
.find(pgid
.pool());
368 if (p
== osd
->pg_num_history
.pg_nums
.end()) {
371 dout(20) << __func__
<< " " << pgid
<< " e" << old_map
->get_epoch()
372 << " to e" << new_map
->get_epoch()
373 << " pg_nums " << p
->second
<< dendl
;
375 queue
.push_back(pgid
);
377 while (!queue
.empty()) {
378 auto cur
= queue
.front();
381 unsigned pgnum
= old_pgnum
;
382 for (auto q
= p
->second
.lower_bound(old_map
->get_epoch());
383 q
!= p
->second
.end() &&
384 q
->first
<= new_map
->get_epoch();
386 if (pgnum
< q
->second
) {
388 if (cur
.ps() < pgnum
) {
390 if (cur
.is_split(pgnum
, q
->second
, &children
)) {
391 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
392 << " pg_num " << pgnum
<< " -> " << q
->second
393 << " children " << children
<< dendl
;
394 for (auto i
: children
) {
395 split_children
->insert(make_pair(i
, q
->first
));
400 } else if (cur
.ps() < q
->second
) {
401 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
402 << " pg_num " << pgnum
<< " -> " << q
->second
403 << " is a child" << dendl
;
404 // normally we'd capture this from the parent, but it's
405 // possible the parent doesn't exist yet (it will be
406 // fabricated to allow an intervening merge). note this PG
407 // as a split child here to be sure we catch it.
408 split_children
->insert(make_pair(cur
, q
->first
));
410 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
411 << " pg_num " << pgnum
<< " -> " << q
->second
412 << " is post-split, skipping" << dendl
;
414 } else if (merge_pgs
) {
416 if (cur
.ps() >= q
->second
) {
417 if (cur
.ps() < pgnum
) {
419 if (cur
.is_merge_source(pgnum
, q
->second
, &parent
)) {
421 parent
.is_split(q
->second
, pgnum
, &children
);
422 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
423 << " pg_num " << pgnum
<< " -> " << q
->second
424 << " is merge source, target " << parent
425 << ", source(s) " << children
<< dendl
;
426 merge_pgs
->insert(make_pair(parent
, q
->first
));
427 if (!did
.count(parent
)) {
428 // queue (and re-scan) parent in case it might not exist yet
429 // and there are some future splits pending on it
430 queue
.push_back(parent
);
432 for (auto c
: children
) {
433 merge_pgs
->insert(make_pair(c
, q
->first
));
439 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
440 << " pg_num " << pgnum
<< " -> " << q
->second
441 << " is beyond old pgnum, skipping" << dendl
;
445 if (cur
.is_split(q
->second
, pgnum
, &children
)) {
446 dout(20) << __func__
<< " " << cur
<< " e" << q
->first
447 << " pg_num " << pgnum
<< " -> " << q
->second
448 << " is merge target, source " << children
<< dendl
;
449 for (auto c
: children
) {
450 merge_pgs
->insert(make_pair(c
, q
->first
));
454 merge_pgs
->insert(make_pair(cur
, q
->first
));
463 void OSDService::need_heartbeat_peer_update()
465 osd
->need_heartbeat_peer_update();
468 HeartbeatStampsRef
OSDService::get_hb_stamps(unsigned peer
)
470 std::lock_guard
l(hb_stamp_lock
);
471 if (peer
>= hb_stamps
.size()) {
472 hb_stamps
.resize(peer
+ 1);
474 if (!hb_stamps
[peer
]) {
475 hb_stamps
[peer
] = ceph::make_ref
<HeartbeatStamps
>(peer
);
477 return hb_stamps
[peer
];
480 void OSDService::queue_renew_lease(epoch_t epoch
, spg_t spgid
)
482 osd
->enqueue_peering_evt(
485 std::make_shared
<PGPeeringEvent
>(
490 void OSDService::start_shutdown()
493 std::lock_guard
l(agent_timer_lock
);
494 agent_timer
.shutdown();
498 std::lock_guard
l(sleep_lock
);
499 sleep_timer
.shutdown();
503 std::lock_guard
l(recovery_request_lock
);
504 recovery_request_timer
.shutdown();
508 void OSDService::shutdown_reserver()
510 reserver_finisher
.wait_for_empty();
511 reserver_finisher
.stop();
514 void OSDService::shutdown()
516 mono_timer
.suspend();
519 std::lock_guard
l(watch_lock
);
520 watch_timer
.shutdown();
523 objecter
->shutdown();
524 for (auto& f
: objecter_finishers
) {
529 publish_map(OSDMapRef());
530 next_osdmap
= OSDMapRef();
533 void OSDService::init()
535 reserver_finisher
.start();
536 for (auto& f
: objecter_finishers
) {
539 objecter
->set_client_incarnation(0);
541 // deprioritize objecter in daemonperf output
542 objecter
->get_logger()->set_prio_adjust(-3);
548 agent_thread
.create("osd_srv_agent");
550 if (cct
->_conf
->osd_recovery_delay_start
)
551 defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
554 void OSDService::final_init()
556 objecter
->start(osdmap
.get());
559 void OSDService::activate_map()
561 // wake/unwake the tiering agent
562 std::lock_guard l
{agent_lock
};
564 !osdmap
->test_flag(CEPH_OSDMAP_NOTIERAGENT
) &&
566 agent_cond
.notify_all();
569 void OSDService::request_osdmap_update(epoch_t e
)
571 osd
->osdmap_subscribe(e
, false);
575 class AgentTimeoutCB
: public Context
{
578 explicit AgentTimeoutCB(PGRef _pg
) : pg(_pg
) {}
579 void finish(int) override
{
580 pg
->agent_choose_mode_restart();
584 void OSDService::agent_entry()
586 dout(10) << __func__
<< " start" << dendl
;
587 std::unique_lock agent_locker
{agent_lock
};
589 while (!agent_stop_flag
) {
590 if (agent_queue
.empty()) {
591 dout(20) << __func__
<< " empty queue" << dendl
;
592 agent_cond
.wait(agent_locker
);
595 uint64_t level
= agent_queue
.rbegin()->first
;
596 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
598 << " tiers " << agent_queue
.size()
599 << ", top is " << level
600 << " with pgs " << top
.size()
601 << ", ops " << agent_ops
<< "/"
602 << cct
->_conf
->osd_agent_max_ops
603 << (agent_active
? " active" : " NOT ACTIVE")
605 dout(20) << __func__
<< " oids " << agent_oids
<< dendl
;
606 int max
= cct
->_conf
->osd_agent_max_ops
- agent_ops
;
607 int agent_flush_quota
= max
;
608 if (!flush_mode_high_count
)
609 agent_flush_quota
= cct
->_conf
->osd_agent_max_low_ops
- agent_ops
;
610 if (agent_flush_quota
<= 0 || top
.empty() || !agent_active
) {
611 agent_cond
.wait(agent_locker
);
615 if (!agent_valid_iterator
|| agent_queue_pos
== top
.end()) {
616 agent_queue_pos
= top
.begin();
617 agent_valid_iterator
= true;
619 PGRef pg
= *agent_queue_pos
;
620 dout(10) << "high_count " << flush_mode_high_count
621 << " agent_ops " << agent_ops
622 << " flush_quota " << agent_flush_quota
<< dendl
;
623 agent_locker
.unlock();
624 if (!pg
->agent_work(max
, agent_flush_quota
)) {
625 dout(10) << __func__
<< " " << pg
->pg_id
626 << " no agent_work, delay for " << cct
->_conf
->osd_agent_delay_time
627 << " seconds" << dendl
;
629 logger
->inc(l_osd_tier_delay
);
630 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
631 std::lock_guard timer_locker
{agent_timer_lock
};
632 Context
*cb
= new AgentTimeoutCB(pg
);
633 agent_timer
.add_event_after(cct
->_conf
->osd_agent_delay_time
, cb
);
637 dout(10) << __func__
<< " finish" << dendl
;
640 void OSDService::agent_stop()
643 std::lock_guard
l(agent_lock
);
645 // By this time all ops should be cancelled
646 ceph_assert(agent_ops
== 0);
647 // By this time all PGs are shutdown and dequeued
648 if (!agent_queue
.empty()) {
649 set
<PGRef
>& top
= agent_queue
.rbegin()->second
;
650 derr
<< "agent queue not empty, for example " << (*top
.begin())->get_pgid() << dendl
;
651 ceph_abort_msg("agent queue not empty");
654 agent_stop_flag
= true;
655 agent_cond
.notify_all();
660 // -------------------------------------
662 void OSDService::promote_throttle_recalibrate()
664 utime_t now
= ceph_clock_now();
665 double dur
= now
- last_recalibrate
;
666 last_recalibrate
= now
;
667 unsigned prob
= promote_probability_millis
;
669 uint64_t target_obj_sec
= cct
->_conf
->osd_tier_promote_max_objects_sec
;
670 uint64_t target_bytes_sec
= cct
->_conf
->osd_tier_promote_max_bytes_sec
;
672 unsigned min_prob
= 1;
674 uint64_t attempts
, obj
, bytes
;
675 promote_counter
.sample_and_attenuate(&attempts
, &obj
, &bytes
);
676 dout(10) << __func__
<< " " << attempts
<< " attempts, promoted "
677 << obj
<< " objects and " << byte_u_t(bytes
) << "; target "
678 << target_obj_sec
<< " obj/sec or "
679 << byte_u_t(target_bytes_sec
) << "/sec"
682 // calculate what the probability *should* be, given the targets
684 if (attempts
&& dur
> 0) {
685 uint64_t avg_size
= 1;
687 avg_size
= std::max
<uint64_t>(bytes
/ obj
, 1);
688 unsigned po
= (double)target_obj_sec
* dur
* 1000.0 / (double)attempts
;
689 unsigned pb
= (double)target_bytes_sec
/ (double)avg_size
* dur
* 1000.0
691 dout(20) << __func__
<< " po " << po
<< " pb " << pb
<< " avg_size "
692 << avg_size
<< dendl
;
693 if (target_obj_sec
&& target_bytes_sec
)
694 new_prob
= std::min(po
, pb
);
695 else if (target_obj_sec
)
697 else if (target_bytes_sec
)
704 dout(20) << __func__
<< " new_prob " << new_prob
<< dendl
;
706 // correct for persistent skew between target rate and actual rate, adjust
709 if (attempts
&& obj
) {
710 actual
= obj
* 1000 / attempts
;
711 ratio
= (double)actual
/ (double)prob
;
712 new_prob
= (double)new_prob
/ ratio
;
714 new_prob
= std::max(new_prob
, min_prob
);
715 new_prob
= std::min(new_prob
, 1000u);
718 prob
= (prob
+ new_prob
) / 2;
719 prob
= std::max(prob
, min_prob
);
720 prob
= std::min(prob
, 1000u);
721 dout(10) << __func__
<< " actual " << actual
722 << ", actual/prob ratio " << ratio
723 << ", adjusted new_prob " << new_prob
724 << ", prob " << promote_probability_millis
<< " -> " << prob
726 promote_probability_millis
= prob
;
728 // set hard limits for this interval to mitigate stampedes
729 promote_max_objects
= target_obj_sec
* osd
->OSD_TICK_INTERVAL
* 2;
730 promote_max_bytes
= target_bytes_sec
* osd
->OSD_TICK_INTERVAL
* 2;
733 // -------------------------------------
735 float OSDService::get_failsafe_full_ratio()
737 float full_ratio
= cct
->_conf
->osd_failsafe_full_ratio
;
738 if (full_ratio
> 1.0) full_ratio
/= 100.0;
742 OSDService::s_names
OSDService::recalc_full_state(float ratio
, float pratio
, string
&inject
)
744 // The OSDMap ratios take precendence. So if the failsafe is .95 and
745 // the admin sets the cluster full to .96, the failsafe moves up to .96
746 // too. (Not that having failsafe == full is ideal, but it's better than
747 // dropping writes before the clusters appears full.)
748 OSDMapRef osdmap
= get_osdmap();
749 if (!osdmap
|| osdmap
->get_epoch() == 0) {
752 float nearfull_ratio
= osdmap
->get_nearfull_ratio();
753 float backfillfull_ratio
= std::max(osdmap
->get_backfillfull_ratio(), nearfull_ratio
);
754 float full_ratio
= std::max(osdmap
->get_full_ratio(), backfillfull_ratio
);
755 float failsafe_ratio
= std::max(get_failsafe_full_ratio(), full_ratio
);
757 if (osdmap
->require_osd_release
< ceph_release_t::luminous
) {
758 // use the failsafe for nearfull and full; the mon isn't using the
759 // flags anyway because we're mid-upgrade.
760 full_ratio
= failsafe_ratio
;
761 backfillfull_ratio
= failsafe_ratio
;
762 nearfull_ratio
= failsafe_ratio
;
763 } else if (full_ratio
<= 0 ||
764 backfillfull_ratio
<= 0 ||
765 nearfull_ratio
<= 0) {
766 derr
<< __func__
<< " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl
;
767 // use failsafe flag. ick. the monitor did something wrong or the user
768 // did something stupid.
769 full_ratio
= failsafe_ratio
;
770 backfillfull_ratio
= failsafe_ratio
;
771 nearfull_ratio
= failsafe_ratio
;
774 if (injectfull_state
> NONE
&& injectfull
) {
775 inject
= "(Injected)";
776 return injectfull_state
;
777 } else if (pratio
> failsafe_ratio
) {
779 } else if (ratio
> full_ratio
) {
781 } else if (ratio
> backfillfull_ratio
) {
783 } else if (pratio
> nearfull_ratio
) {
789 void OSDService::check_full_status(float ratio
, float pratio
)
791 std::lock_guard
l(full_status_lock
);
794 physical_ratio
= pratio
;
798 new_state
= recalc_full_state(ratio
, pratio
, inject
);
800 dout(20) << __func__
<< " cur ratio " << ratio
801 << ", physical ratio " << pratio
802 << ", new state " << get_full_state_name(new_state
)
807 if (cur_state
!= new_state
) {
808 dout(10) << __func__
<< " " << get_full_state_name(cur_state
)
809 << " -> " << get_full_state_name(new_state
) << dendl
;
810 if (new_state
== FAILSAFE
) {
811 clog
->error() << "full status failsafe engaged, dropping updates, now "
812 << (int)roundf(ratio
* 100) << "% full";
813 } else if (cur_state
== FAILSAFE
) {
814 clog
->error() << "full status failsafe disengaged, no longer dropping "
815 << "updates, now " << (int)roundf(ratio
* 100) << "% full";
817 cur_state
= new_state
;
821 bool OSDService::need_fullness_update()
823 OSDMapRef osdmap
= get_osdmap();
825 if (osdmap
->exists(whoami
)) {
826 if (osdmap
->get_state(whoami
) & CEPH_OSD_FULL
) {
828 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_BACKFILLFULL
) {
830 } else if (osdmap
->get_state(whoami
) & CEPH_OSD_NEARFULL
) {
837 else if (is_backfillfull())
839 else if (is_nearfull())
844 bool OSDService::_check_inject_full(DoutPrefixProvider
*dpp
, s_names type
) const
846 if (injectfull
&& injectfull_state
>= type
) {
847 // injectfull is either a count of the number of times to return failsafe full
848 // or if -1 then always return full
851 ldpp_dout(dpp
, 10) << __func__
<< " Injected " << get_full_state_name(type
) << " OSD ("
852 << (injectfull
< 0 ? "set" : std::to_string(injectfull
)) << ")"
859 bool OSDService::_check_full(DoutPrefixProvider
*dpp
, s_names type
) const
861 std::lock_guard
l(full_status_lock
);
863 if (_check_inject_full(dpp
, type
))
866 if (cur_state
>= type
)
867 ldpp_dout(dpp
, 10) << __func__
<< " current usage is " << cur_ratio
868 << " physical " << physical_ratio
<< dendl
;
870 return cur_state
>= type
;
873 bool OSDService::_tentative_full(DoutPrefixProvider
*dpp
, s_names type
, uint64_t adjust_used
, osd_stat_t adjusted_stat
)
875 ldpp_dout(dpp
, 20) << __func__
<< " type " << get_full_state_name(type
) << " adjust_used " << (adjust_used
>> 10) << "KiB" << dendl
;
877 std::lock_guard
l(full_status_lock
);
878 if (_check_inject_full(dpp
, type
)) {
884 float ratio
= compute_adjusted_ratio(adjusted_stat
, &pratio
, adjust_used
);
887 s_names tentative_state
= recalc_full_state(ratio
, pratio
, notused
);
889 if (tentative_state
>= type
)
890 ldpp_dout(dpp
, 10) << __func__
<< " tentative usage is " << ratio
<< dendl
;
892 return tentative_state
>= type
;
895 bool OSDService::check_failsafe_full(DoutPrefixProvider
*dpp
) const
897 return _check_full(dpp
, FAILSAFE
);
900 bool OSDService::check_full(DoutPrefixProvider
*dpp
) const
902 return _check_full(dpp
, FULL
);
905 bool OSDService::tentative_backfill_full(DoutPrefixProvider
*dpp
, uint64_t adjust_used
, osd_stat_t stats
)
907 return _tentative_full(dpp
, BACKFILLFULL
, adjust_used
, stats
);
910 bool OSDService::check_backfill_full(DoutPrefixProvider
*dpp
) const
912 return _check_full(dpp
, BACKFILLFULL
);
915 bool OSDService::check_nearfull(DoutPrefixProvider
*dpp
) const
917 return _check_full(dpp
, NEARFULL
);
920 bool OSDService::is_failsafe_full() const
922 std::lock_guard
l(full_status_lock
);
923 return cur_state
== FAILSAFE
;
926 bool OSDService::is_full() const
928 std::lock_guard
l(full_status_lock
);
929 return cur_state
>= FULL
;
932 bool OSDService::is_backfillfull() const
934 std::lock_guard
l(full_status_lock
);
935 return cur_state
>= BACKFILLFULL
;
938 bool OSDService::is_nearfull() const
940 std::lock_guard
l(full_status_lock
);
941 return cur_state
>= NEARFULL
;
944 void OSDService::set_injectfull(s_names type
, int64_t count
)
946 std::lock_guard
l(full_status_lock
);
947 injectfull_state
= type
;
951 void OSDService::set_statfs(const struct store_statfs_t
&stbuf
,
952 osd_alert_list_t
& alerts
)
954 uint64_t bytes
= stbuf
.total
;
955 uint64_t avail
= stbuf
.available
;
956 uint64_t used
= stbuf
.get_used_raw();
958 // For testing fake statfs values so it doesn't matter if all
959 // OSDs are using the same partition.
960 if (cct
->_conf
->fake_statfs_for_testing
) {
961 uint64_t total_num_bytes
= 0;
965 total_num_bytes
+= p
->get_stats_num_bytes();
967 bytes
= cct
->_conf
->fake_statfs_for_testing
;
968 if (total_num_bytes
< bytes
)
969 avail
= bytes
- total_num_bytes
;
972 dout(0) << __func__
<< " fake total " << cct
->_conf
->fake_statfs_for_testing
973 << " adjust available " << avail
975 used
= bytes
- avail
;
978 logger
->set(l_osd_stat_bytes
, bytes
);
979 logger
->set(l_osd_stat_bytes_used
, used
);
980 logger
->set(l_osd_stat_bytes_avail
, avail
);
982 std::lock_guard
l(stat_lock
);
983 osd_stat
.statfs
= stbuf
;
984 osd_stat
.os_alerts
.clear();
985 osd_stat
.os_alerts
[whoami
].swap(alerts
);
986 if (cct
->_conf
->fake_statfs_for_testing
) {
987 osd_stat
.statfs
.total
= bytes
;
988 osd_stat
.statfs
.available
= avail
;
989 // For testing don't want used to go negative, so clear reserved
990 osd_stat
.statfs
.internally_reserved
= 0;
994 osd_stat_t
OSDService::set_osd_stat(vector
<int>& hb_peers
,
997 utime_t now
= ceph_clock_now();
998 auto stale_time
= g_conf().get_val
<int64_t>("osd_mon_heartbeat_stat_stale");
999 std::lock_guard
l(stat_lock
);
1000 osd_stat
.hb_peers
.swap(hb_peers
);
1001 osd
->op_tracker
.get_age_ms_histogram(&osd_stat
.op_queue_age_hist
);
1002 osd_stat
.num_pgs
= num_pgs
;
1003 // Clean entries that aren't updated
1004 // This is called often enough that we can just remove 1 at a time
1005 for (auto i
: osd_stat
.hb_pingtime
) {
1006 if (i
.second
.last_update
== 0)
1008 if (stale_time
&& now
.sec() - i
.second
.last_update
> stale_time
) {
1009 dout(20) << __func__
<< " time out heartbeat for osd " << i
.first
1010 << " last_update " << i
.second
.last_update
<< dendl
;
1011 osd_stat
.hb_pingtime
.erase(i
.first
);
1018 void OSDService::inc_osd_stat_repaired()
1020 std::lock_guard
l(stat_lock
);
1021 osd_stat
.num_shards_repaired
++;
1025 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat
, float *pratio
,
1026 uint64_t adjust_used
)
1029 ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1032 dout(20) << __func__
<< " Before kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1033 if (new_stat
.statfs
.available
> adjust_used
)
1034 new_stat
.statfs
.available
-= adjust_used
;
1036 new_stat
.statfs
.available
= 0;
1037 dout(20) << __func__
<< " After kb_used() " << new_stat
.statfs
.kb_used() << dendl
;
1040 // Check all pgs and adjust kb_used to include all pending backfill data
1041 int backfill_adjusted
= 0;
1043 osd
->_get_pgs(&pgs
);
1044 for (auto p
: pgs
) {
1045 backfill_adjusted
+= p
->pg_stat_adjust(&new_stat
);
1047 if (backfill_adjusted
) {
1048 dout(20) << __func__
<< " backfill adjusted " << new_stat
<< dendl
;
1050 return ((float)new_stat
.statfs
.get_used_raw()) / ((float)new_stat
.statfs
.total
);
1053 void OSDService::send_message_osd_cluster(int peer
, Message
*m
, epoch_t from_epoch
)
1055 OSDMapRef next_map
= get_nextmap_reserved();
1056 // service map is always newer/newest
1057 ceph_assert(from_epoch
<= next_map
->get_epoch());
1059 if (next_map
->is_down(peer
) ||
1060 next_map
->get_info(peer
).up_from
> from_epoch
) {
1062 release_map(next_map
);
1065 ConnectionRef peer_con
;
1066 if (peer
== whoami
) {
1067 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1069 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1070 next_map
->get_cluster_addrs(peer
), false, true);
1072 maybe_share_map(peer_con
.get(), next_map
);
1073 peer_con
->send_message(m
);
1074 release_map(next_map
);
1077 void OSDService::send_message_osd_cluster(std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
)
1079 OSDMapRef next_map
= get_nextmap_reserved();
1080 // service map is always newer/newest
1081 ceph_assert(from_epoch
<= next_map
->get_epoch());
1083 for (auto& iter
: messages
) {
1084 if (next_map
->is_down(iter
.first
) ||
1085 next_map
->get_info(iter
.first
).up_from
> from_epoch
) {
1089 ConnectionRef peer_con
;
1090 if (iter
.first
== whoami
) {
1091 peer_con
= osd
->cluster_messenger
->get_loopback_connection();
1093 peer_con
= osd
->cluster_messenger
->connect_to_osd(
1094 next_map
->get_cluster_addrs(iter
.first
), false, true);
1096 maybe_share_map(peer_con
.get(), next_map
);
1097 peer_con
->send_message(iter
.second
);
1099 release_map(next_map
);
1101 ConnectionRef
OSDService::get_con_osd_cluster(int peer
, epoch_t from_epoch
)
1103 OSDMapRef next_map
= get_nextmap_reserved();
1104 // service map is always newer/newest
1105 ceph_assert(from_epoch
<= next_map
->get_epoch());
1107 if (next_map
->is_down(peer
) ||
1108 next_map
->get_info(peer
).up_from
> from_epoch
) {
1109 release_map(next_map
);
1113 if (peer
== whoami
) {
1114 con
= osd
->cluster_messenger
->get_loopback_connection();
1116 con
= osd
->cluster_messenger
->connect_to_osd(
1117 next_map
->get_cluster_addrs(peer
), false, true);
1119 release_map(next_map
);
1123 pair
<ConnectionRef
,ConnectionRef
> OSDService::get_con_osd_hb(int peer
, epoch_t from_epoch
)
1125 OSDMapRef next_map
= get_nextmap_reserved();
1126 // service map is always newer/newest
1127 ceph_assert(from_epoch
<= next_map
->get_epoch());
1129 pair
<ConnectionRef
,ConnectionRef
> ret
;
1130 if (next_map
->is_down(peer
) ||
1131 next_map
->get_info(peer
).up_from
> from_epoch
) {
1132 release_map(next_map
);
1135 ret
.first
= osd
->hb_back_client_messenger
->connect_to_osd(
1136 next_map
->get_hb_back_addrs(peer
));
1137 ret
.second
= osd
->hb_front_client_messenger
->connect_to_osd(
1138 next_map
->get_hb_front_addrs(peer
));
1139 release_map(next_map
);
1143 entity_name_t
OSDService::get_cluster_msgr_name() const
1145 return cluster_messenger
->get_myname();
1148 void OSDService::queue_want_pg_temp(pg_t pgid
,
1149 const vector
<int>& want
,
1152 std::lock_guard
l(pg_temp_lock
);
1153 auto p
= pg_temp_pending
.find(pgid
);
1154 if (p
== pg_temp_pending
.end() ||
1155 p
->second
.acting
!= want
||
1157 pg_temp_wanted
[pgid
] = {want
, forced
};
1161 void OSDService::remove_want_pg_temp(pg_t pgid
)
1163 std::lock_guard
l(pg_temp_lock
);
1164 pg_temp_wanted
.erase(pgid
);
1165 pg_temp_pending
.erase(pgid
);
1168 void OSDService::_sent_pg_temp()
1170 #ifdef HAVE_STDLIB_MAP_SPLICING
1171 pg_temp_pending
.merge(pg_temp_wanted
);
1173 pg_temp_pending
.insert(make_move_iterator(begin(pg_temp_wanted
)),
1174 make_move_iterator(end(pg_temp_wanted
)));
1176 pg_temp_wanted
.clear();
1179 void OSDService::requeue_pg_temp()
1181 std::lock_guard
l(pg_temp_lock
);
1182 // wanted overrides pending. note that remove_want_pg_temp
1183 // clears the item out of both.
1184 unsigned old_wanted
= pg_temp_wanted
.size();
1185 unsigned old_pending
= pg_temp_pending
.size();
1187 pg_temp_wanted
.swap(pg_temp_pending
);
1188 dout(10) << __func__
<< " " << old_wanted
<< " + " << old_pending
<< " -> "
1189 << pg_temp_wanted
.size() << dendl
;
1192 std::ostream
& operator<<(std::ostream
& out
,
1193 const OSDService::pg_temp_t
& pg_temp
)
1195 out
<< pg_temp
.acting
;
1196 if (pg_temp
.forced
) {
1202 void OSDService::send_pg_temp()
1204 std::lock_guard
l(pg_temp_lock
);
1205 if (pg_temp_wanted
.empty())
1207 dout(10) << "send_pg_temp " << pg_temp_wanted
<< dendl
;
1208 MOSDPGTemp
*ms
[2] = {nullptr, nullptr};
1209 for (auto& [pgid
, pg_temp
] : pg_temp_wanted
) {
1210 auto& m
= ms
[pg_temp
.forced
];
1212 m
= new MOSDPGTemp(osdmap
->get_epoch());
1213 m
->forced
= pg_temp
.forced
;
1215 m
->pg_temp
.emplace(pgid
, pg_temp
.acting
);
1219 monc
->send_mon_message(m
);
1225 void OSDService::send_pg_created(pg_t pgid
)
1227 std::lock_guard
l(pg_created_lock
);
1228 dout(20) << __func__
<< dendl
;
1229 auto o
= get_osdmap();
1230 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1231 pg_created
.insert(pgid
);
1232 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1236 void OSDService::send_pg_created()
1238 std::lock_guard
l(pg_created_lock
);
1239 dout(20) << __func__
<< dendl
;
1240 auto o
= get_osdmap();
1241 if (o
->require_osd_release
>= ceph_release_t::luminous
) {
1242 for (auto pgid
: pg_created
) {
1243 monc
->send_mon_message(new MOSDPGCreated(pgid
));
1248 void OSDService::prune_pg_created()
1250 std::lock_guard
l(pg_created_lock
);
1251 dout(20) << __func__
<< dendl
;
1252 auto o
= get_osdmap();
1253 auto i
= pg_created
.begin();
1254 while (i
!= pg_created
.end()) {
1255 auto p
= o
->get_pg_pool(i
->pool());
1256 if (!p
|| !p
->has_flag(pg_pool_t::FLAG_CREATING
)) {
1257 dout(20) << __func__
<< " pruning " << *i
<< dendl
;
1258 i
= pg_created
.erase(i
);
1260 dout(20) << __func__
<< " keeping " << *i
<< dendl
;
1267 // --------------------------------------
1270 bool OSDService::can_inc_scrubs()
1272 bool can_inc
= false;
1273 std::lock_guard
l(sched_scrub_lock
);
1275 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1276 dout(20) << __func__
<< " == true " << scrubs_local
<< " local + " << scrubs_remote
1277 << " remote < max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1280 dout(20) << __func__
<< " == false " << scrubs_local
<< " local + " << scrubs_remote
1281 << " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1287 bool OSDService::inc_scrubs_local()
1289 bool result
= false;
1290 std::lock_guard l
{sched_scrub_lock
};
1291 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1292 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
+1)
1293 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1297 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1302 void OSDService::dec_scrubs_local()
1304 std::lock_guard l
{sched_scrub_lock
};
1305 dout(20) << __func__
<< " " << scrubs_local
<< " -> " << (scrubs_local
-1)
1306 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", remote " << scrubs_remote
<< ")" << dendl
;
1308 ceph_assert(scrubs_local
>= 0);
1311 bool OSDService::inc_scrubs_remote()
1313 bool result
= false;
1314 std::lock_guard l
{sched_scrub_lock
};
1315 if (scrubs_local
+ scrubs_remote
< cct
->_conf
->osd_max_scrubs
) {
1316 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
+1)
1317 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1321 dout(20) << __func__
<< " " << scrubs_local
<< " local + " << scrubs_remote
<< " remote >= max " << cct
->_conf
->osd_max_scrubs
<< dendl
;
1326 void OSDService::dec_scrubs_remote()
1328 std::lock_guard l
{sched_scrub_lock
};
1329 dout(20) << __func__
<< " " << scrubs_remote
<< " -> " << (scrubs_remote
-1)
1330 << " (max " << cct
->_conf
->osd_max_scrubs
<< ", local " << scrubs_local
<< ")" << dendl
;
1332 ceph_assert(scrubs_remote
>= 0);
1335 void OSDService::dump_scrub_reservations(Formatter
*f
)
1337 std::lock_guard l
{sched_scrub_lock
};
1338 f
->dump_int("scrubs_local", scrubs_local
);
1339 f
->dump_int("scrubs_remote", scrubs_remote
);
1340 f
->dump_int("osd_max_scrubs", cct
->_conf
->osd_max_scrubs
);
1343 void OSDService::retrieve_epochs(epoch_t
*_boot_epoch
, epoch_t
*_up_epoch
,
1344 epoch_t
*_bind_epoch
) const
1346 std::lock_guard
l(epoch_lock
);
1348 *_boot_epoch
= boot_epoch
;
1350 *_up_epoch
= up_epoch
;
1352 *_bind_epoch
= bind_epoch
;
1355 void OSDService::set_epochs(const epoch_t
*_boot_epoch
, const epoch_t
*_up_epoch
,
1356 const epoch_t
*_bind_epoch
)
1358 std::lock_guard
l(epoch_lock
);
1360 ceph_assert(*_boot_epoch
== 0 || *_boot_epoch
>= boot_epoch
);
1361 boot_epoch
= *_boot_epoch
;
1364 ceph_assert(*_up_epoch
== 0 || *_up_epoch
>= up_epoch
);
1365 up_epoch
= *_up_epoch
;
1368 ceph_assert(*_bind_epoch
== 0 || *_bind_epoch
>= bind_epoch
);
1369 bind_epoch
= *_bind_epoch
;
1373 bool OSDService::prepare_to_stop()
1375 std::unique_lock
l(is_stopping_lock
);
1376 if (get_state() != NOT_STOPPING
)
1379 OSDMapRef osdmap
= get_osdmap();
1380 if (osdmap
&& osdmap
->is_up(whoami
)) {
1381 dout(0) << __func__
<< " telling mon we are shutting down" << dendl
;
1382 set_state(PREPARING_TO_STOP
);
1383 monc
->send_mon_message(
1387 osdmap
->get_addrs(whoami
),
1388 osdmap
->get_epoch(),
1391 const auto timeout
= ceph::make_timespan(cct
->_conf
->osd_mon_shutdown_timeout
);
1392 is_stopping_cond
.wait_for(l
, timeout
,
1393 [this] { return get_state() == STOPPING
; });
1395 dout(0) << __func__
<< " starting shutdown" << dendl
;
1396 set_state(STOPPING
);
1400 void OSDService::got_stop_ack()
1402 std::scoped_lock
l(is_stopping_lock
);
1403 if (get_state() == PREPARING_TO_STOP
) {
1404 dout(0) << __func__
<< " starting shutdown" << dendl
;
1405 set_state(STOPPING
);
1406 is_stopping_cond
.notify_all();
1408 dout(10) << __func__
<< " ignoring msg" << dendl
;
1412 MOSDMap
*OSDService::build_incremental_map_msg(epoch_t since
, epoch_t to
,
1413 OSDSuperblock
& sblock
)
1415 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1416 osdmap
->get_encoding_features());
1417 m
->oldest_map
= max_oldest_map
;
1418 m
->newest_map
= sblock
.newest_map
;
1420 int max
= cct
->_conf
->osd_map_message_max
;
1421 ssize_t max_bytes
= cct
->_conf
->osd_map_message_max_bytes
;
1423 if (since
< m
->oldest_map
) {
1424 // we don't have the next map the target wants, so start with a
1427 dout(10) << __func__
<< " oldest map " << max_oldest_map
<< " > since "
1428 << since
<< ", starting with full map" << dendl
;
1429 since
= m
->oldest_map
;
1430 if (!get_map_bl(since
, bl
)) {
1431 derr
<< __func__
<< " missing full map " << since
<< dendl
;
1435 max_bytes
-= bl
.length();
1436 m
->maps
[since
] = std::move(bl
);
1438 for (epoch_t e
= since
+ 1; e
<= to
; ++e
) {
1440 if (get_inc_map_bl(e
, bl
)) {
1441 m
->incremental_maps
[e
] = std::move(bl
);
1443 dout(10) << __func__
<< " missing incremental map " << e
<< dendl
;
1444 if (!get_map_bl(e
, bl
)) {
1445 derr
<< __func__
<< " also missing full map " << e
<< dendl
;
1448 m
->maps
[e
] = std::move(bl
);
1451 max_bytes
-= bl
.length();
1452 if (max
<= 0 || max_bytes
<= 0) {
1459 if (!m
->maps
.empty() ||
1460 !m
->incremental_maps
.empty()) {
1461 // send what we have so far
1466 if (get_inc_map_bl(m
->newest_map
, bl
)) {
1467 m
->incremental_maps
[m
->newest_map
] = std::move(bl
);
1469 derr
<< __func__
<< " unable to load latest map " << m
->newest_map
<< dendl
;
1470 if (!get_map_bl(m
->newest_map
, bl
)) {
1471 derr
<< __func__
<< " unable to load latest full map " << m
->newest_map
1475 m
->maps
[m
->newest_map
] = std::move(bl
);
1480 void OSDService::send_map(MOSDMap
*m
, Connection
*con
)
1482 con
->send_message(m
);
1485 void OSDService::send_incremental_map(epoch_t since
, Connection
*con
,
1486 const OSDMapRef
& osdmap
)
1488 epoch_t to
= osdmap
->get_epoch();
1489 dout(10) << "send_incremental_map " << since
<< " -> " << to
1490 << " to " << con
<< " " << con
->get_peer_addr() << dendl
;
1494 OSDSuperblock
sblock(get_superblock());
1495 if (since
< sblock
.oldest_map
) {
1496 // just send latest full map
1497 MOSDMap
*m
= new MOSDMap(monc
->get_fsid(),
1498 osdmap
->get_encoding_features());
1499 m
->oldest_map
= max_oldest_map
;
1500 m
->newest_map
= sblock
.newest_map
;
1501 get_map_bl(to
, m
->maps
[to
]);
1506 if (to
> since
&& (int64_t)(to
- since
) > cct
->_conf
->osd_map_share_max_epochs
) {
1507 dout(10) << " " << (to
- since
) << " > max " << cct
->_conf
->osd_map_share_max_epochs
1508 << ", only sending most recent" << dendl
;
1509 since
= to
- cct
->_conf
->osd_map_share_max_epochs
;
1512 m
= build_incremental_map_msg(since
, to
, sblock
);
1517 bool OSDService::_get_map_bl(epoch_t e
, bufferlist
& bl
)
1519 bool found
= map_bl_cache
.lookup(e
, &bl
);
1521 logger
->inc(l_osd_map_bl_cache_hit
);
1524 logger
->inc(l_osd_map_bl_cache_miss
);
1525 found
= store
->read(meta_ch
,
1526 OSD::get_osdmap_pobject_name(e
), 0, 0, bl
,
1527 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1534 bool OSDService::get_inc_map_bl(epoch_t e
, bufferlist
& bl
)
1536 std::lock_guard
l(map_cache_lock
);
1537 bool found
= map_bl_inc_cache
.lookup(e
, &bl
);
1539 logger
->inc(l_osd_map_bl_cache_hit
);
1542 logger
->inc(l_osd_map_bl_cache_miss
);
1543 found
= store
->read(meta_ch
,
1544 OSD::get_inc_osdmap_pobject_name(e
), 0, 0, bl
,
1545 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
) >= 0;
1547 _add_map_inc_bl(e
, bl
);
1552 void OSDService::_add_map_bl(epoch_t e
, bufferlist
& bl
)
1554 dout(10) << "add_map_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1555 // cache a contiguous buffer
1556 if (bl
.get_num_buffers() > 1) {
1559 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1560 map_bl_cache
.add(e
, bl
);
1563 void OSDService::_add_map_inc_bl(epoch_t e
, bufferlist
& bl
)
1565 dout(10) << "add_map_inc_bl " << e
<< " " << bl
.length() << " bytes" << dendl
;
1566 // cache a contiguous buffer
1567 if (bl
.get_num_buffers() > 1) {
1570 bl
.try_assign_to_mempool(mempool::mempool_osd_mapbl
);
1571 map_bl_inc_cache
.add(e
, bl
);
1574 OSDMapRef
OSDService::_add_map(OSDMap
*o
)
1576 epoch_t e
= o
->get_epoch();
1578 if (cct
->_conf
->osd_map_dedup
) {
1579 // Dedup against an existing map at a nearby epoch
1580 OSDMapRef for_dedup
= map_cache
.lower_bound(e
);
1582 OSDMap::dedup(for_dedup
.get(), o
);
1586 OSDMapRef l
= map_cache
.add(e
, o
, &existed
);
1593 OSDMapRef
OSDService::try_get_map(epoch_t epoch
)
1595 std::lock_guard
l(map_cache_lock
);
1596 OSDMapRef retval
= map_cache
.lookup(epoch
);
1598 dout(30) << "get_map " << epoch
<< " -cached" << dendl
;
1599 logger
->inc(l_osd_map_cache_hit
);
1603 logger
->inc(l_osd_map_cache_miss
);
1604 epoch_t lb
= map_cache
.cached_key_lower_bound();
1606 dout(30) << "get_map " << epoch
<< " - miss, below lower bound" << dendl
;
1607 logger
->inc(l_osd_map_cache_miss_low
);
1608 logger
->inc(l_osd_map_cache_miss_low_avg
, lb
- epoch
);
1612 OSDMap
*map
= new OSDMap
;
1614 dout(20) << "get_map " << epoch
<< " - loading and decoding " << map
<< dendl
;
1616 if (!_get_map_bl(epoch
, bl
) || bl
.length() == 0) {
1617 derr
<< "failed to load OSD map for epoch " << epoch
<< ", got " << bl
.length() << " bytes" << dendl
;
1623 dout(20) << "get_map " << epoch
<< " - return initial " << map
<< dendl
;
1625 return _add_map(map
);
1631 void OSDService::reply_op_error(OpRequestRef op
, int err
)
1633 reply_op_error(op
, err
, eversion_t(), 0, {});
1636 void OSDService::reply_op_error(OpRequestRef op
, int err
, eversion_t v
,
1638 vector
<pg_log_op_return_item_t
> op_returns
)
1640 auto m
= op
->get_req
<MOSDOp
>();
1641 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1643 flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
1645 MOSDOpReply
*reply
= new MOSDOpReply(m
, err
, osdmap
->get_epoch(), flags
,
1646 !m
->has_flag(CEPH_OSD_FLAG_RETURNVEC
));
1647 reply
->set_reply_versions(v
, uv
);
1648 reply
->set_op_returns(op_returns
);
1649 m
->get_connection()->send_message(reply
);
1652 void OSDService::handle_misdirected_op(PG
*pg
, OpRequestRef op
)
1654 if (!cct
->_conf
->osd_debug_misdirected_ops
) {
1658 auto m
= op
->get_req
<MOSDOp
>();
1659 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1661 ceph_assert(m
->get_map_epoch() >= pg
->get_history().same_primary_since
);
1663 if (pg
->is_ec_pg()) {
1665 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1666 * can get this result:
1667 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1668 * [CRUSH_ITEM_NONE, 2, 3]/3
1669 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1671 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1673 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1676 * We can't compute the op target based on the sending map epoch due to
1677 * splitting. The simplest thing is to detect such cases here and drop
1678 * them without an error (the client will resend anyway).
1680 ceph_assert(m
->get_map_epoch() <= superblock
.newest_map
);
1681 OSDMapRef opmap
= try_get_map(m
->get_map_epoch());
1683 dout(7) << __func__
<< ": " << *pg
<< " no longer have map for "
1684 << m
->get_map_epoch() << ", dropping" << dendl
;
1687 pg_t _pgid
= m
->get_raw_pg();
1689 if ((m
->get_flags() & CEPH_OSD_FLAG_PGOP
) == 0)
1690 _pgid
= opmap
->raw_pg_to_pg(_pgid
);
1691 if (opmap
->get_primary_shard(_pgid
, &pgid
) &&
1692 pgid
.shard
!= pg
->pg_id
.shard
) {
1693 dout(7) << __func__
<< ": " << *pg
<< " primary changed since "
1694 << m
->get_map_epoch() << ", dropping" << dendl
;
1699 dout(7) << *pg
<< " misdirected op in " << m
->get_map_epoch() << dendl
;
1700 clog
->warn() << m
->get_source_inst() << " misdirected " << m
->get_reqid()
1701 << " pg " << m
->get_raw_pg()
1702 << " to osd." << whoami
1703 << " not " << pg
->get_acting()
1704 << " in e" << m
->get_map_epoch() << "/" << osdmap
->get_epoch();
1707 void OSDService::enqueue_back(OpSchedulerItem
&& qi
)
1709 osd
->op_shardedwq
.queue(std::move(qi
));
1712 void OSDService::enqueue_front(OpSchedulerItem
&& qi
)
1714 osd
->op_shardedwq
.queue_front(std::move(qi
));
1717 void OSDService::queue_recovery_context(
1719 GenContext
<ThreadPool::TPHandle
&> *c
)
1721 epoch_t e
= get_osdmap_epoch();
1724 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1725 new PGRecoveryContext(pg
->get_pgid(), c
, e
)),
1726 cct
->_conf
->osd_recovery_cost
,
1727 cct
->_conf
->osd_recovery_priority
,
1733 void OSDService::queue_for_snap_trim(PG
*pg
)
1735 dout(10) << "queueing " << *pg
<< " for snaptrim" << dendl
;
1738 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1739 new PGSnapTrim(pg
->get_pgid(), pg
->get_osdmap_epoch())),
1740 cct
->_conf
->osd_snap_trim_cost
,
1741 cct
->_conf
->osd_snap_trim_priority
,
1744 pg
->get_osdmap_epoch()));
1747 template <class MSG_TYPE
>
1748 void OSDService::queue_scrub_event_msg(PG
* pg
,
1749 Scrub::scrub_prio_t with_priority
,
1750 unsigned int qu_priority
)
1752 const auto epoch
= pg
->get_osdmap_epoch();
1753 auto msg
= new MSG_TYPE(pg
->get_pgid(), epoch
);
1754 dout(15) << "queue a scrub event (" << *msg
<< ") for " << *pg
<< ". Epoch: " << epoch
<< dendl
;
1756 enqueue_back(OpSchedulerItem(
1757 unique_ptr
<OpSchedulerItem::OpQueueable
>(msg
), cct
->_conf
->osd_scrub_cost
,
1758 pg
->scrub_requeue_priority(with_priority
, qu_priority
), ceph_clock_now(), 0, epoch
));
1761 template <class MSG_TYPE
>
1762 void OSDService::queue_scrub_event_msg(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1764 const auto epoch
= pg
->get_osdmap_epoch();
1765 auto msg
= new MSG_TYPE(pg
->get_pgid(), epoch
);
1766 dout(15) << "queue a scrub event (" << *msg
<< ") for " << *pg
<< ". Epoch: " << epoch
<< dendl
;
1768 enqueue_back(OpSchedulerItem(
1769 unique_ptr
<OpSchedulerItem::OpQueueable
>(msg
), cct
->_conf
->osd_scrub_cost
,
1770 pg
->scrub_requeue_priority(with_priority
), ceph_clock_now(), 0, epoch
));
1773 void OSDService::queue_for_scrub(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1775 queue_scrub_event_msg
<PGScrub
>(pg
, with_priority
);
1778 void OSDService::queue_scrub_after_repair(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1780 queue_scrub_event_msg
<PGScrubAfterRepair
>(pg
, with_priority
);
1783 void OSDService::queue_for_rep_scrub(PG
* pg
,
1784 Scrub::scrub_prio_t with_priority
,
1785 unsigned int qu_priority
)
1787 queue_scrub_event_msg
<PGRepScrub
>(pg
, with_priority
, qu_priority
);
1790 void OSDService::queue_for_rep_scrub_resched(PG
* pg
,
1791 Scrub::scrub_prio_t with_priority
,
1792 unsigned int qu_priority
)
1794 // Resulting scrub event: 'SchedReplica'
1795 queue_scrub_event_msg
<PGRepScrubResched
>(pg
, with_priority
, qu_priority
);
1798 void OSDService::queue_for_scrub_granted(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1800 // Resulting scrub event: 'RemotesReserved'
1801 queue_scrub_event_msg
<PGScrubResourcesOK
>(pg
, with_priority
);
1804 void OSDService::queue_for_scrub_denied(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1806 // Resulting scrub event: 'ReservationFailure'
1807 queue_scrub_event_msg
<PGScrubDenied
>(pg
, with_priority
);
1810 void OSDService::queue_for_scrub_resched(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1812 // Resulting scrub event: 'InternalSchedScrub'
1813 queue_scrub_event_msg
<PGScrubResched
>(pg
, with_priority
);
1816 void OSDService::queue_scrub_pushes_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1818 // Resulting scrub event: 'ActivePushesUpd'
1819 queue_scrub_event_msg
<PGScrubPushesUpdate
>(pg
, with_priority
);
1822 void OSDService::queue_scrub_applied_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1824 queue_scrub_event_msg
<PGScrubAppliedUpdate
>(pg
, with_priority
);
1827 void OSDService::queue_scrub_unblocking(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1829 // Resulting scrub event: 'Unblocked'
1830 queue_scrub_event_msg
<PGScrubUnblocked
>(pg
, with_priority
);
1833 void OSDService::queue_scrub_digest_update(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1835 // Resulting scrub event: 'DigestUpdate'
1836 queue_scrub_event_msg
<PGScrubDigestUpdate
>(pg
, with_priority
);
1839 void OSDService::queue_scrub_got_repl_maps(PG
* pg
, Scrub::scrub_prio_t with_priority
)
1841 // Resulting scrub event: 'GotReplicas'
1842 queue_scrub_event_msg
<PGScrubGotReplMaps
>(pg
, with_priority
);
1845 void OSDService::queue_scrub_replica_pushes(PG
*pg
, Scrub::scrub_prio_t with_priority
)
1847 // Resulting scrub event: 'ReplicaPushesUpd'
1848 queue_scrub_event_msg
<PGScrubReplicaPushes
>(pg
, with_priority
);
1851 void OSDService::queue_for_pg_delete(spg_t pgid
, epoch_t e
)
1853 dout(10) << __func__
<< " on " << pgid
<< " e " << e
<< dendl
;
1856 unique_ptr
<OpSchedulerItem::OpQueueable
>(
1857 new PGDelete(pgid
, e
)),
1858 cct
->_conf
->osd_pg_delete_cost
,
1859 cct
->_conf
->osd_pg_delete_priority
,
1865 bool OSDService::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
1867 return osd
->try_finish_pg_delete(pg
, old_pg_num
);
1872 void OSDService::set_ready_to_merge_source(PG
*pg
, eversion_t version
)
1874 std::lock_guard
l(merge_lock
);
1875 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1876 ready_to_merge_source
[pg
->pg_id
.pgid
] = version
;
1877 assert(not_ready_to_merge_source
.count(pg
->pg_id
.pgid
) == 0);
1878 _send_ready_to_merge();
1881 void OSDService::set_ready_to_merge_target(PG
*pg
,
1883 epoch_t last_epoch_started
,
1884 epoch_t last_epoch_clean
)
1886 std::lock_guard
l(merge_lock
);
1887 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1888 ready_to_merge_target
.insert(make_pair(pg
->pg_id
.pgid
,
1891 last_epoch_clean
)));
1892 assert(not_ready_to_merge_target
.count(pg
->pg_id
.pgid
) == 0);
1893 _send_ready_to_merge();
1896 void OSDService::set_not_ready_to_merge_source(pg_t source
)
1898 std::lock_guard
l(merge_lock
);
1899 dout(10) << __func__
<< " " << source
<< dendl
;
1900 not_ready_to_merge_source
.insert(source
);
1901 assert(ready_to_merge_source
.count(source
) == 0);
1902 _send_ready_to_merge();
1905 void OSDService::set_not_ready_to_merge_target(pg_t target
, pg_t source
)
1907 std::lock_guard
l(merge_lock
);
1908 dout(10) << __func__
<< " " << target
<< " source " << source
<< dendl
;
1909 not_ready_to_merge_target
[target
] = source
;
1910 assert(ready_to_merge_target
.count(target
) == 0);
1911 _send_ready_to_merge();
1914 void OSDService::send_ready_to_merge()
1916 std::lock_guard
l(merge_lock
);
1917 _send_ready_to_merge();
1920 void OSDService::_send_ready_to_merge()
1922 dout(20) << __func__
1923 << " ready_to_merge_source " << ready_to_merge_source
1924 << " not_ready_to_merge_source " << not_ready_to_merge_source
1925 << " ready_to_merge_target " << ready_to_merge_target
1926 << " not_ready_to_merge_target " << not_ready_to_merge_target
1927 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1929 for (auto src
: not_ready_to_merge_source
) {
1930 if (sent_ready_to_merge_source
.count(src
) == 0) {
1931 monc
->send_mon_message(new MOSDPGReadyToMerge(
1935 osdmap
->get_epoch()));
1936 sent_ready_to_merge_source
.insert(src
);
1939 for (auto p
: not_ready_to_merge_target
) {
1940 if (sent_ready_to_merge_source
.count(p
.second
) == 0) {
1941 monc
->send_mon_message(new MOSDPGReadyToMerge(
1945 osdmap
->get_epoch()));
1946 sent_ready_to_merge_source
.insert(p
.second
);
1949 for (auto src
: ready_to_merge_source
) {
1950 if (not_ready_to_merge_source
.count(src
.first
) ||
1951 not_ready_to_merge_target
.count(src
.first
.get_parent())) {
1954 auto p
= ready_to_merge_target
.find(src
.first
.get_parent());
1955 if (p
!= ready_to_merge_target
.end() &&
1956 sent_ready_to_merge_source
.count(src
.first
) == 0) {
1957 monc
->send_mon_message(new MOSDPGReadyToMerge(
1958 src
.first
, // source pgid
1959 src
.second
, // src version
1960 std::get
<0>(p
->second
), // target version
1961 std::get
<1>(p
->second
), // PG's last_epoch_started
1962 std::get
<2>(p
->second
), // PG's last_epoch_clean
1964 osdmap
->get_epoch()));
1965 sent_ready_to_merge_source
.insert(src
.first
);
1970 void OSDService::clear_ready_to_merge(PG
*pg
)
1972 std::lock_guard
l(merge_lock
);
1973 dout(10) << __func__
<< " " << pg
->pg_id
<< dendl
;
1974 ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1975 ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1976 not_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1977 not_ready_to_merge_target
.erase(pg
->pg_id
.pgid
);
1978 sent_ready_to_merge_source
.erase(pg
->pg_id
.pgid
);
1981 void OSDService::clear_sent_ready_to_merge()
1983 std::lock_guard
l(merge_lock
);
1984 sent_ready_to_merge_source
.clear();
1987 void OSDService::prune_sent_ready_to_merge(const OSDMapRef
& osdmap
)
1989 std::lock_guard
l(merge_lock
);
1990 auto i
= sent_ready_to_merge_source
.begin();
1991 while (i
!= sent_ready_to_merge_source
.end()) {
1992 if (!osdmap
->pg_exists(*i
)) {
1993 dout(10) << __func__
<< " " << *i
<< dendl
;
1994 i
= sent_ready_to_merge_source
.erase(i
);
2003 void OSDService::_queue_for_recovery(
2004 std::pair
<epoch_t
, PGRef
> p
,
2005 uint64_t reserved_pushes
)
2007 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
2010 unique_ptr
<OpSchedulerItem::OpQueueable
>(
2012 p
.second
->get_pgid(), p
.first
, reserved_pushes
)),
2013 cct
->_conf
->osd_recovery_cost
,
2014 cct
->_conf
->osd_recovery_priority
,
2020 // ====================================================================
2024 #define dout_prefix *_dout
2026 // Commands shared between OSD's console and admin console:
2027 namespace ceph::osd_cmds
{
2029 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
, std::ostream
& os
);
2031 } // namespace ceph::osd_cmds
2033 int OSD::mkfs(CephContext
*cct
, ObjectStore
*store
, uuid_d fsid
, int whoami
, string osdspec_affinity
)
2039 ObjectStore::CollectionHandle ch
;
2041 // if we are fed a uuid for this osd, use it.
2042 store
->set_fsid(cct
->_conf
->osd_uuid
);
2044 ret
= store
->mkfs();
2046 derr
<< "OSD::mkfs: ObjectStore::mkfs failed with error "
2047 << cpp_strerror(ret
) << dendl
;
2051 store
->set_cache_shards(1); // doesn't matter for mkfs!
2053 ret
= store
->mount();
2055 derr
<< "OSD::mkfs: couldn't mount ObjectStore: error "
2056 << cpp_strerror(ret
) << dendl
;
2060 ch
= store
->open_collection(coll_t::meta());
2062 ret
= store
->read(ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, sbbl
);
2064 derr
<< "OSD::mkfs: have meta collection but no superblock" << dendl
;
2067 /* if we already have superblock, check content of superblock */
2068 dout(0) << " have superblock" << dendl
;
2069 auto p
= sbbl
.cbegin();
2071 if (whoami
!= sb
.whoami
) {
2072 derr
<< "provided osd id " << whoami
<< " != superblock's " << sb
.whoami
2077 if (fsid
!= sb
.cluster_fsid
) {
2078 derr
<< "provided cluster fsid " << fsid
2079 << " != superblock's " << sb
.cluster_fsid
<< dendl
;
2084 // create superblock
2085 sb
.cluster_fsid
= fsid
;
2086 sb
.osd_fsid
= store
->get_fsid();
2088 sb
.compat_features
= get_osd_initial_compat_set();
2093 ObjectStore::CollectionHandle ch
= store
->create_new_collection(
2095 ObjectStore::Transaction t
;
2096 t
.create_collection(coll_t::meta(), 0);
2097 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
2098 ret
= store
->queue_transaction(ch
, std::move(t
));
2100 derr
<< "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2101 << "queue_transaction returned " << cpp_strerror(ret
) << dendl
;
2107 ret
= write_meta(cct
, store
, sb
.cluster_fsid
, sb
.osd_fsid
, whoami
, osdspec_affinity
);
2109 derr
<< "OSD::mkfs: failed to write fsid file: error "
2110 << cpp_strerror(ret
) << dendl
;
2124 int OSD::write_meta(CephContext
*cct
, ObjectStore
*store
, uuid_d
& cluster_fsid
, uuid_d
& osd_fsid
, int whoami
, string
& osdspec_affinity
)
2129 snprintf(val
, sizeof(val
), "%s", CEPH_OSD_ONDISK_MAGIC
);
2130 r
= store
->write_meta("magic", val
);
2134 snprintf(val
, sizeof(val
), "%d", whoami
);
2135 r
= store
->write_meta("whoami", val
);
2139 cluster_fsid
.print(val
);
2140 r
= store
->write_meta("ceph_fsid", val
);
2144 string key
= cct
->_conf
.get_val
<string
>("key");
2146 r
= store
->write_meta("osd_key", key
);
2150 string keyfile
= cct
->_conf
.get_val
<string
>("keyfile");
2151 if (!keyfile
.empty()) {
2154 r
= keybl
.read_file(keyfile
.c_str(), &err
);
2156 derr
<< __func__
<< " failed to read keyfile " << keyfile
<< ": "
2157 << err
<< ": " << cpp_strerror(r
) << dendl
;
2160 r
= store
->write_meta("osd_key", keybl
.to_str());
2165 if (!osdspec_affinity
.empty()) {
2166 r
= store
->write_meta("osdspec_affinity", osdspec_affinity
.c_str());
2171 r
= store
->write_meta("ready", "ready");
2178 int OSD::peek_meta(ObjectStore
*store
,
2180 uuid_d
*cluster_fsid
,
2183 ceph_release_t
*require_osd_release
)
2187 int r
= store
->read_meta("magic", &val
);
2192 r
= store
->read_meta("whoami", &val
);
2195 *whoami
= atoi(val
.c_str());
2197 r
= store
->read_meta("ceph_fsid", &val
);
2200 r
= cluster_fsid
->parse(val
.c_str());
2204 r
= store
->read_meta("fsid", &val
);
2206 *osd_fsid
= uuid_d();
2208 r
= osd_fsid
->parse(val
.c_str());
2213 r
= store
->read_meta("require_osd_release", &val
);
2215 *require_osd_release
= ceph_release_from_name(val
);
2223 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2227 OSD::OSD(CephContext
*cct_
, ObjectStore
*store_
,
2229 Messenger
*internal_messenger
,
2230 Messenger
*external_messenger
,
2231 Messenger
*hb_client_front
,
2232 Messenger
*hb_client_back
,
2233 Messenger
*hb_front_serverm
,
2234 Messenger
*hb_back_serverm
,
2235 Messenger
*osdc_messenger
,
2237 const std::string
&dev
, const std::string
&jdev
,
2238 ceph::async::io_context_pool
& poolctx
) :
2240 tick_timer(cct
, osd_lock
),
2241 tick_timer_without_osd_lock(cct
, tick_timer_lock
),
2242 gss_ktfile_client(cct
->_conf
.get_val
<std::string
>("gss_ktab_client_file")),
2243 cluster_messenger(internal_messenger
),
2244 client_messenger(external_messenger
),
2245 objecter_messenger(osdc_messenger
),
2247 mgrc(cct_
, client_messenger
, &mc
->monmap
),
2248 logger(create_logger()),
2249 recoverystate_perf(create_recoverystate_perf()),
2251 log_client(cct
, client_messenger
, &mc
->monmap
, LogClient::NO_FLAGS
),
2252 clog(log_client
.create_channel()),
2254 dev_path(dev
), journal_path(jdev
),
2255 store_is_rotational(store
->is_rotational()),
2256 trace_endpoint("0.0.0.0", 0, "osd"),
2258 m_osd_pg_epoch_max_lag_factor(cct
->_conf
.get_val
<double>(
2259 "osd_pg_epoch_max_lag_factor")),
2260 osd_compat(get_osd_compat_set()),
2261 osd_op_tp(cct
, "OSD::osd_op_tp", "tp_osd_tp",
2262 get_num_op_threads()),
2263 heartbeat_stop(false),
2264 heartbeat_need_update(true),
2265 hb_front_client_messenger(hb_client_front
),
2266 hb_back_client_messenger(hb_client_back
),
2267 hb_front_server_messenger(hb_front_serverm
),
2268 hb_back_server_messenger(hb_back_serverm
),
2270 heartbeat_thread(this),
2271 heartbeat_dispatcher(this),
2272 op_tracker(cct
, cct
->_conf
->osd_enable_op_tracker
,
2273 cct
->_conf
->osd_num_op_tracker_shard
),
2274 test_ops_hook(NULL
),
2277 ceph::make_timespan(cct
->_conf
->osd_op_thread_timeout
),
2278 ceph::make_timespan(cct
->_conf
->osd_op_thread_suicide_timeout
),
2280 last_pg_create_epoch(0),
2283 requested_full_first(0),
2284 requested_full_last(0),
2285 service(this, poolctx
)
2288 if (!gss_ktfile_client
.empty()) {
2289 // Assert we can export environment variable
2291 The default client keytab is used, if it is present and readable,
2292 to automatically obtain initial credentials for GSSAPI client
2293 applications. The principal name of the first entry in the client
2294 keytab is used by default when obtaining initial credentials.
2295 1. The KRB5_CLIENT_KTNAME environment variable.
2296 2. The default_client_keytab_name profile variable in [libdefaults].
2297 3. The hardcoded default, DEFCKTNAME.
2299 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2300 gss_ktfile_client
.c_str(), 1));
2301 ceph_assert(set_result
== 0);
2304 monc
->set_messenger(client_messenger
);
2305 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
2306 cct
->_conf
->osd_op_log_threshold
);
2307 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
2308 cct
->_conf
->osd_op_history_duration
);
2309 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
2310 cct
->_conf
->osd_op_history_slow_op_threshold
);
2311 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
2313 std::stringstream ss
;
2314 ss
<< "osd." << whoami
;
2315 trace_endpoint
.copy_name(ss
.str());
2318 // initialize shards
2319 num_shards
= get_num_op_shards();
2320 for (uint32_t i
= 0; i
< num_shards
; i
++) {
2321 OSDShard
*one_shard
= new OSDShard(
2325 shards
.push_back(one_shard
);
2331 while (!shards
.empty()) {
2332 delete shards
.back();
2335 cct
->get_perfcounters_collection()->remove(recoverystate_perf
);
2336 cct
->get_perfcounters_collection()->remove(logger
);
2337 delete recoverystate_perf
;
2342 double OSD::get_tick_interval() const
2344 // vary +/- 5% to avoid scrub scheduling livelocks
2345 constexpr auto delta
= 0.05;
2346 return (OSD_TICK_INTERVAL
*
2347 ceph::util::generate_random_number(1.0 - delta
, 1.0 + delta
));
2350 void OSD::handle_signal(int signum
)
2352 ceph_assert(signum
== SIGINT
|| signum
== SIGTERM
);
2353 derr
<< "*** Got signal " << sig_str(signum
) << " ***" << dendl
;
2359 std::lock_guard
lock(osd_lock
);
2363 if (store
->test_mount_in_use()) {
2364 derr
<< "OSD::pre_init: object store '" << dev_path
<< "' is "
2365 << "currently in use. (Is ceph-osd already running?)" << dendl
;
2369 cct
->_conf
.add_observer(this);
2373 int OSD::set_numa_affinity()
2375 // storage numa node
2376 int store_node
= -1;
2377 store
->get_numa_node(&store_node
, nullptr, nullptr);
2378 if (store_node
>= 0) {
2379 dout(1) << __func__
<< " storage numa node " << store_node
<< dendl
;
2382 // check network numa node(s)
2383 int front_node
= -1, back_node
= -1;
2384 string front_iface
= pick_iface(
2386 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
2387 string back_iface
= pick_iface(
2389 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
2390 int r
= get_iface_numa_node(front_iface
, &front_node
);
2391 if (r
>= 0 && front_node
>= 0) {
2392 dout(1) << __func__
<< " public network " << front_iface
<< " numa node "
2393 << front_node
<< dendl
;
2394 r
= get_iface_numa_node(back_iface
, &back_node
);
2395 if (r
>= 0 && back_node
>= 0) {
2396 dout(1) << __func__
<< " cluster network " << back_iface
<< " numa node "
2397 << back_node
<< dendl
;
2398 if (front_node
== back_node
&&
2399 front_node
== store_node
) {
2400 dout(1) << " objectstore and network numa nodes all match" << dendl
;
2401 if (g_conf().get_val
<bool>("osd_numa_auto_affinity")) {
2402 numa_node
= front_node
;
2404 } else if (front_node
!= back_node
) {
2405 dout(1) << __func__
<< " public and cluster network numa nodes do not match"
2408 dout(1) << __func__
<< " objectstore and network numa nodes do not match"
2411 } else if (back_node
== -2) {
2412 dout(1) << __func__
<< " cluster network " << back_iface
2413 << " ports numa nodes do not match" << dendl
;
2415 derr
<< __func__
<< " unable to identify cluster interface '" << back_iface
2416 << "' numa node: " << cpp_strerror(r
) << dendl
;
2418 } else if (front_node
== -2) {
2419 dout(1) << __func__
<< " public network " << front_iface
2420 << " ports numa nodes do not match" << dendl
;
2422 derr
<< __func__
<< " unable to identify public interface '" << front_iface
2423 << "' numa node: " << cpp_strerror(r
) << dendl
;
2425 if (int node
= g_conf().get_val
<int64_t>("osd_numa_node"); node
>= 0) {
2426 // this takes precedence over the automagic logic above
2429 if (numa_node
>= 0) {
2430 int r
= get_numa_node_cpu_set(numa_node
, &numa_cpu_set_size
, &numa_cpu_set
);
2432 dout(1) << __func__
<< " unable to determine numa node " << numa_node
2433 << " CPUs" << dendl
;
2436 dout(1) << __func__
<< " setting numa affinity to node " << numa_node
2438 << cpu_set_to_str_list(numa_cpu_set_size
, &numa_cpu_set
)
2440 r
= set_cpu_affinity_all_threads(numa_cpu_set_size
, &numa_cpu_set
);
2443 derr
<< __func__
<< " failed to set numa affinity: " << cpp_strerror(r
)
2449 dout(1) << __func__
<< " not setting numa affinity" << dendl
;
2456 class OSDSocketHook
: public AdminSocketHook
{
2459 explicit OSDSocketHook(OSD
*o
) : osd(o
) {}
2460 int call(std::string_view prefix
, const cmdmap_t
& cmdmap
,
2463 bufferlist
& out
) override
{
2464 ceph_abort("should use async hook");
2467 std::string_view prefix
,
2468 const cmdmap_t
& cmdmap
,
2470 const bufferlist
& inbl
,
2471 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
) override
{
2473 osd
->asok_command(prefix
, cmdmap
, f
, inbl
, on_finish
);
2474 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2476 on_finish(-EINVAL
, e
.what(), empty
);
2481 std::set
<int64_t> OSD::get_mapped_pools()
2483 std::set
<int64_t> pools
;
2484 std::vector
<spg_t
> pgids
;
2486 for (const auto &pgid
: pgids
) {
2487 pools
.insert(pgid
.pool());
2492 void OSD::asok_command(
2493 std::string_view prefix
, const cmdmap_t
& cmdmap
,
2495 const bufferlist
& inbl
,
2496 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
2499 stringstream ss
; // stderr error message stream
2500 bufferlist outbl
; // if empty at end, we'll dump formatter as output
2502 // --- PG commands are routed here to PG::do_command ---
2503 if (prefix
== "pg" ||
2504 prefix
== "query" ||
2505 prefix
== "mark_unfound_lost" ||
2506 prefix
== "list_unfound" ||
2507 prefix
== "scrub" ||
2508 prefix
== "deep_scrub"
2512 if (!cmd_getval(cmdmap
, "pgid", pgidstr
)) {
2513 ss
<< "no pgid specified";
2517 if (!pgid
.parse(pgidstr
.c_str())) {
2518 ss
<< "couldn't parse pgid '" << pgidstr
<< "'";
2524 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
2525 (pg
= _lookup_lock_pg(pcand
))) {
2526 if (pg
->is_primary()) {
2527 cmdmap_t new_cmdmap
= cmdmap
;
2529 pg
->do_command(prefix
, new_cmdmap
, inbl
, on_finish
);
2531 return; // the pg handler calls on_finish directly
2532 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
2539 ss
<< "not primary for pgid " << pgid
;
2540 // do not reply; they will get newer maps and realize they
2547 ss
<< "i don't have pgid " << pgid
;
2552 // --- OSD commands follow ---
2554 else if (prefix
== "status") {
2555 lock_guard
l(osd_lock
);
2556 f
->open_object_section("status");
2557 f
->dump_stream("cluster_fsid") << superblock
.cluster_fsid
;
2558 f
->dump_stream("osd_fsid") << superblock
.osd_fsid
;
2559 f
->dump_unsigned("whoami", superblock
.whoami
);
2560 f
->dump_string("state", get_state_name(get_state()));
2561 f
->dump_unsigned("oldest_map", superblock
.oldest_map
);
2562 f
->dump_unsigned("newest_map", superblock
.newest_map
);
2563 f
->dump_unsigned("num_pgs", num_pgs
);
2565 } else if (prefix
== "flush_journal") {
2566 store
->flush_journal();
2567 } else if (prefix
== "dump_ops_in_flight" ||
2569 prefix
== "dump_blocked_ops" ||
2570 prefix
== "dump_historic_ops" ||
2571 prefix
== "dump_historic_ops_by_duration" ||
2572 prefix
== "dump_historic_slow_ops") {
2574 const string error_str
= "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2575 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2576 will start to track new ops received afterwards.";
2578 set
<string
> filters
;
2579 vector
<string
> filter_str
;
2580 if (cmd_getval(cmdmap
, "filterstr", filter_str
)) {
2581 copy(filter_str
.begin(), filter_str
.end(),
2582 inserter(filters
, filters
.end()));
2585 if (prefix
== "dump_ops_in_flight" ||
2587 if (!op_tracker
.dump_ops_in_flight(f
, false, filters
)) {
2593 if (prefix
== "dump_blocked_ops") {
2594 if (!op_tracker
.dump_ops_in_flight(f
, true, filters
)) {
2600 if (prefix
== "dump_historic_ops") {
2601 if (!op_tracker
.dump_historic_ops(f
, false, filters
)) {
2607 if (prefix
== "dump_historic_ops_by_duration") {
2608 if (!op_tracker
.dump_historic_ops(f
, true, filters
)) {
2614 if (prefix
== "dump_historic_slow_ops") {
2615 if (!op_tracker
.dump_historic_slow_ops(f
, filters
)) {
2621 } else if (prefix
== "dump_op_pq_state") {
2622 f
->open_object_section("pq");
2623 op_shardedwq
.dump(f
);
2625 } else if (prefix
== "dump_blocklist") {
2626 list
<pair
<entity_addr_t
,utime_t
> > bl
;
2627 OSDMapRef curmap
= service
.get_osdmap();
2629 f
->open_array_section("blocklist");
2630 curmap
->get_blocklist(&bl
);
2631 for (list
<pair
<entity_addr_t
,utime_t
> >::iterator it
= bl
.begin();
2632 it
!= bl
.end(); ++it
) {
2633 f
->open_object_section("entry");
2634 f
->open_object_section("entity_addr_t");
2636 f
->close_section(); //entity_addr_t
2637 it
->second
.localtime(f
->dump_stream("expire_time"));
2638 f
->close_section(); //entry
2640 f
->close_section(); //blocklist
2641 } else if (prefix
== "dump_watchers") {
2642 list
<obj_watch_item_t
> watchers
;
2646 for (auto& pg
: pgs
) {
2647 list
<obj_watch_item_t
> pg_watchers
;
2648 pg
->get_watchers(&pg_watchers
);
2649 watchers
.splice(watchers
.end(), pg_watchers
);
2652 f
->open_array_section("watchers");
2653 for (list
<obj_watch_item_t
>::iterator it
= watchers
.begin();
2654 it
!= watchers
.end(); ++it
) {
2656 f
->open_object_section("watch");
2658 f
->dump_string("namespace", it
->obj
.nspace
);
2659 f
->dump_string("object", it
->obj
.oid
.name
);
2661 f
->open_object_section("entity_name");
2662 it
->wi
.name
.dump(f
);
2663 f
->close_section(); //entity_name_t
2665 f
->dump_unsigned("cookie", it
->wi
.cookie
);
2666 f
->dump_unsigned("timeout", it
->wi
.timeout_seconds
);
2668 f
->open_object_section("entity_addr_t");
2669 it
->wi
.addr
.dump(f
);
2670 f
->close_section(); //entity_addr_t
2672 f
->close_section(); //watch
2675 f
->close_section(); //watchers
2676 } else if (prefix
== "dump_recovery_reservations") {
2677 f
->open_object_section("reservations");
2678 f
->open_object_section("local_reservations");
2679 service
.local_reserver
.dump(f
);
2681 f
->open_object_section("remote_reservations");
2682 service
.remote_reserver
.dump(f
);
2685 } else if (prefix
== "dump_scrub_reservations") {
2686 f
->open_object_section("scrub_reservations");
2687 service
.dump_scrub_reservations(f
);
2689 } else if (prefix
== "get_latest_osdmap") {
2690 get_latest_osdmap();
2691 } else if (prefix
== "set_heap_property") {
2695 bool success
= false;
2696 if (!cmd_getval(cmdmap
, "property", property
)) {
2697 error
= "unable to get property";
2699 } else if (!cmd_getval(cmdmap
, "value", value
)) {
2700 error
= "unable to get value";
2702 } else if (value
< 0) {
2703 error
= "negative value not allowed";
2705 } else if (!ceph_heap_set_numeric_property(property
.c_str(), (size_t)value
)) {
2706 error
= "invalid property";
2711 f
->open_object_section("result");
2712 f
->dump_string("error", error
);
2713 f
->dump_bool("success", success
);
2715 } else if (prefix
== "get_heap_property") {
2719 bool success
= false;
2720 if (!cmd_getval(cmdmap
, "property", property
)) {
2721 error
= "unable to get property";
2723 } else if (!ceph_heap_get_numeric_property(property
.c_str(), &value
)) {
2724 error
= "invalid property";
2729 f
->open_object_section("result");
2730 f
->dump_string("error", error
);
2731 f
->dump_bool("success", success
);
2732 f
->dump_int("value", value
);
2734 } else if (prefix
== "dump_objectstore_kv_stats") {
2735 store
->get_db_statistics(f
);
2736 } else if (prefix
== "dump_scrubs") {
2737 service
.dumps_scrub(f
);
2738 } else if (prefix
== "calc_objectstore_db_histogram") {
2739 store
->generate_db_histogram(f
);
2740 } else if (prefix
== "flush_store_cache") {
2741 store
->flush_cache(&ss
);
2742 } else if (prefix
== "dump_pgstate_history") {
2743 f
->open_object_section("pgstate_history");
2744 f
->open_array_section("pgs");
2747 for (auto& pg
: pgs
) {
2748 f
->open_object_section("pg");
2749 f
->dump_stream("pg") << pg
->pg_id
;
2750 f
->dump_string("currently", pg
->get_current_state());
2751 pg
->dump_pgstate_history(f
);
2756 } else if (prefix
== "compact") {
2757 dout(1) << "triggering manual compaction" << dendl
;
2758 auto start
= ceph::coarse_mono_clock::now();
2760 auto end
= ceph::coarse_mono_clock::now();
2761 double duration
= std::chrono::duration
<double>(end
-start
).count();
2762 dout(1) << "finished manual compaction in "
2764 << " seconds" << dendl
;
2765 f
->open_object_section("compact_result");
2766 f
->dump_float("elapsed_time", duration
);
2768 } else if (prefix
== "get_mapped_pools") {
2769 f
->open_array_section("mapped_pools");
2770 set
<int64_t> poollist
= get_mapped_pools();
2771 for (auto pool
: poollist
) {
2772 f
->dump_int("pool_id", pool
);
2775 } else if (prefix
== "smart") {
2777 cmd_getval(cmdmap
, "devid", devid
);
2779 probe_smart(devid
, out
);
2780 outbl
.append(out
.str());
2781 } else if (prefix
== "list_devices") {
2782 set
<string
> devnames
;
2783 store
->get_devices(&devnames
);
2784 f
->open_array_section("list_devices");
2785 for (auto dev
: devnames
) {
2786 if (dev
.find("dm-") == 0) {
2790 f
->open_object_section("device");
2791 f
->dump_string("device", "/dev/" + dev
);
2792 f
->dump_string("device_id", get_device_id(dev
, &err
));
2796 } else if (prefix
== "send_beacon") {
2797 lock_guard
l(osd_lock
);
2799 send_beacon(ceph::coarse_mono_clock::now());
2803 else if (prefix
== "cluster_log") {
2805 cmd_getval(cmdmap
, "message", msg
);
2808 ss
<< "ignoring empty log message";
2811 string message
= msg
.front();
2812 for (vector
<string
>::iterator a
= ++msg
.begin(); a
!= msg
.end(); ++a
)
2813 message
+= " " + *a
;
2815 cmd_getval(cmdmap
, "level", lvl
);
2816 clog_type level
= string_to_clog_type(lvl
);
2819 ss
<< "unknown level '" << lvl
<< "'";
2822 clog
->do_log(level
, message
);
2825 else if (prefix
== "bench") {
2828 int64_t osize
, onum
;
2829 // default count 1G, size 4MB
2830 cmd_getval(cmdmap
, "count", count
, (int64_t)1 << 30);
2831 cmd_getval(cmdmap
, "size", bsize
, (int64_t)4 << 20);
2832 cmd_getval(cmdmap
, "object_size", osize
, (int64_t)0);
2833 cmd_getval(cmdmap
, "object_num", onum
, (int64_t)0);
2834 double elapsed
= 0.0;
2836 ret
= run_osd_bench_test(count
, bsize
, osize
, onum
, &elapsed
, ss
);
2841 double rate
= count
/ elapsed
;
2842 double iops
= rate
/ bsize
;
2843 f
->open_object_section("osd_bench_results");
2844 f
->dump_int("bytes_written", count
);
2845 f
->dump_int("blocksize", bsize
);
2846 f
->dump_float("elapsed_sec", elapsed
);
2847 f
->dump_float("bytes_per_sec", rate
);
2848 f
->dump_float("iops", iops
);
2852 else if (prefix
== "flush_pg_stats") {
2853 mgrc
.send_pgstats();
2854 f
->dump_unsigned("stat_seq", service
.get_osd_stat_seq());
2857 else if (prefix
== "heap") {
2858 ret
= ceph::osd_cmds::heap(*cct
, cmdmap
, *f
, ss
);
2861 else if (prefix
== "debug dump_missing") {
2862 f
->open_array_section("pgs");
2865 for (auto& pg
: pgs
) {
2866 string s
= stringify(pg
->pg_id
);
2867 f
->open_array_section(s
.c_str());
2869 pg
->dump_missing(f
);
2876 else if (prefix
== "debug kick_recovery_wq") {
2878 cmd_getval(cmdmap
, "delay", delay
);
2881 ret
= cct
->_conf
.set_val("osd_recovery_delay_start", oss
.str().c_str());
2883 ss
<< "kick_recovery_wq: error setting "
2884 << "osd_recovery_delay_start to '" << delay
<< "': error "
2888 cct
->_conf
.apply_changes(nullptr);
2889 ss
<< "kicking recovery queue. set osd_recovery_delay_start "
2890 << "to " << cct
->_conf
->osd_recovery_delay_start
;
2893 else if (prefix
== "cpu_profiler") {
2896 cmd_getval(cmdmap
, "arg", arg
);
2897 vector
<string
> argvec
;
2898 get_str_vec(arg
, argvec
);
2899 cpu_profiler_handle_command(argvec
, ds
);
2900 outbl
.append(ds
.str());
2903 else if (prefix
== "dump_pg_recovery_stats") {
2904 lock_guard
l(osd_lock
);
2905 pg_recovery_stats
.dump_formatted(f
);
2908 else if (prefix
== "reset_pg_recovery_stats") {
2909 lock_guard
l(osd_lock
);
2910 pg_recovery_stats
.reset();
2913 else if (prefix
== "perf histogram dump") {
2915 std::string counter
;
2916 cmd_getval(cmdmap
, "logger", logger
);
2917 cmd_getval(cmdmap
, "counter", counter
);
2918 cct
->get_perfcounters_collection()->dump_formatted_histograms(
2919 f
, false, logger
, counter
);
2922 else if (prefix
== "cache drop") {
2923 lock_guard
l(osd_lock
);
2924 dout(20) << "clearing all caches" << dendl
;
2925 // Clear the objectstore's cache - onode and buffer for Bluestore,
2926 // system's pagecache for Filestore
2927 ret
= store
->flush_cache(&ss
);
2929 ss
<< "Error flushing objectstore cache: " << cpp_strerror(ret
);
2932 // Clear the objectcontext cache (per PG)
2935 for (auto& pg
: pgs
) {
2940 else if (prefix
== "cache status") {
2941 lock_guard
l(osd_lock
);
2942 int obj_ctx_count
= 0;
2945 for (auto& pg
: pgs
) {
2946 obj_ctx_count
+= pg
->get_cache_obj_count();
2948 f
->open_object_section("cache_status");
2949 f
->dump_int("object_ctx", obj_ctx_count
);
2950 store
->dump_cache_stats(f
);
2954 else if (prefix
== "scrub_purged_snaps") {
2955 lock_guard
l(osd_lock
);
2956 scrub_purged_snaps();
2959 else if (prefix
== "dump_osd_network") {
2960 lock_guard
l(osd_lock
);
2962 if (!(cmd_getval(cmdmap
, "value", value
))) {
2963 // Convert milliseconds to microseconds
2964 value
= static_cast<double>(g_conf().get_val
<double>(
2965 "mon_warn_on_slow_ping_time")) * 1000;
2967 double ratio
= g_conf().get_val
<double>("mon_warn_on_slow_ping_ratio");
2968 value
= g_conf().get_val
<int64_t>("osd_heartbeat_grace");
2969 value
*= 1000000 * ratio
; // Seconds of grace to microseconds at ratio
2972 // Convert user input to microseconds
2975 if (value
< 0) value
= 0;
2977 struct osd_ping_time_t
{
2981 std::array
<uint32_t,3> times
;
2982 std::array
<uint32_t,3> min
;
2983 std::array
<uint32_t,3> max
;
2985 uint32_t last_update
;
2987 bool operator<(const osd_ping_time_t
& rhs
) const {
2988 if (pingtime
< rhs
.pingtime
)
2990 if (pingtime
> rhs
.pingtime
)
3000 set
<osd_ping_time_t
> sorted
;
3001 // Get pingtimes under lock and not on the stack
3002 map
<int, osd_stat_t::Interfaces
> *pingtimes
= new map
<int, osd_stat_t::Interfaces
>;
3003 service
.get_hb_pingtime(pingtimes
);
3004 for (auto j
: *pingtimes
) {
3005 if (j
.second
.last_update
== 0)
3007 osd_ping_time_t item
;
3008 item
.pingtime
= std::max(j
.second
.back_pingtime
[0], j
.second
.back_pingtime
[1]);
3009 item
.pingtime
= std::max(item
.pingtime
, j
.second
.back_pingtime
[2]);
3010 if (item
.pingtime
>= value
) {
3012 item
.times
[0] = j
.second
.back_pingtime
[0];
3013 item
.times
[1] = j
.second
.back_pingtime
[1];
3014 item
.times
[2] = j
.second
.back_pingtime
[2];
3015 item
.min
[0] = j
.second
.back_min
[0];
3016 item
.min
[1] = j
.second
.back_min
[1];
3017 item
.min
[2] = j
.second
.back_min
[2];
3018 item
.max
[0] = j
.second
.back_max
[0];
3019 item
.max
[1] = j
.second
.back_max
[1];
3020 item
.max
[2] = j
.second
.back_max
[2];
3021 item
.last
= j
.second
.back_last
;
3023 item
.last_update
= j
.second
.last_update
;
3024 sorted
.emplace(item
);
3026 if (j
.second
.front_last
== 0)
3028 item
.pingtime
= std::max(j
.second
.front_pingtime
[0], j
.second
.front_pingtime
[1]);
3029 item
.pingtime
= std::max(item
.pingtime
, j
.second
.front_pingtime
[2]);
3030 if (item
.pingtime
>= value
) {
3032 item
.times
[0] = j
.second
.front_pingtime
[0];
3033 item
.times
[1] = j
.second
.front_pingtime
[1];
3034 item
.times
[2] = j
.second
.front_pingtime
[2];
3035 item
.min
[0] = j
.second
.front_min
[0];
3036 item
.min
[1] = j
.second
.front_min
[1];
3037 item
.min
[2] = j
.second
.front_min
[2];
3038 item
.max
[0] = j
.second
.front_max
[0];
3039 item
.max
[1] = j
.second
.front_max
[1];
3040 item
.max
[2] = j
.second
.front_max
[2];
3041 item
.last
= j
.second
.front_last
;
3042 item
.last_update
= j
.second
.last_update
;
3044 sorted
.emplace(item
);
3049 // Network ping times (1min 5min 15min)
3050 f
->open_object_section("network_ping_times");
3051 f
->dump_int("threshold", value
/ 1000);
3052 f
->open_array_section("entries");
3053 for (auto &sitem
: boost::adaptors::reverse(sorted
)) {
3054 ceph_assert(sitem
.pingtime
>= value
);
3055 f
->open_object_section("entry");
3057 const time_t lu(sitem
.last_update
);
3059 string
lustr(ctime_r(&lu
, buffer
));
3060 lustr
.pop_back(); // Remove trailing \n
3061 auto stale
= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
3062 f
->dump_string("last update", lustr
);
3063 f
->dump_bool("stale", ceph_clock_now().sec() - sitem
.last_update
> stale
);
3064 f
->dump_int("from osd", whoami
);
3065 f
->dump_int("to osd", sitem
.to
);
3066 f
->dump_string("interface", (sitem
.back
? "back" : "front"));
3067 f
->open_object_section("average");
3068 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.times
[0],3).c_str());
3069 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.times
[1],3).c_str());
3070 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.times
[2],3).c_str());
3071 f
->close_section(); // average
3072 f
->open_object_section("min");
3073 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3074 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3075 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3076 f
->close_section(); // min
3077 f
->open_object_section("max");
3078 f
->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem
.max
[0],3).c_str());
3079 f
->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem
.max
[1],3).c_str());
3080 f
->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem
.max
[2],3).c_str());
3081 f
->close_section(); // max
3082 f
->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem
.last
,3).c_str());
3083 f
->close_section(); // entry
3085 f
->close_section(); // entries
3086 f
->close_section(); // network_ping_times
3088 ceph_abort_msg("broken asok registration");
3092 on_finish(ret
, ss
.str(), outbl
);
3095 int OSD::run_osd_bench_test(
3104 uint32_t duration
= cct
->_conf
->osd_bench_duration
;
3106 if (bsize
> (int64_t) cct
->_conf
->osd_bench_max_block_size
) {
3107 // let us limit the block size because the next checks rely on it
3108 // having a sane value. If we allow any block size to be set things
3109 // can still go sideways.
3110 ss
<< "block 'size' values are capped at "
3111 << byte_u_t(cct
->_conf
->osd_bench_max_block_size
) << ". If you wish to use"
3112 << " a higher value, please adjust 'osd_bench_max_block_size'";
3115 } else if (bsize
< (int64_t) (1 << 20)) {
3116 // entering the realm of small block sizes.
3117 // limit the count to a sane value, assuming a configurable amount of
3118 // IOPS and duration, so that the OSD doesn't get hung up on this,
3119 // preventing timeouts from going off
3121 bsize
* duration
* cct
->_conf
->osd_bench_small_size_max_iops
;
3122 if (count
> max_count
) {
3123 ss
<< "'count' values greater than " << max_count
3124 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
3125 << cct
->_conf
->osd_bench_small_size_max_iops
<< " IOPS,"
3126 << " for " << duration
<< " seconds,"
3127 << " can cause ill effects on osd. "
3128 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3129 << " value if you wish to use a higher 'count'.";
3134 // 1MB block sizes are big enough so that we get more stuff done.
3135 // However, to avoid the osd from getting hung on this and having
3136 // timers being triggered, we are going to limit the count assuming
3137 // a configurable throughput and duration.
3138 // NOTE: max_count is the total amount of bytes that we believe we
3139 // will be able to write during 'duration' for the given
3140 // throughput. The block size hardly impacts this unless it's
3141 // way too big. Given we already check how big the block size
3142 // is, it's safe to assume everything will check out.
3144 cct
->_conf
->osd_bench_large_size_max_throughput
* duration
;
3145 if (count
> max_count
) {
3146 ss
<< "'count' values greater than " << max_count
3147 << " for a block size of " << byte_u_t(bsize
) << ", assuming "
3148 << byte_u_t(cct
->_conf
->osd_bench_large_size_max_throughput
) << "/s,"
3149 << " for " << duration
<< " seconds,"
3150 << " can cause ill effects on osd. "
3151 << " Please adjust 'osd_bench_large_size_max_throughput'"
3152 << " with a higher value if you wish to use a higher 'count'.";
3158 if (osize
&& bsize
> osize
) {
3162 dout(1) << " bench count " << count
3163 << " bsize " << byte_u_t(bsize
) << dendl
;
3165 ObjectStore::Transaction cleanupt
;
3167 if (osize
&& onum
) {
3169 bufferptr
bp(osize
);
3171 bl
.push_back(std::move(bp
));
3172 bl
.rebuild_page_aligned();
3173 for (int i
=0; i
<onum
; ++i
) {
3175 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", i
);
3177 hobject_t
soid(sobject_t(oid
, 0));
3178 ObjectStore::Transaction t
;
3179 t
.write(coll_t(), ghobject_t(soid
), 0, osize
, bl
);
3180 store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
3181 cleanupt
.remove(coll_t(), ghobject_t(soid
));
3186 bufferptr
bp(bsize
);
3188 bl
.push_back(std::move(bp
));
3189 bl
.rebuild_page_aligned();
3193 if (!service
.meta_ch
->flush_commit(&waiter
)) {
3198 utime_t start
= ceph_clock_now();
3199 for (int64_t pos
= 0; pos
< count
; pos
+= bsize
) {
3201 unsigned offset
= 0;
3202 if (onum
&& osize
) {
3203 snprintf(nm
, sizeof(nm
), "disk_bw_test_%d", (int)(rand() % onum
));
3204 offset
= rand() % (osize
/ bsize
) * bsize
;
3206 snprintf(nm
, sizeof(nm
), "disk_bw_test_%lld", (long long)pos
);
3209 hobject_t
soid(sobject_t(oid
, 0));
3210 ObjectStore::Transaction t
;
3211 t
.write(coll_t::meta(), ghobject_t(soid
), offset
, bsize
, bl
);
3212 store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
3213 if (!onum
|| !osize
) {
3214 cleanupt
.remove(coll_t::meta(), ghobject_t(soid
));
3220 if (!service
.meta_ch
->flush_commit(&waiter
)) {
3224 utime_t end
= ceph_clock_now();
3225 *elapsed
= end
- start
;
3228 store
->queue_transaction(service
.meta_ch
, std::move(cleanupt
), nullptr);
3231 if (!service
.meta_ch
->flush_commit(&waiter
)) {
3239 class TestOpsSocketHook
: public AdminSocketHook
{
3240 OSDService
*service
;
3243 TestOpsSocketHook(OSDService
*s
, ObjectStore
*st
) : service(s
), store(st
) {}
3244 int call(std::string_view command
, const cmdmap_t
& cmdmap
,
3246 std::ostream
& errss
,
3247 bufferlist
& out
) override
{
3251 test_ops(service
, store
, command
, cmdmap
, outss
);
3253 } catch (const TOPNSPC::common::bad_cmd_get
& e
) {
3259 void test_ops(OSDService
*service
, ObjectStore
*store
,
3260 std::string_view command
, const cmdmap_t
& cmdmap
, ostream
&ss
);
3264 class OSD::C_Tick
: public Context
{
3267 explicit C_Tick(OSD
*o
) : osd(o
) {}
3268 void finish(int r
) override
{
3273 class OSD::C_Tick_WithoutOSDLock
: public Context
{
3276 explicit C_Tick_WithoutOSDLock(OSD
*o
) : osd(o
) {}
3277 void finish(int r
) override
{
3278 osd
->tick_without_osd_lock();
3282 int OSD::enable_disable_fuse(bool stop
)
3286 string mntpath
= cct
->_conf
->osd_data
+ "/fuse";
3287 if (fuse_store
&& (stop
|| !cct
->_conf
->osd_objectstore_fuse
)) {
3288 dout(1) << __func__
<< " disabling" << dendl
;
3292 r
= ::rmdir(mntpath
.c_str());
3295 derr
<< __func__
<< " failed to rmdir " << mntpath
<< ": "
3296 << cpp_strerror(r
) << dendl
;
3301 if (!fuse_store
&& cct
->_conf
->osd_objectstore_fuse
) {
3302 dout(1) << __func__
<< " enabling" << dendl
;
3303 r
= ::mkdir(mntpath
.c_str(), 0700);
3306 if (r
< 0 && r
!= -EEXIST
) {
3307 derr
<< __func__
<< " unable to create " << mntpath
<< ": "
3308 << cpp_strerror(r
) << dendl
;
3311 fuse_store
= new FuseStore(store
, mntpath
);
3312 r
= fuse_store
->start();
3314 derr
<< __func__
<< " unable to start fuse: " << cpp_strerror(r
) << dendl
;
3320 #endif // HAVE_LIBFUSE
3324 size_t OSD::get_num_cache_shards()
3326 return cct
->_conf
.get_val
<Option::size_t>("osd_num_cache_shards");
3329 int OSD::get_num_op_shards()
3331 if (cct
->_conf
->osd_op_num_shards
)
3332 return cct
->_conf
->osd_op_num_shards
;
3333 if (store_is_rotational
)
3334 return cct
->_conf
->osd_op_num_shards_hdd
;
3336 return cct
->_conf
->osd_op_num_shards_ssd
;
3339 int OSD::get_num_op_threads()
3341 if (cct
->_conf
->osd_op_num_threads_per_shard
)
3342 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard
;
3343 if (store_is_rotational
)
3344 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_hdd
;
3346 return get_num_op_shards() * cct
->_conf
->osd_op_num_threads_per_shard_ssd
;
3349 float OSD::get_osd_recovery_sleep()
3351 if (cct
->_conf
->osd_recovery_sleep
)
3352 return cct
->_conf
->osd_recovery_sleep
;
3353 if (!store_is_rotational
&& !journal_is_rotational
)
3354 return cct
->_conf
->osd_recovery_sleep_ssd
;
3355 else if (store_is_rotational
&& !journal_is_rotational
)
3356 return cct
->_conf
.get_val
<double>("osd_recovery_sleep_hybrid");
3358 return cct
->_conf
->osd_recovery_sleep_hdd
;
3361 float OSD::get_osd_delete_sleep()
3363 float osd_delete_sleep
= cct
->_conf
.get_val
<double>("osd_delete_sleep");
3364 if (osd_delete_sleep
> 0)
3365 return osd_delete_sleep
;
3366 if (!store_is_rotational
&& !journal_is_rotational
)
3367 return cct
->_conf
.get_val
<double>("osd_delete_sleep_ssd");
3368 if (store_is_rotational
&& !journal_is_rotational
)
3369 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hybrid");
3370 return cct
->_conf
.get_val
<double>("osd_delete_sleep_hdd");
3373 int OSD::get_recovery_max_active()
3375 if (cct
->_conf
->osd_recovery_max_active
)
3376 return cct
->_conf
->osd_recovery_max_active
;
3377 if (store_is_rotational
)
3378 return cct
->_conf
->osd_recovery_max_active_hdd
;
3380 return cct
->_conf
->osd_recovery_max_active_ssd
;
3383 float OSD::get_osd_snap_trim_sleep()
3385 float osd_snap_trim_sleep
= cct
->_conf
.get_val
<double>("osd_snap_trim_sleep");
3386 if (osd_snap_trim_sleep
> 0)
3387 return osd_snap_trim_sleep
;
3388 if (!store_is_rotational
&& !journal_is_rotational
)
3389 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_ssd");
3390 if (store_is_rotational
&& !journal_is_rotational
)
3391 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hybrid");
3392 return cct
->_conf
.get_val
<double>("osd_snap_trim_sleep_hdd");
3398 CompatSet initial
, diff
;
3399 std::lock_guard
lock(osd_lock
);
3404 tick_timer_without_osd_lock
.init();
3405 service
.recovery_request_timer
.init();
3406 service
.sleep_timer
.init();
3408 boot_finisher
.start();
3412 store
->read_meta("require_osd_release", &val
);
3413 last_require_osd_release
= ceph_release_from_name(val
);
3417 dout(2) << "init " << dev_path
3418 << " (looks like " << (store_is_rotational
? "hdd" : "ssd") << ")"
3420 dout(2) << "journal " << journal_path
<< dendl
;
3421 ceph_assert(store
); // call pre_init() first!
3423 store
->set_cache_shards(get_num_cache_shards());
3425 int r
= store
->mount();
3427 derr
<< "OSD:init: unable to mount object store" << dendl
;
3430 journal_is_rotational
= store
->is_journal_rotational();
3431 dout(2) << "journal looks like " << (journal_is_rotational
? "hdd" : "ssd")
3434 enable_disable_fuse(false);
3436 dout(2) << "boot" << dendl
;
3438 service
.meta_ch
= store
->open_collection(coll_t::meta());
3440 // initialize the daily loadavg with current 15min loadavg
3442 if (getloadavg(loadavgs
, 3) == 3) {
3443 daily_loadavg
= loadavgs
[2];
3445 derr
<< "OSD::init() : couldn't read loadavgs\n" << dendl
;
3446 daily_loadavg
= 1.0;
3449 int rotating_auth_attempts
= 0;
3450 auto rotating_auth_timeout
=
3451 g_conf().get_val
<int64_t>("rotating_keys_bootstrap_timeout");
3453 // sanity check long object name handling
3456 l
.oid
.name
= string(cct
->_conf
->osd_max_object_name_len
, 'n');
3457 l
.set_key(string(cct
->_conf
->osd_max_object_name_len
, 'k'));
3458 l
.nspace
= string(cct
->_conf
->osd_max_object_namespace_len
, 's');
3459 r
= store
->validate_hobject_key(l
);
3461 derr
<< "backend (" << store
->get_type() << ") is unable to support max "
3462 << "object name[space] len" << dendl
;
3463 derr
<< " osd max object name len = "
3464 << cct
->_conf
->osd_max_object_name_len
<< dendl
;
3465 derr
<< " osd max object namespace len = "
3466 << cct
->_conf
->osd_max_object_namespace_len
<< dendl
;
3467 derr
<< cpp_strerror(r
) << dendl
;
3468 if (cct
->_conf
->osd_check_max_object_name_len_on_startup
) {
3471 derr
<< "osd_check_max_object_name_len_on_startup = false, starting anyway"
3474 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl
;
3479 r
= read_superblock();
3481 derr
<< "OSD::init() : unable to read osd superblock" << dendl
;
3486 if (osd_compat
.compare(superblock
.compat_features
) < 0) {
3487 derr
<< "The disk uses features unsupported by the executable." << dendl
;
3488 derr
<< " ondisk features " << superblock
.compat_features
<< dendl
;
3489 derr
<< " daemon features " << osd_compat
<< dendl
;
3491 if (osd_compat
.writeable(superblock
.compat_features
)) {
3492 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3493 derr
<< "it is still writeable, though. Missing features: " << diff
<< dendl
;
3498 CompatSet diff
= osd_compat
.unsupported(superblock
.compat_features
);
3499 derr
<< "Cannot write to disk! Missing features: " << diff
<< dendl
;
3505 assert_warn(whoami
== superblock
.whoami
);
3506 if (whoami
!= superblock
.whoami
) {
3507 derr
<< "OSD::init: superblock says osd"
3508 << superblock
.whoami
<< " but I am osd." << whoami
<< dendl
;
3513 startup_time
= ceph::mono_clock::now();
3515 // load up "current" osdmap
3516 assert_warn(!get_osdmap());
3518 derr
<< "OSD::init: unable to read current osdmap" << dendl
;
3522 osdmap
= get_map(superblock
.current_epoch
);
3525 // make sure we don't have legacy pgs deleting
3528 int r
= store
->list_collections(ls
);
3529 ceph_assert(r
>= 0);
3532 if (c
.is_pg(&pgid
) &&
3533 !osdmap
->have_pg_pool(pgid
.pool())) {
3534 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
3535 if (!store
->exists(service
.meta_ch
, oid
)) {
3536 derr
<< __func__
<< " missing pg_pool_t for deleted pool "
3537 << pgid
.pool() << " for pg " << pgid
3538 << "; please downgrade to luminous and allow "
3539 << "pg deletion to complete before upgrading" << dendl
;
3546 initial
= get_osd_initial_compat_set();
3547 diff
= superblock
.compat_features
.unsupported(initial
);
3548 if (superblock
.compat_features
.merge(initial
)) {
3549 // Are we adding SNAPMAPPER2?
3550 if (diff
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2
)) {
3551 dout(1) << __func__
<< " upgrade snap_mapper (first start as octopus)"
3553 auto ch
= service
.meta_ch
;
3554 auto hoid
= make_snapmapper_oid();
3555 unsigned max
= cct
->_conf
->osd_target_transaction_size
;
3556 r
= SnapMapper::convert_legacy(cct
, store
, ch
, hoid
, max
);
3560 // We need to persist the new compat_set before we
3562 dout(5) << "Upgrading superblock adding: " << diff
<< dendl
;
3563 ObjectStore::Transaction t
;
3564 write_superblock(t
);
3565 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3570 // make sure snap mapper object exists
3571 if (!store
->exists(service
.meta_ch
, OSD::make_snapmapper_oid())) {
3572 dout(10) << "init creating/touching snapmapper object" << dendl
;
3573 ObjectStore::Transaction t
;
3574 t
.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3575 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3579 if (!store
->exists(service
.meta_ch
, OSD::make_purged_snaps_oid())) {
3580 dout(10) << "init creating/touching purged_snaps object" << dendl
;
3581 ObjectStore::Transaction t
;
3582 t
.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3583 r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
3588 if (cct
->_conf
->osd_open_classes_on_start
) {
3589 int r
= ClassHandler::get_instance().open_all_classes();
3591 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r
) << dendl
;
3594 check_osdmap_features();
3597 epoch_t bind_epoch
= osdmap
->get_epoch();
3598 service
.set_epochs(NULL
, NULL
, &bind_epoch
);
3601 clear_temp_objects();
3603 // initialize osdmap references in sharded wq
3604 for (auto& shard
: shards
) {
3605 std::lock_guard
l(shard
->osdmap_lock
);
3606 shard
->shard_osdmap
= osdmap
;
3609 // load up pgs (as they previously existed)
3612 dout(2) << "superblock: I am osd." << superblock
.whoami
<< dendl
;
3614 if (cct
->_conf
.get_val
<bool>("osd_compact_on_start")) {
3615 dout(2) << "compacting object store's omap" << dendl
;
3621 struct store_statfs_t stbuf
;
3622 osd_alert_list_t alerts
;
3623 int r
= store
->statfs(&stbuf
, &alerts
);
3624 ceph_assert(r
== 0);
3625 service
.set_statfs(stbuf
, alerts
);
3628 // client_messenger's auth_client will be set up by monc->init() later.
3629 for (auto m
: { cluster_messenger
,
3631 hb_front_client_messenger
,
3632 hb_back_client_messenger
,
3633 hb_front_server_messenger
,
3634 hb_back_server_messenger
} ) {
3635 m
->set_auth_client(monc
);
3637 for (auto m
: { client_messenger
,
3639 hb_front_server_messenger
,
3640 hb_back_server_messenger
}) {
3641 m
->set_auth_server(monc
);
3643 monc
->set_handle_authentication_dispatcher(this);
3645 monc
->set_want_keys(CEPH_ENTITY_TYPE_MON
| CEPH_ENTITY_TYPE_OSD
3646 | CEPH_ENTITY_TYPE_MGR
);
3651 mgrc
.set_pgstats_cb([this]() { return collect_pg_stats(); });
3652 mgrc
.set_perf_metric_query_cb(
3653 [this](const ConfigPayload
&config_payload
) {
3654 set_perf_queries(config_payload
);
3657 return get_perf_reports();
3661 // tell monc about log_client so it will know about mon session resets
3662 monc
->set_log_client(&log_client
);
3663 update_log_config();
3666 client_messenger
->add_dispatcher_tail(&mgrc
);
3667 client_messenger
->add_dispatcher_tail(this);
3668 cluster_messenger
->add_dispatcher_head(this);
3670 hb_front_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3671 hb_back_client_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3672 hb_front_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3673 hb_back_server_messenger
->add_dispatcher_head(&heartbeat_dispatcher
);
3675 objecter_messenger
->add_dispatcher_head(service
.objecter
.get());
3678 service
.publish_map(osdmap
);
3679 service
.publish_superblock(superblock
);
3680 service
.max_oldest_map
= superblock
.oldest_map
;
3682 for (auto& shard
: shards
) {
3683 // put PGs in a temporary set because we may modify pg_slots
3684 // unordered_map below.
3686 for (auto& i
: shard
->pg_slots
) {
3687 PGRef pg
= i
.second
->pg
;
3693 for (auto pg
: pgs
) {
3694 std::scoped_lock l
{*pg
};
3695 set
<pair
<spg_t
,epoch_t
>> new_children
;
3696 set
<pair
<spg_t
,epoch_t
>> merge_pgs
;
3697 service
.identify_splits_and_merges(pg
->get_osdmap(), osdmap
, pg
->pg_id
,
3698 &new_children
, &merge_pgs
);
3699 if (!new_children
.empty()) {
3700 for (auto shard
: shards
) {
3701 shard
->prime_splits(osdmap
, &new_children
);
3703 assert(new_children
.empty());
3705 if (!merge_pgs
.empty()) {
3706 for (auto shard
: shards
) {
3707 shard
->prime_merges(osdmap
, &merge_pgs
);
3709 assert(merge_pgs
.empty());
3716 // start the heartbeat
3717 heartbeat_thread
.create("osd_srv_heartbt");
3720 tick_timer
.add_event_after(get_tick_interval(),
3723 std::lock_guard
l(tick_timer_lock
);
3724 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
3725 new C_Tick_WithoutOSDLock(this));
3730 r
= monc
->authenticate();
3732 derr
<< __func__
<< " authentication failed: " << cpp_strerror(r
)
3737 while (monc
->wait_auth_rotating(rotating_auth_timeout
) < 0) {
3738 derr
<< "unable to obtain rotating service keys; retrying" << dendl
;
3739 ++rotating_auth_attempts
;
3740 if (rotating_auth_attempts
> g_conf()->max_rotating_auth_attempts
) {
3741 derr
<< __func__
<< " wait_auth_rotating timed out" << dendl
;
3746 r
= update_crush_device_class();
3748 derr
<< __func__
<< " unable to update_crush_device_class: "
3749 << cpp_strerror(r
) << dendl
;
3753 r
= update_crush_location();
3755 derr
<< __func__
<< " unable to update_crush_location: "
3756 << cpp_strerror(r
) << dendl
;
3764 // start objecter *after* we have authenticated, so that we don't ignore
3765 // the OSDMaps it requests.
3766 service
.final_init();
3770 dout(10) << "ensuring pgs have consumed prior maps" << dendl
;
3773 dout(0) << "done with init, starting boot process" << dendl
;
3775 // subscribe to any pg creations
3776 monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0);
3778 // MgrClient needs this (it doesn't have MonClient reference itself)
3779 monc
->sub_want("mgrmap", 0, 0);
3781 // we don't need to ask for an osdmap here; objecter will
3782 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3788 // Override a few options if mclock scheduler is enabled.
3789 maybe_override_max_osd_capacity_for_qos();
3790 maybe_override_options_for_qos();
3795 enable_disable_fuse(true);
3802 void OSD::final_init()
3804 AdminSocket
*admin_socket
= cct
->get_admin_socket();
3805 asok_hook
= new OSDSocketHook(this);
3806 int r
= admin_socket
->register_command("status", asok_hook
,
3807 "high-level status of OSD");
3808 ceph_assert(r
== 0);
3809 r
= admin_socket
->register_command("flush_journal",
3811 "flush the journal to permanent store");
3812 ceph_assert(r
== 0);
3813 r
= admin_socket
->register_command("dump_ops_in_flight " \
3814 "name=filterstr,type=CephString,n=N,req=false",
3816 "show the ops currently in flight");
3817 ceph_assert(r
== 0);
3818 r
= admin_socket
->register_command("ops " \
3819 "name=filterstr,type=CephString,n=N,req=false",
3821 "show the ops currently in flight");
3822 ceph_assert(r
== 0);
3823 r
= admin_socket
->register_command("dump_blocked_ops " \
3824 "name=filterstr,type=CephString,n=N,req=false",
3826 "show the blocked ops currently in flight");
3827 ceph_assert(r
== 0);
3828 r
= admin_socket
->register_command("dump_historic_ops " \
3829 "name=filterstr,type=CephString,n=N,req=false",
3832 ceph_assert(r
== 0);
3833 r
= admin_socket
->register_command("dump_historic_slow_ops " \
3834 "name=filterstr,type=CephString,n=N,req=false",
3836 "show slowest recent ops");
3837 ceph_assert(r
== 0);
3838 r
= admin_socket
->register_command("dump_historic_ops_by_duration " \
3839 "name=filterstr,type=CephString,n=N,req=false",
3841 "show slowest recent ops, sorted by duration");
3842 ceph_assert(r
== 0);
3843 r
= admin_socket
->register_command("dump_op_pq_state",
3845 "dump op priority queue state");
3846 ceph_assert(r
== 0);
3847 r
= admin_socket
->register_command("dump_blocklist",
3849 "dump blocklisted clients and times");
3850 ceph_assert(r
== 0);
3851 r
= admin_socket
->register_command("dump_watchers",
3853 "show clients which have active watches,"
3854 " and on which objects");
3855 ceph_assert(r
== 0);
3856 r
= admin_socket
->register_command("dump_recovery_reservations",
3858 "show recovery reservations");
3859 ceph_assert(r
== 0);
3860 r
= admin_socket
->register_command("dump_scrub_reservations",
3862 "show scrub reservations");
3863 ceph_assert(r
== 0);
3864 r
= admin_socket
->register_command("get_latest_osdmap",
3866 "force osd to update the latest map from "
3868 ceph_assert(r
== 0);
3870 r
= admin_socket
->register_command("set_heap_property " \
3871 "name=property,type=CephString " \
3872 "name=value,type=CephInt",
3874 "update malloc extension heap property");
3875 ceph_assert(r
== 0);
3877 r
= admin_socket
->register_command("get_heap_property " \
3878 "name=property,type=CephString",
3880 "get malloc extension heap property");
3881 ceph_assert(r
== 0);
3883 r
= admin_socket
->register_command("dump_objectstore_kv_stats",
3885 "print statistics of kvdb which used by bluestore");
3886 ceph_assert(r
== 0);
3888 r
= admin_socket
->register_command("dump_scrubs",
3890 "print scheduled scrubs");
3891 ceph_assert(r
== 0);
3893 r
= admin_socket
->register_command("calc_objectstore_db_histogram",
3895 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3896 ceph_assert(r
== 0);
3898 r
= admin_socket
->register_command("flush_store_cache",
3900 "Flush bluestore internal cache");
3901 ceph_assert(r
== 0);
3902 r
= admin_socket
->register_command("dump_pgstate_history",
3904 "show recent state history");
3905 ceph_assert(r
== 0);
3907 r
= admin_socket
->register_command("compact",
3909 "Commpact object store's omap."
3910 " WARNING: Compaction probably slows your requests");
3911 ceph_assert(r
== 0);
3913 r
= admin_socket
->register_command("get_mapped_pools",
3915 "dump pools whose PG(s) are mapped to this OSD.");
3917 ceph_assert(r
== 0);
3919 r
= admin_socket
->register_command("smart name=devid,type=CephString,req=false",
3921 "probe OSD devices for SMART data.");
3923 ceph_assert(r
== 0);
3925 r
= admin_socket
->register_command("list_devices",
3927 "list OSD devices.");
3928 r
= admin_socket
->register_command("send_beacon",
3930 "send OSD beacon to mon immediately");
3932 r
= admin_socket
->register_command(
3933 "dump_osd_network name=value,type=CephInt,req=false", asok_hook
,
3934 "Dump osd heartbeat network ping times");
3935 ceph_assert(r
== 0);
3937 test_ops_hook
= new TestOpsSocketHook(&(this->service
), this->store
);
3938 // Note: pools are CephString instead of CephPoolname because
3939 // these commands traditionally support both pool names and numbers
3940 r
= admin_socket
->register_command(
3942 "name=pool,type=CephString " \
3943 "name=objname,type=CephObjectname " \
3944 "name=key,type=CephString "\
3945 "name=val,type=CephString",
3948 ceph_assert(r
== 0);
3949 r
= admin_socket
->register_command(
3951 "name=pool,type=CephString " \
3952 "name=objname,type=CephObjectname " \
3953 "name=key,type=CephString",
3956 ceph_assert(r
== 0);
3957 r
= admin_socket
->register_command(
3959 "name=pool,type=CephString " \
3960 "name=objname,type=CephObjectname " \
3961 "name=header,type=CephString",
3964 ceph_assert(r
== 0);
3966 r
= admin_socket
->register_command(
3968 "name=pool,type=CephString " \
3969 "name=objname,type=CephObjectname",
3971 "output entire object map");
3972 ceph_assert(r
== 0);
3974 r
= admin_socket
->register_command(
3976 "name=pool,type=CephString " \
3977 "name=objname,type=CephObjectname " \
3978 "name=len,type=CephInt",
3980 "truncate object to length");
3981 ceph_assert(r
== 0);
3983 r
= admin_socket
->register_command(
3985 "name=pool,type=CephString " \
3986 "name=objname,type=CephObjectname " \
3987 "name=shardid,type=CephInt,req=false,range=0|255",
3989 "inject data error to an object");
3990 ceph_assert(r
== 0);
3992 r
= admin_socket
->register_command(
3994 "name=pool,type=CephString " \
3995 "name=objname,type=CephObjectname " \
3996 "name=shardid,type=CephInt,req=false,range=0|255",
3998 "inject metadata error to an object");
3999 ceph_assert(r
== 0);
4000 r
= admin_socket
->register_command(
4001 "set_recovery_delay " \
4002 "name=utime,type=CephInt,req=false",
4004 "Delay osd recovery by specified seconds");
4005 ceph_assert(r
== 0);
4006 r
= admin_socket
->register_command(
4008 "name=type,type=CephString,req=false " \
4009 "name=count,type=CephInt,req=false ",
4011 "Inject a full disk (optional count times)");
4012 ceph_assert(r
== 0);
4013 r
= admin_socket
->register_command(
4015 "name=count,type=CephInt,req=false " \
4016 "name=size,type=CephInt,req=false " \
4017 "name=object_size,type=CephInt,req=false " \
4018 "name=object_num,type=CephInt,req=false ",
4020 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4021 "(default count=1G default size=4MB). Results in log.");
4022 ceph_assert(r
== 0);
4023 r
= admin_socket
->register_command(
4025 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4026 "name=message,type=CephString,n=N",
4028 "log a message to the cluster log");
4029 ceph_assert(r
== 0);
4030 r
= admin_socket
->register_command(
4034 ceph_assert(r
== 0);
4035 r
= admin_socket
->register_command(
4037 "name=heapcmd,type=CephChoices,strings=" \
4038 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4039 "name=value,type=CephString,req=false",
4041 "show heap usage info (available only if compiled with tcmalloc)");
4042 ceph_assert(r
== 0);
4043 r
= admin_socket
->register_command(
4044 "debug dump_missing " \
4045 "name=filename,type=CephFilepath",
4047 "dump missing objects to a named file");
4048 ceph_assert(r
== 0);
4049 r
= admin_socket
->register_command(
4050 "debug kick_recovery_wq " \
4051 "name=delay,type=CephInt,range=0",
4053 "set osd_recovery_delay_start to <val>");
4054 ceph_assert(r
== 0);
4055 r
= admin_socket
->register_command(
4057 "name=arg,type=CephChoices,strings=status|flush",
4059 "run cpu profiling on daemon");
4060 ceph_assert(r
== 0);
4061 r
= admin_socket
->register_command(
4062 "dump_pg_recovery_stats",
4064 "dump pg recovery statistics");
4065 ceph_assert(r
== 0);
4066 r
= admin_socket
->register_command(
4067 "reset_pg_recovery_stats",
4069 "reset pg recovery statistics");
4070 ceph_assert(r
== 0);
4071 r
= admin_socket
->register_command(
4074 "Drop all OSD caches");
4075 ceph_assert(r
== 0);
4076 r
= admin_socket
->register_command(
4079 "Get OSD caches statistics");
4080 ceph_assert(r
== 0);
4081 r
= admin_socket
->register_command(
4082 "scrub_purged_snaps",
4084 "Scrub purged_snaps vs snapmapper index");
4085 ceph_assert(r
== 0);
4087 // -- pg commands --
4088 // old form: ceph pg <pgid> command ...
4089 r
= admin_socket
->register_command(
4091 "name=pgid,type=CephPgid " \
4092 "name=cmd,type=CephChoices,strings=query",
4095 ceph_assert(r
== 0);
4096 r
= admin_socket
->register_command(
4098 "name=pgid,type=CephPgid " \
4099 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4100 "name=mulcmd,type=CephChoices,strings=revert|delete",
4103 ceph_assert(r
== 0);
4104 r
= admin_socket
->register_command(
4106 "name=pgid,type=CephPgid " \
4107 "name=cmd,type=CephChoices,strings=list_unfound " \
4108 "name=offset,type=CephString,req=false",
4111 ceph_assert(r
== 0);
4112 r
= admin_socket
->register_command(
4114 "name=pgid,type=CephPgid " \
4115 "name=cmd,type=CephChoices,strings=scrub " \
4116 "name=time,type=CephInt,req=false",
4119 ceph_assert(r
== 0);
4120 r
= admin_socket
->register_command(
4122 "name=pgid,type=CephPgid " \
4123 "name=cmd,type=CephChoices,strings=deep_scrub " \
4124 "name=time,type=CephInt,req=false",
4127 ceph_assert(r
== 0);
4128 // new form: tell <pgid> <cmd> for both cli and rest
4129 r
= admin_socket
->register_command(
4132 "show details of a specific pg");
4133 ceph_assert(r
== 0);
4134 r
= admin_socket
->register_command(
4135 "mark_unfound_lost " \
4136 "name=pgid,type=CephPgid,req=false " \
4137 "name=mulcmd,type=CephChoices,strings=revert|delete",
4139 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4140 ceph_assert(r
== 0);
4141 r
= admin_socket
->register_command(
4143 "name=pgid,type=CephPgid,req=false " \
4144 "name=offset,type=CephString,req=false",
4146 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4147 ceph_assert(r
== 0);
4148 r
= admin_socket
->register_command(
4150 "name=pgid,type=CephPgid,req=false " \
4151 "name=time,type=CephInt,req=false",
4153 "Trigger a scheduled scrub ");
4154 ceph_assert(r
== 0);
4155 r
= admin_socket
->register_command(
4157 "name=pgid,type=CephPgid,req=false " \
4158 "name=time,type=CephInt,req=false",
4160 "Trigger a scheduled deep scrub ");
4161 ceph_assert(r
== 0);
4164 PerfCounters
* OSD::create_logger()
4166 PerfCounters
* logger
= build_osd_logger(cct
);
4167 cct
->get_perfcounters_collection()->add(logger
);
4171 PerfCounters
* OSD::create_recoverystate_perf()
4173 PerfCounters
* recoverystate_perf
= build_recoverystate_perf(cct
);
4174 cct
->get_perfcounters_collection()->add(recoverystate_perf
);
4175 return recoverystate_perf
;
4180 if (cct
->_conf
->osd_fast_shutdown
) {
4181 derr
<< "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl
;
4182 if (cct
->_conf
->osd_fast_shutdown_notify_mon
)
4183 service
.prepare_to_stop();
4188 if (!service
.prepare_to_stop())
4189 return 0; // already shutting down
4191 if (is_stopping()) {
4195 dout(0) << "shutdown" << dendl
;
4197 set_state(STATE_STOPPING
);
4200 if (cct
->_conf
.get_val
<bool>("osd_debug_shutdown")) {
4201 cct
->_conf
.set_val("debug_osd", "100");
4202 cct
->_conf
.set_val("debug_journal", "100");
4203 cct
->_conf
.set_val("debug_filestore", "100");
4204 cct
->_conf
.set_val("debug_bluestore", "100");
4205 cct
->_conf
.set_val("debug_ms", "100");
4206 cct
->_conf
.apply_changes(nullptr);
4209 // stop MgrClient earlier as it's more like an internal consumer of OSD
4212 service
.start_shutdown();
4214 // stop sending work to pgs. this just prevents any new work in _process
4215 // from racing with on_shutdown and potentially entering the pg after.
4216 op_shardedwq
.drain();
4222 for (auto pg
: pgs
) {
4227 // drain op queue again (in case PGs requeued something)
4228 op_shardedwq
.drain();
4230 finished
.clear(); // zap waiters (bleh, this is messy)
4231 waiting_for_osdmap
.clear();
4234 // unregister commands
4235 cct
->get_admin_socket()->unregister_commands(asok_hook
);
4239 cct
->get_admin_socket()->unregister_commands(test_ops_hook
);
4240 delete test_ops_hook
;
4241 test_ops_hook
= NULL
;
4246 std::lock_guard l
{heartbeat_lock
};
4247 heartbeat_stop
= true;
4248 heartbeat_cond
.notify_all();
4249 heartbeat_peers
.clear();
4251 heartbeat_thread
.join();
4253 hb_back_server_messenger
->mark_down_all();
4254 hb_front_server_messenger
->mark_down_all();
4255 hb_front_client_messenger
->mark_down_all();
4256 hb_back_client_messenger
->mark_down_all();
4260 dout(10) << "op sharded tp stopped" << dendl
;
4262 dout(10) << "stopping agent" << dendl
;
4263 service
.agent_stop();
4265 boot_finisher
.wait_for_empty();
4269 boot_finisher
.stop();
4270 reset_heartbeat_peers(true);
4272 tick_timer
.shutdown();
4275 std::lock_guard
l(tick_timer_lock
);
4276 tick_timer_without_osd_lock
.shutdown();
4279 // note unmount epoch
4280 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl
;
4281 superblock
.mounted
= service
.get_boot_epoch();
4282 superblock
.clean_thru
= get_osdmap_epoch();
4283 ObjectStore::Transaction t
;
4284 write_superblock(t
);
4285 int r
= store
->queue_transaction(service
.meta_ch
, std::move(t
));
4287 derr
<< "OSD::shutdown: error writing superblock: "
4288 << cpp_strerror(r
) << dendl
;
4292 service
.shutdown_reserver();
4295 #ifdef PG_DEBUG_REFS
4296 service
.dump_live_pgids();
4300 _get_pgs(&pgs
, true);
4304 for (auto& pg
: pgs
) {
4305 if (pg
->is_deleted()) {
4308 dout(20) << " kicking pg " << pg
<< dendl
;
4310 if (pg
->get_num_ref() != 1) {
4311 derr
<< "pgid " << pg
->get_pgid() << " has ref count of "
4312 << pg
->get_num_ref() << dendl
;
4313 #ifdef PG_DEBUG_REFS
4314 pg
->dump_live_ids();
4316 if (cct
->_conf
->osd_shutdown_pgref_assert
) {
4324 #ifdef PG_DEBUG_REFS
4325 service
.dump_live_pgids();
4329 cct
->_conf
.remove_observer(this);
4332 service
.meta_ch
.reset();
4334 dout(10) << "syncing store" << dendl
;
4335 enable_disable_fuse(true);
4337 if (cct
->_conf
->osd_journal_flush_on_shutdown
) {
4338 dout(10) << "flushing journal" << dendl
;
4339 store
->flush_journal();
4345 std::unique_lock l
{map_lock
};
4346 set_osdmap(OSDMapRef());
4348 for (auto s
: shards
) {
4349 std::lock_guard
l(s
->osdmap_lock
);
4350 s
->shard_osdmap
= OSDMapRef();
4354 std::lock_guard
lock(osd_lock
);
4358 dout(10) << "Store synced" << dendl
;
4360 op_tracker
.on_shutdown();
4362 ClassHandler::get_instance().shutdown();
4363 client_messenger
->shutdown();
4364 cluster_messenger
->shutdown();
4365 hb_front_client_messenger
->shutdown();
4366 hb_back_client_messenger
->shutdown();
4367 objecter_messenger
->shutdown();
4368 hb_front_server_messenger
->shutdown();
4369 hb_back_server_messenger
->shutdown();
4374 int OSD::mon_cmd_maybe_osd_create(string
&cmd
)
4376 bool created
= false;
4378 dout(10) << __func__
<< " cmd: " << cmd
<< dendl
;
4379 vector
<string
> vcmd
{cmd
};
4383 monc
->start_mon_command(vcmd
, inbl
, NULL
, &outs
, &w
);
4386 if (r
== -ENOENT
&& !created
) {
4387 string newcmd
= "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami
)
4388 + ", \"uuid\": \"" + stringify(superblock
.osd_fsid
) + "\"}";
4389 vector
<string
> vnewcmd
{newcmd
};
4393 monc
->start_mon_command(vnewcmd
, inbl
, NULL
, &outs
, &w
);
4396 derr
<< __func__
<< " fail: osd does not exist and created failed: "
4397 << cpp_strerror(r
) << dendl
;
4403 derr
<< __func__
<< " fail: '" << outs
<< "': " << cpp_strerror(r
) << dendl
;
4412 int OSD::update_crush_location()
4414 if (!cct
->_conf
->osd_crush_update_on_start
) {
4415 dout(10) << __func__
<< " osd_crush_update_on_start = false" << dendl
;
4420 if (cct
->_conf
->osd_crush_initial_weight
>= 0) {
4421 snprintf(weight
, sizeof(weight
), "%.4lf", cct
->_conf
->osd_crush_initial_weight
);
4423 struct store_statfs_t st
;
4424 osd_alert_list_t alerts
;
4425 int r
= store
->statfs(&st
, &alerts
);
4427 derr
<< "statfs: " << cpp_strerror(r
) << dendl
;
4430 snprintf(weight
, sizeof(weight
), "%.4lf",
4433 double(1ull << 40 /* TB */)));
4436 dout(10) << __func__
<< " crush location is " << cct
->crush_location
<< dendl
;
4439 string("{\"prefix\": \"osd crush create-or-move\", ") +
4440 string("\"id\": ") + stringify(whoami
) + ", " +
4441 string("\"weight\":") + weight
+ ", " +
4442 string("\"args\": [") + stringify(cct
->crush_location
) + "]}";
4443 return mon_cmd_maybe_osd_create(cmd
);
4446 int OSD::update_crush_device_class()
4448 if (!cct
->_conf
->osd_class_update_on_start
) {
4449 dout(10) << __func__
<< " osd_class_update_on_start = false" << dendl
;
4453 string device_class
;
4454 int r
= store
->read_meta("crush_device_class", &device_class
);
4455 if (r
< 0 || device_class
.empty()) {
4456 device_class
= store
->get_default_device_class();
4459 if (device_class
.empty()) {
4460 dout(20) << __func__
<< " no device class stored locally" << dendl
;
4465 string("{\"prefix\": \"osd crush set-device-class\", ") +
4466 string("\"class\": \"") + device_class
+ string("\", ") +
4467 string("\"ids\": [\"") + stringify(whoami
) + string("\"]}");
4469 r
= mon_cmd_maybe_osd_create(cmd
);
4471 // good, already bound to a device-class
4478 void OSD::write_superblock(ObjectStore::Transaction
& t
)
4480 dout(10) << "write_superblock " << superblock
<< dendl
;
4482 //hack: at minimum it's using the baseline feature set
4483 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE
))
4484 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE
);
4487 encode(superblock
, bl
);
4488 t
.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT
, 0, bl
.length(), bl
);
4491 int OSD::read_superblock()
4494 int r
= store
->read(service
.meta_ch
, OSD_SUPERBLOCK_GOBJECT
, 0, 0, bl
);
4498 auto p
= bl
.cbegin();
4499 decode(superblock
, p
);
4501 dout(10) << "read_superblock " << superblock
<< dendl
;
4506 void OSD::clear_temp_objects()
4508 dout(10) << __func__
<< dendl
;
4510 store
->list_collections(ls
);
4511 for (vector
<coll_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
4513 if (!p
->is_pg(&pgid
))
4516 // list temp objects
4517 dout(20) << " clearing temps in " << *p
<< " pgid " << pgid
<< dendl
;
4519 vector
<ghobject_t
> temps
;
4522 vector
<ghobject_t
> objects
;
4523 auto ch
= store
->open_collection(*p
);
4525 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4526 store
->get_ideal_list_max(),
4528 if (objects
.empty())
4530 vector
<ghobject_t
>::iterator q
;
4531 for (q
= objects
.begin(); q
!= objects
.end(); ++q
) {
4532 // Hammer set pool for temps to -1, so check for clean-up
4533 if (q
->hobj
.is_temp() || (q
->hobj
.pool
== -1)) {
4534 temps
.push_back(*q
);
4539 // If we saw a non-temp object and hit the break above we can
4540 // break out of the while loop too.
4541 if (q
!= objects
.end())
4544 if (!temps
.empty()) {
4545 ObjectStore::Transaction t
;
4547 for (vector
<ghobject_t
>::iterator q
= temps
.begin(); q
!= temps
.end(); ++q
) {
4548 dout(20) << " removing " << *p
<< " object " << *q
<< dendl
;
4550 if (++removed
> cct
->_conf
->osd_target_transaction_size
) {
4551 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4552 t
= ObjectStore::Transaction();
4557 store
->queue_transaction(service
.meta_ch
, std::move(t
));
4563 void OSD::recursive_remove_collection(CephContext
* cct
,
4564 ObjectStore
*store
, spg_t pgid
,
4570 make_snapmapper_oid());
4572 ObjectStore::CollectionHandle ch
= store
->open_collection(tmp
);
4573 ObjectStore::Transaction t
;
4574 SnapMapper
mapper(cct
, &driver
, 0, 0, 0, pgid
.shard
);
4577 int max
= cct
->_conf
->osd_target_transaction_size
;
4578 vector
<ghobject_t
> objects
;
4579 objects
.reserve(max
);
4582 store
->collection_list(ch
, next
, ghobject_t::get_max(),
4583 max
, &objects
, &next
);
4584 generic_dout(10) << __func__
<< " " << objects
<< dendl
;
4585 if (objects
.empty())
4587 for (auto& p
: objects
) {
4588 OSDriver::OSTransaction
_t(driver
.get_transaction(&t
));
4589 int r
= mapper
.remove_oid(p
.hobj
, &_t
);
4590 if (r
!= 0 && r
!= -ENOENT
)
4594 int r
= store
->queue_transaction(ch
, std::move(t
));
4595 ceph_assert(r
== 0);
4596 t
= ObjectStore::Transaction();
4598 t
.remove_collection(tmp
);
4599 int r
= store
->queue_transaction(ch
, std::move(t
));
4600 ceph_assert(r
== 0);
4603 if (!ch
->flush_commit(&waiter
)) {
4609 // ======================================================
4613 OSDMapRef createmap
,
4616 dout(10) << __func__
<< " " << pgid
<< dendl
;
4618 map
<string
,string
> ec_profile
;
4620 if (createmap
->have_pg_pool(pgid
.pool())) {
4621 pi
= *createmap
->get_pg_pool(pgid
.pool());
4622 name
= createmap
->get_pool_name(pgid
.pool());
4623 if (pi
.is_erasure()) {
4624 ec_profile
= createmap
->get_erasure_code_profile(pi
.erasure_code_profile
);
4627 // pool was deleted; grab final pg_pool_t off disk.
4628 ghobject_t oid
= make_final_pool_info_oid(pgid
.pool());
4630 int r
= store
->read(service
.meta_ch
, oid
, 0, 0, bl
);
4632 derr
<< __func__
<< " missing pool " << pgid
.pool() << " tombstone"
4636 ceph_assert(r
>= 0);
4637 auto p
= bl
.cbegin();
4640 if (p
.end()) { // dev release v13.0.2 did not include ec_profile
4641 derr
<< __func__
<< " missing ec_profile from pool " << pgid
.pool()
4642 << " tombstone" << dendl
;
4645 decode(ec_profile
, p
);
4647 PGPool
pool(createmap
, pgid
.pool(), pi
, name
);
4649 if (pi
.type
== pg_pool_t::TYPE_REPLICATED
||
4650 pi
.type
== pg_pool_t::TYPE_ERASURE
)
4651 pg
= new PrimaryLogPG(&service
, createmap
, pool
, ec_profile
, pgid
);
4657 void OSD::_get_pgs(vector
<PGRef
> *v
, bool clear_too
)
4660 v
->reserve(get_num_pgs());
4661 for (auto& s
: shards
) {
4662 std::lock_guard
l(s
->shard_lock
);
4663 for (auto& j
: s
->pg_slots
) {
4665 !j
.second
->pg
->is_deleted()) {
4666 v
->push_back(j
.second
->pg
);
4668 s
->_detach_pg(j
.second
.get());
4675 void OSD::_get_pgids(vector
<spg_t
> *v
)
4678 v
->reserve(get_num_pgs());
4679 for (auto& s
: shards
) {
4680 std::lock_guard
l(s
->shard_lock
);
4681 for (auto& j
: s
->pg_slots
) {
4683 !j
.second
->pg
->is_deleted()) {
4684 v
->push_back(j
.first
);
4690 void OSD::register_pg(PGRef pg
)
4692 spg_t pgid
= pg
->get_pgid();
4693 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4694 auto sdata
= shards
[shard_index
];
4695 std::lock_guard
l(sdata
->shard_lock
);
4696 auto r
= sdata
->pg_slots
.emplace(pgid
, make_unique
<OSDShardPGSlot
>());
4697 ceph_assert(r
.second
);
4698 auto *slot
= r
.first
->second
.get();
4699 dout(20) << __func__
<< " " << pgid
<< " " << pg
<< dendl
;
4700 sdata
->_attach_pg(slot
, pg
.get());
4703 bool OSD::try_finish_pg_delete(PG
*pg
, unsigned old_pg_num
)
4705 auto sdata
= pg
->osd_shard
;
4708 std::lock_guard
l(sdata
->shard_lock
);
4709 auto p
= sdata
->pg_slots
.find(pg
->pg_id
);
4710 if (p
== sdata
->pg_slots
.end() ||
4712 dout(20) << __func__
<< " " << pg
->pg_id
<< " not found" << dendl
;
4715 if (p
->second
->waiting_for_merge_epoch
) {
4716 dout(20) << __func__
<< " " << pg
->pg_id
<< " waiting for merge" << dendl
;
4719 dout(20) << __func__
<< " " << pg
->pg_id
<< " " << pg
<< dendl
;
4720 sdata
->_detach_pg(p
->second
.get());
4723 for (auto shard
: shards
) {
4724 shard
->unprime_split_children(pg
->pg_id
, old_pg_num
);
4727 // update pg count now since we might not get an osdmap any time soon.
4728 if (pg
->is_primary())
4729 service
.logger
->dec(l_osd_pg_primary
);
4730 else if (pg
->is_nonprimary())
4731 service
.logger
->dec(l_osd_pg_replica
); // misnomver
4733 service
.logger
->dec(l_osd_pg_stray
);
4738 PGRef
OSD::_lookup_pg(spg_t pgid
)
4740 uint32_t shard_index
= pgid
.hash_to_shard(num_shards
);
4741 auto sdata
= shards
[shard_index
];
4742 std::lock_guard
l(sdata
->shard_lock
);
4743 auto p
= sdata
->pg_slots
.find(pgid
);
4744 if (p
== sdata
->pg_slots
.end()) {
4747 return p
->second
->pg
;
4750 PGRef
OSD::_lookup_lock_pg(spg_t pgid
)
4752 PGRef pg
= _lookup_pg(pgid
);
4757 if (!pg
->is_deleted()) {
4764 PGRef
OSD::lookup_lock_pg(spg_t pgid
)
4766 return _lookup_lock_pg(pgid
);
4769 void OSD::load_pgs()
4771 ceph_assert(ceph_mutex_is_locked(osd_lock
));
4772 dout(0) << "load_pgs" << dendl
;
4775 auto pghist
= make_pg_num_history_oid();
4777 int r
= store
->read(service
.meta_ch
, pghist
, 0, 0, bl
, 0);
4778 if (r
>= 0 && bl
.length() > 0) {
4779 auto p
= bl
.cbegin();
4780 decode(pg_num_history
, p
);
4782 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
4786 int r
= store
->list_collections(ls
);
4788 derr
<< "failed to list pgs: " << cpp_strerror(-r
) << dendl
;
4792 for (vector
<coll_t
>::iterator it
= ls
.begin();
4796 if (it
->is_temp(&pgid
) ||
4797 (it
->is_pg(&pgid
) && PG::_has_removal_flag(store
, pgid
))) {
4798 dout(10) << "load_pgs " << *it
4799 << " removing, legacy or flagged for removal pg" << dendl
;
4800 recursive_remove_collection(cct
, store
, pgid
, *it
);
4804 if (!it
->is_pg(&pgid
)) {
4805 dout(10) << "load_pgs ignoring unrecognized " << *it
<< dendl
;
4809 dout(10) << "pgid " << pgid
<< " coll " << coll_t(pgid
) << dendl
;
4810 epoch_t map_epoch
= 0;
4811 int r
= PG::peek_map_epoch(store
, pgid
, &map_epoch
);
4813 derr
<< __func__
<< " unable to peek at " << pgid
<< " metadata, skipping"
4819 if (map_epoch
> 0) {
4820 OSDMapRef pgosdmap
= service
.try_get_map(map_epoch
);
4822 if (!get_osdmap()->have_pg_pool(pgid
.pool())) {
4823 derr
<< __func__
<< ": could not find map for epoch " << map_epoch
4824 << " on pg " << pgid
<< ", but the pool is not present in the "
4825 << "current map, so this is probably a result of bug 10617. "
4826 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4827 << "to clean it up later." << dendl
;
4830 derr
<< __func__
<< ": have pgid " << pgid
<< " at epoch "
4831 << map_epoch
<< ", but missing map. Crashing."
4833 ceph_abort_msg("Missing map in load_pgs");
4836 pg
= _make_pg(pgosdmap
, pgid
);
4838 pg
= _make_pg(get_osdmap(), pgid
);
4841 recursive_remove_collection(cct
, store
, pgid
, *it
);
4845 // there can be no waiters here, so we don't call _wake_pg_slot
4848 pg
->ch
= store
->open_collection(pg
->coll
);
4850 // read pg state, log
4851 pg
->read_state(store
);
4854 dout(10) << "load_pgs " << *it
<< " deleting dne" << dendl
;
4857 recursive_remove_collection(cct
, store
, pgid
, *it
);
4861 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4862 assert(NULL
!= shards
[shard_index
]);
4863 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4866 pg
->reg_next_scrub();
4868 dout(10) << __func__
<< " loaded " << *pg
<< dendl
;
4874 dout(0) << __func__
<< " opened " << num
<< " pgs" << dendl
;
4878 PGRef
OSD::handle_pg_create_info(const OSDMapRef
& osdmap
,
4879 const PGCreateInfo
*info
)
4881 spg_t pgid
= info
->pgid
;
4883 if (maybe_wait_for_max_pg(osdmap
, pgid
, info
->by_mon
)) {
4884 dout(10) << __func__
<< " hit max pg, dropping" << dendl
;
4888 PeeringCtx rctx
= create_context();
4890 OSDMapRef startmap
= get_map(info
->epoch
);
4893 int64_t pool_id
= pgid
.pgid
.pool();
4894 const pg_pool_t
*pool
= osdmap
->get_pg_pool(pool_id
);
4896 dout(10) << __func__
<< " ignoring " << pgid
<< ", pool dne" << dendl
;
4899 if (osdmap
->require_osd_release
>= ceph_release_t::nautilus
&&
4900 !pool
->has_flag(pg_pool_t::FLAG_CREATING
)) {
4901 // this ensures we do not process old creating messages after the
4902 // pool's initial pgs have been created (and pg are subsequently
4903 // allowed to split or merge).
4904 dout(20) << __func__
<< " dropping " << pgid
4905 << "create, pool does not have CREATING flag set" << dendl
;
4910 int up_primary
, acting_primary
;
4911 vector
<int> up
, acting
;
4912 startmap
->pg_to_up_acting_osds(
4913 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
4915 const pg_pool_t
* pp
= startmap
->get_pg_pool(pgid
.pool());
4916 if (pp
->has_flag(pg_pool_t::FLAG_EC_OVERWRITES
) &&
4917 store
->get_type() != "bluestore") {
4918 clog
->warn() << "pg " << pgid
4919 << " is at risk of silent data corruption: "
4920 << "the pool allows ec overwrites but is not stored in "
4921 << "bluestore, so deep scrubbing will not detect bitrot";
4923 create_pg_collection(
4924 rctx
.transaction
, pgid
, pgid
.get_split_bits(pp
->get_pg_num()));
4925 init_pg_ondisk(rctx
.transaction
, pgid
, pp
);
4927 int role
= startmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
4929 PGRef pg
= _make_pg(startmap
, pgid
);
4930 pg
->ch
= store
->create_new_collection(pg
->coll
);
4933 uint32_t shard_index
= pgid
.hash_to_shard(shards
.size());
4934 assert(NULL
!= shards
[shard_index
]);
4935 store
->set_collection_commit_queue(pg
->coll
, &(shards
[shard_index
]->context_queue
));
4940 // we are holding the shard lock
4941 ceph_assert(!pg
->is_deleted());
4950 info
->past_intervals
,
4954 pg
->init_collection_pool_opts();
4956 if (pg
->is_primary()) {
4957 std::lock_guard locker
{m_perf_queries_lock
};
4958 pg
->set_dynamic_perf_stats_queries(m_perf_queries
);
4961 pg
->handle_initialize(rctx
);
4962 pg
->handle_activate_map(rctx
);
4964 dispatch_context(rctx
, pg
.get(), osdmap
, nullptr);
4966 dout(10) << __func__
<< " new pg " << *pg
<< dendl
;
4970 bool OSD::maybe_wait_for_max_pg(const OSDMapRef
& osdmap
,
4974 const auto max_pgs_per_osd
=
4975 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
4976 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
4978 if (num_pgs
< max_pgs_per_osd
) {
4982 std::lock_guard
l(pending_creates_lock
);
4983 if (is_mon_create
) {
4984 pending_creates_from_mon
++;
4986 bool is_primary
= osdmap
->get_pg_acting_role(pgid
, whoami
) == 0;
4987 pending_creates_from_osd
.emplace(pgid
, is_primary
);
4989 dout(1) << __func__
<< " withhold creation of pg " << pgid
4990 << ": " << num_pgs
<< " >= "<< max_pgs_per_osd
<< dendl
;
4994 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4995 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4996 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4997 static vector
<int32_t> twiddle(const vector
<int>& acting
) {
4998 if (acting
.size() > 1) {
5001 vector
<int32_t> twiddled(acting
.begin(), acting
.end());
5002 twiddled
.push_back(-1);
5007 void OSD::resume_creating_pg()
5009 bool do_sub_pg_creates
= false;
5010 bool have_pending_creates
= false;
5012 const auto max_pgs_per_osd
=
5013 (cct
->_conf
.get_val
<uint64_t>("mon_max_pg_per_osd") *
5014 cct
->_conf
.get_val
<double>("osd_max_pg_per_osd_hard_ratio"));
5015 if (max_pgs_per_osd
<= num_pgs
) {
5016 // this could happen if admin decreases this setting before a PG is removed
5019 unsigned spare_pgs
= max_pgs_per_osd
- num_pgs
;
5020 std::lock_guard
l(pending_creates_lock
);
5021 if (pending_creates_from_mon
> 0) {
5022 dout(20) << __func__
<< " pending_creates_from_mon "
5023 << pending_creates_from_mon
<< dendl
;
5024 do_sub_pg_creates
= true;
5025 if (pending_creates_from_mon
>= spare_pgs
) {
5026 spare_pgs
= pending_creates_from_mon
= 0;
5028 spare_pgs
-= pending_creates_from_mon
;
5029 pending_creates_from_mon
= 0;
5032 auto pg
= pending_creates_from_osd
.cbegin();
5033 while (spare_pgs
> 0 && pg
!= pending_creates_from_osd
.cend()) {
5034 dout(20) << __func__
<< " pg " << pg
->first
<< dendl
;
5036 get_osdmap()->pg_to_up_acting_osds(pg
->first
.pgid
, nullptr, nullptr, &acting
, nullptr);
5037 service
.queue_want_pg_temp(pg
->first
.pgid
, twiddle(acting
), true);
5038 pg
= pending_creates_from_osd
.erase(pg
);
5039 do_sub_pg_creates
= true;
5042 have_pending_creates
= (pending_creates_from_mon
> 0 ||
5043 !pending_creates_from_osd
.empty());
5046 bool do_renew_subs
= false;
5047 if (do_sub_pg_creates
) {
5048 if (monc
->sub_want("osd_pg_creates", last_pg_create_epoch
, 0)) {
5049 dout(4) << __func__
<< ": resolicit pg creates from mon since "
5050 << last_pg_create_epoch
<< dendl
;
5051 do_renew_subs
= true;
5054 version_t start
= get_osdmap_epoch() + 1;
5055 if (have_pending_creates
) {
5056 // don't miss any new osdmap deleting PGs
5057 if (monc
->sub_want("osdmap", start
, 0)) {
5058 dout(4) << __func__
<< ": resolicit osdmap from mon since "
5060 do_renew_subs
= true;
5062 } else if (do_sub_pg_creates
) {
5063 // no need to subscribe the osdmap continuously anymore
5064 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5065 if (monc
->sub_want_increment("osdmap", start
, CEPH_SUBSCRIBE_ONETIME
)) {
5066 dout(4) << __func__
<< ": re-subscribe osdmap(onetime) since "
5068 do_renew_subs
= true;
5072 if (do_renew_subs
) {
5076 service
.send_pg_temp();
5079 void OSD::build_initial_pg_history(
5082 utime_t created_stamp
,
5086 dout(10) << __func__
<< " " << pgid
<< " created " << created
<< dendl
;
5087 *h
= pg_history_t(created
, created_stamp
);
5089 OSDMapRef lastmap
= service
.get_map(created
);
5090 int up_primary
, acting_primary
;
5091 vector
<int> up
, acting
;
5092 lastmap
->pg_to_up_acting_osds(
5093 pgid
.pgid
, &up
, &up_primary
, &acting
, &acting_primary
);
5095 ostringstream debug
;
5096 for (epoch_t e
= created
+ 1; e
<= get_osdmap_epoch(); ++e
) {
5097 OSDMapRef osdmap
= service
.get_map(e
);
5098 int new_up_primary
, new_acting_primary
;
5099 vector
<int> new_up
, new_acting
;
5100 osdmap
->pg_to_up_acting_osds(
5101 pgid
.pgid
, &new_up
, &new_up_primary
, &new_acting
, &new_acting_primary
);
5103 // this is a bit imprecise, but sufficient?
5104 struct min_size_predicate_t
: public IsPGRecoverablePredicate
{
5105 const pg_pool_t
*pi
;
5106 bool operator()(const set
<pg_shard_t
> &have
) const {
5107 return have
.size() >= pi
->min_size
;
5109 explicit min_size_predicate_t(const pg_pool_t
*i
) : pi(i
) {}
5110 } min_size_predicate(osdmap
->get_pg_pool(pgid
.pgid
.pool()));
5112 bool new_interval
= PastIntervals::check_new_interval(
5119 h
->same_interval_since
,
5120 h
->last_epoch_clean
,
5128 h
->same_interval_since
= e
;
5130 h
->same_up_since
= e
;
5132 if (acting_primary
!= new_acting_primary
) {
5133 h
->same_primary_since
= e
;
5135 if (pgid
.pgid
.is_split(lastmap
->get_pg_num(pgid
.pgid
.pool()),
5136 osdmap
->get_pg_num(pgid
.pgid
.pool()),
5138 h
->last_epoch_split
= e
;
5141 acting
= new_acting
;
5142 up_primary
= new_up_primary
;
5143 acting_primary
= new_acting_primary
;
5147 dout(20) << __func__
<< " " << debug
.str() << dendl
;
5148 dout(10) << __func__
<< " " << *h
<< " " << *pi
5149 << " [" << (pi
->empty() ? pair
<epoch_t
,epoch_t
>(0,0) :
5150 pi
->get_bounds()) << ")"
5154 void OSD::_add_heartbeat_peer(int p
)
5160 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(p
);
5161 if (i
== heartbeat_peers
.end()) {
5162 pair
<ConnectionRef
,ConnectionRef
> cons
= service
.get_con_osd_hb(p
, get_osdmap_epoch());
5165 assert(cons
.second
);
5167 hi
= &heartbeat_peers
[p
];
5170 auto stamps
= service
.get_hb_stamps(p
);
5172 auto sb
= ceph::make_ref
<Session
>(cct
, cons
.first
.get());
5174 sb
->stamps
= stamps
;
5175 hi
->hb_interval_start
= ceph_clock_now();
5176 hi
->con_back
= cons
.first
.get();
5177 hi
->con_back
->set_priv(sb
);
5179 auto sf
= ceph::make_ref
<Session
>(cct
, cons
.second
.get());
5181 sf
->stamps
= stamps
;
5182 hi
->con_front
= cons
.second
.get();
5183 hi
->con_front
->set_priv(sf
);
5185 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5186 << " " << hi
->con_back
->get_peer_addr()
5187 << " " << hi
->con_front
->get_peer_addr()
5192 hi
->epoch
= get_osdmap_epoch();
5195 void OSD::_remove_heartbeat_peer(int n
)
5197 map
<int,HeartbeatInfo
>::iterator q
= heartbeat_peers
.find(n
);
5198 ceph_assert(q
!= heartbeat_peers
.end());
5199 dout(20) << " removing heartbeat peer osd." << n
5200 << " " << q
->second
.con_back
->get_peer_addr()
5201 << " " << (q
->second
.con_front
? q
->second
.con_front
->get_peer_addr() : entity_addr_t())
5203 q
->second
.clear_mark_down();
5204 heartbeat_peers
.erase(q
);
5207 void OSD::need_heartbeat_peer_update()
5211 dout(20) << "need_heartbeat_peer_update" << dendl
;
5212 heartbeat_set_peers_need_update();
5215 void OSD::maybe_update_heartbeat_peers()
5217 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5219 if (is_waiting_for_healthy() || is_active()) {
5220 utime_t now
= ceph_clock_now();
5221 if (last_heartbeat_resample
== utime_t()) {
5222 last_heartbeat_resample
= now
;
5223 heartbeat_set_peers_need_update();
5224 } else if (!heartbeat_peers_need_update()) {
5225 utime_t dur
= now
- last_heartbeat_resample
;
5226 if (dur
> cct
->_conf
->osd_heartbeat_grace
) {
5227 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur
<< " seconds" << dendl
;
5228 heartbeat_set_peers_need_update();
5229 last_heartbeat_resample
= now
;
5230 // automatically clean up any stale heartbeat peers
5231 // if we are unhealthy, then clean all
5232 reset_heartbeat_peers(is_waiting_for_healthy());
5237 if (!heartbeat_peers_need_update())
5239 heartbeat_clear_peers_need_update();
5241 std::lock_guard
l(heartbeat_lock
);
5243 dout(10) << "maybe_update_heartbeat_peers updating" << dendl
;
5246 // build heartbeat from set
5250 for (auto& pg
: pgs
) {
5251 pg
->with_heartbeat_peers([&](int peer
) {
5252 if (get_osdmap()->is_up(peer
)) {
5253 _add_heartbeat_peer(peer
);
5259 // include next and previous up osds to ensure we have a fully-connected set
5260 set
<int> want
, extras
;
5261 const int next
= get_osdmap()->get_next_up_osd_after(whoami
);
5264 int prev
= get_osdmap()->get_previous_up_osd_before(whoami
);
5265 if (prev
>= 0 && prev
!= next
)
5268 // make sure we have at least **min_down** osds coming from different
5269 // subtree level (e.g., hosts) for fast failure detection.
5270 auto min_down
= cct
->_conf
.get_val
<uint64_t>("mon_osd_min_down_reporters");
5271 auto subtree
= cct
->_conf
.get_val
<string
>("mon_osd_reporter_subtree_level");
5272 auto limit
= std::max(min_down
, (uint64_t)cct
->_conf
->osd_heartbeat_min_peers
);
5273 get_osdmap()->get_random_up_osds_by_subtree(
5274 whoami
, subtree
, limit
, want
, &want
);
5276 for (set
<int>::iterator p
= want
.begin(); p
!= want
.end(); ++p
) {
5277 dout(10) << " adding neighbor peer osd." << *p
<< dendl
;
5279 _add_heartbeat_peer(*p
);
5282 // remove down peers; enumerate extras
5283 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5284 while (p
!= heartbeat_peers
.end()) {
5285 if (!get_osdmap()->is_up(p
->first
)) {
5288 _remove_heartbeat_peer(o
);
5291 if (p
->second
.epoch
< get_osdmap_epoch()) {
5292 extras
.insert(p
->first
);
5298 for (int n
= next
; n
>= 0; ) {
5299 if ((int)heartbeat_peers
.size() >= cct
->_conf
->osd_heartbeat_min_peers
)
5301 if (!extras
.count(n
) && !want
.count(n
) && n
!= whoami
) {
5302 dout(10) << " adding random peer osd." << n
<< dendl
;
5304 _add_heartbeat_peer(n
);
5306 n
= get_osdmap()->get_next_up_osd_after(n
);
5308 break; // came full circle; stop
5312 for (set
<int>::iterator p
= extras
.begin();
5313 (int)heartbeat_peers
.size() > cct
->_conf
->osd_heartbeat_min_peers
&& p
!= extras
.end();
5317 _remove_heartbeat_peer(*p
);
5320 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers
.size() << " peers, extras " << extras
<< dendl
;
5322 // clean up stale failure pending
5323 for (auto it
= failure_pending
.begin(); it
!= failure_pending
.end();) {
5324 if (heartbeat_peers
.count(it
->first
) == 0) {
5325 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
5326 failure_pending
.erase(it
++);
5333 void OSD::reset_heartbeat_peers(bool all
)
5335 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5336 dout(10) << "reset_heartbeat_peers" << dendl
;
5337 utime_t stale
= ceph_clock_now();
5338 stale
-= cct
->_conf
.get_val
<int64_t>("osd_heartbeat_stale");
5339 std::lock_guard
l(heartbeat_lock
);
5340 for (auto it
= heartbeat_peers
.begin(); it
!= heartbeat_peers
.end();) {
5341 auto& [peer
, hi
] = *it
;
5342 if (all
|| hi
.is_stale(stale
)) {
5343 hi
.clear_mark_down();
5344 // stop sending failure_report to mon too
5345 failure_queue
.erase(peer
);
5346 failure_pending
.erase(peer
);
5347 it
= heartbeat_peers
.erase(it
);
5354 void OSD::handle_osd_ping(MOSDPing
*m
)
5356 if (superblock
.cluster_fsid
!= m
->fsid
) {
5357 dout(20) << "handle_osd_ping from " << m
->get_source_inst()
5358 << " bad fsid " << m
->fsid
<< " != " << superblock
.cluster_fsid
5364 int from
= m
->get_source().num();
5366 heartbeat_lock
.lock();
5367 if (is_stopping()) {
5368 heartbeat_lock
.unlock();
5373 utime_t now
= ceph_clock_now();
5374 auto mnow
= service
.get_mnow();
5375 ConnectionRef
con(m
->get_connection());
5376 OSDMapRef curmap
= service
.get_osdmap();
5378 heartbeat_lock
.unlock();
5383 auto sref
= con
->get_priv();
5384 Session
*s
= static_cast<Session
*>(sref
.get());
5386 heartbeat_lock
.unlock();
5392 s
->stamps
= service
.get_hb_stamps(from
);
5397 case MOSDPing::PING
:
5399 if (cct
->_conf
->osd_debug_drop_ping_probability
> 0) {
5400 auto heartbeat_drop
= debug_heartbeat_drops_remaining
.find(from
);
5401 if (heartbeat_drop
!= debug_heartbeat_drops_remaining
.end()) {
5402 if (heartbeat_drop
->second
== 0) {
5403 debug_heartbeat_drops_remaining
.erase(heartbeat_drop
);
5405 --heartbeat_drop
->second
;
5406 dout(5) << "Dropping heartbeat from " << from
5407 << ", " << heartbeat_drop
->second
5408 << " remaining to drop" << dendl
;
5411 } else if (cct
->_conf
->osd_debug_drop_ping_probability
>
5412 ((((double)(rand()%100))/100.0))) {
5414 debug_heartbeat_drops_remaining
.insert(std::make_pair(from
,
5415 cct
->_conf
->osd_debug_drop_ping_duration
)).first
;
5416 dout(5) << "Dropping heartbeat from " << from
5417 << ", " << heartbeat_drop
->second
5418 << " remaining to drop" << dendl
;
5423 ceph::signedspan sender_delta_ub
{};
5424 s
->stamps
->got_ping(
5430 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5432 if (!cct
->get_heartbeat_map()->is_healthy()) {
5433 dout(10) << "internal heartbeat not healthy, dropping ping request"
5438 Message
*r
= new MOSDPing(monc
->get_fsid(),
5439 curmap
->get_epoch(),
5440 MOSDPing::PING_REPLY
,
5444 service
.get_up_epoch(),
5445 cct
->_conf
->osd_heartbeat_min_size
,
5447 con
->send_message(r
);
5449 if (curmap
->is_up(from
)) {
5451 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5452 from
, curmap
->get_epoch());
5454 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5457 } else if (!curmap
->exists(from
) ||
5458 curmap
->get_down_at(from
) > m
->map_epoch
) {
5459 // tell them they have died
5460 Message
*r
= new MOSDPing(monc
->get_fsid(),
5461 curmap
->get_epoch(),
5466 service
.get_up_epoch(),
5467 cct
->_conf
->osd_heartbeat_min_size
);
5468 con
->send_message(r
);
5473 case MOSDPing::PING_REPLY
:
5475 map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.find(from
);
5476 if (i
!= heartbeat_peers
.end()) {
5477 auto acked
= i
->second
.ping_history
.find(m
->ping_stamp
);
5478 if (acked
!= i
->second
.ping_history
.end()) {
5479 int &unacknowledged
= acked
->second
.second
;
5480 if (con
== i
->second
.con_back
) {
5481 dout(25) << "handle_osd_ping got reply from osd." << from
5482 << " first_tx " << i
->second
.first_tx
5483 << " last_tx " << i
->second
.last_tx
5484 << " last_rx_back " << i
->second
.last_rx_back
5486 << " last_rx_front " << i
->second
.last_rx_front
5488 i
->second
.last_rx_back
= now
;
5489 ceph_assert(unacknowledged
> 0);
5491 // if there is no front con, set both stamps.
5492 if (i
->second
.con_front
== NULL
) {
5493 i
->second
.last_rx_front
= now
;
5494 ceph_assert(unacknowledged
> 0);
5497 } else if (con
== i
->second
.con_front
) {
5498 dout(25) << "handle_osd_ping got reply from osd." << from
5499 << " first_tx " << i
->second
.first_tx
5500 << " last_tx " << i
->second
.last_tx
5501 << " last_rx_back " << i
->second
.last_rx_back
5502 << " last_rx_front " << i
->second
.last_rx_front
5505 i
->second
.last_rx_front
= now
;
5506 ceph_assert(unacknowledged
> 0);
5510 if (unacknowledged
== 0) {
5511 // succeeded in getting all replies
5512 dout(25) << "handle_osd_ping got all replies from osd." << from
5513 << " , erase pending ping(sent at " << m
->ping_stamp
<< ")"
5514 << " and older pending ping(s)"
5517 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5518 ++i
->second
.hb_average_count
;
5519 uint32_t back_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_back
- m
->ping_stamp
);
5520 i
->second
.hb_total_back
+= back_pingtime
;
5521 if (back_pingtime
< i
->second
.hb_min_back
)
5522 i
->second
.hb_min_back
= back_pingtime
;
5523 if (back_pingtime
> i
->second
.hb_max_back
)
5524 i
->second
.hb_max_back
= back_pingtime
;
5525 uint32_t front_pingtime
= ROUND_S_TO_USEC(i
->second
.last_rx_front
- m
->ping_stamp
);
5526 i
->second
.hb_total_front
+= front_pingtime
;
5527 if (front_pingtime
< i
->second
.hb_min_front
)
5528 i
->second
.hb_min_front
= front_pingtime
;
5529 if (front_pingtime
> i
->second
.hb_max_front
)
5530 i
->second
.hb_max_front
= front_pingtime
;
5532 ceph_assert(i
->second
.hb_interval_start
!= utime_t());
5533 if (i
->second
.hb_interval_start
== utime_t())
5534 i
->second
.hb_interval_start
= now
;
5535 int64_t hb_avg_time_period
= 60;
5536 if (cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span")) {
5537 hb_avg_time_period
= cct
->_conf
.get_val
<int64_t>("debug_heartbeat_testing_span");
5539 if (now
- i
->second
.hb_interval_start
>= utime_t(hb_avg_time_period
, 0)) {
5540 uint32_t back_avg
= i
->second
.hb_total_back
/ i
->second
.hb_average_count
;
5541 uint32_t back_min
= i
->second
.hb_min_back
;
5542 uint32_t back_max
= i
->second
.hb_max_back
;
5543 uint32_t front_avg
= i
->second
.hb_total_front
/ i
->second
.hb_average_count
;
5544 uint32_t front_min
= i
->second
.hb_min_front
;
5545 uint32_t front_max
= i
->second
.hb_max_front
;
5547 // Reset for new interval
5548 i
->second
.hb_average_count
= 0;
5549 i
->second
.hb_interval_start
= now
;
5550 i
->second
.hb_total_back
= i
->second
.hb_max_back
= 0;
5551 i
->second
.hb_min_back
= UINT_MAX
;
5552 i
->second
.hb_total_front
= i
->second
.hb_max_front
= 0;
5553 i
->second
.hb_min_front
= UINT_MAX
;
5555 // Record per osd interace ping times
5556 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5557 if (i
->second
.hb_back_pingtime
.size() == 0) {
5558 ceph_assert(i
->second
.hb_front_pingtime
.size() == 0);
5559 for (unsigned k
= 0 ; k
< hb_vector_size
; ++k
) {
5560 i
->second
.hb_back_pingtime
.push_back(back_avg
);
5561 i
->second
.hb_back_min
.push_back(back_min
);
5562 i
->second
.hb_back_max
.push_back(back_max
);
5563 i
->second
.hb_front_pingtime
.push_back(front_avg
);
5564 i
->second
.hb_front_min
.push_back(front_min
);
5565 i
->second
.hb_front_max
.push_back(front_max
);
5566 ++i
->second
.hb_index
;
5569 int index
= i
->second
.hb_index
& (hb_vector_size
- 1);
5570 i
->second
.hb_back_pingtime
[index
] = back_avg
;
5571 i
->second
.hb_back_min
[index
] = back_min
;
5572 i
->second
.hb_back_max
[index
] = back_max
;
5573 i
->second
.hb_front_pingtime
[index
] = front_avg
;
5574 i
->second
.hb_front_min
[index
] = front_min
;
5575 i
->second
.hb_front_max
[index
] = front_max
;
5576 ++i
->second
.hb_index
;
5580 std::lock_guard
l(service
.stat_lock
);
5581 service
.osd_stat
.hb_pingtime
[from
].last_update
= now
.sec();
5582 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5585 uint32_t min
= UINT_MAX
;
5589 uint32_t size
= (uint32_t)i
->second
.hb_back_pingtime
.size();
5590 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5592 int index
= (i
->second
.hb_index
+ k
) % size
;
5593 total
+= i
->second
.hb_back_pingtime
[index
];
5594 if (i
->second
.hb_back_min
[index
] < min
)
5595 min
= i
->second
.hb_back_min
[index
];
5596 if (i
->second
.hb_back_max
[index
] > max
)
5597 max
= i
->second
.hb_back_max
[index
];
5598 if (count
== 1 || count
== 5 || count
== 15) {
5599 service
.osd_stat
.hb_pingtime
[from
].back_pingtime
[which
] = total
/ count
;
5600 service
.osd_stat
.hb_pingtime
[from
].back_min
[which
] = min
;
5601 service
.osd_stat
.hb_pingtime
[from
].back_max
[which
] = max
;
5608 if (i
->second
.con_front
!= NULL
) {
5609 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5616 for (int32_t k
= size
- 1 ; k
>= 0; --k
) {
5618 int index
= (i
->second
.hb_index
+ k
) % size
;
5619 total
+= i
->second
.hb_front_pingtime
[index
];
5620 if (i
->second
.hb_front_min
[index
] < min
)
5621 min
= i
->second
.hb_front_min
[index
];
5622 if (i
->second
.hb_front_max
[index
] > max
)
5623 max
= i
->second
.hb_front_max
[index
];
5624 if (count
== 1 || count
== 5 || count
== 15) {
5625 service
.osd_stat
.hb_pingtime
[from
].front_pingtime
[which
] = total
/ count
;
5626 service
.osd_stat
.hb_pingtime
[from
].front_min
[which
] = min
;
5627 service
.osd_stat
.hb_pingtime
[from
].front_max
[which
] = max
;
5636 std::lock_guard
l(service
.stat_lock
);
5637 service
.osd_stat
.hb_pingtime
[from
].back_last
= back_pingtime
;
5638 if (i
->second
.con_front
!= NULL
)
5639 service
.osd_stat
.hb_pingtime
[from
].front_last
= front_pingtime
;
5641 i
->second
.ping_history
.erase(i
->second
.ping_history
.begin(), ++acked
);
5644 if (i
->second
.is_healthy(now
)) {
5645 // Cancel false reports
5646 auto failure_queue_entry
= failure_queue
.find(from
);
5647 if (failure_queue_entry
!= failure_queue
.end()) {
5648 dout(10) << "handle_osd_ping canceling queued "
5649 << "failure report for osd." << from
<< dendl
;
5650 failure_queue
.erase(failure_queue_entry
);
5653 auto failure_pending_entry
= failure_pending
.find(from
);
5654 if (failure_pending_entry
!= failure_pending
.end()) {
5655 dout(10) << "handle_osd_ping canceling in-flight "
5656 << "failure report for osd." << from
<< dendl
;
5657 send_still_alive(curmap
->get_epoch(),
5659 failure_pending_entry
->second
.second
);
5660 failure_pending
.erase(failure_pending_entry
);
5664 // old replies, deprecated by newly sent pings.
5665 dout(10) << "handle_osd_ping no pending ping(sent at " << m
->ping_stamp
5666 << ") is found, treat as covered by newly sent pings "
5673 curmap
->is_up(from
)) {
5675 ConnectionRef cluster_con
= service
.get_con_osd_cluster(
5676 from
, curmap
->get_epoch());
5678 service
.maybe_share_map(cluster_con
.get(), curmap
, m
->map_epoch
);
5683 s
->stamps
->got_ping_reply(
5687 dout(20) << __func__
<< " new stamps " << *s
->stamps
<< dendl
;
5691 case MOSDPing::YOU_DIED
:
5692 dout(10) << "handle_osd_ping " << m
->get_source_inst()
5693 << " says i am down in " << m
->map_epoch
<< dendl
;
5694 osdmap_subscribe(curmap
->get_epoch()+1, false);
5698 heartbeat_lock
.unlock();
5702 void OSD::heartbeat_entry()
5704 std::unique_lock
l(heartbeat_lock
);
5707 while (!heartbeat_stop
) {
5711 if (cct
->_conf
.get_val
<bool>("debug_disable_randomized_ping")) {
5712 wait
= (float)cct
->_conf
->osd_heartbeat_interval
;
5714 wait
= .5 + ((float)(rand() % 10)/10.0) * (float)cct
->_conf
->osd_heartbeat_interval
;
5716 auto w
= ceph::make_timespan(wait
);
5717 dout(30) << "heartbeat_entry sleeping for " << wait
<< dendl
;
5718 heartbeat_cond
.wait_for(l
, w
);
5721 dout(30) << "heartbeat_entry woke up" << dendl
;
5725 void OSD::heartbeat_check()
5727 ceph_assert(ceph_mutex_is_locked(heartbeat_lock
));
5728 utime_t now
= ceph_clock_now();
5730 // check for incoming heartbeats (move me elsewhere?)
5731 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5732 p
!= heartbeat_peers
.end();
5735 if (p
->second
.first_tx
== utime_t()) {
5736 dout(25) << "heartbeat_check we haven't sent ping to osd." << p
->first
5737 << " yet, skipping" << dendl
;
5741 dout(25) << "heartbeat_check osd." << p
->first
5742 << " first_tx " << p
->second
.first_tx
5743 << " last_tx " << p
->second
.last_tx
5744 << " last_rx_back " << p
->second
.last_rx_back
5745 << " last_rx_front " << p
->second
.last_rx_front
5747 if (p
->second
.is_unhealthy(now
)) {
5748 utime_t oldest_deadline
= p
->second
.ping_history
.begin()->second
.first
;
5749 if (p
->second
.last_rx_back
== utime_t() ||
5750 p
->second
.last_rx_front
== utime_t()) {
5751 derr
<< "heartbeat_check: no reply from "
5752 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5753 << " osd." << p
->first
5754 << " ever on either front or back, first ping sent "
5755 << p
->second
.first_tx
5756 << " (oldest deadline " << oldest_deadline
<< ")"
5759 failure_queue
[p
->first
] = p
->second
.first_tx
;
5761 derr
<< "heartbeat_check: no reply from "
5762 << p
->second
.con_front
->get_peer_addr().get_sockaddr()
5763 << " osd." << p
->first
<< " since back " << p
->second
.last_rx_back
5764 << " front " << p
->second
.last_rx_front
5765 << " (oldest deadline " << oldest_deadline
<< ")"
5768 failure_queue
[p
->first
] = std::min(p
->second
.last_rx_back
, p
->second
.last_rx_front
);
5774 void OSD::heartbeat()
5776 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock
));
5777 dout(30) << "heartbeat" << dendl
;
5781 int hb_interval
= cct
->_conf
->osd_heartbeat_interval
;
5782 int n_samples
= 86400;
5783 if (hb_interval
> 1) {
5784 n_samples
/= hb_interval
;
5789 if (getloadavg(loadavgs
, 1) == 1) {
5790 logger
->set(l_osd_loadavg
, 100 * loadavgs
[0]);
5791 daily_loadavg
= (daily_loadavg
* (n_samples
- 1) + loadavgs
[0]) / n_samples
;
5792 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg
<< dendl
;
5795 dout(30) << "heartbeat checking stats" << dendl
;
5797 // refresh peer list and osd stats
5798 vector
<int> hb_peers
;
5799 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
5800 p
!= heartbeat_peers
.end();
5802 hb_peers
.push_back(p
->first
);
5804 auto new_stat
= service
.set_osd_stat(hb_peers
, get_num_pgs());
5805 dout(5) << __func__
<< " " << new_stat
<< dendl
;
5806 ceph_assert(new_stat
.statfs
.total
);
5809 float ratio
= service
.compute_adjusted_ratio(new_stat
, &pratio
);
5811 service
.check_full_status(ratio
, pratio
);
5813 utime_t now
= ceph_clock_now();
5814 auto mnow
= service
.get_mnow();
5815 utime_t deadline
= now
;
5816 deadline
+= cct
->_conf
->osd_heartbeat_grace
;
5819 for (map
<int,HeartbeatInfo
>::iterator i
= heartbeat_peers
.begin();
5820 i
!= heartbeat_peers
.end();
5822 int peer
= i
->first
;
5823 Session
*s
= static_cast<Session
*>(i
->second
.con_back
->get_priv().get());
5825 dout(30) << "heartbeat osd." << peer
<< " has no open con" << dendl
;
5828 dout(30) << "heartbeat sending ping to osd." << peer
<< dendl
;
5830 i
->second
.last_tx
= now
;
5831 if (i
->second
.first_tx
== utime_t())
5832 i
->second
.first_tx
= now
;
5833 i
->second
.ping_history
[now
] = make_pair(deadline
,
5834 HeartbeatInfo::HEARTBEAT_MAX_CONN
);
5835 if (i
->second
.hb_interval_start
== utime_t())
5836 i
->second
.hb_interval_start
= now
;
5838 std::optional
<ceph::signedspan
> delta_ub
;
5839 s
->stamps
->sent_ping(&delta_ub
);
5841 i
->second
.con_back
->send_message(
5842 new MOSDPing(monc
->get_fsid(),
5843 service
.get_osdmap_epoch(),
5848 service
.get_up_epoch(),
5849 cct
->_conf
->osd_heartbeat_min_size
,
5852 if (i
->second
.con_front
)
5853 i
->second
.con_front
->send_message(
5854 new MOSDPing(monc
->get_fsid(),
5855 service
.get_osdmap_epoch(),
5860 service
.get_up_epoch(),
5861 cct
->_conf
->osd_heartbeat_min_size
,
5865 logger
->set(l_osd_hb_to
, heartbeat_peers
.size());
5867 // hmm.. am i all alone?
5868 dout(30) << "heartbeat lonely?" << dendl
;
5869 if (heartbeat_peers
.empty()) {
5870 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
&& is_active()) {
5871 last_mon_heartbeat
= now
;
5872 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl
;
5873 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5877 dout(30) << "heartbeat done" << dendl
;
5880 bool OSD::heartbeat_reset(Connection
*con
)
5882 std::lock_guard
l(heartbeat_lock
);
5883 auto s
= con
->get_priv();
5884 dout(20) << __func__
<< " con " << con
<< " s " << s
.get() << dendl
;
5885 con
->set_priv(nullptr);
5887 if (is_stopping()) {
5890 auto session
= static_cast<Session
*>(s
.get());
5891 auto p
= heartbeat_peers
.find(session
->peer
);
5892 if (p
!= heartbeat_peers
.end() &&
5893 (p
->second
.con_back
== con
||
5894 p
->second
.con_front
== con
)) {
5895 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5896 << ", reopening" << dendl
;
5897 p
->second
.clear_mark_down(con
);
5898 pair
<ConnectionRef
,ConnectionRef
> newcon
= service
.get_con_osd_hb(p
->second
.peer
, p
->second
.epoch
);
5900 p
->second
.con_back
= newcon
.first
.get();
5901 p
->second
.con_back
->set_priv(s
);
5902 if (newcon
.second
) {
5903 p
->second
.con_front
= newcon
.second
.get();
5904 p
->second
.con_front
->set_priv(s
);
5906 p
->second
.ping_history
.clear();
5908 dout(10) << "heartbeat_reset failed hb con " << con
<< " for osd." << p
->second
.peer
5909 << ", raced with osdmap update, closing out peer" << dendl
;
5910 heartbeat_peers
.erase(p
);
5913 dout(10) << "heartbeat_reset closing (old) failed hb con " << con
<< dendl
;
5921 // =========================================
5925 ceph_assert(ceph_mutex_is_locked(osd_lock
));
5926 dout(10) << "tick" << dendl
;
5928 utime_t now
= ceph_clock_now();
5929 // throw out any obsolete markdown log
5930 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
5931 while (!osd_markdown_log
.empty() &&
5932 osd_markdown_log
.front() + grace
< now
)
5933 osd_markdown_log
.pop_front();
5935 if (is_active() || is_waiting_for_healthy()) {
5936 maybe_update_heartbeat_peers();
5939 if (is_waiting_for_healthy()) {
5943 if (is_waiting_for_healthy() || is_booting()) {
5944 std::lock_guard
l(heartbeat_lock
);
5945 if (now
- last_mon_heartbeat
> cct
->_conf
->osd_mon_heartbeat_interval
) {
5946 last_mon_heartbeat
= now
;
5947 dout(1) << __func__
<< " checking mon for new map" << dendl
;
5948 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5954 // scrub purged_snaps every deep scrub interval
5956 const utime_t last
= superblock
.last_purged_snaps_scrub
;
5957 utime_t next
= last
;
5958 next
+= cct
->_conf
->osd_scrub_min_interval
;
5960 // use a seed that is stable for each scrub interval, but varies
5961 // by OSD to avoid any herds.
5962 rng
.seed(whoami
+ superblock
.last_purged_snaps_scrub
.sec());
5963 double r
= (rng() % 1024) / 1024;
5965 cct
->_conf
->osd_scrub_min_interval
*
5966 cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
5967 if (next
< ceph_clock_now()) {
5968 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5969 << " next " << next
<< " ... now" << dendl
;
5970 scrub_purged_snaps();
5972 dout(20) << __func__
<< " last_purged_snaps_scrub " << last
5973 << " next " << next
<< dendl
;
5977 tick_timer
.add_event_after(get_tick_interval(), new C_Tick(this));
5980 void OSD::tick_without_osd_lock()
5982 ceph_assert(ceph_mutex_is_locked(tick_timer_lock
));
5983 dout(10) << "tick_without_osd_lock" << dendl
;
5985 logger
->set(l_osd_cached_crc
, ceph::buffer::get_cached_crc());
5986 logger
->set(l_osd_cached_crc_adjusted
, ceph::buffer::get_cached_crc_adjusted());
5987 logger
->set(l_osd_missed_crc
, ceph::buffer::get_missed_crc());
5989 // refresh osd stats
5990 struct store_statfs_t stbuf
;
5991 osd_alert_list_t alerts
;
5992 int r
= store
->statfs(&stbuf
, &alerts
);
5993 ceph_assert(r
== 0);
5994 service
.set_statfs(stbuf
, alerts
);
5996 // osd_lock is not being held, which means the OSD state
5997 // might change when doing the monitor report
5998 if (is_active() || is_waiting_for_healthy()) {
6000 std::lock_guard l
{heartbeat_lock
};
6003 map_lock
.lock_shared();
6004 std::lock_guard
l(mon_report_lock
);
6007 utime_t now
= ceph_clock_now();
6008 if (service
.need_fullness_update() ||
6009 now
- last_mon_report
> cct
->_conf
->osd_mon_report_interval
) {
6010 last_mon_report
= now
;
6014 map_lock
.unlock_shared();
6016 epoch_t max_waiting_epoch
= 0;
6017 for (auto s
: shards
) {
6018 max_waiting_epoch
= std::max(max_waiting_epoch
,
6019 s
->get_max_waiting_epoch());
6021 if (max_waiting_epoch
> get_osdmap()->get_epoch()) {
6022 dout(20) << __func__
<< " max_waiting_epoch " << max_waiting_epoch
6023 << ", requesting new map" << dendl
;
6024 osdmap_subscribe(superblock
.newest_map
+ 1, false);
6029 if (!scrub_random_backoff()) {
6032 service
.promote_throttle_recalibrate();
6033 resume_creating_pg();
6034 bool need_send_beacon
= false;
6035 const auto now
= ceph::coarse_mono_clock::now();
6037 // borrow lec lock to pretect last_sent_beacon from changing
6038 std::lock_guard l
{min_last_epoch_clean_lock
};
6039 const auto elapsed
= now
- last_sent_beacon
;
6040 if (std::chrono::duration_cast
<std::chrono::seconds
>(elapsed
).count() >
6041 cct
->_conf
->osd_beacon_report_interval
) {
6042 need_send_beacon
= true;
6045 if (need_send_beacon
) {
6050 mgrc
.update_daemon_health(get_health_metrics());
6051 service
.kick_recovery_queue();
6052 tick_timer_without_osd_lock
.add_event_after(get_tick_interval(),
6053 new C_Tick_WithoutOSDLock(this));
6057 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6058 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
6059 // setomapheader <pool-id> [namespace/]<obj-name> <header>
6060 // getomap <pool> [namespace/]<obj-name>
6061 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
6062 // injectmdataerr [namespace/]<obj-name> [shardid]
6063 // injectdataerr [namespace/]<obj-name> [shardid]
6065 // set_recovery_delay [utime]
6066 void TestOpsSocketHook::test_ops(OSDService
*service
, ObjectStore
*store
,
6067 std::string_view command
,
6068 const cmdmap_t
& cmdmap
, ostream
&ss
)
6071 //Support changing the omap on a single osd by using the Admin Socket to
6072 //directly request the osd make a change.
6073 if (command
== "setomapval" || command
== "rmomapkey" ||
6074 command
== "setomapheader" || command
== "getomap" ||
6075 command
== "truncobj" || command
== "injectmdataerr" ||
6076 command
== "injectdataerr"
6080 OSDMapRef curmap
= service
->get_osdmap();
6085 cmd_getval(cmdmap
, "pool", poolstr
);
6086 pool
= curmap
->lookup_pg_pool_name(poolstr
);
6087 //If we can't find it by name then maybe id specified
6088 if (pool
< 0 && isdigit(poolstr
[0]))
6089 pool
= atoll(poolstr
.c_str());
6091 ss
<< "Invalid pool '" << poolstr
<< "''";
6095 string objname
, nspace
;
6096 cmd_getval(cmdmap
, "objname", objname
);
6097 std::size_t found
= objname
.find_first_of('/');
6098 if (found
!= string::npos
) {
6099 nspace
= objname
.substr(0, found
);
6100 objname
= objname
.substr(found
+1);
6102 object_locator_t
oloc(pool
, nspace
);
6103 r
= curmap
->object_locator_to_pg(object_t(objname
), oloc
, rawpg
);
6106 ss
<< "Invalid namespace/objname";
6111 cmd_getval(cmdmap
, "shardid", shardid
, int64_t(shard_id_t::NO_SHARD
));
6112 hobject_t
obj(object_t(objname
), string(""), CEPH_NOSNAP
, rawpg
.ps(), pool
, nspace
);
6113 ghobject_t
gobj(obj
, ghobject_t::NO_GEN
, shard_id_t(uint8_t(shardid
)));
6114 spg_t
pgid(curmap
->raw_pg_to_pg(rawpg
), shard_id_t(shardid
));
6115 if (curmap
->pg_is_ec(rawpg
)) {
6116 if ((command
!= "injectdataerr") && (command
!= "injectmdataerr")) {
6117 ss
<< "Must not call on ec pool, except injectdataerr or injectmdataerr";
6122 ObjectStore::Transaction t
;
6124 if (command
== "setomapval") {
6125 map
<string
, bufferlist
> newattrs
;
6128 cmd_getval(cmdmap
, "key", key
);
6129 cmd_getval(cmdmap
, "val", valstr
);
6132 newattrs
[key
] = val
;
6133 t
.omap_setkeys(coll_t(pgid
), ghobject_t(obj
), newattrs
);
6134 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6136 ss
<< "error=" << r
;
6139 } else if (command
== "rmomapkey") {
6141 cmd_getval(cmdmap
, "key", key
);
6143 t
.omap_rmkey(coll_t(pgid
), ghobject_t(obj
), key
);
6144 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6146 ss
<< "error=" << r
;
6149 } else if (command
== "setomapheader") {
6150 bufferlist newheader
;
6153 cmd_getval(cmdmap
, "header", headerstr
);
6154 newheader
.append(headerstr
);
6155 t
.omap_setheader(coll_t(pgid
), ghobject_t(obj
), newheader
);
6156 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6158 ss
<< "error=" << r
;
6161 } else if (command
== "getomap") {
6162 //Debug: Output entire omap
6164 map
<string
, bufferlist
> keyvals
;
6165 auto ch
= store
->open_collection(coll_t(pgid
));
6167 ss
<< "unable to open collection for " << pgid
;
6170 r
= store
->omap_get(ch
, ghobject_t(obj
), &hdrbl
, &keyvals
);
6172 ss
<< "header=" << string(hdrbl
.c_str(), hdrbl
.length());
6173 for (map
<string
, bufferlist
>::iterator it
= keyvals
.begin();
6174 it
!= keyvals
.end(); ++it
)
6175 ss
<< " key=" << (*it
).first
<< " val="
6176 << string((*it
).second
.c_str(), (*it
).second
.length());
6178 ss
<< "error=" << r
;
6181 } else if (command
== "truncobj") {
6183 cmd_getval(cmdmap
, "len", trunclen
);
6184 t
.truncate(coll_t(pgid
), ghobject_t(obj
), trunclen
);
6185 r
= store
->queue_transaction(service
->meta_ch
, std::move(t
));
6187 ss
<< "error=" << r
;
6190 } else if (command
== "injectdataerr") {
6191 store
->inject_data_error(gobj
);
6193 } else if (command
== "injectmdataerr") {
6194 store
->inject_mdata_error(gobj
);
6199 if (command
== "set_recovery_delay") {
6201 cmd_getval(cmdmap
, "utime", delay
, (int64_t)0);
6204 int r
= service
->cct
->_conf
.set_val("osd_recovery_delay_start",
6207 ss
<< "set_recovery_delay: error setting "
6208 << "osd_recovery_delay_start to '" << delay
<< "': error "
6212 service
->cct
->_conf
.apply_changes(nullptr);
6213 ss
<< "set_recovery_delay: set osd_recovery_delay_start "
6214 << "to " << service
->cct
->_conf
->osd_recovery_delay_start
;
6217 if (command
== "injectfull") {
6220 OSDService::s_names state
;
6221 cmd_getval(cmdmap
, "type", type
, string("full"));
6222 cmd_getval(cmdmap
, "count", count
, (int64_t)-1);
6223 if (type
== "none" || count
== 0) {
6227 state
= service
->get_full_state(type
);
6228 if (state
== OSDService::s_names::INVALID
) {
6229 ss
<< "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6232 service
->set_injectfull(state
, count
);
6235 ss
<< "Internal error - command=" << command
;
6238 // =========================================
6240 void OSD::ms_handle_connect(Connection
*con
)
6242 dout(10) << __func__
<< " con " << con
<< dendl
;
6243 if (con
->get_peer_type() == CEPH_ENTITY_TYPE_MON
) {
6244 std::lock_guard
l(osd_lock
);
6247 dout(10) << __func__
<< " on mon" << dendl
;
6251 } else if (is_booting()) {
6252 _send_boot(); // resend boot message
6254 map_lock
.lock_shared();
6255 std::lock_guard
l2(mon_report_lock
);
6257 utime_t now
= ceph_clock_now();
6258 last_mon_report
= now
;
6260 // resend everything, it's a new session
6263 service
.requeue_pg_temp();
6264 service
.clear_sent_ready_to_merge();
6265 service
.send_pg_temp();
6266 service
.send_ready_to_merge();
6267 service
.send_pg_created();
6271 map_lock
.unlock_shared();
6273 send_beacon(ceph::coarse_mono_clock::now());
6277 // full map requests may happen while active or pre-boot
6278 if (requested_full_first
) {
6279 rerequest_full_maps();
6284 void OSD::ms_handle_fast_connect(Connection
*con
)
6286 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6287 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6288 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6289 s
= ceph::make_ref
<Session
>(cct
, con
);
6291 dout(10) << " new session (outgoing) " << s
<< " con=" << s
->con
6292 << " addr=" << s
->con
->get_peer_addr() << dendl
;
6293 // we don't connect to clients
6294 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6295 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6300 void OSD::ms_handle_fast_accept(Connection
*con
)
6302 if (con
->get_peer_type() != CEPH_ENTITY_TYPE_MON
&&
6303 con
->get_peer_type() != CEPH_ENTITY_TYPE_MGR
) {
6304 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); !s
) {
6305 s
= ceph::make_ref
<Session
>(cct
, con
);
6307 dout(10) << "new session (incoming)" << s
<< " con=" << con
6308 << " addr=" << con
->get_peer_addr()
6309 << " must have raced with connect" << dendl
;
6310 ceph_assert(con
->get_peer_type() == CEPH_ENTITY_TYPE_OSD
);
6311 s
->entity_name
.set_type(CEPH_ENTITY_TYPE_OSD
);
6316 bool OSD::ms_handle_reset(Connection
*con
)
6318 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6319 dout(2) << "ms_handle_reset con " << con
<< " session " << session
.get() << dendl
;
6322 session
->wstate
.reset(con
);
6323 session
->con
->set_priv(nullptr);
6324 session
->con
.reset(); // break con <-> session ref cycle
6325 // note that we break session->con *before* the session_handle_reset
6326 // cleanup below. this avoids a race between us and
6327 // PG::add_backoff, Session::check_backoff, etc.
6328 session_handle_reset(session
);
6332 bool OSD::ms_handle_refused(Connection
*con
)
6334 if (!cct
->_conf
->osd_fast_fail_on_connection_refused
)
6337 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6338 dout(2) << "ms_handle_refused con " << con
<< " session " << session
.get() << dendl
;
6341 int type
= con
->get_peer_type();
6342 // handle only OSD failures here
6343 if (monc
&& (type
== CEPH_ENTITY_TYPE_OSD
)) {
6344 OSDMapRef osdmap
= get_osdmap();
6346 int id
= osdmap
->identify_osd_on_all_channels(con
->get_peer_addr());
6347 if (id
>= 0 && osdmap
->is_up(id
)) {
6348 // I'm cheating mon heartbeat grace logic, because we know it's not going
6349 // to respawn alone. +1 so we won't hit any boundary case.
6350 monc
->send_mon_message(
6354 osdmap
->get_addrs(id
),
6355 cct
->_conf
->osd_heartbeat_grace
+ 1,
6356 osdmap
->get_epoch(),
6357 MOSDFailure::FLAG_IMMEDIATE
| MOSDFailure::FLAG_FAILED
6365 struct CB_OSD_GetVersion
{
6367 explicit CB_OSD_GetVersion(OSD
*o
) : osd(o
) {}
6368 void operator ()(boost::system::error_code ec
, version_t newest
,
6371 osd
->_got_mon_epochs(oldest
, newest
);
6375 void OSD::start_boot()
6377 if (!_is_healthy()) {
6378 // if we are not healthy, do not mark ourselves up (yet)
6379 dout(1) << "not healthy; waiting to boot" << dendl
;
6380 if (!is_waiting_for_healthy())
6381 start_waiting_for_healthy();
6382 // send pings sooner rather than later
6386 dout(1) << __func__
<< dendl
;
6387 set_state(STATE_PREBOOT
);
6388 dout(10) << "start_boot - have maps " << superblock
.oldest_map
6389 << ".." << superblock
.newest_map
<< dendl
;
6390 monc
->get_version("osdmap", CB_OSD_GetVersion(this));
6393 void OSD::_got_mon_epochs(epoch_t oldest
, epoch_t newest
)
6395 std::lock_guard
l(osd_lock
);
6397 _preboot(oldest
, newest
);
6401 void OSD::_preboot(epoch_t oldest
, epoch_t newest
)
6403 ceph_assert(is_preboot());
6404 dout(10) << __func__
<< " _preboot mon has osdmaps "
6405 << oldest
<< ".." << newest
<< dendl
;
6407 // ensure our local fullness awareness is accurate
6409 std::lock_guard
l(heartbeat_lock
);
6413 const auto& monmap
= monc
->monmap
;
6414 const auto osdmap
= get_osdmap();
6415 // if our map within recent history, try to add ourselves to the osdmap.
6416 if (osdmap
->get_epoch() == 0) {
6417 derr
<< "waiting for initial osdmap" << dendl
;
6418 } else if (osdmap
->is_destroyed(whoami
)) {
6419 derr
<< "osdmap says I am destroyed" << dendl
;
6420 // provide a small margin so we don't livelock seeing if we
6421 // un-destroyed ourselves.
6422 if (osdmap
->get_epoch() > newest
- 1) {
6425 } else if (osdmap
->is_noup(whoami
)) {
6426 derr
<< "osdmap NOUP flag is set, waiting for it to clear" << dendl
;
6427 } else if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
)) {
6428 derr
<< "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6430 } else if (service
.need_fullness_update()) {
6431 derr
<< "osdmap fullness state needs update" << dendl
;
6433 } else if (monmap
.min_mon_release
>= ceph_release_t::octopus
&&
6434 superblock
.purged_snaps_last
< superblock
.current_epoch
) {
6435 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6436 << " < newest_map " << superblock
.current_epoch
<< dendl
;
6437 _get_purged_snaps();
6438 } else if (osdmap
->get_epoch() >= oldest
- 1 &&
6439 osdmap
->get_epoch() + cct
->_conf
->osd_map_message_max
> newest
) {
6441 // wait for pgs to fully catch up in a different thread, since
6442 // this thread might be required for splitting and merging PGs to
6444 boot_finisher
.queue(
6447 std::unique_lock
l(osd_lock
);
6449 dout(10) << __func__
<< " waiting for peering work to drain"
6452 for (auto shard
: shards
) {
6453 shard
->wait_min_pg_epoch(get_osdmap_epoch());
6464 // get all the latest maps
6465 if (osdmap
->get_epoch() + 1 >= oldest
)
6466 osdmap_subscribe(osdmap
->get_epoch() + 1, false);
6468 osdmap_subscribe(oldest
- 1, true);
6471 void OSD::_get_purged_snaps()
6473 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6474 // overlapping requests to the mon, which will be somewhat inefficient, but
6475 // it should be reliable.
6476 dout(10) << __func__
<< " purged_snaps_last " << superblock
.purged_snaps_last
6477 << ", newest_map " << superblock
.current_epoch
<< dendl
;
6478 MMonGetPurgedSnaps
*m
= new MMonGetPurgedSnaps(
6479 superblock
.purged_snaps_last
+ 1,
6480 superblock
.current_epoch
+ 1);
6481 monc
->send_mon_message(m
);
6484 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply
*m
)
6486 dout(10) << __func__
<< " " << *m
<< dendl
;
6487 ObjectStore::Transaction t
;
6488 if (!is_preboot() ||
6489 m
->last
< superblock
.purged_snaps_last
) {
6492 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
6493 make_purged_snaps_oid(), &t
,
6495 superblock
.purged_snaps_last
= m
->last
;
6496 write_superblock(t
);
6497 store
->queue_transaction(
6500 service
.publish_superblock(superblock
);
6501 if (m
->last
< superblock
.current_epoch
) {
6502 _get_purged_snaps();
6510 void OSD::send_full_update()
6512 if (!service
.need_fullness_update())
6515 if (service
.is_full()) {
6516 state
= CEPH_OSD_FULL
;
6517 } else if (service
.is_backfillfull()) {
6518 state
= CEPH_OSD_BACKFILLFULL
;
6519 } else if (service
.is_nearfull()) {
6520 state
= CEPH_OSD_NEARFULL
;
6523 OSDMap::calc_state_set(state
, s
);
6524 dout(10) << __func__
<< " want state " << s
<< dendl
;
6525 monc
->send_mon_message(new MOSDFull(get_osdmap_epoch(), state
));
6528 void OSD::start_waiting_for_healthy()
6530 dout(1) << "start_waiting_for_healthy" << dendl
;
6531 set_state(STATE_WAITING_FOR_HEALTHY
);
6532 last_heartbeat_resample
= utime_t();
6534 // subscribe to osdmap updates, in case our peers really are known to be dead
6535 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6538 bool OSD::_is_healthy()
6540 if (!cct
->get_heartbeat_map()->is_healthy()) {
6541 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl
;
6545 if (is_waiting_for_healthy()) {
6546 utime_t now
= ceph_clock_now();
6547 if (osd_markdown_log
.empty()) {
6548 dout(5) << __func__
<< " force returning true since last markdown"
6549 << " was " << cct
->_conf
->osd_max_markdown_period
6550 << "s ago" << dendl
;
6553 std::lock_guard
l(heartbeat_lock
);
6554 int num
= 0, up
= 0;
6555 for (map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.begin();
6556 p
!= heartbeat_peers
.end();
6558 if (p
->second
.is_healthy(now
))
6562 if ((float)up
< (float)num
* cct
->_conf
->osd_heartbeat_min_healthy_ratio
) {
6563 dout(1) << "is_healthy false -- only " << up
<< "/" << num
<< " up peers (less than "
6564 << int(cct
->_conf
->osd_heartbeat_min_healthy_ratio
* 100.0) << "%)" << dendl
;
6572 void OSD::_send_boot()
6574 dout(10) << "_send_boot" << dendl
;
6575 Connection
*local_connection
=
6576 cluster_messenger
->get_loopback_connection().get();
6577 entity_addrvec_t client_addrs
= client_messenger
->get_myaddrs();
6578 entity_addrvec_t cluster_addrs
= cluster_messenger
->get_myaddrs();
6579 entity_addrvec_t hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6580 entity_addrvec_t hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6582 dout(20) << " initial client_addrs " << client_addrs
6583 << ", cluster_addrs " << cluster_addrs
6584 << ", hb_back_addrs " << hb_back_addrs
6585 << ", hb_front_addrs " << hb_front_addrs
6587 if (cluster_messenger
->set_addr_unknowns(client_addrs
)) {
6588 dout(10) << " assuming cluster_addrs match client_addrs "
6589 << client_addrs
<< dendl
;
6590 cluster_addrs
= cluster_messenger
->get_myaddrs();
6592 if (auto session
= local_connection
->get_priv(); !session
) {
6593 cluster_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6596 local_connection
= hb_back_server_messenger
->get_loopback_connection().get();
6597 if (hb_back_server_messenger
->set_addr_unknowns(cluster_addrs
)) {
6598 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6599 << cluster_addrs
<< dendl
;
6600 hb_back_addrs
= hb_back_server_messenger
->get_myaddrs();
6602 if (auto session
= local_connection
->get_priv(); !session
) {
6603 hb_back_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6606 local_connection
= hb_front_server_messenger
->get_loopback_connection().get();
6607 if (hb_front_server_messenger
->set_addr_unknowns(client_addrs
)) {
6608 dout(10) << " assuming hb_front_addrs match client_addrs "
6609 << client_addrs
<< dendl
;
6610 hb_front_addrs
= hb_front_server_messenger
->get_myaddrs();
6612 if (auto session
= local_connection
->get_priv(); !session
) {
6613 hb_front_server_messenger
->ms_deliver_handle_fast_connect(local_connection
);
6616 // we now know what our front and back addrs will be, and we are
6617 // about to tell the mon what our metadata (including numa bindings)
6618 // are, so now is a good time!
6619 set_numa_affinity();
6621 MOSDBoot
*mboot
= new MOSDBoot(
6622 superblock
, get_osdmap_epoch(), service
.get_boot_epoch(),
6623 hb_back_addrs
, hb_front_addrs
, cluster_addrs
,
6625 dout(10) << " final client_addrs " << client_addrs
6626 << ", cluster_addrs " << cluster_addrs
6627 << ", hb_back_addrs " << hb_back_addrs
6628 << ", hb_front_addrs " << hb_front_addrs
6630 _collect_metadata(&mboot
->metadata
);
6631 monc
->send_mon_message(mboot
);
6632 set_state(STATE_BOOTING
);
6635 void OSD::_collect_metadata(map
<string
,string
> *pm
)
6638 (*pm
)["osd_data"] = dev_path
;
6639 if (store
->get_type() == "filestore") {
6640 // not applicable for bluestore
6641 (*pm
)["osd_journal"] = journal_path
;
6643 (*pm
)["front_addr"] = stringify(client_messenger
->get_myaddrs());
6644 (*pm
)["back_addr"] = stringify(cluster_messenger
->get_myaddrs());
6645 (*pm
)["hb_front_addr"] = stringify(hb_front_server_messenger
->get_myaddrs());
6646 (*pm
)["hb_back_addr"] = stringify(hb_back_server_messenger
->get_myaddrs());
6649 (*pm
)["osd_objectstore"] = store
->get_type();
6650 (*pm
)["rotational"] = store_is_rotational
? "1" : "0";
6651 (*pm
)["journal_rotational"] = journal_is_rotational
? "1" : "0";
6652 (*pm
)["default_device_class"] = store
->get_default_device_class();
6653 string osdspec_affinity
;
6654 int r
= store
->read_meta("osdspec_affinity", &osdspec_affinity
);
6655 if (r
< 0 || osdspec_affinity
.empty()) {
6656 osdspec_affinity
= "";
6658 (*pm
)["osdspec_affinity"] = osdspec_affinity
;
6659 store
->collect_metadata(pm
);
6661 collect_sys_info(pm
, cct
);
6663 (*pm
)["front_iface"] = pick_iface(
6665 client_messenger
->get_myaddrs().front().get_sockaddr_storage());
6666 (*pm
)["back_iface"] = pick_iface(
6668 cluster_messenger
->get_myaddrs().front().get_sockaddr_storage());
6674 set
<string
> unknown
;
6675 for (auto nm
: { "front_iface", "back_iface" }) {
6676 if (!(*pm
)[nm
].size()) {
6681 int r
= get_iface_numa_node((*pm
)[nm
], &n
);
6683 unknown
.insert((*pm
)[nm
]);
6691 if (unknown
.size()) {
6692 (*pm
)["network_numa_unknown_ifaces"] = stringify(unknown
);
6694 if (!nodes
.empty()) {
6695 (*pm
)["network_numa_nodes"] = stringify(nodes
);
6697 if (node
>= 0 && nodes
.size() == 1 && unknown
.empty()) {
6698 (*pm
)["network_numa_node"] = stringify(node
);
6702 if (numa_node
>= 0) {
6703 (*pm
)["numa_node"] = stringify(numa_node
);
6704 (*pm
)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size
,
6708 set
<string
> devnames
;
6709 store
->get_devices(&devnames
);
6710 map
<string
,string
> errs
;
6711 get_device_metadata(devnames
, pm
, &errs
);
6712 for (auto& i
: errs
) {
6713 dout(1) << __func__
<< " " << i
.first
<< ": " << i
.second
<< dendl
;
6715 dout(10) << __func__
<< " " << *pm
<< dendl
;
6718 void OSD::queue_want_up_thru(epoch_t want
)
6720 std::shared_lock map_locker
{map_lock
};
6721 epoch_t cur
= get_osdmap()->get_up_thru(whoami
);
6722 std::lock_guard
report_locker(mon_report_lock
);
6723 if (want
> up_thru_wanted
) {
6724 dout(10) << "queue_want_up_thru now " << want
<< " (was " << up_thru_wanted
<< ")"
6725 << ", currently " << cur
6727 up_thru_wanted
= want
;
6730 dout(10) << "queue_want_up_thru want " << want
<< " <= queued " << up_thru_wanted
6731 << ", currently " << cur
6736 void OSD::send_alive()
6738 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6739 const auto osdmap
= get_osdmap();
6740 if (!osdmap
->exists(whoami
))
6742 epoch_t up_thru
= osdmap
->get_up_thru(whoami
);
6743 dout(10) << "send_alive up_thru currently " << up_thru
<< " want " << up_thru_wanted
<< dendl
;
6744 if (up_thru_wanted
> up_thru
) {
6745 dout(10) << "send_alive want " << up_thru_wanted
<< dendl
;
6746 monc
->send_mon_message(new MOSDAlive(osdmap
->get_epoch(), up_thru_wanted
));
6750 void OSD::request_full_map(epoch_t first
, epoch_t last
)
6752 dout(10) << __func__
<< " " << first
<< ".." << last
6753 << ", previously requested "
6754 << requested_full_first
<< ".." << requested_full_last
<< dendl
;
6755 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6756 ceph_assert(first
> 0 && last
> 0);
6757 ceph_assert(first
<= last
);
6758 ceph_assert(first
>= requested_full_first
); // we shouldn't ever ask for older maps
6759 if (requested_full_first
== 0) {
6761 requested_full_first
= first
;
6762 requested_full_last
= last
;
6763 } else if (last
<= requested_full_last
) {
6767 // additional request
6768 first
= requested_full_last
+ 1;
6769 requested_full_last
= last
;
6771 MMonGetOSDMap
*req
= new MMonGetOSDMap
;
6772 req
->request_full(first
, last
);
6773 monc
->send_mon_message(req
);
6776 void OSD::got_full_map(epoch_t e
)
6778 ceph_assert(requested_full_first
<= requested_full_last
);
6779 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6780 if (requested_full_first
== 0) {
6781 dout(20) << __func__
<< " " << e
<< ", nothing requested" << dendl
;
6784 if (e
< requested_full_first
) {
6785 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6786 << ".." << requested_full_last
6787 << ", ignoring" << dendl
;
6790 if (e
>= requested_full_last
) {
6791 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6792 << ".." << requested_full_last
<< ", resetting" << dendl
;
6793 requested_full_first
= requested_full_last
= 0;
6797 requested_full_first
= e
+ 1;
6799 dout(10) << __func__
<< " " << e
<< ", requested " << requested_full_first
6800 << ".." << requested_full_last
6801 << ", still need more" << dendl
;
6804 void OSD::requeue_failures()
6806 std::lock_guard
l(heartbeat_lock
);
6807 unsigned old_queue
= failure_queue
.size();
6808 unsigned old_pending
= failure_pending
.size();
6809 for (auto p
= failure_pending
.begin(); p
!= failure_pending
.end(); ) {
6810 failure_queue
[p
->first
] = p
->second
.first
;
6811 failure_pending
.erase(p
++);
6813 dout(10) << __func__
<< " " << old_queue
<< " + " << old_pending
<< " -> "
6814 << failure_queue
.size() << dendl
;
6817 void OSD::send_failures()
6819 ceph_assert(ceph_mutex_is_locked(map_lock
));
6820 ceph_assert(ceph_mutex_is_locked(mon_report_lock
));
6821 std::lock_guard
l(heartbeat_lock
);
6822 utime_t now
= ceph_clock_now();
6823 const auto osdmap
= get_osdmap();
6824 while (!failure_queue
.empty()) {
6825 int osd
= failure_queue
.begin()->first
;
6826 if (!failure_pending
.count(osd
)) {
6827 int failed_for
= (int)(double)(now
- failure_queue
.begin()->second
);
6828 monc
->send_mon_message(
6832 osdmap
->get_addrs(osd
),
6834 osdmap
->get_epoch()));
6835 failure_pending
[osd
] = make_pair(failure_queue
.begin()->second
,
6836 osdmap
->get_addrs(osd
));
6838 failure_queue
.erase(osd
);
6842 void OSD::send_still_alive(epoch_t epoch
, int osd
, const entity_addrvec_t
&addrs
)
6844 MOSDFailure
*m
= new MOSDFailure(monc
->get_fsid(), osd
, addrs
, 0, epoch
,
6845 MOSDFailure::FLAG_ALIVE
);
6846 monc
->send_mon_message(m
);
6849 void OSD::cancel_pending_failures()
6851 std::lock_guard
l(heartbeat_lock
);
6852 auto it
= failure_pending
.begin();
6853 while (it
!= failure_pending
.end()) {
6854 dout(10) << __func__
<< " canceling in-flight failure report for osd."
6855 << it
->first
<< dendl
;
6856 send_still_alive(get_osdmap_epoch(), it
->first
, it
->second
.second
);
6857 failure_pending
.erase(it
++);
6861 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point
& now
)
6863 const auto& monmap
= monc
->monmap
;
6864 // send beacon to mon even if we are just connected, and the monmap is not
6865 // initialized yet by then.
6866 if (monmap
.epoch
> 0 &&
6867 monmap
.get_required_features().contains_all(
6868 ceph::features::mon::FEATURE_LUMINOUS
)) {
6869 dout(20) << __func__
<< " sending" << dendl
;
6870 MOSDBeacon
* beacon
= nullptr;
6872 std::lock_guard l
{min_last_epoch_clean_lock
};
6873 beacon
= new MOSDBeacon(get_osdmap_epoch(),
6874 min_last_epoch_clean
,
6875 superblock
.last_purged_snaps_scrub
,
6876 cct
->_conf
->osd_beacon_report_interval
);
6877 beacon
->pgs
= min_last_epoch_clean_pgs
;
6878 last_sent_beacon
= now
;
6880 monc
->send_mon_message(beacon
);
6882 dout(20) << __func__
<< " not sending" << dendl
;
6886 void OSD::handle_command(MCommand
*m
)
6888 ConnectionRef con
= m
->get_connection();
6889 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
6891 con
->send_message(new MCommandReply(m
, -EACCES
));
6895 if (!session
->caps
.allow_all()) {
6896 con
->send_message(new MCommandReply(m
, -EACCES
));
6900 cct
->get_admin_socket()->queue_tell_command(m
);
6905 class unlock_guard
{
6908 explicit unlock_guard(ceph::mutex
& mutex
)
6913 unlock_guard(unlock_guard
&) = delete;
6920 void OSD::scrub_purged_snaps()
6922 dout(10) << __func__
<< dendl
;
6923 ceph_assert(ceph_mutex_is_locked(osd_lock
));
6924 SnapMapper::Scrubber
s(cct
, store
, service
.meta_ch
,
6925 make_snapmapper_oid(),
6926 make_purged_snaps_oid());
6927 clog
->debug() << "purged_snaps scrub starts";
6930 if (s
.stray
.size()) {
6931 clog
->debug() << "purged_snaps scrub found " << s
.stray
.size() << " strays";
6933 clog
->debug() << "purged_snaps scrub ok";
6935 set
<pair
<spg_t
,snapid_t
>> queued
;
6936 for (auto& [pool
, snap
, hash
, shard
] : s
.stray
) {
6937 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(pool
);
6939 dout(20) << __func__
<< " pool " << pool
<< " dne" << dendl
;
6942 pg_t
pgid(pi
->raw_hash_to_pg(hash
), pool
);
6943 spg_t
spgid(pgid
, shard
);
6944 pair
<spg_t
,snapid_t
> p(spgid
, snap
);
6945 if (queued
.count(p
)) {
6946 dout(20) << __func__
<< " pg " << spgid
<< " snap " << snap
6947 << " already queued" << dendl
;
6950 PGRef pg
= lookup_lock_pg(spgid
);
6952 dout(20) << __func__
<< " pg " << spgid
<< " not found" << dendl
;
6956 dout(10) << __func__
<< " requeue pg " << spgid
<< " " << pg
<< " snap "
6958 pg
->queue_snap_retrim(snap
);
6962 if (is_stopping()) {
6965 dout(10) << __func__
<< " done queueing pgs, updating superblock" << dendl
;
6966 ObjectStore::Transaction t
;
6967 superblock
.last_purged_snaps_scrub
= ceph_clock_now();
6968 write_superblock(t
);
6969 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
6970 ceph_assert(tr
== 0);
6972 send_beacon(ceph::coarse_mono_clock::now());
6974 dout(10) << __func__
<< " done" << dendl
;
6977 void OSD::probe_smart(const string
& only_devid
, ostream
& ss
)
6979 set
<string
> devnames
;
6980 store
->get_devices(&devnames
);
6981 uint64_t smart_timeout
= cct
->_conf
.get_val
<uint64_t>(
6982 "osd_smart_report_timeout");
6984 // == typedef std::map<std::string, mValue> mObject;
6985 json_spirit::mObject json_map
;
6987 for (auto dev
: devnames
) {
6988 // smartctl works only on physical devices; filter out any logical device
6989 if (dev
.find("dm-") == 0) {
6994 string devid
= get_device_id(dev
, &err
);
6995 if (devid
.size() == 0) {
6996 dout(10) << __func__
<< " no unique id for dev " << dev
<< " ("
6997 << err
<< "), skipping" << dendl
;
7000 if (only_devid
.size() && devid
!= only_devid
) {
7004 json_spirit::mValue smart_json
;
7005 if (block_device_get_metrics(dev
, smart_timeout
,
7007 dout(10) << "block_device_get_metrics failed for /dev/" << dev
<< dendl
;
7010 json_map
[devid
] = smart_json
;
7012 json_spirit::write(json_map
, ss
, json_spirit::pretty_print
);
7015 bool OSD::heartbeat_dispatch(Message
*m
)
7017 dout(30) << "heartbeat_dispatch " << m
<< dendl
;
7018 switch (m
->get_type()) {
7021 dout(10) << "ping from " << m
->get_source_inst() << dendl
;
7026 handle_osd_ping(static_cast<MOSDPing
*>(m
));
7030 dout(0) << "dropping unexpected message " << *m
<< " from " << m
->get_source_inst() << dendl
;
7037 bool OSD::ms_dispatch(Message
*m
)
7039 dout(20) << "OSD::ms_dispatch: " << *m
<< dendl
;
7040 if (m
->get_type() == MSG_OSD_MARK_ME_DOWN
) {
7041 service
.got_stop_ack();
7049 if (is_stopping()) {
7063 void OSDService::maybe_share_map(
7065 const OSDMapRef
& osdmap
,
7066 epoch_t peer_epoch_lb
)
7068 // NOTE: we assume caller hold something that keeps the Connection itself
7069 // pinned (e.g., an OpRequest's MessageRef).
7070 auto session
= ceph::ref_cast
<Session
>(con
->get_priv());
7075 // assume the peer has the newer of the op's sent_epoch and what
7076 // we think we sent them.
7077 session
->sent_epoch_lock
.lock();
7078 if (peer_epoch_lb
> session
->last_sent_epoch
) {
7079 dout(10) << __func__
<< " con " << con
7080 << " " << con
->get_peer_addr()
7081 << " map epoch " << session
->last_sent_epoch
7082 << " -> " << peer_epoch_lb
<< " (as per caller)" << dendl
;
7083 session
->last_sent_epoch
= peer_epoch_lb
;
7085 epoch_t last_sent_epoch
= session
->last_sent_epoch
;
7086 session
->sent_epoch_lock
.unlock();
7088 if (osdmap
->get_epoch() <= last_sent_epoch
) {
7092 send_incremental_map(last_sent_epoch
, con
, osdmap
);
7093 last_sent_epoch
= osdmap
->get_epoch();
7095 session
->sent_epoch_lock
.lock();
7096 if (session
->last_sent_epoch
< last_sent_epoch
) {
7097 dout(10) << __func__
<< " con " << con
7098 << " " << con
->get_peer_addr()
7099 << " map epoch " << session
->last_sent_epoch
7100 << " -> " << last_sent_epoch
<< " (shared)" << dendl
;
7101 session
->last_sent_epoch
= last_sent_epoch
;
7103 session
->sent_epoch_lock
.unlock();
7106 void OSD::dispatch_session_waiting(const ceph::ref_t
<Session
>& session
, OSDMapRef osdmap
)
7108 ceph_assert(ceph_mutex_is_locked(session
->session_dispatch_lock
));
7110 auto i
= session
->waiting_on_map
.begin();
7111 while (i
!= session
->waiting_on_map
.end()) {
7112 OpRequestRef op
= &(*i
);
7113 ceph_assert(ms_can_fast_dispatch(op
->get_req()));
7114 auto m
= op
->get_req
<MOSDFastDispatchOp
>();
7115 if (m
->get_min_epoch() > osdmap
->get_epoch()) {
7118 session
->waiting_on_map
.erase(i
++);
7122 if (m
->get_type() == CEPH_MSG_OSD_OP
) {
7123 pg_t actual_pgid
= osdmap
->raw_pg_to_pg(
7124 static_cast<const MOSDOp
*>(m
)->get_pg());
7125 if (!osdmap
->get_primary_shard(actual_pgid
, &pgid
)) {
7129 pgid
= m
->get_spg();
7131 enqueue_op(pgid
, std::move(op
), m
->get_map_epoch());
7134 if (session
->waiting_on_map
.empty()) {
7135 clear_session_waiting_on_map(session
);
7137 register_session_waiting_on_map(session
);
7141 void OSD::ms_fast_dispatch(Message
*m
)
7145 jaeger_tracing::init_tracer("osd-services-reinit");
7146 dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl
;
7147 auto dispatch_span
= jaeger_tracing::new_span(__func__
);
7150 if (service
.is_stopping()) {
7156 switch (m
->get_type()) {
7158 dout(10) << "ping from " << m
->get_source() << dendl
;
7161 case MSG_OSD_FORCE_RECOVERY
:
7162 handle_fast_force_recovery(static_cast<MOSDForceRecovery
*>(m
));
7164 case MSG_OSD_SCRUB2
:
7165 handle_fast_scrub(static_cast<MOSDScrub2
*>(m
));
7168 case MSG_OSD_PG_CREATE2
:
7169 return handle_fast_pg_create(static_cast<MOSDPGCreate2
*>(m
));
7170 case MSG_OSD_PG_QUERY
:
7171 return handle_fast_pg_query(static_cast<MOSDPGQuery
*>(m
));
7172 case MSG_OSD_PG_NOTIFY
:
7173 return handle_fast_pg_notify(static_cast<MOSDPGNotify
*>(m
));
7174 case MSG_OSD_PG_INFO
:
7175 return handle_fast_pg_info(static_cast<MOSDPGInfo
*>(m
));
7176 case MSG_OSD_PG_REMOVE
:
7177 return handle_fast_pg_remove(static_cast<MOSDPGRemove
*>(m
));
7179 // these are single-pg messages that handle themselves
7180 case MSG_OSD_PG_LOG
:
7181 case MSG_OSD_PG_TRIM
:
7182 case MSG_OSD_PG_NOTIFY2
:
7183 case MSG_OSD_PG_QUERY2
:
7184 case MSG_OSD_PG_INFO2
:
7185 case MSG_OSD_BACKFILL_RESERVE
:
7186 case MSG_OSD_RECOVERY_RESERVE
:
7187 case MSG_OSD_PG_LEASE
:
7188 case MSG_OSD_PG_LEASE_ACK
:
7190 MOSDPeeringOp
*pm
= static_cast<MOSDPeeringOp
*>(m
);
7191 if (require_osd_peer(pm
)) {
7192 enqueue_peering_evt(
7194 PGPeeringEventRef(pm
->get_event()));
7201 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7204 osd_reqid_t reqid
= op
->get_reqid();
7206 tracepoint(osd
, ms_fast_dispatch
, reqid
.name
._type
,
7207 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
7210 op
->set_osd_parent_span(dispatch_span
);
7211 if (op
->osd_parent_span
) {
7212 auto op_req_span
= jaeger_tracing::child_span("op-request-created", op
->osd_parent_span
);
7213 op
->set_osd_parent_span(op_req_span
);
7217 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7219 // note sender epoch, min req's epoch
7220 op
->sent_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch();
7221 op
->min_epoch
= static_cast<MOSDFastDispatchOp
*>(m
)->get_min_epoch();
7222 ceph_assert(op
->min_epoch
<= op
->sent_epoch
); // sanity check!
7224 service
.maybe_inject_dispatch_delay();
7226 if (m
->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT
) ||
7227 m
->get_type() != CEPH_MSG_OSD_OP
) {
7228 // queue it directly
7230 static_cast<MOSDFastDispatchOp
*>(m
)->get_spg(),
7232 static_cast<MOSDFastDispatchOp
*>(m
)->get_map_epoch());
7234 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7235 // message that didn't have an explicit spg_t); we need to map
7236 // them to an spg_t while preserving delivery order.
7237 auto priv
= m
->get_connection()->get_priv();
7238 if (auto session
= static_cast<Session
*>(priv
.get()); session
) {
7239 std::lock_guard l
{session
->session_dispatch_lock
};
7241 session
->waiting_on_map
.push_back(*op
);
7242 OSDMapRef nextmap
= service
.get_nextmap_reserved();
7243 dispatch_session_waiting(session
, nextmap
);
7244 service
.release_map(nextmap
);
7247 OID_EVENT_TRACE_WITH_MSG(m
, "MS_FAST_DISPATCH_END", false);
7250 int OSD::ms_handle_authentication(Connection
*con
)
7253 auto s
= ceph::ref_cast
<Session
>(con
->get_priv());
7255 s
= ceph::make_ref
<Session
>(cct
, con
);
7257 s
->entity_name
= con
->get_peer_entity_name();
7258 dout(10) << __func__
<< " new session " << s
<< " con " << s
->con
7259 << " entity " << s
->entity_name
7260 << " addr " << con
->get_peer_addrs() << dendl
;
7262 dout(10) << __func__
<< " existing session " << s
<< " con " << s
->con
7263 << " entity " << s
->entity_name
7264 << " addr " << con
->get_peer_addrs() << dendl
;
7267 AuthCapsInfo
&caps_info
= con
->get_peer_caps_info();
7268 if (caps_info
.allow_all
) {
7269 s
->caps
.set_allow_all();
7270 } else if (caps_info
.caps
.length() > 0) {
7271 bufferlist::const_iterator p
= caps_info
.caps
.cbegin();
7276 catch (ceph::buffer::error
& e
) {
7277 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7278 << " failed to decode caps string" << dendl
;
7282 bool success
= s
->caps
.parse(str
);
7284 dout(10) << __func__
<< " session " << s
7285 << " " << s
->entity_name
7286 << " has caps " << s
->caps
<< " '" << str
<< "'" << dendl
;
7289 dout(10) << __func__
<< " session " << s
<< " " << s
->entity_name
7290 << " failed to parse caps '" << str
<< "'" << dendl
;
7298 void OSD::do_waiters()
7300 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7302 dout(10) << "do_waiters -- start" << dendl
;
7303 while (!finished
.empty()) {
7304 OpRequestRef next
= finished
.front();
7305 finished
.pop_front();
7308 dout(10) << "do_waiters -- finish" << dendl
;
7311 void OSD::dispatch_op(OpRequestRef op
)
7313 switch (op
->get_req()->get_type()) {
7315 case MSG_OSD_PG_CREATE
:
7316 handle_pg_create(op
);
7321 void OSD::_dispatch(Message
*m
)
7323 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7324 dout(20) << "_dispatch " << m
<< " " << *m
<< dendl
;
7326 switch (m
->get_type()) {
7327 // -- don't need OSDMap --
7329 // map and replication
7330 case CEPH_MSG_OSD_MAP
:
7331 handle_osd_map(static_cast<MOSDMap
*>(m
));
7333 case MSG_MON_GET_PURGED_SNAPS_REPLY
:
7334 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply
*>(m
));
7339 handle_scrub(static_cast<MOSDScrub
*>(m
));
7343 handle_command(static_cast<MCommand
*>(m
));
7346 // -- need OSDMap --
7348 case MSG_OSD_PG_CREATE
:
7350 OpRequestRef op
= op_tracker
.create_request
<OpRequest
, Message
*>(m
);
7352 op
->osd_trace
.init("osd op", &trace_endpoint
, &m
->trace
);
7353 // no map? starting up?
7354 if (!get_osdmap()) {
7355 dout(7) << "no OSDMap, not booted" << dendl
;
7356 logger
->inc(l_osd_waiting_for_map
);
7357 waiting_for_osdmap
.push_back(op
);
7358 op
->mark_delayed("no osdmap");
7368 // remove me post-nautilus
7369 void OSD::handle_scrub(MOSDScrub
*m
)
7371 dout(10) << "handle_scrub " << *m
<< dendl
;
7372 if (!require_mon_or_mgr_peer(m
)) {
7376 if (m
->fsid
!= monc
->get_fsid()) {
7377 dout(0) << "handle_scrub fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7386 if (!m
->scrub_pgs
.empty()) {
7388 for (auto pgid
: m
->scrub_pgs
) {
7390 if (get_osdmap()->get_primary_shard(pgid
, &pcand
) &&
7391 std::find(spgs
.begin(), spgs
.end(), pcand
) != spgs
.end()) {
7398 for (auto pgid
: spgs
) {
7399 enqueue_peering_evt(
7402 std::make_shared
<PGPeeringEvent
>(
7405 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7411 void OSD::handle_fast_scrub(MOSDScrub2
*m
)
7413 dout(10) << __func__
<< " " << *m
<< dendl
;
7414 if (!require_mon_or_mgr_peer(m
)) {
7418 if (m
->fsid
!= monc
->get_fsid()) {
7419 dout(0) << __func__
<< " fsid " << m
->fsid
<< " != " << monc
->get_fsid()
7424 for (auto pgid
: m
->scrub_pgs
) {
7425 enqueue_peering_evt(
7428 std::make_shared
<PGPeeringEvent
>(
7431 PeeringState::RequestScrub(m
->deep
, m
->repair
))));
7436 bool OSD::scrub_random_backoff()
7438 bool coin_flip
= (rand() / (double)RAND_MAX
>=
7439 cct
->_conf
->osd_scrub_backoff_ratio
);
7441 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl
;
7447 OSDService::ScrubJob::ScrubJob(CephContext
* cct
,
7448 const spg_t
& pg
, const utime_t
& timestamp
,
7449 double pool_scrub_min_interval
,
7450 double pool_scrub_max_interval
, bool must
)
7453 sched_time(timestamp
),
7456 // if not explicitly requested, postpone the scrub with a random delay
7458 double scrub_min_interval
= pool_scrub_min_interval
> 0 ?
7459 pool_scrub_min_interval
: cct
->_conf
->osd_scrub_min_interval
;
7460 double scrub_max_interval
= pool_scrub_max_interval
> 0 ?
7461 pool_scrub_max_interval
: cct
->_conf
->osd_scrub_max_interval
;
7463 sched_time
+= scrub_min_interval
;
7464 double r
= rand() / (double)RAND_MAX
;
7466 scrub_min_interval
* cct
->_conf
->osd_scrub_interval_randomize_ratio
* r
;
7467 if (scrub_max_interval
== 0) {
7468 deadline
= utime_t();
7470 deadline
+= scrub_max_interval
;
7476 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob
& rhs
) const {
7477 if (sched_time
< rhs
.sched_time
)
7479 if (sched_time
> rhs
.sched_time
)
7481 return pgid
< rhs
.pgid
;
7484 void OSDService::dumps_scrub(ceph::Formatter
*f
)
7486 ceph_assert(f
!= nullptr);
7487 std::lock_guard
l(sched_scrub_lock
);
7489 f
->open_array_section("scrubs");
7490 for (const auto &i
: sched_scrub_pg
) {
7491 f
->open_object_section("scrub");
7492 f
->dump_stream("pgid") << i
.pgid
;
7493 f
->dump_stream("sched_time") << i
.sched_time
;
7494 f
->dump_stream("deadline") << i
.deadline
;
7495 f
->dump_bool("forced", i
.sched_time
== PgScrubber::scrub_must_stamp());
7501 double OSD::scrub_sleep_time(bool must_scrub
)
7504 return cct
->_conf
->osd_scrub_sleep
;
7506 utime_t now
= ceph_clock_now();
7507 if (scrub_time_permit(now
)) {
7508 return cct
->_conf
->osd_scrub_sleep
;
7510 double normal_sleep
= cct
->_conf
->osd_scrub_sleep
;
7511 double extended_sleep
= cct
->_conf
->osd_scrub_extended_sleep
;
7512 return std::max(extended_sleep
, normal_sleep
);
7515 bool OSD::scrub_time_permit(utime_t now
)
7518 time_t tt
= now
.sec();
7519 localtime_r(&tt
, &bdt
);
7521 bool day_permit
= false;
7522 if (cct
->_conf
->osd_scrub_begin_week_day
< cct
->_conf
->osd_scrub_end_week_day
) {
7523 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
&& bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7527 if (bdt
.tm_wday
>= cct
->_conf
->osd_scrub_begin_week_day
|| bdt
.tm_wday
< cct
->_conf
->osd_scrub_end_week_day
) {
7533 dout(20) << __func__
<< " should run between week day " << cct
->_conf
->osd_scrub_begin_week_day
7534 << " - " << cct
->_conf
->osd_scrub_end_week_day
7535 << " now " << bdt
.tm_wday
<< " = no" << dendl
;
7539 bool time_permit
= false;
7540 if (cct
->_conf
->osd_scrub_begin_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7541 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
&& bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7545 if (bdt
.tm_hour
>= cct
->_conf
->osd_scrub_begin_hour
|| bdt
.tm_hour
< cct
->_conf
->osd_scrub_end_hour
) {
7550 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7551 << " - " << cct
->_conf
->osd_scrub_end_hour
7552 << " now " << bdt
.tm_hour
<< " = yes" << dendl
;
7554 dout(20) << __func__
<< " should run between " << cct
->_conf
->osd_scrub_begin_hour
7555 << " - " << cct
->_conf
->osd_scrub_end_hour
7556 << " now " << bdt
.tm_hour
<< " = no" << dendl
;
7561 bool OSD::scrub_load_below_threshold()
7564 if (getloadavg(loadavgs
, 3) != 3) {
7565 dout(10) << __func__
<< " couldn't read loadavgs\n" << dendl
;
7569 // allow scrub if below configured threshold
7570 long cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
7571 double loadavg_per_cpu
= cpus
> 0 ? loadavgs
[0] / cpus
: loadavgs
[0];
7572 if (loadavg_per_cpu
< cct
->_conf
->osd_scrub_load_threshold
) {
7573 dout(20) << __func__
<< " loadavg per cpu " << loadavg_per_cpu
7574 << " < max " << cct
->_conf
->osd_scrub_load_threshold
7575 << " = yes" << dendl
;
7579 // allow scrub if below daily avg and currently decreasing
7580 if (loadavgs
[0] < daily_loadavg
&& loadavgs
[0] < loadavgs
[2]) {
7581 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7582 << " < daily_loadavg " << daily_loadavg
7583 << " and < 15m avg " << loadavgs
[2]
7584 << " = yes" << dendl
;
7588 dout(20) << __func__
<< " loadavg " << loadavgs
[0]
7589 << " >= max " << cct
->_conf
->osd_scrub_load_threshold
7590 << " and ( >= daily_loadavg " << daily_loadavg
7591 << " or >= 15m avg " << loadavgs
[2]
7592 << ") = no" << dendl
;
7596 void OSD::sched_scrub()
7598 dout(20) << __func__
<< " sched_scrub starts" << dendl
;
7600 // if not permitted, fail fast
7601 if (!service
.can_inc_scrubs()) {
7602 dout(20) << __func__
<< ": OSD cannot inc scrubs" << dendl
;
7605 bool allow_requested_repair_only
= false;
7606 if (service
.is_recovery_active() && !cct
->_conf
->osd_scrub_during_recovery
) {
7607 if (!cct
->_conf
->osd_repair_during_recovery
) {
7608 dout(15) << __func__
<< ": not scheduling scrubs due to active recovery" << dendl
;
7611 dout(10) << __func__
7612 << " will only schedule explicitly requested repair due to active recovery"
7614 allow_requested_repair_only
= true;
7617 utime_t now
= ceph_clock_now();
7618 bool time_permit
= scrub_time_permit(now
);
7619 bool load_is_low
= scrub_load_below_threshold();
7620 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low
<< dendl
;
7622 OSDService::ScrubJob scrub_job
;
7623 if (service
.first_scrub_stamp(&scrub_job
)) {
7625 dout(30) << "sched_scrub examine " << scrub_job
.pgid
<< " at " << scrub_job
.sched_time
<< dendl
;
7627 if (scrub_job
.sched_time
> now
) {
7628 // save ourselves some effort
7629 dout(20) << "sched_scrub " << scrub_job
.pgid
<< " scheduled at " << scrub_job
.sched_time
7630 << " > " << now
<< dendl
;
7634 if ((scrub_job
.deadline
.is_zero() || scrub_job
.deadline
>= now
) && !(time_permit
&& load_is_low
)) {
7635 dout(15) << __func__
<< " not scheduling scrub for " << scrub_job
.pgid
<< " due to "
7636 << (!time_permit
? "time not permit" : "high load") << dendl
;
7640 PGRef pg
= _lookup_lock_pg(scrub_job
.pgid
);
7642 dout(20) << __func__
<< " pg " << scrub_job
.pgid
<< " not found" << dendl
;
7646 // This has already started, so go on to the next scrub job
7647 if (pg
->is_scrub_active()) {
7649 dout(20) << __func__
<< ": already in progress pgid " << scrub_job
.pgid
<< dendl
;
7652 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7653 if (allow_requested_repair_only
&& !pg
->m_planned_scrub
.must_repair
) {
7655 dout(10) << __func__
<< " skip " << scrub_job
.pgid
7656 << " because repairing is not explicitly requested on it"
7661 // If it is reserving, let it resolve before going to the next scrub job
7662 if (pg
->m_scrubber
->is_reserving()) {
7664 dout(10) << __func__
<< ": reserve in progress pgid " << scrub_job
.pgid
<< dendl
;
7667 dout(15) << "sched_scrub scrubbing " << scrub_job
.pgid
<< " at " << scrub_job
.sched_time
7668 << (pg
->get_must_scrub() ? ", explicitly requested" :
7669 (load_is_low
? ", load_is_low" : " deadline < now"))
7671 if (pg
->sched_scrub()) {
7673 dout(10) << __func__
<< " scheduled a scrub!" << " (~" << scrub_job
.pgid
<< "~)" << dendl
;
7677 } while (service
.next_scrub_stamp(scrub_job
, &scrub_job
));
7679 dout(20) << "sched_scrub done" << dendl
;
7682 void OSD::resched_all_scrubs()
7684 dout(10) << __func__
<< ": start" << dendl
;
7685 const vector
<spg_t
> pgs
= [this] {
7687 OSDService::ScrubJob job
;
7688 if (service
.first_scrub_stamp(&job
)) {
7690 pgs
.push_back(job
.pgid
);
7691 } while (service
.next_scrub_stamp(job
, &job
));
7695 for (auto& pgid
: pgs
) {
7696 dout(20) << __func__
<< ": examine " << pgid
<< dendl
;
7697 PGRef pg
= _lookup_lock_pg(pgid
);
7700 if (!pg
->m_planned_scrub
.must_scrub
&& !pg
->m_planned_scrub
.need_auto
) {
7701 dout(15) << __func__
<< ": reschedule " << pgid
<< dendl
;
7702 pg
->on_info_history_change();
7706 dout(10) << __func__
<< ": done" << dendl
;
7709 MPGStats
* OSD::collect_pg_stats()
7711 // This implementation unconditionally sends every is_primary PG's
7712 // stats every time we're called. This has equivalent cost to the
7713 // previous implementation's worst case where all PGs are busy and
7714 // their stats are always enqueued for sending.
7715 std::shared_lock l
{map_lock
};
7717 osd_stat_t cur_stat
= service
.get_osd_stat();
7718 cur_stat
.os_perf_stat
= store
->get_cur_stats();
7720 auto m
= new MPGStats(monc
->get_fsid(), get_osdmap_epoch());
7721 m
->osd_stat
= cur_stat
;
7723 std::lock_guard lec
{min_last_epoch_clean_lock
};
7724 min_last_epoch_clean
= get_osdmap_epoch();
7725 min_last_epoch_clean_pgs
.clear();
7727 std::set
<int64_t> pool_set
;
7730 for (auto& pg
: pgs
) {
7731 auto pool
= pg
->pg_id
.pgid
.pool();
7732 pool_set
.emplace((int64_t)pool
);
7733 if (!pg
->is_primary()) {
7736 pg
->get_pg_stats([&](const pg_stat_t
& s
, epoch_t lec
) {
7737 m
->pg_stat
[pg
->pg_id
.pgid
] = s
;
7738 min_last_epoch_clean
= std::min(min_last_epoch_clean
, lec
);
7739 min_last_epoch_clean_pgs
.push_back(pg
->pg_id
.pgid
);
7743 bool per_pool_stats
= false;
7744 bool per_pool_omap_stats
= false;
7745 for (auto p
: pool_set
) {
7746 int r
= store
->pool_statfs(p
, &st
, &per_pool_omap_stats
);
7747 if (r
== -ENOTSUP
) {
7751 m
->pool_stat
[p
] = st
;
7752 per_pool_stats
= true;
7756 // indicate whether we are reporting per-pool stats
7757 m
->osd_stat
.num_osds
= 1;
7758 m
->osd_stat
.num_per_pool_osds
= per_pool_stats
? 1 : 0;
7759 m
->osd_stat
.num_per_pool_omap_osds
= per_pool_omap_stats
? 1 : 0;
7764 vector
<DaemonHealthMetric
> OSD::get_health_metrics()
7766 vector
<DaemonHealthMetric
> metrics
;
7768 utime_t oldest_secs
;
7769 const utime_t now
= ceph_clock_now();
7771 too_old
-= cct
->_conf
.get_val
<double>("osd_op_complaint_time");
7773 TrackedOpRef oldest_op
;
7774 auto count_slow_ops
= [&](TrackedOp
& op
) {
7775 if (op
.get_initiated() < too_old
) {
7777 ss
<< "slow request " << op
.get_desc()
7779 << op
.get_initiated()
7781 << op
.state_string();
7782 lgeneric_subdout(cct
,osd
,20) << ss
.str() << dendl
;
7783 clog
->warn() << ss
.str();
7785 if (!oldest_op
|| op
.get_initiated() < oldest_op
->get_initiated()) {
7793 if (op_tracker
.visit_ops_in_flight(&oldest_secs
, count_slow_ops
)) {
7795 derr
<< __func__
<< " reporting " << slow
<< " slow ops, oldest is "
7796 << oldest_op
->get_desc() << dendl
;
7798 metrics
.emplace_back(daemon_metric::SLOW_OPS
, slow
, oldest_secs
);
7800 // no news is not good news.
7801 metrics
.emplace_back(daemon_metric::SLOW_OPS
, 0, 0);
7805 std::lock_guard
l(pending_creates_lock
);
7806 auto n_primaries
= pending_creates_from_mon
;
7807 for (const auto& create
: pending_creates_from_osd
) {
7808 if (create
.second
) {
7812 metrics
.emplace_back(daemon_metric::PENDING_CREATING_PGS
, n_primaries
);
7817 // =====================================================
7820 void OSD::wait_for_new_map(OpRequestRef op
)
7823 if (waiting_for_osdmap
.empty()) {
7824 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7827 logger
->inc(l_osd_waiting_for_map
);
7828 waiting_for_osdmap
.push_back(op
);
7829 op
->mark_delayed("wait for new map");
7834 * assimilate new OSDMap(s). scan pgs, etc.
7837 void OSD::note_down_osd(int peer
)
7839 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7840 cluster_messenger
->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer
));
7842 std::lock_guard l
{heartbeat_lock
};
7843 failure_queue
.erase(peer
);
7844 failure_pending
.erase(peer
);
7845 map
<int,HeartbeatInfo
>::iterator p
= heartbeat_peers
.find(peer
);
7846 if (p
!= heartbeat_peers
.end()) {
7847 p
->second
.clear_mark_down();
7848 heartbeat_peers
.erase(p
);
7852 void OSD::note_up_osd(int peer
)
7854 heartbeat_set_peers_need_update();
7857 struct C_OnMapCommit
: public Context
{
7859 epoch_t first
, last
;
7861 C_OnMapCommit(OSD
*o
, epoch_t f
, epoch_t l
, MOSDMap
*m
)
7862 : osd(o
), first(f
), last(l
), msg(m
) {}
7863 void finish(int r
) override
{
7864 osd
->_committed_osd_maps(first
, last
, msg
);
7869 void OSD::osdmap_subscribe(version_t epoch
, bool force_request
)
7871 std::lock_guard
l(osdmap_subscribe_lock
);
7872 if (latest_subscribed_epoch
>= epoch
&& !force_request
)
7875 latest_subscribed_epoch
= std::max
<uint64_t>(epoch
, latest_subscribed_epoch
);
7877 if (monc
->sub_want_increment("osdmap", epoch
, CEPH_SUBSCRIBE_ONETIME
) ||
7883 void OSD::trim_maps(epoch_t oldest
, int nreceived
, bool skip_maps
)
7885 epoch_t min
= std::min(oldest
, service
.map_cache
.cached_key_lower_bound());
7886 if (min
<= superblock
.oldest_map
)
7890 ObjectStore::Transaction t
;
7891 for (epoch_t e
= superblock
.oldest_map
; e
< min
; ++e
) {
7892 dout(20) << " removing old osdmap epoch " << e
<< dendl
;
7893 t
.remove(coll_t::meta(), get_osdmap_pobject_name(e
));
7894 t
.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e
));
7895 superblock
.oldest_map
= e
+ 1;
7897 if (num
>= cct
->_conf
->osd_target_transaction_size
&& num
>= nreceived
) {
7898 service
.publish_superblock(superblock
);
7899 write_superblock(t
);
7900 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7901 ceph_assert(tr
== 0);
7904 // skip_maps leaves us with a range of old maps if we fail to remove all
7905 // of them before moving superblock.oldest_map forward to the first map
7906 // in the incoming MOSDMap msg. so we should continue removing them in
7907 // this case, even we could do huge series of delete transactions all at
7914 service
.publish_superblock(superblock
);
7915 write_superblock(t
);
7916 int tr
= store
->queue_transaction(service
.meta_ch
, std::move(t
), nullptr);
7917 ceph_assert(tr
== 0);
7919 // we should not remove the cached maps
7920 ceph_assert(min
<= service
.map_cache
.cached_key_lower_bound());
7923 void OSD::handle_osd_map(MOSDMap
*m
)
7925 // wait for pgs to catch up
7927 // we extend the map cache pins to accomodate pgs slow to consume maps
7928 // for some period, until we hit the max_lag_factor bound, at which point
7929 // we block here to stop injesting more maps than they are able to keep
7931 epoch_t max_lag
= cct
->_conf
->osd_map_cache_size
*
7932 m_osd_pg_epoch_max_lag_factor
;
7933 ceph_assert(max_lag
> 0);
7934 epoch_t osd_min
= 0;
7935 for (auto shard
: shards
) {
7936 epoch_t min
= shard
->get_min_pg_epoch();
7937 if (osd_min
== 0 || min
< osd_min
) {
7941 epoch_t osdmap_epoch
= get_osdmap_epoch();
7943 osdmap_epoch
> max_lag
&&
7944 osdmap_epoch
- max_lag
> osd_min
) {
7945 epoch_t need
= osdmap_epoch
- max_lag
;
7946 dout(10) << __func__
<< " waiting for pgs to catch up (need " << need
7947 << " max_lag " << max_lag
<< ")" << dendl
;
7948 for (auto shard
: shards
) {
7949 epoch_t min
= shard
->get_min_pg_epoch();
7951 dout(10) << __func__
<< " waiting for pgs to consume " << need
7952 << " (shard " << shard
->shard_id
<< " min " << min
7953 << ", map cache is " << cct
->_conf
->osd_map_cache_size
7954 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7956 unlock_guard unlock
{osd_lock
};
7957 shard
->wait_min_pg_epoch(need
);
7963 ceph_assert(ceph_mutex_is_locked(osd_lock
));
7964 map
<epoch_t
,OSDMapRef
> added_maps
;
7965 map
<epoch_t
,bufferlist
> added_maps_bl
;
7966 if (m
->fsid
!= monc
->get_fsid()) {
7967 dout(0) << "handle_osd_map fsid " << m
->fsid
<< " != "
7968 << monc
->get_fsid() << dendl
;
7972 if (is_initializing()) {
7973 dout(0) << "ignoring osdmap until we have initialized" << dendl
;
7978 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
7979 if (session
&& !(session
->entity_name
.is_mon() ||
7980 session
->entity_name
.is_osd())) {
7982 dout(10) << "got osd map from Session " << session
7983 << " which we can't take maps from (not a mon or osd)" << dendl
;
7988 // share with the objecter
7990 service
.objecter
->handle_osd_map(m
);
7992 epoch_t first
= m
->get_first();
7993 epoch_t last
= m
->get_last();
7994 dout(3) << "handle_osd_map epochs [" << first
<< "," << last
<< "], i have "
7995 << superblock
.newest_map
7996 << ", src has [" << m
->oldest_map
<< "," << m
->newest_map
<< "]"
7999 logger
->inc(l_osd_map
);
8000 logger
->inc(l_osd_mape
, last
- first
+ 1);
8001 if (first
<= superblock
.newest_map
)
8002 logger
->inc(l_osd_mape_dup
, superblock
.newest_map
- first
+ 1);
8003 if (service
.max_oldest_map
< m
->oldest_map
) {
8004 service
.max_oldest_map
= m
->oldest_map
;
8005 ceph_assert(service
.max_oldest_map
>= superblock
.oldest_map
);
8008 // make sure there is something new, here, before we bother flushing
8009 // the queues and such
8010 if (last
<= superblock
.newest_map
) {
8011 dout(10) << " no new maps here, dropping" << dendl
;
8017 bool skip_maps
= false;
8018 if (first
> superblock
.newest_map
+ 1) {
8019 dout(10) << "handle_osd_map message skips epochs "
8020 << superblock
.newest_map
+ 1 << ".." << (first
-1) << dendl
;
8021 if (m
->oldest_map
<= superblock
.newest_map
+ 1) {
8022 osdmap_subscribe(superblock
.newest_map
+ 1, false);
8026 // always try to get the full range of maps--as many as we can. this
8027 // 1- is good to have
8028 // 2- is at present the only way to ensure that we get a *full* map as
8030 if (m
->oldest_map
< first
) {
8031 osdmap_subscribe(m
->oldest_map
- 1, true);
8038 ObjectStore::Transaction t
;
8039 uint64_t txn_size
= 0;
8041 map
<epoch_t
,mempool::osdmap::map
<int64_t,snap_interval_set_t
>> purged_snaps
;
8043 // store new maps: queue for disk and put in the osdmap cache
8044 epoch_t start
= std::max(superblock
.newest_map
+ 1, first
);
8045 for (epoch_t e
= start
; e
<= last
; e
++) {
8046 if (txn_size
>= t
.get_num_bytes()) {
8047 derr
<< __func__
<< " transaction size overflowed" << dendl
;
8048 ceph_assert(txn_size
< t
.get_num_bytes());
8050 txn_size
= t
.get_num_bytes();
8051 map
<epoch_t
,bufferlist
>::iterator p
;
8052 p
= m
->maps
.find(e
);
8053 if (p
!= m
->maps
.end()) {
8054 dout(10) << "handle_osd_map got full map for epoch " << e
<< dendl
;
8055 OSDMap
*o
= new OSDMap
;
8056 bufferlist
& bl
= p
->second
;
8060 purged_snaps
[e
] = o
->get_new_purged_snaps();
8062 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8063 t
.write(coll_t::meta(), fulloid
, 0, bl
.length(), bl
);
8064 added_maps
[e
] = add_map(o
);
8065 added_maps_bl
[e
] = bl
;
8070 p
= m
->incremental_maps
.find(e
);
8071 if (p
!= m
->incremental_maps
.end()) {
8072 dout(10) << "handle_osd_map got inc map for epoch " << e
<< dendl
;
8073 bufferlist
& bl
= p
->second
;
8074 ghobject_t oid
= get_inc_osdmap_pobject_name(e
);
8075 t
.write(coll_t::meta(), oid
, 0, bl
.length(), bl
);
8077 OSDMap
*o
= new OSDMap
;
8080 bool got
= get_map_bl(e
- 1, obl
);
8082 auto p
= added_maps_bl
.find(e
- 1);
8083 ceph_assert(p
!= added_maps_bl
.end());
8089 OSDMap::Incremental inc
;
8090 auto p
= bl
.cbegin();
8093 if (o
->apply_incremental(inc
) < 0) {
8094 derr
<< "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc
.fsid
<< dendl
;
8095 ceph_abort_msg("bad fsid");
8099 o
->encode(fbl
, inc
.encode_features
| CEPH_FEATURE_RESERVED
);
8101 bool injected_failure
= false;
8102 if (cct
->_conf
->osd_inject_bad_map_crc_probability
> 0 &&
8103 (rand() % 10000) < cct
->_conf
->osd_inject_bad_map_crc_probability
*10000.0) {
8104 derr
<< __func__
<< " injecting map crc failure" << dendl
;
8105 injected_failure
= true;
8108 if ((inc
.have_crc
&& o
->get_crc() != inc
.full_crc
) || injected_failure
) {
8109 dout(2) << "got incremental " << e
8110 << " but failed to encode full with correct crc; requesting"
8112 clog
->warn() << "failed to encode map e" << e
<< " with expected crc";
8113 dout(20) << "my encoded map was:\n";
8114 fbl
.hexdump(*_dout
);
8117 request_full_map(e
, last
);
8120 // don't continue committing if we failed to enc the first inc map
8122 dout(10) << __func__
<< " bailing because last < start (" << last
<< "<" << start
<< ")" << dendl
;
8129 purged_snaps
[e
] = o
->get_new_purged_snaps();
8131 ghobject_t fulloid
= get_osdmap_pobject_name(e
);
8132 t
.write(coll_t::meta(), fulloid
, 0, fbl
.length(), fbl
);
8133 added_maps
[e
] = add_map(o
);
8134 added_maps_bl
[e
] = fbl
;
8138 ceph_abort_msg("MOSDMap lied about what maps it had?");
8141 // even if this map isn't from a mon, we may have satisfied our subscription
8142 monc
->sub_got("osdmap", last
);
8144 if (!m
->maps
.empty() && requested_full_first
) {
8145 dout(10) << __func__
<< " still missing full maps " << requested_full_first
8146 << ".." << requested_full_last
<< dendl
;
8147 rerequest_full_maps();
8150 if (superblock
.oldest_map
) {
8151 // make sure we at least keep pace with incoming maps
8152 trim_maps(m
->oldest_map
, last
- first
+ 1, skip_maps
);
8153 pg_num_history
.prune(superblock
.oldest_map
);
8156 if (!superblock
.oldest_map
|| skip_maps
)
8157 superblock
.oldest_map
= first
;
8158 superblock
.newest_map
= last
;
8159 superblock
.current_epoch
= last
;
8161 // note in the superblock that we were clean thru the prior epoch
8162 epoch_t boot_epoch
= service
.get_boot_epoch();
8163 if (boot_epoch
&& boot_epoch
>= superblock
.mounted
) {
8164 superblock
.mounted
= boot_epoch
;
8165 superblock
.clean_thru
= last
;
8168 // check for pg_num changes and deleted pools
8170 for (auto& i
: added_maps
) {
8172 if (!(lastmap
= service
.try_get_map(i
.first
- 1))) {
8173 dout(10) << __func__
<< " can't get previous map " << i
.first
- 1
8174 << " probably first start of this osd" << dendl
;
8178 ceph_assert(lastmap
->get_epoch() + 1 == i
.second
->get_epoch());
8179 for (auto& j
: lastmap
->get_pools()) {
8180 if (!i
.second
->have_pg_pool(j
.first
)) {
8181 pg_num_history
.log_pool_delete(i
.first
, j
.first
);
8182 dout(10) << __func__
<< " recording final pg_pool_t for pool "
8183 << j
.first
<< dendl
;
8184 // this information is needed by _make_pg() if have to restart before
8185 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8186 ghobject_t obj
= make_final_pool_info_oid(j
.first
);
8188 encode(j
.second
, bl
, CEPH_FEATURES_ALL
);
8189 string name
= lastmap
->get_pool_name(j
.first
);
8191 map
<string
,string
> profile
;
8192 if (lastmap
->get_pg_pool(j
.first
)->is_erasure()) {
8193 profile
= lastmap
->get_erasure_code_profile(
8194 lastmap
->get_pg_pool(j
.first
)->erasure_code_profile
);
8196 encode(profile
, bl
);
8197 t
.write(coll_t::meta(), obj
, 0, bl
.length(), bl
);
8198 } else if (unsigned new_pg_num
= i
.second
->get_pg_num(j
.first
);
8199 new_pg_num
!= j
.second
.get_pg_num()) {
8200 dout(10) << __func__
<< " recording pool " << j
.first
<< " pg_num "
8201 << j
.second
.get_pg_num() << " -> " << new_pg_num
<< dendl
;
8202 pg_num_history
.log_pg_num_change(i
.first
, j
.first
, new_pg_num
);
8205 for (auto& j
: i
.second
->get_pools()) {
8206 if (!lastmap
->have_pg_pool(j
.first
)) {
8207 dout(10) << __func__
<< " recording new pool " << j
.first
<< " pg_num "
8208 << j
.second
.get_pg_num() << dendl
;
8209 pg_num_history
.log_pg_num_change(i
.first
, j
.first
,
8210 j
.second
.get_pg_num());
8215 pg_num_history
.epoch
= last
;
8218 ::encode(pg_num_history
, bl
);
8219 t
.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl
.length(), bl
);
8220 dout(20) << __func__
<< " pg_num_history " << pg_num_history
<< dendl
;
8223 // record new purged_snaps
8224 if (superblock
.purged_snaps_last
== start
- 1) {
8225 SnapMapper::record_purged_snaps(cct
, store
, service
.meta_ch
,
8226 make_purged_snaps_oid(), &t
,
8228 superblock
.purged_snaps_last
= last
;
8230 dout(10) << __func__
<< " superblock purged_snaps_last is "
8231 << superblock
.purged_snaps_last
8232 << ", not recording new purged_snaps" << dendl
;
8235 // superblock and commit
8236 write_superblock(t
);
8237 t
.register_on_commit(new C_OnMapCommit(this, start
, last
, m
));
8238 store
->queue_transaction(
8241 service
.publish_superblock(superblock
);
8244 void OSD::_committed_osd_maps(epoch_t first
, epoch_t last
, MOSDMap
*m
)
8246 dout(10) << __func__
<< " " << first
<< ".." << last
<< dendl
;
8247 if (is_stopping()) {
8248 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8251 std::lock_guard
l(osd_lock
);
8252 if (is_stopping()) {
8253 dout(10) << __func__
<< " bailing, we are shutting down" << dendl
;
8258 ceph_assert(first
<= last
);
8260 bool do_shutdown
= false;
8261 bool do_restart
= false;
8262 bool network_error
= false;
8263 OSDMapRef osdmap
= get_osdmap();
8265 // advance through the new maps
8266 for (epoch_t cur
= first
; cur
<= last
; cur
++) {
8267 dout(10) << " advance to epoch " << cur
8268 << " (<= last " << last
8269 << " <= newest_map " << superblock
.newest_map
8272 OSDMapRef newmap
= get_map(cur
);
8273 ceph_assert(newmap
); // we just cached it above!
8275 // start blocklisting messages sent to peers that go down.
8276 service
.pre_publish_map(newmap
);
8278 // kill connections to newly down osds
8279 bool waited_for_reservations
= false;
8281 osdmap
= get_osdmap();
8282 osdmap
->get_all_osds(old
);
8283 for (set
<int>::iterator p
= old
.begin(); p
!= old
.end(); ++p
) {
8285 osdmap
->is_up(*p
) && // in old map
8286 newmap
->is_down(*p
)) { // but not the new one
8287 if (!waited_for_reservations
) {
8288 service
.await_reserved_maps();
8289 waited_for_reservations
= true;
8292 } else if (*p
!= whoami
&&
8293 osdmap
->is_down(*p
) &&
8294 newmap
->is_up(*p
)) {
8299 if (osdmap
->is_noup(whoami
) != newmap
->is_noup(whoami
)) {
8300 dout(10) << __func__
<< " NOUP flag changed in " << newmap
->get_epoch()
8303 // this captures the case where we sent the boot message while
8304 // NOUP was being set on the mon and our boot request was
8305 // dropped, and then later it is cleared. it imperfectly
8306 // handles the case where our original boot message was not
8307 // dropped and we restart even though we might have booted, but
8308 // that is harmless (boot will just take slightly longer).
8313 osdmap
= std::move(newmap
);
8317 service
.retrieve_epochs(&boot_epoch
, &up_epoch
, NULL
);
8319 osdmap
->is_up(whoami
) &&
8320 osdmap
->get_addrs(whoami
) == client_messenger
->get_myaddrs()) {
8321 up_epoch
= osdmap
->get_epoch();
8322 dout(10) << "up_epoch is " << up_epoch
<< dendl
;
8324 boot_epoch
= osdmap
->get_epoch();
8325 dout(10) << "boot_epoch is " << boot_epoch
<< dendl
;
8327 service
.set_epochs(&boot_epoch
, &up_epoch
, NULL
);
8331 epoch_t _bind_epoch
= service
.get_bind_epoch();
8332 if (osdmap
->is_up(whoami
) &&
8333 osdmap
->get_addrs(whoami
).legacy_equals(
8334 client_messenger
->get_myaddrs()) &&
8335 _bind_epoch
< osdmap
->get_up_from(whoami
)) {
8338 dout(1) << "state: booting -> active" << dendl
;
8339 set_state(STATE_ACTIVE
);
8342 // set incarnation so that osd_reqid_t's we generate for our
8343 // objecter requests are unique across restarts.
8344 service
.objecter
->set_client_incarnation(osdmap
->get_epoch());
8345 cancel_pending_failures();
8349 if (osdmap
->get_epoch() > 0 &&
8351 if (!osdmap
->exists(whoami
)) {
8352 derr
<< "map says i do not exist. shutting down." << dendl
;
8353 do_shutdown
= true; // don't call shutdown() while we have
8354 // everything paused
8355 } else if (osdmap
->is_stop(whoami
)) {
8356 derr
<< "map says i am stopped by admin. shutting down." << dendl
;
8358 } else if (!osdmap
->is_up(whoami
) ||
8359 !osdmap
->get_addrs(whoami
).legacy_equals(
8360 client_messenger
->get_myaddrs()) ||
8361 !osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8362 cluster_messenger
->get_myaddrs()) ||
8363 !osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8364 hb_back_server_messenger
->get_myaddrs()) ||
8365 !osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8366 hb_front_server_messenger
->get_myaddrs())) {
8367 if (!osdmap
->is_up(whoami
)) {
8368 if (service
.is_preparing_to_stop() || service
.is_stopping()) {
8369 service
.got_stop_ack();
8371 clog
->warn() << "Monitor daemon marked osd." << whoami
<< " down, "
8372 "but it is still running";
8373 clog
->debug() << "map e" << osdmap
->get_epoch()
8374 << " wrongly marked me down at e"
8375 << osdmap
->get_down_at(whoami
);
8377 if (monc
->monmap
.min_mon_release
>= ceph_release_t::octopus
) {
8378 // note that this is best-effort...
8379 monc
->send_mon_message(
8383 osdmap
->get_epoch()));
8385 } else if (!osdmap
->get_addrs(whoami
).legacy_equals(
8386 client_messenger
->get_myaddrs())) {
8387 clog
->error() << "map e" << osdmap
->get_epoch()
8388 << " had wrong client addr (" << osdmap
->get_addrs(whoami
)
8389 << " != my " << client_messenger
->get_myaddrs() << ")";
8390 } else if (!osdmap
->get_cluster_addrs(whoami
).legacy_equals(
8391 cluster_messenger
->get_myaddrs())) {
8392 clog
->error() << "map e" << osdmap
->get_epoch()
8393 << " had wrong cluster addr ("
8394 << osdmap
->get_cluster_addrs(whoami
)
8395 << " != my " << cluster_messenger
->get_myaddrs() << ")";
8396 } else if (!osdmap
->get_hb_back_addrs(whoami
).legacy_equals(
8397 hb_back_server_messenger
->get_myaddrs())) {
8398 clog
->error() << "map e" << osdmap
->get_epoch()
8399 << " had wrong heartbeat back addr ("
8400 << osdmap
->get_hb_back_addrs(whoami
)
8401 << " != my " << hb_back_server_messenger
->get_myaddrs()
8403 } else if (!osdmap
->get_hb_front_addrs(whoami
).legacy_equals(
8404 hb_front_server_messenger
->get_myaddrs())) {
8405 clog
->error() << "map e" << osdmap
->get_epoch()
8406 << " had wrong heartbeat front addr ("
8407 << osdmap
->get_hb_front_addrs(whoami
)
8408 << " != my " << hb_front_server_messenger
->get_myaddrs()
8412 if (!service
.is_stopping()) {
8413 epoch_t up_epoch
= 0;
8414 epoch_t bind_epoch
= osdmap
->get_epoch();
8415 service
.set_epochs(NULL
,&up_epoch
, &bind_epoch
);
8419 utime_t now
= ceph_clock_now();
8420 utime_t grace
= utime_t(cct
->_conf
->osd_max_markdown_period
, 0);
8421 osd_markdown_log
.push_back(now
);
8422 if ((int)osd_markdown_log
.size() > cct
->_conf
->osd_max_markdown_count
) {
8423 derr
<< __func__
<< " marked down "
8424 << osd_markdown_log
.size()
8425 << " > osd_max_markdown_count "
8426 << cct
->_conf
->osd_max_markdown_count
8427 << " in last " << grace
<< " seconds, shutting down"
8433 start_waiting_for_healthy();
8435 set
<int> avoid_ports
;
8436 #if defined(__FreeBSD__)
8437 // prevent FreeBSD from grabbing the client_messenger port during
8438 // rebinding. In which case a cluster_meesneger will connect also
8440 client_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8442 cluster_messenger
->get_myaddrs().get_ports(&avoid_ports
);
8444 int r
= cluster_messenger
->rebind(avoid_ports
);
8446 do_shutdown
= true; // FIXME: do_restart?
8447 network_error
= true;
8448 derr
<< __func__
<< " marked down:"
8449 << " rebind cluster_messenger failed" << dendl
;
8452 hb_back_server_messenger
->mark_down_all();
8453 hb_front_server_messenger
->mark_down_all();
8454 hb_front_client_messenger
->mark_down_all();
8455 hb_back_client_messenger
->mark_down_all();
8457 reset_heartbeat_peers(true);
8464 check_osdmap_features();
8469 if (is_active() || is_waiting_for_healthy())
8470 maybe_update_heartbeat_peers();
8477 if (network_error
) {
8478 cancel_pending_failures();
8480 // trigger shutdown in a different thread
8481 dout(0) << __func__
<< " shutdown OSD via async signal" << dendl
;
8482 queue_async_signal(SIGINT
);
8484 else if (m
->newest_map
&& m
->newest_map
> last
) {
8485 dout(10) << " msg say newest map is " << m
->newest_map
8486 << ", requesting more" << dendl
;
8487 osdmap_subscribe(osdmap
->get_epoch()+1, false);
8489 else if (is_preboot()) {
8490 if (m
->get_source().is_mon())
8491 _preboot(m
->oldest_map
, m
->newest_map
);
8495 else if (do_restart
)
8500 void OSD::check_osdmap_features()
8502 // adjust required feature bits?
8504 // we have to be a bit careful here, because we are accessing the
8505 // Policy structures without taking any lock. in particular, only
8506 // modify integer values that can safely be read by a racing CPU.
8507 // since we are only accessing existing Policy structures a their
8508 // current memory location, and setting or clearing bits in integer
8509 // fields, and we are the only writer, this is not a problem.
8511 const auto osdmap
= get_osdmap();
8513 Messenger::Policy p
= client_messenger
->get_default_policy();
8515 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_CLIENT
, &mask
);
8516 if ((p
.features_required
& mask
) != features
) {
8517 dout(0) << "crush map has features " << features
8518 << ", adjusting msgr requires for clients" << dendl
;
8519 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8520 client_messenger
->set_default_policy(p
);
8524 Messenger::Policy p
= client_messenger
->get_policy(entity_name_t::TYPE_MON
);
8526 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_MON
, &mask
);
8527 if ((p
.features_required
& mask
) != features
) {
8528 dout(0) << "crush map has features " << features
8529 << " was " << p
.features_required
8530 << ", adjusting msgr requires for mons" << dendl
;
8531 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8532 client_messenger
->set_policy(entity_name_t::TYPE_MON
, p
);
8536 Messenger::Policy p
= cluster_messenger
->get_policy(entity_name_t::TYPE_OSD
);
8538 uint64_t features
= osdmap
->get_features(entity_name_t::TYPE_OSD
, &mask
);
8540 if ((p
.features_required
& mask
) != features
) {
8541 dout(0) << "crush map has features " << features
8542 << ", adjusting msgr requires for osds" << dendl
;
8543 p
.features_required
= (p
.features_required
& ~mask
) | features
;
8544 cluster_messenger
->set_policy(entity_name_t::TYPE_OSD
, p
);
8547 if (!superblock
.compat_features
.incompat
.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
)) {
8548 dout(0) << __func__
<< " enabling on-disk ERASURE CODES compat feature" << dendl
;
8549 superblock
.compat_features
.incompat
.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS
);
8550 ObjectStore::Transaction t
;
8551 write_superblock(t
);
8552 int err
= store
->queue_transaction(service
.meta_ch
, std::move(t
), NULL
);
8553 ceph_assert(err
== 0);
8557 if (osdmap
->require_osd_release
< ceph_release_t::nautilus
) {
8558 hb_front_server_messenger
->set_require_authorizer(false);
8559 hb_back_server_messenger
->set_require_authorizer(false);
8561 hb_front_server_messenger
->set_require_authorizer(true);
8562 hb_back_server_messenger
->set_require_authorizer(true);
8565 if (osdmap
->require_osd_release
!= last_require_osd_release
) {
8566 dout(1) << __func__
<< " require_osd_release " << last_require_osd_release
8567 << " -> " << to_string(osdmap
->require_osd_release
) << dendl
;
8568 store
->write_meta("require_osd_release",
8569 stringify((int)osdmap
->require_osd_release
));
8570 last_require_osd_release
= osdmap
->require_osd_release
;
8574 struct C_FinishSplits
: public Context
{
8577 C_FinishSplits(OSD
*osd
, const set
<PGRef
> &in
)
8578 : osd(osd
), pgs(in
) {}
8579 void finish(int r
) override
{
8580 osd
->_finish_splits(pgs
);
8584 void OSD::_finish_splits(set
<PGRef
>& pgs
)
8586 dout(10) << __func__
<< " " << pgs
<< dendl
;
8589 for (set
<PGRef
>::iterator i
= pgs
.begin();
8594 PeeringCtx rctx
= create_context();
8596 dout(10) << __func__
<< " " << *pg
<< dendl
;
8597 epoch_t e
= pg
->get_osdmap_epoch();
8598 pg
->handle_initialize(rctx
);
8599 pg
->queue_null(e
, e
);
8600 dispatch_context(rctx
, pg
, service
.get_osdmap());
8603 unsigned shard_index
= pg
->pg_id
.hash_to_shard(num_shards
);
8604 shards
[shard_index
]->register_and_wake_split_child(pg
);
8608 bool OSD::add_merge_waiter(OSDMapRef nextmap
, spg_t target
, PGRef src
,
8611 std::lock_guard
l(merge_lock
);
8612 auto& p
= merge_waiters
[nextmap
->get_epoch()][target
];
8613 p
[src
->pg_id
] = src
;
8614 dout(10) << __func__
<< " added merge_waiter " << src
->pg_id
8615 << " for " << target
<< ", have " << p
.size() << "/" << need
8617 return p
.size() == need
;
8620 bool OSD::advance_pg(
8623 ThreadPool::TPHandle
&handle
,
8626 if (osd_epoch
<= pg
->get_osdmap_epoch()) {
8629 ceph_assert(pg
->is_locked());
8630 OSDMapRef lastmap
= pg
->get_osdmap();
8631 set
<PGRef
> new_pgs
; // any split children
8634 unsigned old_pg_num
= lastmap
->have_pg_pool(pg
->pg_id
.pool()) ?
8635 lastmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8636 for (epoch_t next_epoch
= pg
->get_osdmap_epoch() + 1;
8637 next_epoch
<= osd_epoch
;
8639 OSDMapRef nextmap
= service
.try_get_map(next_epoch
);
8641 dout(20) << __func__
<< " missing map " << next_epoch
<< dendl
;
8645 unsigned new_pg_num
=
8646 (old_pg_num
&& nextmap
->have_pg_pool(pg
->pg_id
.pool())) ?
8647 nextmap
->get_pg_num(pg
->pg_id
.pool()) : 0;
8648 if (old_pg_num
&& new_pg_num
&& old_pg_num
!= new_pg_num
) {
8650 if (nextmap
->have_pg_pool(pg
->pg_id
.pool())) {
8652 if (pg
->pg_id
.is_merge_source(
8656 // we are merge source
8657 PGRef spg
= pg
; // carry a ref
8658 dout(1) << __func__
<< " " << pg
->pg_id
8659 << " is merge source, target is " << parent
8661 pg
->write_if_dirty(rctx
);
8662 if (!new_pgs
.empty()) {
8663 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8667 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8669 // release backoffs explicitly, since the on_shutdown path
8670 // aggressively tears down backoff state.
8671 if (pg
->is_primary()) {
8672 pg
->release_pg_backoffs();
8675 OSDShard
*sdata
= pg
->osd_shard
;
8677 std::lock_guard
l(sdata
->shard_lock
);
8679 sdata
->_detach_pg(pg
->pg_slot
);
8680 // update pg count now since we might not get an osdmap
8682 if (pg
->is_primary())
8683 logger
->dec(l_osd_pg_primary
);
8684 else if (pg
->is_nonprimary())
8685 logger
->dec(l_osd_pg_replica
); // misnomer
8687 logger
->dec(l_osd_pg_stray
);
8692 set
<spg_t
> children
;
8693 parent
.is_split(new_pg_num
, old_pg_num
, &children
);
8694 if (add_merge_waiter(nextmap
, parent
, pg
, children
.size())) {
8695 enqueue_peering_evt(
8698 std::make_shared
<PGPeeringEvent
>(
8699 nextmap
->get_epoch(),
8700 nextmap
->get_epoch(),
8705 } else if (pg
->pg_id
.is_merge_target(old_pg_num
, new_pg_num
)) {
8706 // we are merge target
8707 set
<spg_t
> children
;
8708 pg
->pg_id
.is_split(new_pg_num
, old_pg_num
, &children
);
8709 dout(20) << __func__
<< " " << pg
->pg_id
8710 << " is merge target, sources are " << children
8712 map
<spg_t
,PGRef
> sources
;
8714 std::lock_guard
l(merge_lock
);
8715 auto& s
= merge_waiters
[nextmap
->get_epoch()][pg
->pg_id
];
8716 unsigned need
= children
.size();
8717 dout(20) << __func__
<< " have " << s
.size() << "/"
8719 if (s
.size() == need
) {
8721 merge_waiters
[nextmap
->get_epoch()].erase(pg
->pg_id
);
8722 if (merge_waiters
[nextmap
->get_epoch()].empty()) {
8723 merge_waiters
.erase(nextmap
->get_epoch());
8727 if (!sources
.empty()) {
8728 unsigned new_pg_num
= nextmap
->get_pg_num(pg
->pg_id
.pool());
8729 unsigned split_bits
= pg
->pg_id
.get_split_bits(new_pg_num
);
8730 dout(1) << __func__
<< " merging " << pg
->pg_id
<< dendl
;
8732 sources
, rctx
, split_bits
,
8733 nextmap
->get_pg_pool(
8734 pg
->pg_id
.pool())->last_pg_merge_meta
);
8735 pg
->pg_slot
->waiting_for_merge_epoch
= 0;
8737 dout(20) << __func__
<< " not ready to merge yet" << dendl
;
8738 pg
->write_if_dirty(rctx
);
8739 if (!new_pgs
.empty()) {
8740 rctx
.transaction
.register_on_applied(new C_FinishSplits(this,
8744 dispatch_context(rctx
, pg
, pg
->get_osdmap(), &handle
);
8746 // kick source(s) to get them ready
8747 for (auto& i
: children
) {
8748 dout(20) << __func__
<< " kicking source " << i
<< dendl
;
8749 enqueue_peering_evt(
8752 std::make_shared
<PGPeeringEvent
>(
8753 nextmap
->get_epoch(),
8754 nextmap
->get_epoch(),
8764 vector
<int> newup
, newacting
;
8765 int up_primary
, acting_primary
;
8766 nextmap
->pg_to_up_acting_osds(
8768 &newup
, &up_primary
,
8769 &newacting
, &acting_primary
);
8770 pg
->handle_advance_map(
8771 nextmap
, lastmap
, newup
, up_primary
,
8772 newacting
, acting_primary
, rctx
);
8774 auto oldpool
= lastmap
->get_pools().find(pg
->pg_id
.pool());
8775 auto newpool
= nextmap
->get_pools().find(pg
->pg_id
.pool());
8776 if (oldpool
!= lastmap
->get_pools().end()
8777 && newpool
!= nextmap
->get_pools().end()) {
8778 dout(20) << __func__
8779 << " new pool opts " << newpool
->second
.opts
8780 << " old pool opts " << oldpool
->second
.opts
8783 double old_min_interval
= 0, new_min_interval
= 0;
8784 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &old_min_interval
);
8785 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &new_min_interval
);
8787 double old_max_interval
= 0, new_max_interval
= 0;
8788 oldpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &old_max_interval
);
8789 newpool
->second
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &new_max_interval
);
8791 // Assume if an interval is change from set to unset or vice versa the actual config
8792 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8794 if (old_min_interval
!= new_min_interval
|| old_max_interval
!= new_max_interval
) {
8795 pg
->on_info_history_change();
8799 if (new_pg_num
&& old_pg_num
!= new_pg_num
) {
8801 set
<spg_t
> children
;
8802 if (pg
->pg_id
.is_split(
8807 pg
, children
, &new_pgs
, lastmap
, nextmap
,
8813 old_pg_num
= new_pg_num
;
8814 handle
.reset_tp_timeout();
8816 pg
->handle_activate_map(rctx
);
8820 if (!new_pgs
.empty()) {
8821 rctx
.transaction
.register_on_applied(new C_FinishSplits(this, new_pgs
));
8826 void OSD::consume_map()
8828 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8829 auto osdmap
= get_osdmap();
8830 dout(7) << "consume_map version " << osdmap
->get_epoch() << dendl
;
8832 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8833 * speak the older sorting version any more. Be careful not to force
8834 * a shutdown if we are merely processing old maps, though.
8836 if (!osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
) && is_active()) {
8837 derr
<< __func__
<< " SORTBITWISE flag is not set" << dendl
;
8841 service
.pre_publish_map(osdmap
);
8842 service
.await_reserved_maps();
8843 service
.publish_map(osdmap
);
8845 // prime splits and merges
8846 set
<pair
<spg_t
,epoch_t
>> newly_split
; // splits, and when
8847 set
<pair
<spg_t
,epoch_t
>> merge_pgs
; // merge participants, and when
8848 for (auto& shard
: shards
) {
8849 shard
->identify_splits_and_merges(osdmap
, &newly_split
, &merge_pgs
);
8851 if (!newly_split
.empty()) {
8852 for (auto& shard
: shards
) {
8853 shard
->prime_splits(osdmap
, &newly_split
);
8855 ceph_assert(newly_split
.empty());
8858 // prune sent_ready_to_merge
8859 service
.prune_sent_ready_to_merge(osdmap
);
8861 // FIXME, maybe: We could race against an incoming peering message
8862 // that instantiates a merge PG after identify_merges() below and
8863 // never set up its peer to complete the merge. An OSD restart
8864 // would clear it up. This is a hard race to resolve,
8865 // extraordinarily rare (we only merge PGs that are stable and
8866 // clean, so it'd have to be an imported PG to an OSD with a
8867 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8868 // replace all of this with a seastar-based code soon anyway.
8869 if (!merge_pgs
.empty()) {
8870 // mark the pgs we already have, or create new and empty merge
8871 // participants for those we are missing. do this all under the
8872 // shard lock so we don't have to worry about racing pg creates
8874 for (auto& shard
: shards
) {
8875 shard
->prime_merges(osdmap
, &merge_pgs
);
8877 ceph_assert(merge_pgs
.empty());
8880 service
.prune_pg_created();
8882 unsigned pushes_to_free
= 0;
8883 for (auto& shard
: shards
) {
8884 shard
->consume_map(osdmap
, &pushes_to_free
);
8887 vector
<spg_t
> pgids
;
8890 // count (FIXME, probably during seastar rewrite)
8891 int num_pg_primary
= 0, num_pg_replica
= 0, num_pg_stray
= 0;
8894 for (auto& pg
: pgs
) {
8895 // FIXME (probably during seastar rewrite): this is lockless and
8896 // racy, but we don't want to take pg lock here.
8897 if (pg
->is_primary())
8899 else if (pg
->is_nonprimary())
8900 num_pg_replica
++; // misnomer
8906 // FIXME (as part of seastar rewrite): move to OSDShard
8907 std::lock_guard
l(pending_creates_lock
);
8908 for (auto pg
= pending_creates_from_osd
.begin();
8909 pg
!= pending_creates_from_osd
.end();) {
8910 if (osdmap
->get_pg_acting_role(pg
->first
, whoami
) < 0) {
8911 dout(10) << __func__
<< " pg " << pg
->first
<< " doesn't map here, "
8912 << "discarding pending_create_from_osd" << dendl
;
8913 pg
= pending_creates_from_osd
.erase(pg
);
8920 service
.maybe_inject_dispatch_delay();
8922 dispatch_sessions_waiting_on_map();
8924 service
.maybe_inject_dispatch_delay();
8926 service
.release_reserved_pushes(pushes_to_free
);
8928 // queue null events to push maps down to individual PGs
8929 for (auto pgid
: pgids
) {
8930 enqueue_peering_evt(
8933 std::make_shared
<PGPeeringEvent
>(
8934 osdmap
->get_epoch(),
8935 osdmap
->get_epoch(),
8938 logger
->set(l_osd_pg
, pgids
.size());
8939 logger
->set(l_osd_pg_primary
, num_pg_primary
);
8940 logger
->set(l_osd_pg_replica
, num_pg_replica
);
8941 logger
->set(l_osd_pg_stray
, num_pg_stray
);
8944 void OSD::activate_map()
8946 ceph_assert(ceph_mutex_is_locked(osd_lock
));
8947 auto osdmap
= get_osdmap();
8949 dout(7) << "activate_map version " << osdmap
->get_epoch() << dendl
;
8952 if (osdmap
->test_flag(CEPH_OSDMAP_NORECOVER
)) {
8953 if (!service
.recovery_is_paused()) {
8954 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl
;
8955 service
.pause_recovery();
8958 if (service
.recovery_is_paused()) {
8959 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl
;
8960 service
.unpause_recovery();
8964 service
.activate_map();
8967 take_waiters(waiting_for_osdmap
);
8970 bool OSD::require_mon_peer(const Message
*m
)
8972 if (!m
->get_connection()->peer_is_mon()) {
8973 dout(0) << "require_mon_peer received from non-mon "
8974 << m
->get_connection()->get_peer_addr()
8975 << " " << *m
<< dendl
;
8981 bool OSD::require_mon_or_mgr_peer(const Message
*m
)
8983 if (!m
->get_connection()->peer_is_mon() &&
8984 !m
->get_connection()->peer_is_mgr()) {
8985 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8986 << m
->get_connection()->get_peer_addr()
8987 << " " << *m
<< dendl
;
8993 bool OSD::require_osd_peer(const Message
*m
)
8995 if (!m
->get_connection()->peer_is_osd()) {
8996 dout(0) << "require_osd_peer received from non-osd "
8997 << m
->get_connection()->get_peer_addr()
8998 << " " << *m
<< dendl
;
9004 bool OSD::require_self_aliveness(const Message
*m
, epoch_t epoch
)
9006 epoch_t up_epoch
= service
.get_up_epoch();
9007 if (epoch
< up_epoch
) {
9008 dout(7) << "from pre-up epoch " << epoch
<< " < " << up_epoch
<< dendl
;
9013 dout(7) << "still in boot state, dropping message " << *m
<< dendl
;
9020 bool OSD::require_same_peer_instance(const Message
*m
, const OSDMapRef
& map
,
9021 bool is_fast_dispatch
)
9023 int from
= m
->get_source().num();
9025 if (map
->is_down(from
) ||
9026 (map
->get_cluster_addrs(from
) != m
->get_source_addrs())) {
9027 dout(5) << "from dead osd." << from
<< ", marking down, "
9028 << " msg was " << m
->get_source_inst().addr
9030 << (map
->is_up(from
) ?
9031 map
->get_cluster_addrs(from
) : entity_addrvec_t())
9033 ConnectionRef con
= m
->get_connection();
9035 if (auto s
= ceph::ref_cast
<Session
>(con
->get_priv()); s
) {
9036 if (!is_fast_dispatch
)
9037 s
->session_dispatch_lock
.lock();
9038 clear_session_waiting_on_map(s
);
9039 con
->set_priv(nullptr); // break ref <-> session cycle, if any
9041 if (!is_fast_dispatch
)
9042 s
->session_dispatch_lock
.unlock();
9051 * require that we have same (or newer) map, and that
9052 * the source is the pg primary.
9054 bool OSD::require_same_or_newer_map(OpRequestRef
& op
, epoch_t epoch
,
9055 bool is_fast_dispatch
)
9057 const Message
*m
= op
->get_req();
9058 const auto osdmap
= get_osdmap();
9059 dout(15) << "require_same_or_newer_map " << epoch
9060 << " (i am " << osdmap
->get_epoch() << ") " << m
<< dendl
;
9062 ceph_assert(ceph_mutex_is_locked(osd_lock
));
9064 // do they have a newer map?
9065 if (epoch
> osdmap
->get_epoch()) {
9066 dout(7) << "waiting for newer map epoch " << epoch
9067 << " > my " << osdmap
->get_epoch() << " with " << m
<< dendl
;
9068 wait_for_new_map(op
);
9072 if (!require_self_aliveness(op
->get_req(), epoch
)) {
9076 // ok, our map is same or newer.. do they still exist?
9077 if (m
->get_connection()->get_messenger() == cluster_messenger
&&
9078 !require_same_peer_instance(op
->get_req(), osdmap
, is_fast_dispatch
)) {
9089 // ----------------------------------------
9092 void OSD::split_pgs(
9094 const set
<spg_t
> &childpgids
, set
<PGRef
> *out_pgs
,
9099 unsigned pg_num
= nextmap
->get_pg_num(parent
->pg_id
.pool());
9100 parent
->update_snap_mapper_bits(parent
->get_pgid().get_split_bits(pg_num
));
9102 vector
<object_stat_sum_t
> updated_stats
;
9103 parent
->start_split_stats(childpgids
, &updated_stats
);
9105 vector
<object_stat_sum_t
>::iterator stat_iter
= updated_stats
.begin();
9106 for (set
<spg_t
>::const_iterator i
= childpgids
.begin();
9107 i
!= childpgids
.end();
9109 ceph_assert(stat_iter
!= updated_stats
.end());
9110 dout(10) << __func__
<< " splitting " << *parent
<< " into " << *i
<< dendl
;
9111 PG
* child
= _make_pg(nextmap
, *i
);
9113 out_pgs
->insert(child
);
9114 child
->ch
= store
->create_new_collection(child
->coll
);
9117 uint32_t shard_index
= i
->hash_to_shard(shards
.size());
9118 assert(NULL
!= shards
[shard_index
]);
9119 store
->set_collection_commit_queue(child
->coll
, &(shards
[shard_index
]->context_queue
));
9122 unsigned split_bits
= i
->get_split_bits(pg_num
);
9123 dout(10) << " pg_num is " << pg_num
9124 << ", m_seed " << i
->ps()
9125 << ", split_bits is " << split_bits
<< dendl
;
9126 parent
->split_colls(
9130 &child
->get_pool().info
,
9137 child
->init_collection_pool_opts();
9139 child
->finish_split_stats(*stat_iter
, rctx
.transaction
);
9142 ceph_assert(stat_iter
!= updated_stats
.end());
9143 parent
->finish_split_stats(*stat_iter
, rctx
.transaction
);
9149 void OSD::handle_pg_create(OpRequestRef op
)
9151 // NOTE: this can be removed in P release (mimic is the last version to
9152 // send MOSDPGCreate messages).
9154 auto m
= op
->get_req
<MOSDPGCreate
>();
9155 ceph_assert(m
->get_type() == MSG_OSD_PG_CREATE
);
9157 dout(10) << "handle_pg_create " << *m
<< dendl
;
9159 if (!require_mon_peer(op
->get_req())) {
9163 if (!require_same_or_newer_map(op
, m
->epoch
, false))
9168 const auto osdmap
= get_osdmap();
9169 map
<pg_t
,utime_t
>::const_iterator ci
= m
->ctimes
.begin();
9170 for (map
<pg_t
,pg_create_t
>::const_iterator p
= m
->mkpg
.begin();
9173 ceph_assert(ci
!= m
->ctimes
.end() && ci
->first
== p
->first
);
9174 epoch_t created
= p
->second
.created
;
9175 if (p
->second
.split_bits
) // Skip split pgs
9179 if (!osdmap
->have_pg_pool(on
.pool())) {
9180 dout(20) << "ignoring pg on deleted pool " << on
<< dendl
;
9184 dout(20) << "mkpg " << on
<< " e" << created
<< "@" << ci
->second
<< dendl
;
9187 bool mapped
= osdmap
->get_primary_shard(on
, &pgid
);
9188 ceph_assert(mapped
);
9190 // is it still ours?
9191 vector
<int> up
, acting
;
9192 int up_primary
= -1;
9193 int acting_primary
= -1;
9194 osdmap
->pg_to_up_acting_osds(on
, &up
, &up_primary
, &acting
, &acting_primary
);
9195 int role
= osdmap
->calc_pg_role(pg_shard_t(whoami
, pgid
.shard
), acting
);
9197 if (acting_primary
!= whoami
) {
9198 dout(10) << "mkpg " << on
<< " not acting_primary (" << acting_primary
9199 << "), my role=" << role
<< ", skipping" << dendl
;
9205 pg_history_t history
;
9206 build_initial_pg_history(pgid
, created
, ci
->second
, &history
, &pi
);
9208 // The mon won't resend unless the primary changed, so we ignore
9209 // same_interval_since. We'll pass this history with the current
9210 // epoch as the event.
9211 if (history
.same_primary_since
> m
->epoch
) {
9212 dout(10) << __func__
<< ": got obsolete pg create on pgid "
9213 << pgid
<< " from epoch " << m
->epoch
9214 << ", primary changed in " << history
.same_primary_since
9218 enqueue_peering_evt(
9221 std::make_shared
<PGPeeringEvent
>(
9222 osdmap
->get_epoch(),
9223 osdmap
->get_epoch(),
9228 osdmap
->get_epoch(),
9236 std::lock_guard
l(pending_creates_lock
);
9237 if (pending_creates_from_mon
== 0) {
9238 last_pg_create_epoch
= m
->epoch
;
9242 maybe_update_heartbeat_peers();
9246 // ----------------------------------------
9247 // peering and recovery
9249 PeeringCtx
OSD::create_context()
9251 return PeeringCtx(get_osdmap()->require_osd_release
);
9254 void OSD::dispatch_context(PeeringCtx
&ctx
, PG
*pg
, OSDMapRef curmap
,
9255 ThreadPool::TPHandle
*handle
)
9257 if (!service
.get_osdmap()->is_up(whoami
)) {
9258 dout(20) << __func__
<< " not up in osdmap" << dendl
;
9259 } else if (!is_active()) {
9260 dout(20) << __func__
<< " not active" << dendl
;
9262 for (auto& [osd
, ls
] : ctx
.message_map
) {
9263 if (!curmap
->is_up(osd
)) {
9264 dout(20) << __func__
<< " skipping down osd." << osd
<< dendl
;
9267 ConnectionRef con
= service
.get_con_osd_cluster(
9268 osd
, curmap
->get_epoch());
9270 dout(20) << __func__
<< " skipping osd." << osd
<< " (NULL con)"
9274 service
.maybe_share_map(con
.get(), curmap
);
9276 con
->send_message2(m
);
9281 if ((!ctx
.transaction
.empty() || ctx
.transaction
.has_contexts()) && pg
) {
9282 int tr
= store
->queue_transaction(
9284 std::move(ctx
.transaction
), TrackedOpRef(),
9286 ceph_assert(tr
== 0);
9290 void OSD::handle_fast_pg_create(MOSDPGCreate2
*m
)
9292 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9293 if (!require_mon_peer(m
)) {
9297 for (auto& p
: m
->pgs
) {
9298 spg_t pgid
= p
.first
;
9299 epoch_t created
= p
.second
.first
;
9300 utime_t created_stamp
= p
.second
.second
;
9301 auto q
= m
->pg_extra
.find(pgid
);
9302 if (q
== m
->pg_extra
.end()) {
9303 dout(20) << __func__
<< " " << pgid
<< " e" << created
9304 << "@" << created_stamp
9305 << " (no history or past_intervals)" << dendl
;
9306 // pre-octopus ... no pg history. this can be removed in Q release.
9307 enqueue_peering_evt(
9310 std::make_shared
<PGPeeringEvent
>(
9318 pg_history_t(created
, created_stamp
),
9323 dout(20) << __func__
<< " " << pgid
<< " e" << created
9324 << "@" << created_stamp
9325 << " history " << q
->second
.first
9326 << " pi " << q
->second
.second
<< dendl
;
9327 if (!q
->second
.second
.empty() &&
9328 m
->epoch
< q
->second
.second
.get_bounds().second
) {
9329 clog
->error() << "got pg_create on " << pgid
<< " epoch " << m
->epoch
9330 << " and unmatched past_intervals " << q
->second
.second
9331 << " (history " << q
->second
.first
<< ")";
9333 enqueue_peering_evt(
9336 std::make_shared
<PGPeeringEvent
>(
9353 std::lock_guard
l(pending_creates_lock
);
9354 if (pending_creates_from_mon
== 0) {
9355 last_pg_create_epoch
= m
->epoch
;
9362 void OSD::handle_fast_pg_query(MOSDPGQuery
*m
)
9364 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9365 if (!require_osd_peer(m
)) {
9369 int from
= m
->get_source().num();
9370 for (auto& p
: m
->pg_list
) {
9371 enqueue_peering_evt(
9374 std::make_shared
<PGPeeringEvent
>(
9375 p
.second
.epoch_sent
, p
.second
.epoch_sent
,
9378 pg_shard_t(from
, p
.second
.from
),
9380 p
.second
.epoch_sent
),
9387 void OSD::handle_fast_pg_notify(MOSDPGNotify
* m
)
9389 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9390 if (!require_osd_peer(m
)) {
9394 int from
= m
->get_source().num();
9395 for (auto& p
: m
->get_pg_list()) {
9396 spg_t
pgid(p
.info
.pgid
.pgid
, p
.to
);
9397 enqueue_peering_evt(
9400 std::make_shared
<PGPeeringEvent
>(
9404 pgid
, pg_shard_t(from
, p
.from
),
9406 m
->get_connection()->get_features()),
9419 void OSD::handle_fast_pg_info(MOSDPGInfo
* m
)
9421 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9422 if (!require_osd_peer(m
)) {
9426 int from
= m
->get_source().num();
9427 for (auto& p
: m
->pg_list
) {
9428 enqueue_peering_evt(
9429 spg_t(p
.info
.pgid
.pgid
, p
.to
),
9431 std::make_shared
<PGPeeringEvent
>(
9432 p
.epoch_sent
, p
.query_epoch
,
9434 pg_shard_t(from
, p
.from
),
9442 void OSD::handle_fast_pg_remove(MOSDPGRemove
*m
)
9444 dout(7) << __func__
<< " " << *m
<< " from " << m
->get_source() << dendl
;
9445 if (!require_osd_peer(m
)) {
9449 for (auto& pgid
: m
->pg_list
) {
9450 enqueue_peering_evt(
9453 std::make_shared
<PGPeeringEvent
>(
9454 m
->get_epoch(), m
->get_epoch(),
9455 PeeringState::DeleteStart())));
9460 void OSD::handle_fast_force_recovery(MOSDForceRecovery
*m
)
9462 dout(10) << __func__
<< " " << *m
<< dendl
;
9463 if (!require_mon_or_mgr_peer(m
)) {
9467 epoch_t epoch
= get_osdmap_epoch();
9468 for (auto pgid
: m
->forced_pgs
) {
9469 if (m
->options
& OFR_BACKFILL
) {
9470 if (m
->options
& OFR_CANCEL
) {
9471 enqueue_peering_evt(
9474 std::make_shared
<PGPeeringEvent
>(
9476 PeeringState::UnsetForceBackfill())));
9478 enqueue_peering_evt(
9481 std::make_shared
<PGPeeringEvent
>(
9483 PeeringState::SetForceBackfill())));
9485 } else if (m
->options
& OFR_RECOVERY
) {
9486 if (m
->options
& OFR_CANCEL
) {
9487 enqueue_peering_evt(
9490 std::make_shared
<PGPeeringEvent
>(
9492 PeeringState::UnsetForceRecovery())));
9494 enqueue_peering_evt(
9497 std::make_shared
<PGPeeringEvent
>(
9499 PeeringState::SetForceRecovery())));
9506 void OSD::handle_pg_query_nopg(const MQuery
& q
)
9508 spg_t pgid
= q
.pgid
;
9509 dout(10) << __func__
<< " " << pgid
<< dendl
;
9511 OSDMapRef osdmap
= get_osdmap();
9512 if (!osdmap
->have_pg_pool(pgid
.pool()))
9515 dout(10) << " pg " << pgid
<< " dne" << dendl
;
9516 pg_info_t
empty(spg_t(pgid
.pgid
, q
.query
.to
));
9517 ConnectionRef con
= service
.get_con_osd_cluster(q
.from
.osd
, osdmap
->get_epoch());
9520 if (q
.query
.type
== pg_query_t::LOG
||
9521 q
.query
.type
== pg_query_t::FULLLOG
) {
9523 q
.query
.from
, q
.query
.to
,
9524 osdmap
->get_epoch(), empty
,
9525 q
.query
.epoch_sent
);
9527 vector
<pg_notify_t
> ls
;
9530 q
.query
.from
, q
.query
.to
,
9532 osdmap
->get_epoch(),
9535 m
= new MOSDPGNotify(osdmap
->get_epoch(), std::move(ls
));
9537 service
.maybe_share_map(con
.get(), osdmap
);
9538 con
->send_message(m
);
9542 void OSDService::queue_check_readable(spg_t spgid
,
9544 ceph::signedspan delay
)
9546 if (delay
== ceph::signedspan::zero()) {
9547 osd
->enqueue_peering_evt(
9550 std::make_shared
<PGPeeringEvent
>(
9552 PeeringState::CheckReadable())));
9554 mono_timer
.add_event(
9556 [this, spgid
, lpr
]() {
9557 queue_check_readable(spgid
, lpr
);
9563 // =========================================================
9566 void OSDService::_maybe_queue_recovery() {
9567 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock
));
9568 uint64_t available_pushes
;
9569 while (!awaiting_throttle
.empty() &&
9570 _recover_now(&available_pushes
)) {
9571 uint64_t to_start
= std::min(
9573 cct
->_conf
->osd_recovery_max_single_start
);
9574 _queue_for_recovery(awaiting_throttle
.front(), to_start
);
9575 awaiting_throttle
.pop_front();
9576 dout(10) << __func__
<< " starting " << to_start
9577 << ", recovery_ops_reserved " << recovery_ops_reserved
9578 << " -> " << (recovery_ops_reserved
+ to_start
) << dendl
;
9579 recovery_ops_reserved
+= to_start
;
9583 bool OSDService::_recover_now(uint64_t *available_pushes
)
9585 if (available_pushes
)
9586 *available_pushes
= 0;
9588 if (ceph_clock_now() < defer_recovery_until
) {
9589 dout(15) << __func__
<< " defer until " << defer_recovery_until
<< dendl
;
9593 if (recovery_paused
) {
9594 dout(15) << __func__
<< " paused" << dendl
;
9598 uint64_t max
= osd
->get_recovery_max_active();
9599 if (max
<= recovery_ops_active
+ recovery_ops_reserved
) {
9600 dout(15) << __func__
<< " active " << recovery_ops_active
9601 << " + reserved " << recovery_ops_reserved
9602 << " >= max " << max
<< dendl
;
9606 if (available_pushes
)
9607 *available_pushes
= max
- recovery_ops_active
- recovery_ops_reserved
;
9612 unsigned OSDService::get_target_pg_log_entries() const
9614 auto num_pgs
= osd
->get_num_pgs();
9615 auto target
= cct
->_conf
->osd_target_pg_log_entries_per_osd
;
9616 if (num_pgs
> 0 && target
> 0) {
9617 // target an even spread of our budgeted log entries across all
9618 // PGs. note that while we only get to control the entry count
9619 // for primary PGs, we'll normally be responsible for a mix of
9620 // primary and replica PGs (for the same pool(s) even), so this
9622 return std::max
<unsigned>(
9623 std::min
<unsigned>(target
/ num_pgs
,
9624 cct
->_conf
->osd_max_pg_log_entries
),
9625 cct
->_conf
->osd_min_pg_log_entries
);
9627 // fall back to a per-pg value.
9628 return cct
->_conf
->osd_min_pg_log_entries
;
9632 void OSD::do_recovery(
9633 PG
*pg
, epoch_t queued
, uint64_t reserved_pushes
,
9634 ThreadPool::TPHandle
&handle
)
9636 uint64_t started
= 0;
9639 * When the value of osd_recovery_sleep is set greater than zero, recovery
9640 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9641 * recovery event's schedule time. This is done by adding a
9642 * recovery_requeue_callback event, which re-queues the recovery op using
9643 * queue_recovery_after_sleep.
9645 float recovery_sleep
= get_osd_recovery_sleep();
9647 std::lock_guard
l(service
.sleep_lock
);
9648 if (recovery_sleep
> 0 && service
.recovery_needs_sleep
) {
9650 auto recovery_requeue_callback
= new LambdaContext([this, pgref
, queued
, reserved_pushes
](int r
) {
9651 dout(20) << "do_recovery wake up at "
9653 << ", re-queuing recovery" << dendl
;
9654 std::lock_guard
l(service
.sleep_lock
);
9655 service
.recovery_needs_sleep
= false;
9656 service
.queue_recovery_after_sleep(pgref
.get(), queued
, reserved_pushes
);
9659 // This is true for the first recovery op and when the previous recovery op
9660 // has been scheduled in the past. The next recovery op is scheduled after
9661 // completing the sleep from now.
9663 if (auto now
= ceph::real_clock::now();
9664 service
.recovery_schedule_time
< now
) {
9665 service
.recovery_schedule_time
= now
;
9667 service
.recovery_schedule_time
+= ceph::make_timespan(recovery_sleep
);
9668 service
.sleep_timer
.add_event_at(service
.recovery_schedule_time
,
9669 recovery_requeue_callback
);
9670 dout(20) << "Recovery event scheduled at "
9671 << service
.recovery_schedule_time
<< dendl
;
9678 std::lock_guard
l(service
.sleep_lock
);
9679 service
.recovery_needs_sleep
= true;
9682 if (pg
->pg_has_reset_since(queued
)) {
9686 dout(10) << "do_recovery starting " << reserved_pushes
<< " " << *pg
<< dendl
;
9687 #ifdef DEBUG_RECOVERY_OIDS
9688 dout(20) << " active was " << service
.recovery_oids
[pg
->pg_id
] << dendl
;
9691 bool do_unfound
= pg
->start_recovery_ops(reserved_pushes
, handle
, &started
);
9692 dout(10) << "do_recovery started " << started
<< "/" << reserved_pushes
9693 << " on " << *pg
<< dendl
;
9696 PeeringCtx rctx
= create_context();
9697 rctx
.handle
= &handle
;
9698 pg
->find_unfound(queued
, rctx
);
9699 dispatch_context(rctx
, pg
, pg
->get_osdmap());
9704 ceph_assert(started
<= reserved_pushes
);
9705 service
.release_reserved_pushes(reserved_pushes
);
9708 void OSDService::start_recovery_op(PG
*pg
, const hobject_t
& soid
)
9710 std::lock_guard
l(recovery_lock
);
9711 dout(10) << "start_recovery_op " << *pg
<< " " << soid
9712 << " (" << recovery_ops_active
<< "/"
9713 << osd
->get_recovery_max_active() << " rops)"
9715 recovery_ops_active
++;
9717 #ifdef DEBUG_RECOVERY_OIDS
9718 dout(20) << " active was " << recovery_oids
[pg
->pg_id
] << dendl
;
9719 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
) == 0);
9720 recovery_oids
[pg
->pg_id
].insert(soid
);
9724 void OSDService::finish_recovery_op(PG
*pg
, const hobject_t
& soid
, bool dequeue
)
9726 std::lock_guard
l(recovery_lock
);
9727 dout(10) << "finish_recovery_op " << *pg
<< " " << soid
9728 << " dequeue=" << dequeue
9729 << " (" << recovery_ops_active
<< "/"
9730 << osd
->get_recovery_max_active() << " rops)"
9734 ceph_assert(recovery_ops_active
> 0);
9735 recovery_ops_active
--;
9737 #ifdef DEBUG_RECOVERY_OIDS
9738 dout(20) << " active oids was " << recovery_oids
[pg
->pg_id
] << dendl
;
9739 ceph_assert(recovery_oids
[pg
->pg_id
].count(soid
));
9740 recovery_oids
[pg
->pg_id
].erase(soid
);
9743 _maybe_queue_recovery();
9746 bool OSDService::is_recovery_active()
9748 if (cct
->_conf
->osd_debug_pretend_recovery_active
) {
9751 return local_reserver
.has_reservation() || remote_reserver
.has_reservation();
9754 void OSDService::release_reserved_pushes(uint64_t pushes
)
9756 std::lock_guard
l(recovery_lock
);
9757 dout(10) << __func__
<< "(" << pushes
<< "), recovery_ops_reserved "
9758 << recovery_ops_reserved
<< " -> " << (recovery_ops_reserved
-pushes
)
9760 ceph_assert(recovery_ops_reserved
>= pushes
);
9761 recovery_ops_reserved
-= pushes
;
9762 _maybe_queue_recovery();
9765 // =========================================================
9768 bool OSD::op_is_discardable(const MOSDOp
*op
)
9770 // drop client request if they are not connected and can't get the
9772 if (!op
->get_connection()->is_connected()) {
9778 void OSD::enqueue_op(spg_t pg
, OpRequestRef
&& op
, epoch_t epoch
)
9780 const utime_t stamp
= op
->get_req()->get_recv_stamp();
9781 const utime_t latency
= ceph_clock_now() - stamp
;
9782 const unsigned priority
= op
->get_req()->get_priority();
9783 const int cost
= op
->get_req()->get_cost();
9784 const uint64_t owner
= op
->get_req()->get_source().num();
9785 const int type
= op
->get_req()->get_type();
9787 dout(15) << "enqueue_op " << op
<< " prio " << priority
9790 << " latency " << latency
9791 << " epoch " << epoch
9792 << " " << *(op
->get_req()) << dendl
;
9793 op
->osd_trace
.event("enqueue op");
9794 op
->osd_trace
.keyval("priority", priority
);
9795 op
->osd_trace
.keyval("cost", cost
);
9797 if (op
->osd_parent_span
) {
9798 auto enqueue_span
= jaeger_tracing::child_span(__func__
, op
->osd_parent_span
);
9800 {"priority", priority
},
9808 op
->mark_queued_for_pg();
9809 logger
->tinc(l_osd_op_before_queue_op_lat
, latency
);
9810 if (type
== MSG_OSD_PG_PUSH
||
9811 type
== MSG_OSD_PG_PUSH_REPLY
) {
9814 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGRecoveryMsg(pg
, std::move(op
))),
9815 cost
, priority
, stamp
, owner
, epoch
));
9819 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(pg
, std::move(op
))),
9820 cost
, priority
, stamp
, owner
, epoch
));
9824 void OSD::enqueue_peering_evt(spg_t pgid
, PGPeeringEventRef evt
)
9826 dout(15) << __func__
<< " " << pgid
<< " " << evt
->get_desc() << dendl
;
9829 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGPeeringItem(pgid
, evt
)),
9831 cct
->_conf
->osd_peering_op_priority
,
9834 evt
->get_epoch_sent()));
9838 * NOTE: dequeue called in worker thread, with pg lock
9840 void OSD::dequeue_op(
9841 PGRef pg
, OpRequestRef op
,
9842 ThreadPool::TPHandle
&handle
)
9844 const Message
*m
= op
->get_req();
9847 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_BEGIN", false);
9849 utime_t now
= ceph_clock_now();
9850 op
->set_dequeued_time(now
);
9852 utime_t latency
= now
- m
->get_recv_stamp();
9853 dout(10) << "dequeue_op " << op
<< " prio " << m
->get_priority()
9854 << " cost " << m
->get_cost()
9855 << " latency " << latency
9857 << " pg " << *pg
<< dendl
;
9859 logger
->tinc(l_osd_op_before_dequeue_op_lat
, latency
);
9861 service
.maybe_share_map(m
->get_connection().get(),
9865 if (pg
->is_deleting())
9868 op
->mark_reached_pg();
9869 op
->osd_trace
.event("dequeue_op");
9871 pg
->do_request(op
, handle
);
9874 dout(10) << "dequeue_op " << op
<< " finish" << dendl
;
9875 OID_EVENT_TRACE_WITH_MSG(m
, "DEQUEUE_OP_END", false);
9879 void OSD::dequeue_peering_evt(
9882 PGPeeringEventRef evt
,
9883 ThreadPool::TPHandle
& handle
)
9885 PeeringCtx rctx
= create_context();
9886 auto curmap
= sdata
->get_osdmap();
9887 bool need_up_thru
= false;
9888 epoch_t same_interval_since
= 0;
9890 if (const MQuery
*q
= dynamic_cast<const MQuery
*>(evt
->evt
.get())) {
9891 handle_pg_query_nopg(*q
);
9893 derr
<< __func__
<< " unrecognized pg-less event " << evt
->get_desc() << dendl
;
9896 } else if (advance_pg(curmap
->get_epoch(), pg
, handle
, rctx
)) {
9897 pg
->do_peering_event(evt
, rctx
);
9898 if (pg
->is_deleted()) {
9902 dispatch_context(rctx
, pg
, curmap
, &handle
);
9903 need_up_thru
= pg
->get_need_up_thru();
9904 same_interval_since
= pg
->get_same_interval_since();
9909 queue_want_up_thru(same_interval_since
);
9912 service
.send_pg_temp();
9915 void OSD::dequeue_delete(
9919 ThreadPool::TPHandle
& handle
)
9921 dequeue_peering_evt(
9925 std::make_shared
<PGPeeringEvent
>(
9927 PeeringState::DeleteSome())),
9933 // --------------------------------
9935 const char** OSD::get_tracked_conf_keys() const
9937 static const char* KEYS
[] = {
9938 "osd_max_backfills",
9939 "osd_min_recovery_priority",
9940 "osd_max_trimming_pgs",
9941 "osd_op_complaint_time",
9942 "osd_op_log_threshold",
9943 "osd_op_history_size",
9944 "osd_op_history_duration",
9945 "osd_op_history_slow_op_size",
9946 "osd_op_history_slow_op_threshold",
9947 "osd_enable_op_tracker",
9948 "osd_map_cache_size",
9949 "osd_pg_epoch_max_lag_factor",
9950 "osd_pg_epoch_persisted_max_stale",
9951 "osd_recovery_sleep",
9952 "osd_recovery_sleep_hdd",
9953 "osd_recovery_sleep_ssd",
9954 "osd_recovery_sleep_hybrid",
9956 "osd_delete_sleep_hdd",
9957 "osd_delete_sleep_ssd",
9958 "osd_delete_sleep_hybrid",
9959 "osd_snap_trim_sleep",
9960 "osd_snap_trim_sleep_hdd",
9961 "osd_snap_trim_sleep_ssd",
9962 "osd_snap_trim_sleep_hybrid"
9964 "osd_recovery_max_active",
9965 "osd_recovery_max_active_hdd",
9966 "osd_recovery_max_active_ssd",
9967 // clog & admin clog
9970 "clog_to_syslog_facility",
9971 "clog_to_syslog_level",
9972 "osd_objectstore_fuse",
9974 "clog_to_graylog_host",
9975 "clog_to_graylog_port",
9978 "osd_recovery_delay_start",
9979 "osd_client_message_size_cap",
9980 "osd_client_message_cap",
9981 "osd_heartbeat_min_size",
9982 "osd_heartbeat_interval",
9983 "osd_object_clean_region_max_num_intervals",
9984 "osd_scrub_min_interval",
9985 "osd_scrub_max_interval",
9991 void OSD::handle_conf_change(const ConfigProxy
& conf
,
9992 const std::set
<std::string
> &changed
)
9994 std::lock_guard l
{osd_lock
};
9996 if (changed
.count("osd_max_backfills") ||
9997 changed
.count("osd_delete_sleep") ||
9998 changed
.count("osd_delete_sleep_hdd") ||
9999 changed
.count("osd_delete_sleep_ssd") ||
10000 changed
.count("osd_delete_sleep_hybrid") ||
10001 changed
.count("osd_snap_trim_sleep") ||
10002 changed
.count("osd_snap_trim_sleep_hdd") ||
10003 changed
.count("osd_snap_trim_sleep_ssd") ||
10004 changed
.count("osd_snap_trim_sleep_hybrid") ||
10005 changed
.count("osd_scrub_sleep") ||
10006 changed
.count("osd_recovery_sleep") ||
10007 changed
.count("osd_recovery_sleep_hdd") ||
10008 changed
.count("osd_recovery_sleep_ssd") ||
10009 changed
.count("osd_recovery_sleep_hybrid") ||
10010 changed
.count("osd_recovery_max_active") ||
10011 changed
.count("osd_recovery_max_active_hdd") ||
10012 changed
.count("osd_recovery_max_active_ssd")) {
10013 if (!maybe_override_options_for_qos() &&
10014 changed
.count("osd_max_backfills")) {
10015 // Scheduler is not "mclock". Fallback to earlier behavior
10016 service
.local_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10017 service
.remote_reserver
.set_max(cct
->_conf
->osd_max_backfills
);
10020 if (changed
.count("osd_min_recovery_priority")) {
10021 service
.local_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10022 service
.remote_reserver
.set_min_priority(cct
->_conf
->osd_min_recovery_priority
);
10024 if (changed
.count("osd_max_trimming_pgs")) {
10025 service
.snap_reserver
.set_max(cct
->_conf
->osd_max_trimming_pgs
);
10027 if (changed
.count("osd_op_complaint_time") ||
10028 changed
.count("osd_op_log_threshold")) {
10029 op_tracker
.set_complaint_and_threshold(cct
->_conf
->osd_op_complaint_time
,
10030 cct
->_conf
->osd_op_log_threshold
);
10032 if (changed
.count("osd_op_history_size") ||
10033 changed
.count("osd_op_history_duration")) {
10034 op_tracker
.set_history_size_and_duration(cct
->_conf
->osd_op_history_size
,
10035 cct
->_conf
->osd_op_history_duration
);
10037 if (changed
.count("osd_op_history_slow_op_size") ||
10038 changed
.count("osd_op_history_slow_op_threshold")) {
10039 op_tracker
.set_history_slow_op_size_and_threshold(cct
->_conf
->osd_op_history_slow_op_size
,
10040 cct
->_conf
->osd_op_history_slow_op_threshold
);
10042 if (changed
.count("osd_enable_op_tracker")) {
10043 op_tracker
.set_tracking(cct
->_conf
->osd_enable_op_tracker
);
10045 if (changed
.count("osd_map_cache_size")) {
10046 service
.map_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10047 service
.map_bl_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10048 service
.map_bl_inc_cache
.set_size(cct
->_conf
->osd_map_cache_size
);
10050 if (changed
.count("clog_to_monitors") ||
10051 changed
.count("clog_to_syslog") ||
10052 changed
.count("clog_to_syslog_level") ||
10053 changed
.count("clog_to_syslog_facility") ||
10054 changed
.count("clog_to_graylog") ||
10055 changed
.count("clog_to_graylog_host") ||
10056 changed
.count("clog_to_graylog_port") ||
10057 changed
.count("host") ||
10058 changed
.count("fsid")) {
10059 update_log_config();
10061 if (changed
.count("osd_pg_epoch_max_lag_factor")) {
10062 m_osd_pg_epoch_max_lag_factor
= conf
.get_val
<double>(
10063 "osd_pg_epoch_max_lag_factor");
10066 #ifdef HAVE_LIBFUSE
10067 if (changed
.count("osd_objectstore_fuse")) {
10069 enable_disable_fuse(false);
10074 if (changed
.count("osd_recovery_delay_start")) {
10075 service
.defer_recovery(cct
->_conf
->osd_recovery_delay_start
);
10076 service
.kick_recovery_queue();
10079 if (changed
.count("osd_client_message_cap")) {
10080 uint64_t newval
= cct
->_conf
->osd_client_message_cap
;
10081 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10082 if (pol
.throttler_messages
&& newval
> 0) {
10083 pol
.throttler_messages
->reset_max(newval
);
10086 if (changed
.count("osd_client_message_size_cap")) {
10087 uint64_t newval
= cct
->_conf
->osd_client_message_size_cap
;
10088 Messenger::Policy pol
= client_messenger
->get_policy(entity_name_t::TYPE_CLIENT
);
10089 if (pol
.throttler_bytes
&& newval
> 0) {
10090 pol
.throttler_bytes
->reset_max(newval
);
10093 if (changed
.count("osd_object_clean_region_max_num_intervals")) {
10094 ObjectCleanRegions::set_max_num_intervals(cct
->_conf
->osd_object_clean_region_max_num_intervals
);
10097 if (changed
.count("osd_scrub_min_interval") ||
10098 changed
.count("osd_scrub_max_interval")) {
10099 resched_all_scrubs();
10100 dout(0) << __func__
<< ": scrub interval change" << dendl
;
10103 if (changed
.count("osd_asio_thread_count")) {
10104 service
.poolctx
.stop();
10105 service
.poolctx
.start(conf
.get_val
<std::uint64_t>("osd_asio_thread_count"));
10109 void OSD::maybe_override_max_osd_capacity_for_qos()
10111 // If the scheduler enabled is mclock, override the default
10112 // osd capacity with the value obtained from running the
10113 // osd bench test. This is later used to setup mclock.
10114 if ((cct
->_conf
.get_val
<std::string
>("osd_op_queue") == "mclock_scheduler") &&
10115 (cct
->_conf
.get_val
<bool>("osd_mclock_skip_benchmark") == false)) {
10116 std::string max_capacity_iops_config
;
10117 bool force_run_benchmark
=
10118 cct
->_conf
.get_val
<bool>("osd_mclock_force_run_benchmark_on_init");
10120 if (store_is_rotational
) {
10121 max_capacity_iops_config
= "osd_mclock_max_capacity_iops_hdd";
10123 max_capacity_iops_config
= "osd_mclock_max_capacity_iops_ssd";
10126 if (!force_run_benchmark
) {
10127 double default_iops
= 0.0;
10129 // Get the current osd iops capacity
10130 double cur_iops
= cct
->_conf
.get_val
<double>(max_capacity_iops_config
);
10132 // Get the default max iops capacity
10133 auto val
= cct
->_conf
.get_val_default(max_capacity_iops_config
);
10134 if (!val
.has_value()) {
10135 derr
<< __func__
<< " Unable to determine default value of "
10136 << max_capacity_iops_config
<< dendl
;
10137 // Cannot determine default iops. Force a run of the OSD benchmark.
10138 force_run_benchmark
= true;
10141 default_iops
= std::stod(val
.value());
10144 // Determine if we really need to run the osd benchmark
10145 if (!force_run_benchmark
&& (default_iops
!= cur_iops
)) {
10146 dout(1) << __func__
<< std::fixed
<< std::setprecision(2)
10147 << " default_iops: " << default_iops
10148 << " cur_iops: " << cur_iops
10149 << ". Skip OSD benchmark test." << dendl
;
10154 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
10155 int64_t count
= 12288000; // Count of bytes to write
10156 int64_t bsize
= 4096; // Block size
10157 int64_t osize
= 4194304; // Object size
10158 int64_t onum
= 100; // Count of objects to write
10159 double elapsed
= 0.0; // Time taken to complete the test
10162 int ret
= run_osd_bench_test(count
, bsize
, osize
, onum
, &elapsed
, ss
);
10165 << " osd bench err: " << ret
10166 << " osd bench errstr: " << ss
.str()
10171 double rate
= count
/ elapsed
;
10172 iops
= rate
/ bsize
;
10173 dout(1) << __func__
10174 << " osd bench result -"
10175 << std::fixed
<< std::setprecision(3)
10176 << " bandwidth (MiB/sec): " << rate
/ (1024 * 1024)
10177 << " iops: " << iops
10178 << " elapsed_sec: " << elapsed
10181 // Persist iops to the MON store
10182 ret
= mon_cmd_set_config(max_capacity_iops_config
, std::to_string(iops
));
10184 // Fallback to setting the config within the in-memory "values" map.
10185 cct
->_conf
.set_val(max_capacity_iops_config
, std::to_string(iops
));
10188 // Override the max osd capacity for all shards
10189 for (auto& shard
: shards
) {
10190 shard
->update_scheduler_config();
10195 bool OSD::maybe_override_options_for_qos()
10197 // If the scheduler enabled is mclock, override the recovery, backfill
10198 // and sleep options so that mclock can meet the QoS goals.
10199 if (cct
->_conf
.get_val
<std::string
>("osd_op_queue") == "mclock_scheduler") {
10200 dout(1) << __func__
10201 << ": Changing recovery/backfill/sleep settings for QoS" << dendl
;
10203 // Set high value for recovery max active
10204 uint32_t rec_max_active
= 1000;
10205 cct
->_conf
.set_val(
10206 "osd_recovery_max_active", std::to_string(rec_max_active
));
10207 cct
->_conf
.set_val(
10208 "osd_recovery_max_active_hdd", std::to_string(rec_max_active
));
10209 cct
->_conf
.set_val(
10210 "osd_recovery_max_active_ssd", std::to_string(rec_max_active
));
10212 // Set high value for osd_max_backfill
10213 uint32_t max_backfills
= 1000;
10214 cct
->_conf
.set_val("osd_max_backfills", std::to_string(max_backfills
));
10215 service
.local_reserver
.set_max(max_backfills
);
10216 service
.remote_reserver
.set_max(max_backfills
);
10218 // Disable recovery sleep
10219 cct
->_conf
.set_val("osd_recovery_sleep", std::to_string(0));
10220 cct
->_conf
.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10221 cct
->_conf
.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10222 cct
->_conf
.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10224 // Disable delete sleep
10225 cct
->_conf
.set_val("osd_delete_sleep", std::to_string(0));
10226 cct
->_conf
.set_val("osd_delete_sleep_hdd", std::to_string(0));
10227 cct
->_conf
.set_val("osd_delete_sleep_ssd", std::to_string(0));
10228 cct
->_conf
.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10230 // Disable snap trim sleep
10231 cct
->_conf
.set_val("osd_snap_trim_sleep", std::to_string(0));
10232 cct
->_conf
.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10233 cct
->_conf
.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10234 cct
->_conf
.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10236 // Disable scrub sleep
10237 cct
->_conf
.set_val("osd_scrub_sleep", std::to_string(0));
10243 int OSD::mon_cmd_set_config(const std::string
&key
, const std::string
&val
)
10247 "\"prefix\": \"config set\", "
10248 "\"who\": \"osd." + std::to_string(whoami
) + "\", "
10249 "\"name\": \"" + key
+ "\", "
10250 "\"value\": \"" + val
+ "\""
10253 vector
<std::string
> vcmd
{cmd
};
10257 monc
->start_mon_command(vcmd
, inbl
, nullptr, &outs
, &cond
);
10258 int r
= cond
.wait();
10260 derr
<< __func__
<< " Failed to set config key " << key
10261 << " err: " << cpp_strerror(r
)
10262 << " errstr: " << outs
<< dendl
;
10269 void OSD::update_log_config()
10271 map
<string
,string
> log_to_monitors
;
10272 map
<string
,string
> log_to_syslog
;
10273 map
<string
,string
> log_channel
;
10274 map
<string
,string
> log_prio
;
10275 map
<string
,string
> log_to_graylog
;
10276 map
<string
,string
> log_to_graylog_host
;
10277 map
<string
,string
> log_to_graylog_port
;
10281 if (parse_log_client_options(cct
, log_to_monitors
, log_to_syslog
,
10282 log_channel
, log_prio
, log_to_graylog
,
10283 log_to_graylog_host
, log_to_graylog_port
,
10285 clog
->update_config(log_to_monitors
, log_to_syslog
,
10286 log_channel
, log_prio
, log_to_graylog
,
10287 log_to_graylog_host
, log_to_graylog_port
,
10289 derr
<< "log_to_monitors " << log_to_monitors
<< dendl
;
10292 void OSD::check_config()
10294 // some sanity checks
10295 if (cct
->_conf
->osd_map_cache_size
<= (int)cct
->_conf
->osd_pg_epoch_persisted_max_stale
+ 2) {
10296 clog
->warn() << "osd_map_cache_size (" << cct
->_conf
->osd_map_cache_size
<< ")"
10297 << " is not > osd_pg_epoch_persisted_max_stale ("
10298 << cct
->_conf
->osd_pg_epoch_persisted_max_stale
<< ")";
10300 if (cct
->_conf
->osd_object_clean_region_max_num_intervals
< 0) {
10301 clog
->warn() << "osd_object_clean_region_max_num_intervals ("
10302 << cct
->_conf
->osd_object_clean_region_max_num_intervals
10307 // --------------------------------
10309 void OSD::get_latest_osdmap()
10311 dout(10) << __func__
<< " -- start" << dendl
;
10313 boost::system::error_code ec
;
10314 service
.objecter
->wait_for_latest_osdmap(ceph::async::use_blocked
[ec
]);
10316 dout(10) << __func__
<< " -- finish" << dendl
;
10319 // --------------------------------
10321 void OSD::set_perf_queries(const ConfigPayload
&config_payload
) {
10322 const OSDConfigPayload
&osd_config_payload
= boost::get
<OSDConfigPayload
>(config_payload
);
10323 const std::map
<OSDPerfMetricQuery
, OSDPerfMetricLimits
> &queries
= osd_config_payload
.config
;
10324 dout(10) << "setting " << queries
.size() << " queries" << dendl
;
10326 std::list
<OSDPerfMetricQuery
> supported_queries
;
10327 for (auto &it
: queries
) {
10328 auto &query
= it
.first
;
10329 if (!query
.key_descriptor
.empty()) {
10330 supported_queries
.push_back(query
);
10333 if (supported_queries
.size() < queries
.size()) {
10334 dout(1) << queries
.size() - supported_queries
.size()
10335 << " unsupported queries" << dendl
;
10338 std::lock_guard locker
{m_perf_queries_lock
};
10339 m_perf_queries
= supported_queries
;
10340 m_perf_limits
= queries
;
10342 std::vector
<PGRef
> pgs
;
10344 for (auto& pg
: pgs
) {
10345 std::scoped_lock l
{*pg
};
10346 pg
->set_dynamic_perf_stats_queries(supported_queries
);
10350 MetricPayload
OSD::get_perf_reports() {
10351 OSDMetricPayload payload
;
10352 std::map
<OSDPerfMetricQuery
, OSDPerfMetricReport
> &reports
= payload
.report
;
10354 std::vector
<PGRef
> pgs
;
10356 DynamicPerfStats dps
;
10357 for (auto& pg
: pgs
) {
10358 // m_perf_queries can be modified only in set_perf_queries by mgr client
10359 // request, and it is protected by by mgr client's lock, which is held
10360 // when set_perf_queries/get_perf_reports are called, so we may not hold
10361 // m_perf_queries_lock here.
10362 DynamicPerfStats
pg_dps(m_perf_queries
);
10364 pg
->get_dynamic_perf_stats(&pg_dps
);
10368 dps
.add_to_reports(m_perf_limits
, &reports
);
10369 dout(20) << "reports for " << reports
.size() << " queries" << dendl
;
10374 // =============================================================
10376 #undef dout_context
10377 #define dout_context cct
10379 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10381 void OSDShard::_attach_pg(OSDShardPGSlot
*slot
, PG
*pg
)
10383 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10385 pg
->osd_shard
= this;
10386 pg
->pg_slot
= slot
;
10387 osd
->inc_num_pgs();
10389 slot
->epoch
= pg
->get_osdmap_epoch();
10390 pg_slots_by_epoch
.insert(*slot
);
10393 void OSDShard::_detach_pg(OSDShardPGSlot
*slot
)
10395 dout(10) << slot
->pg
->pg_id
<< " " << slot
->pg
<< dendl
;
10396 slot
->pg
->osd_shard
= nullptr;
10397 slot
->pg
->pg_slot
= nullptr;
10398 slot
->pg
= nullptr;
10399 osd
->dec_num_pgs();
10401 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10403 if (waiting_for_min_pg_epoch
) {
10404 min_pg_epoch_cond
.notify_all();
10408 void OSDShard::update_pg_epoch(OSDShardPGSlot
*slot
, epoch_t e
)
10410 std::lock_guard
l(shard_lock
);
10411 dout(30) << "min was " << pg_slots_by_epoch
.begin()->epoch
10412 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10413 pg_slots_by_epoch
.erase(pg_slots_by_epoch
.iterator_to(*slot
));
10414 dout(20) << slot
->pg
->pg_id
<< " " << slot
->epoch
<< " -> " << e
<< dendl
;
10416 pg_slots_by_epoch
.insert(*slot
);
10417 dout(30) << "min is now " << pg_slots_by_epoch
.begin()->epoch
10418 << " on " << pg_slots_by_epoch
.begin()->pg
->pg_id
<< dendl
;
10419 if (waiting_for_min_pg_epoch
) {
10420 min_pg_epoch_cond
.notify_all();
10424 epoch_t
OSDShard::get_min_pg_epoch()
10426 std::lock_guard
l(shard_lock
);
10427 auto p
= pg_slots_by_epoch
.begin();
10428 if (p
== pg_slots_by_epoch
.end()) {
10434 void OSDShard::wait_min_pg_epoch(epoch_t need
)
10436 std::unique_lock l
{shard_lock
};
10437 ++waiting_for_min_pg_epoch
;
10438 min_pg_epoch_cond
.wait(l
, [need
, this] {
10439 if (pg_slots_by_epoch
.empty()) {
10441 } else if (pg_slots_by_epoch
.begin()->epoch
>= need
) {
10444 dout(10) << need
<< " waiting on "
10445 << pg_slots_by_epoch
.begin()->epoch
<< dendl
;
10449 --waiting_for_min_pg_epoch
;
10452 epoch_t
OSDShard::get_max_waiting_epoch()
10454 std::lock_guard
l(shard_lock
);
10456 for (auto& i
: pg_slots
) {
10457 if (!i
.second
->waiting_peering
.empty()) {
10458 r
= std::max(r
, i
.second
->waiting_peering
.rbegin()->first
);
10464 void OSDShard::consume_map(
10465 const OSDMapRef
& new_osdmap
,
10466 unsigned *pushes_to_free
)
10468 std::lock_guard
l(shard_lock
);
10469 OSDMapRef old_osdmap
;
10471 std::lock_guard
l(osdmap_lock
);
10472 old_osdmap
= std::move(shard_osdmap
);
10473 shard_osdmap
= new_osdmap
;
10475 dout(10) << new_osdmap
->get_epoch()
10476 << " (was " << (old_osdmap
? old_osdmap
->get_epoch() : 0) << ")"
10478 bool queued
= false;
10481 auto p
= pg_slots
.begin();
10482 while (p
!= pg_slots
.end()) {
10483 OSDShardPGSlot
*slot
= p
->second
.get();
10484 const spg_t
& pgid
= p
->first
;
10485 dout(20) << __func__
<< " " << pgid
<< dendl
;
10486 if (!slot
->waiting_for_split
.empty()) {
10487 dout(20) << __func__
<< " " << pgid
10488 << " waiting for split " << slot
->waiting_for_split
<< dendl
;
10492 if (slot
->waiting_for_merge_epoch
> new_osdmap
->get_epoch()) {
10493 dout(20) << __func__
<< " " << pgid
10494 << " waiting for merge by epoch " << slot
->waiting_for_merge_epoch
10499 if (!slot
->waiting_peering
.empty()) {
10500 epoch_t first
= slot
->waiting_peering
.begin()->first
;
10501 if (first
<= new_osdmap
->get_epoch()) {
10502 dout(20) << __func__
<< " " << pgid
10503 << " pending_peering first epoch " << first
10504 << " <= " << new_osdmap
->get_epoch() << ", requeueing" << dendl
;
10505 _wake_pg_slot(pgid
, slot
);
10511 if (!slot
->waiting
.empty()) {
10512 if (new_osdmap
->is_up_acting_osd_shard(pgid
, osd
->get_nodeid())) {
10513 dout(20) << __func__
<< " " << pgid
<< " maps to us, keeping"
10518 while (!slot
->waiting
.empty() &&
10519 slot
->waiting
.front().get_map_epoch() <= new_osdmap
->get_epoch()) {
10520 auto& qi
= slot
->waiting
.front();
10521 dout(20) << __func__
<< " " << pgid
10522 << " waiting item " << qi
10523 << " epoch " << qi
.get_map_epoch()
10524 << " <= " << new_osdmap
->get_epoch()
10526 << (qi
.get_map_epoch() < new_osdmap
->get_epoch() ? "stale" :
10528 << ", dropping" << dendl
;
10529 *pushes_to_free
+= qi
.get_reserved_pushes();
10530 slot
->waiting
.pop_front();
10533 if (slot
->waiting
.empty() &&
10534 slot
->num_running
== 0 &&
10535 slot
->waiting_for_split
.empty() &&
10537 dout(20) << __func__
<< " " << pgid
<< " empty, pruning" << dendl
;
10538 p
= pg_slots
.erase(p
);
10545 std::lock_guard l
{sdata_wait_lock
};
10546 sdata_cond
.notify_one();
10550 void OSDShard::_wake_pg_slot(
10552 OSDShardPGSlot
*slot
)
10554 dout(20) << __func__
<< " " << pgid
10555 << " to_process " << slot
->to_process
10556 << " waiting " << slot
->waiting
10557 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
10558 for (auto i
= slot
->to_process
.rbegin();
10559 i
!= slot
->to_process
.rend();
10561 scheduler
->enqueue_front(std::move(*i
));
10563 slot
->to_process
.clear();
10564 for (auto i
= slot
->waiting
.rbegin();
10565 i
!= slot
->waiting
.rend();
10567 scheduler
->enqueue_front(std::move(*i
));
10569 slot
->waiting
.clear();
10570 for (auto i
= slot
->waiting_peering
.rbegin();
10571 i
!= slot
->waiting_peering
.rend();
10573 // this is overkill; we requeue everything, even if some of these
10574 // items are waiting for maps we don't have yet. FIXME, maybe,
10575 // someday, if we decide this inefficiency matters
10576 for (auto j
= i
->second
.rbegin(); j
!= i
->second
.rend(); ++j
) {
10577 scheduler
->enqueue_front(std::move(*j
));
10580 slot
->waiting_peering
.clear();
10581 ++slot
->requeue_seq
;
10584 void OSDShard::identify_splits_and_merges(
10585 const OSDMapRef
& as_of_osdmap
,
10586 set
<pair
<spg_t
,epoch_t
>> *split_pgs
,
10587 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10589 std::lock_guard
l(shard_lock
);
10590 if (shard_osdmap
) {
10591 for (auto& i
: pg_slots
) {
10592 const spg_t
& pgid
= i
.first
;
10593 auto *slot
= i
.second
.get();
10595 osd
->service
.identify_splits_and_merges(
10596 shard_osdmap
, as_of_osdmap
, pgid
,
10597 split_pgs
, merge_pgs
);
10598 } else if (!slot
->waiting_for_split
.empty()) {
10599 osd
->service
.identify_splits_and_merges(
10600 shard_osdmap
, as_of_osdmap
, pgid
,
10601 split_pgs
, nullptr);
10603 dout(20) << __func__
<< " slot " << pgid
10604 << " has no pg and waiting_for_split " << dendl
;
10610 void OSDShard::prime_splits(const OSDMapRef
& as_of_osdmap
,
10611 set
<pair
<spg_t
,epoch_t
>> *pgids
)
10613 std::lock_guard
l(shard_lock
);
10614 _prime_splits(pgids
);
10615 if (shard_osdmap
->get_epoch() > as_of_osdmap
->get_epoch()) {
10616 set
<pair
<spg_t
,epoch_t
>> newer_children
;
10617 for (auto i
: *pgids
) {
10618 osd
->service
.identify_splits_and_merges(
10619 as_of_osdmap
, shard_osdmap
, i
.first
,
10620 &newer_children
, nullptr);
10622 newer_children
.insert(pgids
->begin(), pgids
->end());
10623 dout(10) << "as_of_osdmap " << as_of_osdmap
->get_epoch() << " < shard "
10624 << shard_osdmap
->get_epoch() << ", new children " << newer_children
10626 _prime_splits(&newer_children
);
10627 // note: we don't care what is left over here for other shards.
10628 // if this shard is ahead of us and one isn't, e.g., one thread is
10629 // calling into prime_splits via _process (due to a newly created
10630 // pg) and this shard has a newer map due to a racing consume_map,
10631 // then any grandchildren left here will be identified (or were
10632 // identified) when the slower shard's osdmap is advanced.
10633 // _prime_splits() will tolerate the case where the pgid is
10638 void OSDShard::_prime_splits(set
<pair
<spg_t
,epoch_t
>> *pgids
)
10640 dout(10) << *pgids
<< dendl
;
10641 auto p
= pgids
->begin();
10642 while (p
!= pgids
->end()) {
10643 unsigned shard_index
= p
->first
.hash_to_shard(osd
->num_shards
);
10644 if (shard_index
== shard_id
) {
10645 auto r
= pg_slots
.emplace(p
->first
, nullptr);
10647 dout(10) << "priming slot " << p
->first
<< " e" << p
->second
<< dendl
;
10648 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10649 r
.first
->second
->waiting_for_split
.insert(p
->second
);
10652 ceph_assert(q
!= pg_slots
.end());
10653 dout(10) << "priming (existing) slot " << p
->first
<< " e" << p
->second
10655 q
->second
->waiting_for_split
.insert(p
->second
);
10657 p
= pgids
->erase(p
);
10664 void OSDShard::prime_merges(const OSDMapRef
& as_of_osdmap
,
10665 set
<pair
<spg_t
,epoch_t
>> *merge_pgs
)
10667 std::lock_guard
l(shard_lock
);
10668 dout(20) << __func__
<< " checking shard " << shard_id
10669 << " for remaining merge pgs " << merge_pgs
<< dendl
;
10670 auto p
= merge_pgs
->begin();
10671 while (p
!= merge_pgs
->end()) {
10672 spg_t pgid
= p
->first
;
10673 epoch_t epoch
= p
->second
;
10674 unsigned shard_index
= pgid
.hash_to_shard(osd
->num_shards
);
10675 if (shard_index
!= shard_id
) {
10679 OSDShardPGSlot
*slot
;
10680 auto r
= pg_slots
.emplace(pgid
, nullptr);
10682 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10684 slot
= r
.first
->second
.get();
10687 dout(20) << __func__
<< " have merge participant pg " << pgid
10688 << " " << slot
->pg
<< dendl
;
10689 } else if (!slot
->waiting_for_split
.empty() &&
10690 *slot
->waiting_for_split
.begin() < epoch
) {
10691 dout(20) << __func__
<< " pending split on merge participant pg " << pgid
10692 << " " << slot
->waiting_for_split
<< dendl
;
10694 dout(20) << __func__
<< " creating empty merge participant " << pgid
10695 << " for merge in " << epoch
<< dendl
;
10696 // leave history zeroed; PG::merge_from() will fill it in.
10697 pg_history_t history
;
10698 PGCreateInfo
cinfo(pgid
, epoch
- 1,
10699 history
, PastIntervals(), false);
10700 PGRef pg
= osd
->handle_pg_create_info(shard_osdmap
, &cinfo
);
10701 _attach_pg(r
.first
->second
.get(), pg
.get());
10702 _wake_pg_slot(pgid
, slot
);
10705 // mark slot for merge
10706 dout(20) << __func__
<< " marking merge participant " << pgid
<< dendl
;
10707 slot
->waiting_for_merge_epoch
= epoch
;
10708 p
= merge_pgs
->erase(p
);
10712 void OSDShard::register_and_wake_split_child(PG
*pg
)
10716 std::lock_guard
l(shard_lock
);
10717 dout(10) << pg
->pg_id
<< " " << pg
<< dendl
;
10718 auto p
= pg_slots
.find(pg
->pg_id
);
10719 ceph_assert(p
!= pg_slots
.end());
10720 auto *slot
= p
->second
.get();
10721 dout(20) << pg
->pg_id
<< " waiting_for_split " << slot
->waiting_for_split
10723 ceph_assert(!slot
->pg
);
10724 ceph_assert(!slot
->waiting_for_split
.empty());
10725 _attach_pg(slot
, pg
);
10727 epoch
= pg
->get_osdmap_epoch();
10728 ceph_assert(slot
->waiting_for_split
.count(epoch
));
10729 slot
->waiting_for_split
.erase(epoch
);
10730 if (slot
->waiting_for_split
.empty()) {
10731 _wake_pg_slot(pg
->pg_id
, slot
);
10733 dout(10) << __func__
<< " still waiting for split on "
10734 << slot
->waiting_for_split
<< dendl
;
10738 // kick child to ensure it pulls up to the latest osdmap
10739 osd
->enqueue_peering_evt(
10742 std::make_shared
<PGPeeringEvent
>(
10747 std::lock_guard l
{sdata_wait_lock
};
10748 sdata_cond
.notify_one();
10751 void OSDShard::unprime_split_children(spg_t parent
, unsigned old_pg_num
)
10753 std::lock_guard
l(shard_lock
);
10754 vector
<spg_t
> to_delete
;
10755 for (auto& i
: pg_slots
) {
10756 if (i
.first
!= parent
&&
10757 i
.first
.get_ancestor(old_pg_num
) == parent
) {
10758 dout(10) << __func__
<< " parent " << parent
<< " clearing " << i
.first
10760 _wake_pg_slot(i
.first
, i
.second
.get());
10761 to_delete
.push_back(i
.first
);
10764 for (auto pgid
: to_delete
) {
10765 pg_slots
.erase(pgid
);
10769 void OSDShard::update_scheduler_config()
10771 std::lock_guard
l(shard_lock
);
10772 scheduler
->update_configuration();
10775 OSDShard::OSDShard(
10782 shard_name(string("OSDShard.") + stringify(id
)),
10783 sdata_wait_lock_name(shard_name
+ "::sdata_wait_lock"),
10784 sdata_wait_lock
{make_mutex(sdata_wait_lock_name
)},
10785 osdmap_lock
{make_mutex(shard_name
+ "::osdmap_lock")},
10786 shard_lock_name(shard_name
+ "::shard_lock"),
10787 shard_lock
{make_mutex(shard_lock_name
)},
10788 scheduler(ceph::osd::scheduler::make_scheduler(
10789 cct
, osd
->num_shards
, osd
->store
->is_rotational())),
10790 context_queue(sdata_wait_lock
, sdata_cond
)
10792 dout(0) << "using op scheduler " << *scheduler
<< dendl
;
10796 // =============================================================
10798 #undef dout_context
10799 #define dout_context osd->cct
10801 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10803 void OSD::ShardedOpWQ::_add_slot_waiter(
10805 OSDShardPGSlot
*slot
,
10806 OpSchedulerItem
&& qi
)
10808 if (qi
.is_peering()) {
10809 dout(20) << __func__
<< " " << pgid
10810 << " peering, item epoch is "
10811 << qi
.get_map_epoch()
10812 << ", will wait on " << qi
<< dendl
;
10813 slot
->waiting_peering
[qi
.get_map_epoch()].push_back(std::move(qi
));
10815 dout(20) << __func__
<< " " << pgid
10816 << " item epoch is "
10817 << qi
.get_map_epoch()
10818 << ", will wait on " << qi
<< dendl
;
10819 slot
->waiting
.push_back(std::move(qi
));
10824 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10826 void OSD::ShardedOpWQ::_process(uint32_t thread_index
, heartbeat_handle_d
*hb
)
10828 uint32_t shard_index
= thread_index
% osd
->num_shards
;
10829 auto& sdata
= osd
->shards
[shard_index
];
10830 ceph_assert(sdata
);
10832 // If all threads of shards do oncommits, there is a out-of-order
10833 // problem. So we choose the thread which has the smallest
10834 // thread_index(thread_index < num_shards) of shard to do oncommit
10836 bool is_smallest_thread_index
= thread_index
< osd
->num_shards
;
10839 sdata
->shard_lock
.lock();
10840 if (sdata
->scheduler
->empty() &&
10841 (!is_smallest_thread_index
|| sdata
->context_queue
.empty())) {
10842 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10843 if (is_smallest_thread_index
&& !sdata
->context_queue
.empty()) {
10844 // we raced with a context_queue addition, don't wait
10845 wait_lock
.unlock();
10846 } else if (!sdata
->stop_waiting
) {
10847 dout(20) << __func__
<< " empty q, waiting" << dendl
;
10848 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10849 sdata
->shard_lock
.unlock();
10850 sdata
->sdata_cond
.wait(wait_lock
);
10851 wait_lock
.unlock();
10852 sdata
->shard_lock
.lock();
10853 if (sdata
->scheduler
->empty() &&
10854 !(is_smallest_thread_index
&& !sdata
->context_queue
.empty())) {
10855 sdata
->shard_lock
.unlock();
10858 // found a work item; reapply default wq timeouts
10859 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10860 timeout_interval
, suicide_interval
);
10862 dout(20) << __func__
<< " need return immediately" << dendl
;
10863 wait_lock
.unlock();
10864 sdata
->shard_lock
.unlock();
10869 list
<Context
*> oncommits
;
10870 if (is_smallest_thread_index
) {
10871 sdata
->context_queue
.move_to(oncommits
);
10874 WorkItem work_item
;
10875 while (!std::get_if
<OpSchedulerItem
>(&work_item
)) {
10876 if (sdata
->scheduler
->empty()) {
10877 if (osd
->is_stopping()) {
10878 sdata
->shard_lock
.unlock();
10879 for (auto c
: oncommits
) {
10880 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10883 return; // OSD shutdown, discard.
10885 sdata
->shard_lock
.unlock();
10886 handle_oncommits(oncommits
);
10890 work_item
= sdata
->scheduler
->dequeue();
10891 if (osd
->is_stopping()) {
10892 sdata
->shard_lock
.unlock();
10893 for (auto c
: oncommits
) {
10894 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10897 return; // OSD shutdown, discard.
10900 // If the work item is scheduled in the future, wait until
10901 // the time returned in the dequeue response before retrying.
10902 if (auto when_ready
= std::get_if
<double>(&work_item
)) {
10903 if (is_smallest_thread_index
) {
10904 sdata
->shard_lock
.unlock();
10905 handle_oncommits(oncommits
);
10908 std::unique_lock wait_lock
{sdata
->sdata_wait_lock
};
10909 auto future_time
= ceph::real_clock::from_double(*when_ready
);
10910 dout(10) << __func__
<< " dequeue future request at " << future_time
<< dendl
;
10911 // Disable heartbeat timeout until we find a non-future work item to process.
10912 osd
->cct
->get_heartbeat_map()->clear_timeout(hb
);
10913 sdata
->shard_lock
.unlock();
10914 ++sdata
->waiting_threads
;
10915 sdata
->sdata_cond
.wait_until(wait_lock
, future_time
);
10916 --sdata
->waiting_threads
;
10917 wait_lock
.unlock();
10918 sdata
->shard_lock
.lock();
10919 // Reapply default wq timeouts
10920 osd
->cct
->get_heartbeat_map()->reset_timeout(hb
,
10921 timeout_interval
, suicide_interval
);
10925 // Access the stored item
10926 auto item
= std::move(std::get
<OpSchedulerItem
>(work_item
));
10927 if (osd
->is_stopping()) {
10928 sdata
->shard_lock
.unlock();
10929 for (auto c
: oncommits
) {
10930 dout(10) << __func__
<< " discarding in-flight oncommit " << c
<< dendl
;
10933 return; // OSD shutdown, discard.
10936 const auto token
= item
.get_ordering_token();
10937 auto r
= sdata
->pg_slots
.emplace(token
, nullptr);
10939 r
.first
->second
= make_unique
<OSDShardPGSlot
>();
10941 OSDShardPGSlot
*slot
= r
.first
->second
.get();
10942 dout(20) << __func__
<< " " << token
10943 << (r
.second
? " (new)" : "")
10944 << " to_process " << slot
->to_process
10945 << " waiting " << slot
->waiting
10946 << " waiting_peering " << slot
->waiting_peering
10948 slot
->to_process
.push_back(std::move(item
));
10949 dout(20) << __func__
<< " " << slot
->to_process
.back()
10950 << " queued" << dendl
;
10953 PGRef pg
= slot
->pg
;
10955 // lock pg (if we have it)
10957 // note the requeue seq now...
10958 uint64_t requeue_seq
= slot
->requeue_seq
;
10959 ++slot
->num_running
;
10961 sdata
->shard_lock
.unlock();
10962 osd
->service
.maybe_inject_dispatch_delay();
10964 osd
->service
.maybe_inject_dispatch_delay();
10965 sdata
->shard_lock
.lock();
10967 auto q
= sdata
->pg_slots
.find(token
);
10968 if (q
== sdata
->pg_slots
.end()) {
10969 // this can happen if we race with pg removal.
10970 dout(20) << __func__
<< " slot " << token
<< " no longer there" << dendl
;
10972 sdata
->shard_lock
.unlock();
10973 handle_oncommits(oncommits
);
10976 slot
= q
->second
.get();
10977 --slot
->num_running
;
10979 if (slot
->to_process
.empty()) {
10980 // raced with _wake_pg_slot or consume_map
10981 dout(20) << __func__
<< " " << token
10982 << " nothing queued" << dendl
;
10984 sdata
->shard_lock
.unlock();
10985 handle_oncommits(oncommits
);
10988 if (requeue_seq
!= slot
->requeue_seq
) {
10989 dout(20) << __func__
<< " " << token
10990 << " requeue_seq " << slot
->requeue_seq
<< " > our "
10991 << requeue_seq
<< ", we raced with _wake_pg_slot"
10994 sdata
->shard_lock
.unlock();
10995 handle_oncommits(oncommits
);
10998 if (slot
->pg
!= pg
) {
10999 // this can happen if we race with pg removal.
11000 dout(20) << __func__
<< " slot " << token
<< " no longer attached to "
11007 dout(20) << __func__
<< " " << token
11008 << " to_process " << slot
->to_process
11009 << " waiting " << slot
->waiting
11010 << " waiting_peering " << slot
->waiting_peering
<< dendl
;
11012 ThreadPool::TPHandle
tp_handle(osd
->cct
, hb
, timeout_interval
,
11016 auto qi
= std::move(slot
->to_process
.front());
11017 slot
->to_process
.pop_front();
11018 dout(20) << __func__
<< " " << qi
<< " pg " << pg
<< dendl
;
11019 set
<pair
<spg_t
,epoch_t
>> new_children
;
11023 // should this pg shard exist on this osd in this (or a later) epoch?
11024 osdmap
= sdata
->shard_osdmap
;
11025 const PGCreateInfo
*create_info
= qi
.creates_pg();
11026 if (!slot
->waiting_for_split
.empty()) {
11027 dout(20) << __func__
<< " " << token
11028 << " splitting " << slot
->waiting_for_split
<< dendl
;
11029 _add_slot_waiter(token
, slot
, std::move(qi
));
11030 } else if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11031 dout(20) << __func__
<< " " << token
11032 << " map " << qi
.get_map_epoch() << " > "
11033 << osdmap
->get_epoch() << dendl
;
11034 _add_slot_waiter(token
, slot
, std::move(qi
));
11035 } else if (qi
.is_peering()) {
11036 if (!qi
.peering_requires_pg()) {
11037 // for pg-less events, we run them under the ordering lock, since
11038 // we don't have the pg lock to keep them ordered.
11039 qi
.run(osd
, sdata
, pg
, tp_handle
);
11040 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
11042 if (create_info
->by_mon
&&
11043 osdmap
->get_pg_acting_primary(token
.pgid
) != osd
->whoami
) {
11044 dout(20) << __func__
<< " " << token
11045 << " no pg, no longer primary, ignoring mon create on "
11048 dout(20) << __func__
<< " " << token
11049 << " no pg, should create on " << qi
<< dendl
;
11050 pg
= osd
->handle_pg_create_info(osdmap
, create_info
);
11052 // we created the pg! drop out and continue "normally"!
11053 sdata
->_attach_pg(slot
, pg
.get());
11054 sdata
->_wake_pg_slot(token
, slot
);
11056 // identify split children between create epoch and shard epoch.
11057 osd
->service
.identify_splits_and_merges(
11058 pg
->get_osdmap(), osdmap
, pg
->pg_id
, &new_children
, nullptr);
11059 sdata
->_prime_splits(&new_children
);
11060 // distribute remaining split children to other shards below!
11063 dout(20) << __func__
<< " ignored create on " << qi
<< dendl
;
11066 dout(20) << __func__
<< " " << token
11067 << " no pg, peering, !create, discarding " << qi
<< dendl
;
11070 dout(20) << __func__
<< " " << token
11071 << " no pg, peering, doesn't map here e" << osdmap
->get_epoch()
11072 << ", discarding " << qi
11075 } else if (osdmap
->is_up_acting_osd_shard(token
, osd
->whoami
)) {
11076 dout(20) << __func__
<< " " << token
11077 << " no pg, should exist e" << osdmap
->get_epoch()
11078 << ", will wait on " << qi
<< dendl
;
11079 _add_slot_waiter(token
, slot
, std::move(qi
));
11081 dout(20) << __func__
<< " " << token
11082 << " no pg, shouldn't exist e" << osdmap
->get_epoch()
11083 << ", dropping " << qi
<< dendl
;
11084 // share map with client?
11085 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11086 osd
->service
.maybe_share_map((*_op
)->get_req()->get_connection().get(),
11087 sdata
->shard_osdmap
,
11088 (*_op
)->sent_epoch
);
11090 unsigned pushes_to_free
= qi
.get_reserved_pushes();
11091 if (pushes_to_free
> 0) {
11092 sdata
->shard_lock
.unlock();
11093 osd
->service
.release_reserved_pushes(pushes_to_free
);
11094 handle_oncommits(oncommits
);
11098 sdata
->shard_lock
.unlock();
11099 handle_oncommits(oncommits
);
11102 if (qi
.is_peering()) {
11103 OSDMapRef osdmap
= sdata
->shard_osdmap
;
11104 if (qi
.get_map_epoch() > osdmap
->get_epoch()) {
11105 _add_slot_waiter(token
, slot
, std::move(qi
));
11106 sdata
->shard_lock
.unlock();
11108 handle_oncommits(oncommits
);
11112 sdata
->shard_lock
.unlock();
11114 if (!new_children
.empty()) {
11115 for (auto shard
: osd
->shards
) {
11116 shard
->prime_splits(osdmap
, &new_children
);
11118 ceph_assert(new_children
.empty());
11121 // osd_opwq_process marks the point at which an operation has been dequeued
11122 // and will begin to be handled by a worker thread.
11126 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11127 reqid
= (*_op
)->get_reqid();
11130 tracepoint(osd
, opwq_process_start
, reqid
.name
._type
,
11131 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11134 lgeneric_subdout(osd
->cct
, osd
, 30) << "dequeue status: ";
11135 Formatter
*f
= Formatter::create("json");
11136 f
->open_object_section("q");
11138 f
->close_section();
11143 qi
.run(osd
, sdata
, pg
, tp_handle
);
11148 if (std::optional
<OpRequestRef
> _op
= qi
.maybe_get_op()) {
11149 reqid
= (*_op
)->get_reqid();
11152 tracepoint(osd
, opwq_process_finish
, reqid
.name
._type
,
11153 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
11156 handle_oncommits(oncommits
);
11159 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem
&& item
) {
11160 uint32_t shard_index
=
11161 item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11163 dout(20) << __func__
<< " " << item
<< dendl
;
11165 OSDShard
* sdata
= osd
->shards
[shard_index
];
11166 assert (NULL
!= sdata
);
11170 std::lock_guard l
{sdata
->shard_lock
};
11171 empty
= sdata
->scheduler
->empty();
11172 sdata
->scheduler
->enqueue(std::move(item
));
11176 std::lock_guard l
{sdata
->sdata_wait_lock
};
11178 sdata
->sdata_cond
.notify_all();
11179 } else if (sdata
->waiting_threads
) {
11180 sdata
->sdata_cond
.notify_one();
11185 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem
&& item
)
11187 auto shard_index
= item
.get_ordering_token().hash_to_shard(osd
->shards
.size());
11188 auto& sdata
= osd
->shards
[shard_index
];
11189 ceph_assert(sdata
);
11190 sdata
->shard_lock
.lock();
11191 auto p
= sdata
->pg_slots
.find(item
.get_ordering_token());
11192 if (p
!= sdata
->pg_slots
.end() &&
11193 !p
->second
->to_process
.empty()) {
11194 // we may be racing with _process, which has dequeued a new item
11195 // from scheduler, put it on to_process, and is now busy taking the
11196 // pg lock. ensure this old requeued item is ordered before any
11197 // such newer item in to_process.
11198 p
->second
->to_process
.push_front(std::move(item
));
11199 item
= std::move(p
->second
->to_process
.back());
11200 p
->second
->to_process
.pop_back();
11201 dout(20) << __func__
11202 << " " << p
->second
->to_process
.front()
11203 << " shuffled w/ " << item
<< dendl
;
11205 dout(20) << __func__
<< " " << item
<< dendl
;
11207 sdata
->scheduler
->enqueue_front(std::move(item
));
11208 sdata
->shard_lock
.unlock();
11209 std::lock_guard l
{sdata
->sdata_wait_lock
};
11210 sdata
->sdata_cond
.notify_one();
11213 namespace ceph::osd_cmds
{
11215 int heap(CephContext
& cct
, const cmdmap_t
& cmdmap
, Formatter
& f
,
11218 if (!ceph_using_tcmalloc()) {
11219 os
<< "could not issue heap profiler command -- not using tcmalloc!";
11220 return -EOPNOTSUPP
;
11224 if (!cmd_getval(cmdmap
, "heapcmd", cmd
)) {
11225 os
<< "unable to get value for command \"" << cmd
<< "\"";
11229 std::vector
<std::string
> cmd_vec
;
11230 get_str_vec(cmd
, cmd_vec
);
11233 if (cmd_getval(cmdmap
, "value", val
)) {
11234 cmd_vec
.push_back(val
);
11237 ceph_heap_profiler_handle_command(cmd_vec
, os
);
11242 } // namespace ceph::osd_cmds